<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/subject_classification_model_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install opendatasets library
!pip install opendatasets

In [3]:
# Import necessary libraries and functions
import opendatasets as od
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU,Dense,Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from nltk.stem.porter import PorterStemmer

In [None]:
nltk.download("punkt")
nltk.download("stopwords")

In [5]:
# Define useful classes
English_stopwords = stopwords.words('english')
stemmer = PorterStemmer()

In [None]:
# Load data into working directory
od.download("https://www.kaggle.com/datasets/mrutyunjaybiswal/iitjee-neet-aims-students-questions-data")

In [None]:
# Read the data in a pandas frame
data=pd.read_csv("/content/iitjee-neet-aims-students-questions-data/subjects-questions.csv")
data.head()

In [None]:
# Check the distrubution of classes
data['Subject'].value_counts()

In [None]:
# Define a function to clean texts
def clean_text(text):
  # convert to lower case
  text = text.lower()

  # remove none alphabetic characters
  text = re.sub(r'[^a-z]', ' ', text)

  # stem words
  # split into words
  tokens = word_tokenize(text)

  # stemming of words
  stemmed = [stemmer.stem(word) for word in tokens]

  text = ' '.join(stemmed)

  # remove stopwords
  text = ' '.join([word for word in text.split() if word not in English_stopwords])

  return text


# apply to all dataset
data['eng'] = data['eng'].apply(clean_text)
data.head()

In [10]:
# Split the data into training and testing parts
x=data['eng'].values
y=data['Subject'].values
encoder=LabelEncoder()
y=encoder.fit_transform(y)# Encode the labels
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=42)


In [11]:
# One hot encoding
y_train=to_categorical(y_train)
y_test=to_categorical(y_test)

In [None]:
# Define the tokenizer
tok=Tokenizer(num_words=10000)
tok.fit_on_texts(x_train)
vocab_size=len(tok.word_index)+1

In [13]:
# Convert texts to sequences
x_train=tok.texts_to_sequences(x_train)
x_test=tok.texts_to_sequences(x_test)


In [14]:
# Pad the defined Squences
x_train=pad_sequences(x_train,maxlen=80,padding="post")
x_test=pad_sequences(x_test,maxlen=80,padding="post")


In [None]:
# Define a learning rate schedule function
def learning_rate_schedule(epoch):
    initial_lr = 0.01  # Initial learning rate
    decay = 0.1
    if epoch < 5:
        return initial_lr
    else:
        return initial_lr * np.power(decay, (epoch //5))

# Create an EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

# Create a LearningRateScheduler callback
lr_scheduler = LearningRateScheduler(learning_rate_schedule)

# Define and compile your model
model = Sequential()
model.add(Embedding(vocab_size,input_length=80,output_dim=70))
model.add(GRU(75,return_sequences=False))
model.add(Dense(64, activation='relu'))
model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train your model with both callbacks
history=model.fit(x_train, y_train, epochs=10,validation_split=0.2, callbacks=[early_stopping, lr_scheduler],batch_size=32,verbose=1)


In [None]:
# Evaluate the model on testing data
model.evaluate(x_test,y_test)

In [None]:
# Plot the model architecture to a file (e.g., model.png)
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

In [None]:
# Extract training and validation loss and accuracy values
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

# Plot the loss curves
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(train_loss, label='Training Loss',c="red")
plt.plot(val_loss, label='Validation Loss',c="blue")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')

# Plot the accuracy curves
plt.subplot(1, 2, 2)
plt.plot(train_accuracy, label='Training Accuracy',c="red")
plt.plot(val_accuracy, label='Validation Accuracy',c="blue")
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')

plt.tight_layout()
plt.show()

In [89]:
def subject_predictor(input_text, model, tok, max_length=80):
    # Clean the input text
    cleaned_input = clean_text(input_text)

    # Tokenize the cleaned text using a pre-trained tokenizer (tok)
    input_sequence = tok.texts_to_sequences([cleaned_input])

    # Pad the sequence to a fixed length (max_length)
    padded_input_sequence = pad_sequences(input_sequence, maxlen=max_length, padding="post")

    # Make predictions using the pre-trained model (model)
    predictions = model.predict(padded_input_sequence)

    # Assuming you want the predicted class
    predicted_class = predictions[0]

    return predicted_class


In [102]:
# Let us have some fun!
input_text='Can you give me some refrences in quantum mechanics?!'
print(subject_predictor(input_text,model,tok,80))