In [4]:
import pandas as pd
import zipfile
import os

# Define the path to the zip file
zip_file_path = r"/subjects-questions.csv.zip"
extracted_dir = "extracted_data"
csv_file_name = "subjects-questions.csv"
csv_file_path = os.path.join(extracted_dir, csv_file_name)

# Create a directory to extract the contents
os.makedirs(extracted_dir, exist_ok=True)

# Extract the CSV file from the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extract(csv_file_name, extracted_dir)

# creating a dataframe for our dataset "neet-jee questions"
subject_df = pd.read_csv(csv_file_path)

display(subject_df.head())

Unnamed: 0,eng,Subject
0,An anti-forest measure is\nA. Afforestation\nB...,Biology
1,"Among the following organic acids, the acid pr...",Chemistry
2,If the area of two similar triangles are equal...,Maths
3,"In recent year, there has been a growing\nconc...",Biology
4,Which of the following statement\nregarding tr...,Physics


In [5]:
subject_count = subject_df['Subject'].value_counts()
print(subject_count)

Subject
Physics      38438
Chemistry    37767
Maths        33190
Biology      13124
Name: count, dtype: int64


In [6]:
#data preprocessing
%pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (113 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.9/113.9 kB[0m 

In [7]:
import contractions

#Expanding contractions such as I'll to I will etc.
def expand_contractions(s):
    return contractions.fix(s)

In [8]:
#reducing the size of dataset and extracting physics questions
df_physics = subject_df[subject_df['Subject']=='Physics'].sample(n = 1500)
balanced_df = pd.concat([df_physics])
balanced_df = balanced_df.sample(frac = 1)
balanced_df.reset_index(drop=True, inplace=True)
subject_df_clean = balanced_df.copy()
subject_df_clean.rename(columns = {'eng':'Questions'}, inplace = True)
print(subject_df_clean)


                                              Questions  Subject
0     The adjoining diagram shows the\nspectral ener...  Physics
1     The energy band gap (distance between the cond...  Physics
2     Vertical displacement of a plank with a body o...  Physics
3     When you heat the water in a pot, it\nboils. W...  Physics
4     Match list A and list B accurately:\nList \( \...  Physics
...                                                 ...      ...
1495                  Temperature is measured in degree  Physics
1496  The horizontal component of earth's magnetic f...  Physics
1497  A circuit element shown in the figure as\nbox ...  Physics
1498  Assertion\nWe can live very happily if the fri...  Physics
1499  The potential energy function for\na particle ...  Physics

[1500 rows x 2 columns]


In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
def lemmatize_sentence(sentence):
    doc = nlp(sentence)
    # Create list of tokens from given string
    tokens = []
    for token in doc:
        tokens.append(token)
    lemmatized_sentence = " ".join([token.lemma_ for token in doc])
    return lemmatized_sentence


import re
from nltk.corpus import stopwords

def clean_data(clean_sentence):

    clean_sentence = clean_sentence.lower()  #converting to lowercase
    clean_sentence = re.sub(r'[^\w\s]', '', clean_sentence)  #removing punctuations like "/", ";" "[", "]" "=", "#" etc.
    clean_sentence = re.sub(r'\s', ' ', clean_sentence) #removing extra spaces
    clean_sentence = expand_contractions(clean_sentence)
    clean_sentence = clean_sentence.split()  #tokenization: splitting sentence into words

    stop_words = set(stopwords.words('english'))
    clean_lst = []  #removing stop words like if, but, or etc. and removing characters of length 1
    for word in clean_sentence:
        if word not in stop_words and len(word) > 1:
          clean_lst.append(word)
    return lemmatize_sentence(' '.join(clean_lst))

subject_df_clean['cleaned_question'] = subject_df_clean['Questions'].apply(clean_data)

sentence = subject_df_clean['Questions'][84]
print(sentence)
sentence = clean_data(sentence)
print("\n\n")
print(sentence)

The potential energy \( U \) for a force field \( \vec{F} \) is such that \( U=-K x y, \) where \( K \) is
a constant. Then
\( \mathbf{A} \cdot \vec{F}=K y \hat{i}+K x \hat{j} \)
В . \( \vec{F}=K x \hat{i}+K y \hat{j} \)
c. \( \bar{F} \) is a conservative force
D. \( \vec{F} \) is a non-conservation force



potential energy force field vecf uk constant mathbfa cdot vecfk hatik hatj vecfk hatik hatj barf conservative force vecf nonconservation force


In [10]:
import spacy.cli
spacy.cli.download("en_core_web_sm")


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [36]:
keyword_topic_map = {
    # Easy Topics
    "Kinematics": ["motion", "velocity", "displacement", "acceleration", "graph", "uniform", "non-uniform", "rest"],
    "Optics": ["mirror", "lens", "refraction", "reflection", "focal", "image", "optical", "light"],

    #  Medium Topics
    "Mechanics": ["force", "friction", "newton", "mass", "energy", "momentum", "projectile", "work", "power", "torque"],
    "Thermodynamics": ["temperature", "heat", "gas", "entropy", "internal energy", "expansion", "laws of thermodynamics"],

    # Hard Topics
    "Electricity": ["current", "voltage", "resistance", "ohm", "circuit", "capacitor", "inductor", "charge", "battery"],
    "Modern Physics": ["quantum", "photoelectric", "relativity", "nuclear", "radioactivity", "atom", "isotope", "electron", "de broglie", "dual nature", "planck", "wave-particle", "bohr"
    ]
}

def classify_topic(question):
    question = question.lower()
    for topic, keywords in keyword_topic_map.items():
        for kw in keywords:
            if kw in question:
                return topic
    return "Other"

subject_df_clean['topic'] = subject_df_clean['cleaned_question'].apply(classify_topic)

#defining difficulty level according to topics
difficulty_map = {
    "Kinematics": "Easy",
    "Optics": "Easy",
    "Mechanics": "Medium",
    "Thermodynamics": "Medium",
    "Electricity": "Hard",
    "Modern Physics": "Hard",
    "Other": "Unknown"
}

subject_df_clean['Topic_Difficulty'] = subject_df_clean['Topic'].map(difficulty_map)
print(subject_df_clean['Topic'].value_counts())





Topic
Other             381
Kinematics        331
Mechanics         320
Electricity       182
Optics            167
Thermodynamics     91
Modern Physics     28
Name: count, dtype: int64


In [20]:
import random

# Simulating student data for Physics quiz system
subject_df_clean['student_id'] = [random.randint(1001, 1100) for _ in range(len(subject_df_clean))]
subject_df_clean['quiz_attempt'] = [random.randint(1, 10) for _ in range(len(subject_df_clean))]
subject_df_clean['score'] = [random.randint(20, 100) for _ in range(len(subject_df_clean))]
subject_df_clean['time_taken'] = [random.randint(30, 300) for _ in range(len(subject_df_clean))]

# Difficulty level based on score
subject_df_clean['difficulty'] = subject_df_clean['score'].apply(
    lambda s: 'Easy' if s < 40 else ('Medium' if s < 70 else 'Hard')
)

# Rule-based recommendation of next topic
def recommend_next(row):
    if row['score'] < 40:
        return row['topic'].lower()  # Retry the same topic, ensure lowercase
    elif row['score'] < 70:
        return random.choice(['mechanics', 'optics','electricity']).lower() # Ensure lowercase
    else:
        return random.choice(['thermodynamics', 'kinematics']).lower() # Ensure lowercase

subject_df_clean['recommended_next_topic'] = subject_df_clean.apply(recommend_next, axis=1)

In [52]:
#AI Engine
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Step 1: Label Encoding
le_topic = LabelEncoder()
le_diff = LabelEncoder()
le_next = LabelEncoder()

subject_df_clean['topic_enc'] = le_topic.fit_transform(subject_df_clean['Topic'])
subject_df_clean['diff_enc'] = le_diff.fit_transform(subject_df_clean['difficulty'])
subject_df_clean['target_enc'] = le_next.fit_transform(subject_df_clean['recommended_next_topic'])

# Step 2: Define features and target
X = subject_df_clean[['topic_enc', 'diff_enc', 'score', 'quiz_attempt', 'time_taken']]
y = subject_df_clean['target_enc']

# Step 3: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 5: Evaluate model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Get unique labels from y_test and y_pred
unique_labels = np.unique(np.concatenate((y_test, y_pred)))

# Get target names corresponding to the unique labels
target_names_subset = le_next.inverse_transform(unique_labels)

print("\nClassification Report:\n", classification_report(y_test, y_pred, labels=unique_labels, target_names=target_names_subset))

Accuracy: 0.5566666666666666

Classification Report:
                 precision    recall  f1-score   support

   electricity       0.41      0.46      0.44        41
    kinematics       0.67      0.56      0.61        87
     mechanics       0.59      0.53      0.56        57
        optics       0.45      0.46      0.46        41
         other       1.00      1.00      1.00        19
thermodynamics       0.45      0.56      0.50        55

      accuracy                           0.56       300
     macro avg       0.60      0.60      0.59       300
  weighted avg       0.57      0.56      0.56       300



In [25]:
# testing the model
sample_input = [[2, 1, 65, 3, 120]]  # topic_enc, diff_enc, score, quiz_attempt, time_taken
predicted_topic_enc = model.predict(sample_input)[0]
predicted_topic = le_next.inverse_transform([predicted_topic_enc])[0]
print("Predicted Recommended Topic:", predicted_topic)


Predicted Recommended Topic: mechanics




In [37]:
import pandas as pd
all_topics = subject_df_clean['topic'].unique()
all_topics_enc = le_topic.transform(all_topics)
difficulty_levels = ['Easy', 'Medium', 'Hard']
difficulty_enc = le_diff.transform(difficulty_levels)
test_data = []
for topic_enc, topic_name in zip(all_topics_enc, all_topics):
    for diff_name, diff_enc_val in zip(difficulty_levels, difficulty_enc):
        test_data.append({
            'topic': topic_name,
            'topic_enc': topic_enc,
            'diff_enc': diff_enc_val,
            'score': 60 if diff_name == 'Medium' else (35 if diff_name == 'Easy' else 85),
            'quiz_attempt': 2,
            'time_taken': 100
        })
test_df = pd.DataFrame(test_data)
X_test = test_df[['topic_enc', 'diff_enc', 'score', 'quiz_attempt', 'time_taken']]
test_df['predicted_topic_enc'] = model.predict(X_test)
test_df['predicted_recommendation'] = le_next.inverse_transform(test_df['predicted_topic_enc'])
result_df = test_df[['topic', 'score', 'diff_enc', 'predicted_recommendation']]
print(result_df.to_string(index=False))


         topic  score  diff_enc predicted_recommendation
     Mechanics     35         0                mechanics
     Mechanics     60         2              electricity
     Mechanics     85         1           thermodynamics
    Kinematics     35         0               kinematics
    Kinematics     60         2              electricity
    Kinematics     85         1           thermodynamics
        Optics     35         0                   optics
        Optics     60         2              electricity
        Optics     85         1               kinematics
         Other     35         0                    other
         Other     60         2              electricity
         Other     85         1               kinematics
   Electricity     35         0              electricity
   Electricity     60         2              electricity
   Electricity     85         1           thermodynamics
Thermodynamics     35         0           thermodynamics
Thermodynamics     60         2

In [40]:
# creating quizes per topic
subject_df_clean['quiz_id'] = ['QZ' + str(i).zfill(4) for i in range(len(subject_df_clean))]
subject_df_clean['quiz_question'] = subject_df_clean['Questions']

quiz_bank = {
    topic: group[['quiz_id', 'quiz_question']].sample(min(10, len(group)))
    for topic, group in subject_df_clean.groupby('Topic')
}

In [48]:
def recommend_next_quiz_set(row):
    input_row = pd.DataFrame([{
        'topic_enc': le_topic.transform([row['Topic']])[0],
        'diff_enc': le_diff.transform([row['difficulty']])[0],
        'score': row['score'],
        'quiz_attempt': row['quiz_attempt'],
        'time_taken': row['time_taken']
    }])

    # Predict
    predicted_topic = le_next.inverse_transform([model.predict(input_row)[0]])[0]

    # Rule-based
    if row['score'] < 40:
        predicted_topic = row['Topic']
        message = f"😐 You need more practice. Retrying topic: {predicted_topic}."
    elif row['score'] < 70:
        message = f"🙂 You're doing okay. Let's try a Medium level topic: {predicted_topic}."
    else:
        message = f"🔥 Great job! Advancing to a harder topic: {predicted_topic}."

    # Get quiz set
    quiz_set = quiz_bank.get(predicted_topic, pd.DataFrame())

    return {
        "Next Topic": predicted_topic,
        "Message": message,
        "Quiz Set": quiz_set
    }


In [50]:
student = subject_df_clean.iloc[0]
result = recommend_next_quiz_set(student)

print(result['Message'])
print(f"\n📘 Recommended Quiz on: {result['Next Topic']}")
for i, q in result['Quiz Set'].iterrows():
    print(f"{i+1}. {q['quiz_question']}")


🔥 Great job! Advancing to a harder topic: thermodynamics.

📘 Recommended Quiz on: thermodynamics


In [53]:
#evaluating the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5566666666666666
Confusion Matrix:
 [[19  0 12 10  0  0]
 [ 0 49  0  0  0 38]
 [14  0 30 13  0  0]
 [13  0  9 19  0  0]
 [ 0  0  0  0 19  0]
 [ 0 24  0  0  0 31]]
Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.46      0.44        41
           1       0.67      0.56      0.61        87
           2       0.59      0.53      0.56        57
           4       0.45      0.46      0.46        41
           5       1.00      1.00      1.00        19
           6       0.45      0.56      0.50        55

    accuracy                           0.56       300
   macro avg       0.60      0.60      0.59       300
weighted avg       0.57      0.56      0.56       300



In [58]:
for i in range(10):
    student = subject_df_clean.iloc[i]
    result = recommend_next_quiz_set(student)
    print(f"\nStudent {student['student_id']}: {result['Message']}")
    print(f"Recommended Topic: {result['Next Topic']}")



Student 1049: 🔥 Great job! Advancing to a harder topic: thermodynamics.
Recommended Topic: thermodynamics

Student 1053: 🔥 Great job! Advancing to a harder topic: thermodynamics.
Recommended Topic: thermodynamics

Student 1017: 🔥 Great job! Advancing to a harder topic: kinematics.
Recommended Topic: kinematics

Student 1053: 🔥 Great job! Advancing to a harder topic: thermodynamics.
Recommended Topic: thermodynamics

Student 1025: 🙂 You're doing okay. Let's try a Medium level topic: optics.
Recommended Topic: optics

Student 1074: 🔥 Great job! Advancing to a harder topic: thermodynamics.
Recommended Topic: thermodynamics

Student 1001: 😐 You need more practice. Retrying topic: Kinematics.
Recommended Topic: Kinematics

Student 1041: 😐 You need more practice. Retrying topic: Other.
Recommended Topic: Other

Student 1042: 🙂 You're doing okay. Let's try a Medium level topic: optics.
Recommended Topic: optics

Student 1080: 🙂 You're doing okay. Let's try a Medium level topic: electricity.
