Importing libraries

In [1]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import re
import string
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
import Levenshtein
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

NLTK resources

In [2]:
# Download NLTK resources
# Install all required resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')  # The correct resource name
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rmkav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rmkav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rmkav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rmkav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Preprocessing

In [3]:
# Step 2: Text Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

Needed Functions

In [4]:
def get_wordnet_pos(treebank_tag):
    """Simplified POS mapping"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

def preprocess_text(text):
    """Updated preprocessing pipeline"""
    if not isinstance(text, str) or len(text.strip()) == 0:
        return ''

    # Case folding
    text = text.lower()
    # Remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Remove punctuation
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)
    # Tokenization
    tokens = word_tokenize(text)

    try:
        pos_tags = nltk.pos_tag(tokens)
    except LookupError:
        nltk.download('averaged_perceptron_tagger')
        pos_tags = nltk.pos_tag(tokens)

    lemmatized = [
        lemmatizer.lemmatize(token, get_wordnet_pos(tag))
        for token, tag in pos_tags
        if token not in stop_words and len(token) > 1
    ]
    return ' '.join(lemmatized)

Load and Preprocessing

In [6]:
# from google.colab import drive
# drive.mount('/content/drive')

In [10]:
# Step 3: Load and Preprocess Data
df = pd.read_excel('dataset/data.xlsx')

In [11]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\rmkav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rmkav\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

Preprocess text fields

In [13]:
# Preprocess all text fields
df['processed_correct'] = df['correct_answer'].apply(preprocess_text)
df['processed_keywords'] = df['keywords'].apply(
    lambda x: ' '.join([preprocess_text(k) for k in x.split(',')]))
df['processed_student'] = df['student_answers'].apply(preprocess_text)

Combined reference creation (correct answer + keywords)

In [14]:
# Create combined reference (correct answer + keywords)
df['reference'] = df['processed_correct'] + ' ' + df['processed_keywords']

Similarity Scores

In [15]:
# Step 4: Feature Engineering (Similarity Scores)
# Load Word2Vec model for WMD (download and path setup required)
# word2vec_model = KeyedVectors.load_word2vec_format('path/to/GoogleNews-vectors-negative300.bin', binary=True)

def calculate_features(row):
    """Calculate all similarity features for a row"""
    ref = row['reference']
    stu = row['processed_student']

    # Cosine Similarity
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform([ref, stu])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

    # Jaccard Similarity
    set_ref = set(ref.split())
    set_stu = set(stu.split())
    intersection = set_ref.intersection(set_stu)
    union = set_ref.union(set_stu)
    jaccard = len(intersection)/len(union) if union else 0

    # Word Mover's Distance (Requires Word2Vec model)
    wmd = 0  # Initialize with default value
    # if word2vec_model:
    #     wmd = word2vec_model.wmdistance(ref.split(), stu.split())

    # Levenshtein Distance
    lev = Levenshtein.distance(ref, stu)

    # WordNet Similarity (Simplified version)
    def wordnet_sim(text1, text2):
        # ... (implementation from previous explanation)
        return 0.5  # Placeholder

    wordnet_s = wordnet_sim(ref, stu)

    # BLEU Score
    smooth = SmoothingFunction().method1
    bleu = sentence_bleu([ref.split()], stu.split(), smoothing_function=smooth)

    return [cosine_sim, jaccard, wmd, lev, wordnet_s, bleu]

Feature Calculation

In [16]:
# Apply feature calculation
features = df.apply(calculate_features, axis=1, result_type='expand')
features.columns = ['cosine', 'jaccard', 'wmd', 'levenshtein', 'wordnet', 'bleu']

Data normalization

In [17]:
# Step 5: Data Normalization
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

Splitting dataset

In [18]:
# Step 6: Train-Test Split
X = scaled_features
y = df['student_marks'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Build and train the model

In [19]:
# Step 7: Build and Train Deep Learning Model
model = Sequential([
    Dense(128, activation='relu', input_shape=(6,)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1)
])

In [20]:
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='mse',
    metrics=['mae']
)

In [21]:
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Evaluation of model

In [22]:
# Step 8: Evaluation
loss, mae = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test MAE: {mae:.4f}')

Test Loss: 2.7210
Test MAE: 1.3003


In [23]:
sample_input = scaler.transform([[
    0.8,  # cosine
    0.75, # jaccard
    0.2,  # wmd (normalized)
    0.3,  # levenshtein (normalized)
    0.6,  # wordnet
    0.4   # bleu
]])



In [24]:
prediction = model.predict(sample_input)
print(f'Predicted Marks: {prediction[0][0]:.2f}')

Predicted Marks: 6.61


In [25]:

import pandas as pd

def predict_marks(correct_answer, keywords, student_answer, model, scaler):
    """
    Predicts marks based on the correct answer, keywords, and student answer.

    Parameters:
        correct_answer (str): The correct answer.
        keywords (list): List of important keywords.
        student_answer (str): The student's response.
        model (tf.keras.Model): The trained deep learning model.
        scaler (MinMaxScaler): The scaler used for feature normalization.

    Returns:
        float: Predicted marks for the student's answer.
    """

    # Preprocess inputs
    processed_correct = preprocess_text(correct_answer)
    processed_keywords = ' '.join([preprocess_text(k) for k in keywords])
    processed_student = preprocess_text(student_answer)
    reference = processed_correct + ' ' + processed_keywords

    # Compute similarity features
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform([reference, processed_student])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

    set_ref = set(reference.split())
    set_stu = set(processed_student.split())
    jaccard = len(set_ref.intersection(set_stu)) / len(set_ref.union(set_stu)) if set_ref.union(set_stu) else 0

    lev = Levenshtein.distance(reference, processed_student)

    def wordnet_sim(text1, text2):
        return 0.5  # Placeholder for actual implementation

    wordnet_s = wordnet_sim(reference, processed_student)

    smooth = SmoothingFunction().method1
    bleu = sentence_bleu([reference.split()], processed_student.split(), smoothing_function=smooth)

    # Prepare feature array as DataFrame with column names
    feature_data = pd.DataFrame([[cosine_sim, jaccard, 0, lev, wordnet_s, bleu]],
                                columns=['cosine', 'jaccard', 'wmd', 'levenshtein', 'wordnet', 'bleu'])

    # Scale the features
    scaled_features = scaler.transform(feature_data)

    # Predict marks
    prediction = model.predict(scaled_features)
    return round(float(prediction[0][0]), 2)

In [26]:
correct_answer = "The mitochondria is the powerhouse of the cell."
keywords = ["mitochondria", "powerhouse", "cell"]
student_answer = "Mitochondria is the energy producer in a cell."

predicted_marks = predict_marks(correct_answer, keywords, student_answer, model, scaler)
print(f"Predicted Marks: {predicted_marks}")

Predicted Marks: 6.76


In [27]:
# !pip install flask flask-cors

saving the mdoel

In [28]:
import pickle

# Save in SavedModel format
model.save("flask_app/model/saved_model")

pickle.dump(scaler, open("flask_app/model/scaler.pkl", "wb"))

INFO:tensorflow:Assets written to: flask_app/model/saved_model\assets


or can use latest keras format

In [29]:
# # Save model in the newer .keras format
# model.save("flask_app/model/model.keras")

# # Save scaler with pickle (this part remains the same)
# pickle.dump(scaler, open("flask_app/model/scaler.pkl", "wb"))
