<a href="https://colab.research.google.com/github/Nehasatheesh04/ASAG/blob/main/ASAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the dataset
file_path = "/content/drive/MyDrive/asag_dataset.csv"
df = pd.read_csv(file_path)

# Display basic information about the dataset
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 646 entries, 0 to 645
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           646 non-null    int64  
 1   question             646 non-null    object 
 2   student_answer       607 non-null    object 
 3   grades_round         646 non-null    int64  
 4   student_modified     606 non-null    object 
 5   ref_answer           646 non-null    object 
 6   qn_modified          646 non-null    object 
 7   ref_modified         646 non-null    object 
 8   student_demoted      606 non-null    object 
 9   ref_demoted          646 non-null    object 
 10  length_ratio         646 non-null    float64
 11  embed_ref            646 non-null    object 
 12  embed_stud           646 non-null    object 
 13  embed_ref_demoted    646 non-null    object 
 14  embed_stud_demoted   646 non-null    object 
 15  aligned              646 non-null    obj

(None,
    Unnamed: 0                                           question  \
 0           0   Give a definition for the term "artificial ne...   
 1           1   Give a definition for the term "artificial ne...   
 2           2   Give a definition for the term "artificial ne...   
 3           3   Give a definition for the term "artificial ne...   
 4           4   Give a definition for the term "artificial ne...   
 
                                       student_answer  grades_round  \
 0  An artificial neural network is a massively pa...             2   
 1  Artificial neural network consists of: . Large...             2   
 2  An artificial neural network is a massive dist...             1   
 3  An ANN is a layered graphical model containing...             2   
 4  Artificial Neural Networks are large parallel ...             2   
 
                                     student_modified  \
 0  artificial neural network massively parallel d...   
 1  artificial neural network consi

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score



# Ensure required columns exist
required_columns = {'ref_answer', 'student_answer', 'grades_round'}
if not required_columns.issubset(df.columns):
    raise KeyError(f"Missing columns: {required_columns - set(df.columns)}")

# Extract necessary columns
ref_answers = df['ref_answer'].astype(str).values  # Convert to string to avoid errors
stud_answers = df['student_answer'].astype(str).values
grades = df['grades_round'].astype(float).values  # Convert grades to float

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform reference answers
ref_tfidf = vectorizer.fit_transform(ref_answers)

# Transform student answers
stud_tfidf = vectorizer.transform(stud_answers)

# Compute cosine similarity for each answer pair
cos_sim = cosine_similarity(ref_tfidf, stud_tfidf).diagonal().reshape(-1, 1)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(cos_sim, grades, test_size=0.2, random_state=42)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
accuracy = (1 - (mae / np.mean(y_test))) * 100

print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"R² Score: {r2:.3f}")
print(f"Accuracy: {accuracy:.2f}%")


Mean Absolute Error (MAE): 0.529
R² Score: 0.209
Accuracy: 61.77%


LSTM


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/asag_dataset.csv")  # Update filename if necessary

# Ensure required columns exist
required_columns = {'ref_answer', 'student_answer', 'grades_round'}
if not required_columns.issubset(df.columns):
    raise KeyError(f"Missing columns: {required_columns - set(df.columns)}")

# Extract necessary columns
ref_answers = df['ref_answer'].astype(str).values  # Convert to string to avoid errors
stud_answers = df['student_answer'].astype(str).values
grades = df['grades_round'].astype(float).values  # Convert grades to float

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.concatenate((ref_answers, stud_answers), axis=0))

# Convert text to sequences
ref_sequences = tokenizer.texts_to_sequences(ref_answers)
stud_sequences = tokenizer.texts_to_sequences(stud_answers)

# Padding sequences
max_length = max(max(len(seq) for seq in ref_sequences), max(len(seq) for seq in stud_sequences))
ref_padded = pad_sequences(ref_sequences, maxlen=max_length, padding='post')
stud_padded = pad_sequences(stud_sequences, maxlen=max_length, padding='post')

# Compute absolute difference between encoded sequences
diff_padded = np.abs(ref_padded - stud_padded)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(diff_padded, grades, test_size=0.2, random_state=42)

# Define LSTM Model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    LSTM(64, return_sequences=True),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')  # Regression output
])

# Compile Model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train Model
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
accuracy = (1 - (mae / np.mean(y_test))) * 100  # Compute accuracy as a percentage

print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"R² Score: {r2:.3f}")
print(f"Accuracy: {accuracy:.2f}%")


Epoch 1/10




[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 115ms/step - loss: 1.2643 - mae: 0.9356 - val_loss: 0.5142 - val_mae: 0.6433
Epoch 2/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 138ms/step - loss: 0.5806 - mae: 0.6713 - val_loss: 0.4575 - val_mae: 0.6120
Epoch 3/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 101ms/step - loss: 0.5173 - mae: 0.6421 - val_loss: 0.4317 - val_mae: 0.5891
Epoch 4/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 98ms/step - loss: 0.4401 - mae: 0.5863 - val_loss: 0.4038 - val_mae: 0.5667
Epoch 5/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 138ms/step - loss: 0.3583 - mae: 0.5179 - val_loss: 0.5071 - val_mae: 0.5574
Epoch 6/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 100ms/step - loss: 0.3000 - mae: 0.4279 - val_loss: 0.4104 - val_mae: 0.5351
Epoch 7/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 98ms/step - loss: 0.2215 

improved lstm......word embedding like glove


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


--2025-03-30 20:39:38--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-03-30 20:39:39--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-30 20:39:39--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalAveragePooling1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/asag_dataset.csv")  # Update filename if necessary

# Ensure required columns exist
required_columns = {'ref_answer', 'student_answer', 'grades_round'}
if not required_columns.issubset(df.columns):
    raise KeyError(f"Missing columns: {required_columns - set(df.columns)}")

# Extract necessary columns
ref_answers = df['ref_answer'].astype(str).values  # Convert to string to avoid errors
stud_answers = df['student_answer'].astype(str).values
grades = df['grades_round'].astype(float).values  # Convert grades to float

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.concatenate((ref_answers, stud_answers), axis=0))

# Convert text to sequences
ref_sequences = tokenizer.texts_to_sequences(ref_answers)
stud_sequences = tokenizer.texts_to_sequences(stud_answers)

# Padding sequences
max_length = max(max(len(seq) for seq in ref_sequences), max(len(seq) for seq in stud_sequences))
ref_padded = pad_sequences(ref_sequences, maxlen=max_length, padding='post')
stud_padded = pad_sequences(stud_sequences, maxlen=max_length, padding='post')

# Compute absolute difference between encoded sequences
diff_padded = np.abs(ref_padded - stud_padded)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(diff_padded, grades, test_size=0.2, random_state=42)

# Load GloVe word embeddings
def load_glove_embeddings(filepath, tokenizer, embedding_dim):
    embeddings_index = {}
    with open(filepath, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    vocab_size = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

embedding_dim = 100  # Use GloVe 100D
embedding_matrix = load_glove_embeddings("glove.6B.100d.txt", tokenizer, embedding_dim)

# Define Improved LSTM Model
vocab_size = len(tokenizer.word_index) + 1
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    Bidirectional(LSTM(128, return_sequences=True)),
    GlobalAveragePooling1D(),
    Dense(128, activation='relu'),
    Dense(1, activation='linear')  # Regression output
])

# Compile Model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Train Model
model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_test, y_test))

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
accuracy = (1 - (mae / np.mean(y_test))) * 100  # Compute accuracy as a percentage

print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"R² Score: {r2:.3f}")
print(f"Accuracy: {accuracy:.2f}%")




Epoch 1/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 780ms/step - loss: 1.0668 - mae: 0.8551 - val_loss: 0.5060 - val_mae: 0.5918
Epoch 2/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 832ms/step - loss: 0.5681 - mae: 0.6020 - val_loss: 0.4937 - val_mae: 0.5815
Epoch 3/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 786ms/step - loss: 0.4979 - mae: 0.5790 - val_loss: 0.4075 - val_mae: 0.5377
Epoch 4/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 532ms/step - loss: 0.5269 - mae: 0.6086 - val_loss: 0.3998 - val_mae: 0.5314
Epoch 5/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 788ms/step - loss: 0.4920 - mae: 0.5909 - val_loss: 0.3841 - val_mae: 0.5212
Epoch 6/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 756ms/step - loss: 0.4686 - mae: 0.5863 - val_loss: 0.3792 - val_mae: 0.5130
Epoch 7/15
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 777ms/

 Pretrained BERT-Based ASAG

In [None]:
from huggingface_hub import login

login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!huggingface-cli whoami


SimplyNeha


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/asag_dataset.csv")  # Ensure correct filename

# Ensure required columns exist
required_columns = {'ref_answer', 'student_answer', 'grades_round'}
if not required_columns.issubset(df.columns):
    raise KeyError(f"Missing columns: {required_columns - set(df.columns)}")

# Extract necessary columns
ref_answers = df['ref_answer'].astype(str).tolist()
stud_answers = df['student_answer'].astype(str).tolist()
grades = df['grades_round'].astype(float).values  # Convert grades to float

# Load Pretrained BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Function to extract BERT embeddings
def get_bert_embeddings(text_list):
    embeddings = []
    for text in text_list:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # Use [CLS] token embedding
        embeddings.append(cls_embedding)
    return np.array(embeddings)

# Extract BERT embeddings
ref_embeddings = get_bert_embeddings(ref_answers)
stud_embeddings = get_bert_embeddings(stud_answers)

# Compute absolute difference between embeddings (distance measure)
X = np.abs(ref_embeddings - stud_embeddings)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, grades, test_size=0.2, random_state=42)

# Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
accuracy = (1 - (mae / np.mean(y_test))) * 100  # Compute accuracy

# Display Results
print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"R² Score: {r2:.3f}")
print(f"Accuracy: {accuracy:.2f}%")


Mean Absolute Error (MAE): 0.696
R² Score: -0.756
Accuracy: 49.70%


 Fine-Tune BERT for ASAG

In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/asag_dataset.csv")

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_data(ref_texts, stud_texts, tokenizer, max_len=256):
    return tokenizer(
        list(ref_texts), list(stud_texts),
        padding='max_length', truncation="only_second",
        max_length=max_len, return_tensors="pt"
    )

# Tokenize both reference and student answers
tokenized_data = tokenize_data(df['ref_answer'].astype(str), df['student_answer'].astype(str), tokenizer)

input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define Fine-tuned BERT Model
class BertRegressionModel(nn.Module):
    def __init__(self):
        super(BertRegressionModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)  # Regression output

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.regressor(pooled_output)

# Initialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertRegressionModel().to(device)

# Loss and Optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

# Training Loop
def train_model(model, train_loader, criterion, optimizer, epochs=15):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            predictions = model(input_ids=input_ids, attention_mask=(input_ids > 0).long()).squeeze()
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

train_model(model, train_loader, criterion, optimizer, epochs=15)


Epoch 1, Loss: 0.5464955893429843
Epoch 2, Loss: 0.30822799806341983
Epoch 3, Loss: 0.24821928775671756
Epoch 4, Loss: 0.20133014486143083
Epoch 5, Loss: 0.15017854715838577
Epoch 6, Loss: 0.1053885475478389
Epoch 7, Loss: 0.06486132295068467
Epoch 8, Loss: 0.041677157930804024
Epoch 9, Loss: 0.03009343816136772
Epoch 10, Loss: 0.027963122578732895
Epoch 11, Loss: 0.027171308400504517
Epoch 12, Loss: 0.024267390539700336
Epoch 13, Loss: 0.014683837889496124
Epoch 14, Loss: 0.012527317697690292
Epoch 15, Loss: 0.011810952599978808


In [None]:
# Evaluate Model
def evaluate(model, test_loader):
    model.eval()
    predictions, actuals = [], []

    with torch.no_grad():
        for batch in test_loader:
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            preds = model(input_ids=input_ids, attention_mask=(input_ids > 0).long()).squeeze()
            predictions.extend(preds.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    return predictions, actuals

y_pred, y_true = evaluate(model, test_loader)

# Compute MAE and R² Score
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
accuracy = (1 - (mae / np.mean(y_true))) * 100  # Accuracy formula

print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"R² Score: {r2:.3f}")
print(f"Accuracy: {accuracy:.2f}%")


Mean Absolute Error (MAE): 0.351
R² Score: 0.395
Accuracy: 74.66%
