In [1]:
!pip install transformers torch pandas scikit-learn



In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm


In [19]:
from google.colab import files
uploaded = files.upload()

# Load the dataset
df = pd.read_csv('/content/DAOLEB_Dataset.csv')

print(df['Encoded Grade'].isnull().sum())

df = df.dropna(subset=['Encoded Grade'])  # Drop rows with missing values


# Inspect the first few rows
print(df.head())


Saving DAOLEB_Dataset.csv to DAOLEB_Dataset (2).csv
3
   Question_id                       Questions  \
0            1  What is unsupervised learning?   
1            2  What is unsupervised learning?   
2            3  What is unsupervised learning?   
3            4  What is unsupervised learning?   
4            5  What is unsupervised learning?   

                                             Answers      Grade  Encoded Grade  
0  Unsupervised learning is a type of machine lea...  Excellent            4.0  
1  It involves identifying structures or relation...       Good            3.0  
2  A technique where algorithms analyze data to u...    Average            2.0  
3  It is used when the output labels are unknown,...       Poor            1.0  
4  In unsupervised learning, there is no target v...  Very Poor            0.0  


In [21]:
# After loading the dataset and handling missing values

# Check and correct the values in 'Encoded Grade'
df['Encoded Grade'] = df['Encoded Grade'].astype(int)  # Convert to integers
df = df[df['Encoded Grade'].between(0, 4)]  # Keep rows with labels in the range 0-4

   # ... rest of your code ...

In [22]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize questions and answers
def tokenize_data(question, answer):
    combined_text = f"Question: {question} Answer: {answer}"
    return tokenizer(combined_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

# Apply tokenization
tokenized_data = df.apply(
    lambda row: tokenize_data(row['Questions'], row['Answers']),
    axis=1
)

# Extract input IDs and attention masks
df['input_ids'] = tokenized_data.apply(lambda x: x['input_ids'][0])
df['attention_mask'] = tokenized_data.apply(lambda x: x['attention_mask'][0])

# Convert labels to tensor
# labels = torch.tensor(df['Encoded Grade'].values)
# Convert labels to tensor with explicit type casting
labels = torch.tensor(df['Encoded Grade'].values, dtype=torch.long)

In [23]:
# Stack tensors for input IDs and attention masks
input_ids = torch.stack(df['input_ids'].tolist())
attention_masks = torch.stack(df['attention_mask'].tolist())

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_masks, val_masks = train_test_split(attention_masks, test_size=0.2, random_state=42)

# Create DataLoaders
train_data = TensorDataset(X_train, train_masks, y_train)
val_data = TensorDataset(X_val, val_masks, y_val)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16, shuffle=False)


In [24]:
# # Load a pre-trained BERT model for sequence classification
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# # Move the model to GPU if available
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# Load a pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Change the problem_type to 'single_label_classification' to ensure correct loss function is used
model.config.problem_type = "single_label_classification"
# or alternatively set the loss function directly
# model.loss_fct = torch.nn.CrossEntropyLoss()
# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [25]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 3  # Number of training epochs
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        b_input_ids, b_attention_mask, b_labels = [item.to(device) for item in batch]

        # Reset gradients
        model.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs} - Loss: {total_loss / len(train_loader)}")


100%|██████████| 86/86 [30:38<00:00, 21.38s/it]


Epoch 1/3 - Loss: 1.5436663391978243


100%|██████████| 86/86 [30:32<00:00, 21.30s/it]


Epoch 2/3 - Loss: 1.3985369940136754


100%|██████████| 86/86 [30:14<00:00, 21.10s/it]

Epoch 3/3 - Loss: 1.291667734467706





In [26]:
model.eval()
total_eval_accuracy = 0

for batch in val_loader:
    b_input_ids, b_attention_mask, b_labels = [item.to(device) for item in batch]

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    total_eval_accuracy += (predictions == b_labels).sum().item()

accuracy = total_eval_accuracy / len(val_loader.dataset)
print(f"Validation Accuracy: {accuracy}")


Validation Accuracy: 0.36443148688046645


In [27]:
 def predict_answer(question, answer):
    model.eval()
    combined_text = f"Question: {question} Answer: {answer}"
    inputs = tokenizer(combined_text, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_grade = torch.argmax(logits, dim=-1).item()
    return predicted_grade


In [28]:
# Example usage
test_question = "What is unsupervised learning?"
test_answer = "Unsupervised learning is a type of machine learning where the model learns patterns in data without labeled outcomes."

predicted_grade = predict_answer(test_question, test_answer)
print(f"The predicted grade for the answer is: {predicted_grade}")


The predicted grade for the answer is: 4
