In [None]:
!pip install transformers datasets torch scikit-learn


Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F
import matplotlib.pyplot as plt

In [9]:
df = pd.read_csv('/content/balanced_data_with_value_counts.csv')
df.columns

Index(['Unnamed: 0', 'review_content', 'student1_email', 'student1annotation',
       'student2_email', 'student2annotation', 'student3_email',
       'student3annotation', 'annotation_count', 'agreement_type',
       'final_label', 'processed_review_content'],
      dtype='object')

In [10]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your data


# Encode labels to start from 0
df['final_label'] -= 1

# Load the model and tokenizer
model_name = 'roberta-base'
num_classes = 4
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
tokenizer = RobertaTokenizer.from_pretrained(model_name)

# Split data into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Separate features and labels for each split
X_train, y_train = train_df['processed_review_content'], train_df['final_label']
X_val, y_val = val_df['processed_review_content'], val_df['final_label']
X_test, y_test = test_df['processed_review_content'], test_df['final_label']

# Convert to strings and handle missing values
X_train = X_train.astype(str).fillna('')
X_val = X_val.astype(str).fillna('')
X_test = X_test.astype(str).fillna('')





# # Save the model to Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# model_path = "/content/drive/MyDrive/roberta_model.pth"
# torch.save(model.state_dict(), model_path)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def preprocess_data(texts, labels, max_length):
    encodings = tokenizer(texts.tolist(), truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    labels = torch.tensor(labels.tolist()).to(device)
    dataset = TensorDataset(encodings.input_ids.to(device), encodings.attention_mask.to(device), labels)
    return dataset

max_length = 256  # Set your desired max sequence length

train_dataset = preprocess_data(X_train, y_train, max_length)
val_dataset = preprocess_data(X_val, y_val, max_length)
test_dataset = preprocess_data(X_test, y_test, max_length)

batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [13]:
# Model training
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

num_epochs = 20
early_stopping_patience = 3
best_val_loss = float('inf')
epochs_since_last_improvement = 0

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            predicted_labels = torch.argmax(outputs.logits, dim=1)
            correct += (predicted_labels == labels).sum().item()
            total += labels.size(0)

    val_accuracy = correct / total
    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Avg. Validation Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_since_last_improvement = 0
    else:
        epochs_since_last_improvement += 1
        if epochs_since_last_improvement >= early_stopping_patience:
            print("Early stopping triggered. Stopping training.")
            break

Epoch [1/20]
Validation Accuracy: 0.4995
Avg. Validation Loss: 0.9644
Epoch [2/20]
Validation Accuracy: 0.5051
Avg. Validation Loss: 0.9648
Epoch [3/20]
Validation Accuracy: 0.5285
Avg. Validation Loss: 0.9334
Epoch [4/20]
Validation Accuracy: 0.5565
Avg. Validation Loss: 0.9485
Epoch [5/20]
Validation Accuracy: 0.5845
Avg. Validation Loss: 0.9164
Epoch [6/20]
Validation Accuracy: 0.6340
Avg. Validation Loss: 0.8287
Epoch [7/20]
Validation Accuracy: 0.6713
Avg. Validation Loss: 0.8645
Epoch [8/20]
Validation Accuracy: 0.6639
Avg. Validation Loss: 0.7920
Epoch [9/20]
Validation Accuracy: 0.6900
Avg. Validation Loss: 0.8398
Epoch [10/20]
Validation Accuracy: 0.6723
Avg. Validation Loss: 0.8952
Epoch [11/20]
Validation Accuracy: 0.7087
Avg. Validation Loss: 1.0398
Early stopping triggered. Stopping training.


In [17]:
# Save the model to Google Drive
from google.colab import drive
drive.mount('/content/drive')

model_path = "/content/drive/MyDrive/roberta_model.pth"
torch.save(model.state_dict(), model_path)

Mounted at /content/drive


In [18]:
from transformers import RobertaForSequenceClassification

# Load the saved model
model2 = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
model2.load_state_dict(torch.load("/content/drive/MyDrive/roberta_model.pth"))
model2.to(device)

# Evaluate the model
model2.eval()
test_preds = []
test_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model2(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        predicted_labels = torch.argmax(logits, dim=1)
        test_preds.extend(predicted_labels.cpu().numpy())
        test_true_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(test_true_labels, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Generate classification report
class_names = ["Class 0", "Class 1", "Class 2", "Class 3"]  # Replace with your actual class names
report = classification_report(test_true_labels, test_preds, target_names=class_names, digits=4)
print("Classification Report:")
print(report)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy: 0.7015
Classification Report:
              precision    recall  f1-score   support

     Class 0     0.9200    0.7782    0.8432       266
     Class 1     0.4841    0.4729    0.4784       258
     Class 2     0.9712    0.8613    0.9130       274
     Class 3     0.5312    0.6825    0.5974       274

    accuracy                         0.7015      1072
   macro avg     0.7266    0.6987    0.7080      1072
weighted avg     0.7288    0.7015    0.7104      1072

