In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers 
from transformers import AutoModel, BertTokenizerFast

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from sklearn.utils.class_weight import compute_class_weight

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# specicy which GPU we want to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


#  Loading the data
df = pd.read_csv("./essay.csv")

train_text, temp_text, train_labels, temp_labels = train_test_split(df['full_text'],df['score'], test_size=.3, stratify=df['score'], random_state=42)

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, test_size=.666, stratify=temp_labels, random_state=42)

bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# seq_len = [len(i.split()) for i in train_text]
# max is 1051

In [None]:
# def tokenize(variable, source):
#     variable = tokenizer.batch_encode_plus(
#     source.tolist(),
#     max_length = 25,
#     padding= max_length,
#     truncation = True
# )
    
# tokenize(tokens_train, train_text)

# This is where we set up the train, val, and test tokens and mask them for the BERT model
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 512,
    padding='max_length',
    truncation = True
)

tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = 512,
    padding='max_length',
    truncation = True
)

tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = 512,
    padding='max_length',
    truncation = True
)


In [None]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())


batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler= train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data,sampler=val_sampler, batch_size=batch_size)


In [None]:

class BERTRegressor(nn.Module):
    def __init__(self, bert):
        super(BERTRegressor, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Linear(768,1)
        # self.softmax = nn.LogSoftmax(dim=1)
    # def forward(self, sent_id,mask):
    #     _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
        
    #     x = self.fc1(cls_hs)
    #     x = self.relu(x)
    #     x = self.dropout(x)
        
    #     x = self.fc2(x)
    #     x = self.softmax(x)
    #     return x
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        output = self.dropout(pooled_output)
        return self.regressor(output).squeeze(-1)

# model = BERT_Arch(bert)
# model = model.to(device)

# optimizer = AdamW(model.parameters(),lr = 1e-5)

# class_weights  = compute_class_weight('balanced',np.unique(train_labels),train_labels)

# print("Class Weights", class_weights)

# weights= torch.tensor(class_weights,dtype=torch.float)
# weights = weights.to(device)

# cross_entropy = nn.NLLLoss(weight=weights)


In [None]:
epochs = 10

model = BERTRegressor(bert)
model = model.to(device)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = AdamW(model.parameters(), lr=3e-5)
    

def train(model, dataloader):
    model.train()
    total_loss = 0
    for batch in dataloader:
        b_input_ids, b_attn_mask, b_labels = [item.to(device) for item in batch]
        optimizer.zero_grad()
        outputs = model(b_input_ids, b_attn_mask)
        loss = criterion(outputs, b_labels.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
def evaluate(model, dataloader):
    model.eval()
    preds, true_labels = [],[]
    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_attn_mask, b_labels = [item.to(device) for item in batch]
            outputs = model(b_input_ids, b_attn_mask)
            preds.extend(outputs.cpu().numpy())
            true_labels.extend(b_labels.cpu().numpy())
            
    mse = mean_squared_error(true_labels, preds)
    return mse, preds, true_labels
            

In [None]:
for epoch in range(epochs):
    train_loss = train(model, train_dataloader)
    val_mse, _, _ = evaluate(model, val_dataloader)
    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.3f} | Val MSE: {val_mse:.3f}")

test_mse, test_preds, test_true = evaluate(model, DataLoader(TensorDataset(test_seq, test_mask, test_y), batch_size=batch_size))
print(f"Final Test MSE: {test_mse:.4f}")