In [64]:
import pandas as pd
from transformers import pipeline
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import numpy as np
from sklearn.metrics import f1_score
import matplotlib as plt

In [65]:
path = 'C:/Users/anike/Desktop/IIIT D/NLP/Ass3'
path = 'c:\\Users\\Shobhit\\Desktop\\IIITacad\\Sem6\\NLP_Assignments\\Assignment3'
data = pd.read_csv(path + '\\train.csv')
val_data =pd.read_csv(path +'\\dev.csv', sep = "\t")
#change column name from setence1 to sentence1
val_data = val_data.rename(columns = {'setence1':'sentence1'})
val_data.head()

Unnamed: 0,score,sentence1,sentence2
0,5.0,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.
1,4.75,A young child is riding a horse.,A child is riding a horse.
2,5.0,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.
3,2.4,A woman is playing the guitar.,A man is playing guitar.
4,2.75,A woman is playing the flute.,A man is playing a flute.


In [66]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
sentences1 = data["sentence1"]
sentences2 = data["sentence2"]
labels = data["score"]

In [67]:
def plot_results(train_losses,val_losses):
    epochs=range(1,len(train_losses)+1)

    plt.figure(figsize=(12, 5))

    # Plotting Losses
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label='Training Loss')
    plt.plot(epochs, val_losses, label='Validation Loss')
    plt.title('Training and Validation Losses')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [68]:
class model:
    def __init__(self, model_name, device = "cpu"):
        self.model = BertForSequenceClassification.from_pretrained(model_name).to(device)
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model.to(self.device)
        self.model.eval()
        
    def encode(self, sentences1, sentences2):
        inputs = self.tokenizer(sentences1, sentences2, return_tensors="pt", max_length = 30, padding="max_length", truncation=True)
        inputs.to(self.device)
        return inputs    

    def add_linear_layer(self, num_labels):
        self.model.classifier = nn.Linear(self.model.config.hidden_size, num_labels)
        self.model.num_labels = num_labels
        self.model.to(self.device)

    def forward(self, inputs):
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs
    
    def predict(self, sentences1, sentences2):
        inputs = self.encode(sentences1, sentences2)
        outputs = self.forward(inputs)
        logits = outputs.logits
        return logits
    
    def train(self, train_dataloader,val_dataloader, optimizer, loss_fn, epochs):
        #train on GPU
        train_losses=[]
        val_losses=[]
        for epoch in range(epochs):
            i = 0
            self.model.train()  # Set the model to training mode
            total_train_loss = 0
            all_train_predictions = [] 
            all_train_targets = []
            for data in train_dataloader:
                i += 1
                print("Training Epoch: ", epoch, " Batch: ", i)
                optimizer.zero_grad()
                inputs = data[0].to(self.device)
                attention_mask = data[1].to(self.device)
                labels = data[2].to(self.device)
                outputs = self.model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                total_train_loss+=loss.item()
                
                all_train_predictions.extend(outputs["logits"].argmax(dim=1).view(-1).cpu().numpy())
                all_train_targets.extend(labels.view(-1).cpu().numpy())

            
            avg_train_loss = total_train_loss / len(train_dataloader)
            train_losses.append(avg_train_loss)
            print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss}")
            self.model.eval()  # Set the model to evaluation mode
            total_val_loss = 0
            all_val_predictions = []
            all_val_targets = []

            with torch.no_grad():
                for val_data in val_dataloader:
                    val_inputs,val_attention_mask,val_labels=val_data
                    val_inputs = val_inputs.to(self.device)
                    val_attention_mask = val_attention_mask.to(self.device)
                    val_labels = val_labels.to(self.device)
                    val_outputs = self.model(input_ids=inputs, attention_mask=attention_mask, labels=labels)

                    
                    loss=val_outputs.loss

                    total_val_loss += loss.item()
                    
                    all_val_predictions.extend(val_outputs["logits"].argmax(dim=1).view(-1).cpu().numpy())
                    all_val_targets.extend(val_labels.view(-1).cpu().numpy())

                avg_val_loss = total_val_loss / len(val_dataloader)
                val_losses.append(avg_val_loss)
            print(f"Epoch {epoch + 1},  Validation Loss: {avg_val_loss}")
        plot_results(train_losses, val_losses)
        return train_losses,  val_losses

In [69]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, model_name):
        self.data = dataframe
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sentence1 = str(self.data.iloc[idx]['sentence1'])
        sentence2 = str(self.data.iloc[idx]['sentence2'])
        label = self.data.iloc[idx]['score']
        inputs = self.tokenizer(sentence1, sentence2, return_tensors="pt", max_length = 256, padding="max_length", truncation=True)
        return inputs, torch.tensor(label, dtype=torch.long)
    
    def get_encoded_data(self):
        input_ids = []
        attention_masks = []
        labels = []
        for i in range(self.__len__()):
            input, label = self.__getitem__(i)
            input_ids.append(input['input_ids'])
            attention_masks.append(input['attention_mask'])
            labels.append(label.float()/5.0)
        
        input_ids = torch.cat(input_ids, dim=0).squeeze(0)
        attention_masks = torch.cat(attention_masks, dim=0).squeeze(0)
        labels = np.array(labels)
        labels = labels.reshape(-1, 1)
        labels = torch.tensor(labels)
        return input_ids, attention_masks, labels
    
    def get_dataloader(self, batch_size):
        input_ids, attention_masks, labels = self.get_encoded_data()
        dataset = TensorDataset(input_ids, attention_masks, labels)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        return dataloader

In [70]:
def generate_predictions(model, test_loader):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for data in test_loader:
            inputs1, inputs2 = data  # Modify this based on your test dataset structure
            outputs = model.predict(inputs1, inputs2)  # Modify this based on your model's prediction method
            predictions = ...  # Process outputs to get predictions
            all_predictions.extend(predictions)
    return all_predictions

In [71]:
dataset = CustomDataset(data,model_name)
val_dataset=CustomDataset(val_data,model_name)

In [72]:
Model = model(model_name)
Model.add_linear_layer(1)
Model.train(dataset.get_dataloader(8),val_dataset.get_dataloader(8), torch.optim.Adam(Model.model.parameters()), nn.CrossEntropyLoss(), 1)
test_data_path = 'test_data.csv'  # Modify this with your actual test data file path
test_data = pd.read_csv(test_data_path)
# Create DataLoader for test data
test_dataset = CustomDataset(test_data)  # Modify this based on your test dataset class
test_loader = test_dataset.get_dataloader(8)
predictions = generate_predictions(model, test_loader)
# Create CSV file
output_df = pd.DataFrame({'Prediction': predictions})
output_df.to_csv('sample_demo.csv', index=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens 

Training Epoch:  0  Batch:  1
Training Epoch:  0  Batch:  2
Training Epoch:  0  Batch:  3
Training Epoch:  0  Batch:  4
Training Epoch:  0  Batch:  5
Training Epoch:  0  Batch:  6
Training Epoch:  0  Batch:  7
Training Epoch:  0  Batch:  8
Training Epoch:  0  Batch:  9
Training Epoch:  0  Batch:  10
Training Epoch:  0  Batch:  11
Training Epoch:  0  Batch:  12


KeyboardInterrupt: 