In [28]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification

In [16]:
DATA_PATH_TRAIN = "datasets/WASSA23_essay_level_train_preproc.tsv"
df_train = pd.read_csv(DATA_PATH_TRAIN, sep='\t')
df_train.head(10)

Unnamed: 0,conversation_id,article_id,essay,empathy,distress,speaker_id,gender,education,race,age,...,essay_id,emotion,Surprise,Hope,Neutral,Sadness,Joy,Fear,Anger,Disgust
0,2,35,It breaks my heart to see people living in tho...,6.833333,6.625,30,1,6,3,37,...,1,Hope/Sadness,0,1,0,1,0,0,0,0
1,3,35,I wonder why there aren't more people trying t...,5.833333,6.0,19,1,6,2,32,...,2,Anger,0,0,0,0,0,0,1,0
2,5,35,"After reading the article, you can't help but ...",1.0,1.375,17,1,6,1,29,...,4,Sadness,0,0,0,1,0,0,0,0
3,6,213,It is so sad that someone who had such an amaz...,6.166667,6.625,16,2,5,1,28,...,5,Sadness,0,0,0,1,0,0,0,0
4,8,213,"From reading the article, it looks like the wo...",6.833333,1.0,30,1,6,3,37,...,7,Neutral,0,0,1,0,0,0,0,0
5,10,213,That's sad. Regardless of what they find out ...,1.666667,1.125,49,1,5,1,31,...,9,Sadness,0,0,0,1,0,0,0,0
6,11,78,"After reading the article, my reaction is that...",1.5,1.0,17,1,6,1,29,...,10,Sadness,0,0,0,1,0,0,0,0
7,13,78,It sounds like these boys had a really rough l...,2.0,1.0,24,2,7,1,38,...,12,Sadness,0,0,0,1,0,0,0,0
8,14,78,This is a tragic and sad story about how some ...,6.0,3.0,43,2,6,1,33,...,13,Sadness,0,0,0,1,0,0,0,0
9,17,336,Hello. I feel really terrible about the curren...,7.0,1.0,31,unknown,unknown,unknown,unknown,...,16,Disgust/Sadness,0,0,0,1,0,0,0,1


In [18]:
DATA_PATH_DEV = "datasets/WASSA23_essay_level_dev_preproc.tsv"
df_dev = pd.read_csv(DATA_PATH_DEV, sep='\t')
df_dev.head(10)

Unnamed: 0,conversation_id,article_id,essay,speaker_id,gender,education,race,age,income,speaker_number,...,iri_fantasy,iri_empathatic_concern,Sadness,Anger,Surprise,Neutral,Joy,Hope,Disgust,Fear
0,1,35,How sad is it that this kind of pain and suffe...,68,2,2,1,21,20000,1,...,3.143,3.286,1,0,0,0,0,0,0,0
1,4,35,The article is kind of tragic and hits close t...,79,1,6,3,33,64000,1,...,2.429,1.429,1,0,0,0,0,0,0,0
2,7,213,"I think that these kinds of stories, are sad, ...",68,2,2,1,21,20000,1,...,3.143,3.286,1,0,0,0,0,0,0,0
3,9,213,It's crazy that random accidents like this hap...,84,2,4,1,25,55000,1,...,3.571,3.143,0,0,0,1,0,0,0,0
4,12,78,This story makes me so so sad.... As someone w...,68,2,2,1,21,20000,1,...,3.143,3.286,1,0,0,0,0,0,0,0
5,15,78,"After reading the article, my first reaction a...",70,1,6,1,29,85000,1,...,4.143,4.643,1,0,0,0,0,0,0,0
6,16,336,I didn't know coal mining had such adverse eff...,81,1,4,1,30,27000,1,...,4.571,4.0,0,0,0,1,0,0,0,0
7,20,336,This is very sad. I can't imagine having elep...,73,2,7,1,38,42000,1,...,2.571,3.857,1,0,0,0,0,0,0,0
8,23,281,"Guys, reading this article really hits home fo...",63,1,4,1,25,29000,1,...,2.571,4.857,1,0,0,0,0,0,0,0
9,26,171,Hey guys. So I just read this article about Ir...,63,1,4,1,25,29000,1,...,2.571,4.857,0,0,0,1,0,0,0,0


In [19]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.essay = dataframe.essay
        self.targets = self.data[target_list]
        self.max_len = max_len

    def __len__(self):
        return len(self.essay)

    def __getitem__(self, index):
        essay = self.essay[index]

        encoding = self.tokenizer.encode_plus(
            essay,
            max_length=self.max_len,  # Adjust according to your requirements
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        ids = encoding['input_ids']
        mask = encoding['attention_mask']
        token_type_ids = encoding["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [30]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
N_LABELS = 8
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
emotions_list = ["Sadness", "Anger", "Surprise", "Neutral",	"Joy", "Hope", "Disgust", "Fear"]
dataset_train = CustomDataset(df_train, tokenizer, MAX_LEN, emotions_list)
dataset_dev = CustomDataset(df_dev, tokenizer, MAX_LEN, emotions_list)



In [25]:
dataset_train

<__main__.CustomDataset at 0x1279e7a60>

In [26]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(dataset_train, **train_params)
testing_loader = DataLoader(dataset_dev, **test_params)

In [42]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-cased')
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output = self.dropout(output)
        output = self.linear(output)
        return output

In [31]:
modelxxx = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=N_LABELS)

Downloading model.safetensors: 100%|██████████| 436M/436M [00:33<00:00, 13.0MB/s] 
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [43]:
model = BERTClass()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [44]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [45]:

val_targets=[]
val_outputs=[]

In [33]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [46]:
def train(
    epochs, 
    train_loader, 
    val_loader, 
    model, 
    optimizer): 
    #checkpoint_path, 
    #best_model_path):
  valid_loss_min = np.Inf

  for epoch in range(1, epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(train_loader):

        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))

    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
   
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(val_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))

      train_loss = train_loss/len(train_loader)
      valid_loss = valid_loss/len(val_loader)

      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))

      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
        
      # save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).'.format(valid_loss_min,valid_loss))

        # save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model
    

In [47]:
trained_model = train(EPOCHS, training_loader, testing_loader, model, optimizer)#, ckpt_path, best_model_path)

############# Epoch 1: Training Start   #############


  'ids': torch.tensor(ids, dtype=torch.long),
  'mask': torch.tensor(mask, dtype=torch.long),
  'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),


KeyError: 420