In [25]:
from datasets import load_dataset
import torch
import transformers
from transformers import AutoTokenizer, BertModel
import pandas as pd
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
import gc
from sklearn import metrics
import numpy as np

gc.collect()
torch.cuda.empty_cache()

In [64]:
data = load_dataset("dair-ai/emotion",trust_remote_code=True)
df1 = data['train'].to_pandas()
df2 = data['validation'].to_pandas()
print(data)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [48]:
df1.text.map(len).max()

300

In [49]:
df1['label'].unique()

array([0, 3, 2, 5, 4, 1], dtype=int64)

In [50]:
df1['text'] = df1['text'].str.replace(r'http\S+', '', regex=True)
df1['text'] = df1['text'].str.replace(r'[^\w\s]', '', regex=True)
df1['text'] = df1['text'].str.replace(r'\s+', ' ', regex=True)
df1['text'] = df1['text'].str.replace(r'\d+', '', regex=True)
df1['text'] = df1['text'].str.lower()
cachedStopWords = set(stopwords.words("english"))
df1["text"] = df1['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in cachedStopWords]))
df1['text'] = df1['text'].str.replace(r'[^a-zA-Z\s]', '',regex=True)


In [51]:
df1.text.map(len).max()

232

In [63]:
MAX_LEN = 232
TRAIN_BATCH_SIZE = 4
VAL_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-5
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [53]:
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer , max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.targets = self.data['label']
        self.text = self.data['text']

    def __len__(self):
        return len(self.data.text)

    def __getitem__(self, index):
        text = self.text[index]

        inputs = self.tokenizer(
            text,
            max_length = self.max_len,
            padding = 'max_length'
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids,dtype=torch.long),
            'mask': torch.tensor(mask,dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids,dtype=torch.long),
            'targets': torch.tensor(self.targets[index],dtype=torch.long)
        }
        


In [54]:
text = df1['text'][1]
token = tokenizer(text,max_length = MAX_LEN,
          padding = 'max_length'
          )
token

{'input_ids': [101, 2175, 3110, 20625, 9636, 17772, 2105, 2619, 14977, 8300, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [55]:
# train_size = 0.8 
# train_data = df.sample(frac=train_size,random_state=420)
# test_data = df.drop(train_data.index).reset_index(drop=True)
# train_data = train_data.reset_index(drop=True)

train_data = df1
test_data = df2

# print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

train_set = EmotionDataset(train_data, tokenizer, MAX_LEN)
test_set = EmotionDataset(test_data, tokenizer, MAX_LEN)

TRAIN Dataset: (16000, 2)
TEST Dataset: (2000, 2)


In [56]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VAL_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = torch.utils.data.DataLoader(train_set, **train_params)
testing_loader = torch.utils.data.DataLoader(test_set, **test_params)

In [57]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [58]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

In [59]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [61]:
def train(epoch):
    model.train(True)
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%200==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        # optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [62]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  1.8439944982528687


KeyboardInterrupt: 

In [34]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [35]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

[[0.5647835731506348, 0.5935133099555969, 0.479469895362854, 0.5204168558120728, 0.5096961855888367, 0.5454772114753723], [0.5545200109481812, 0.5992230772972107, 0.4827253520488739, 0.5253337025642395, 0.49767130613327026, 0.5628164410591125]]
[1.0, 1.0]


<generator object Module.parameters at 0x000001D9B8D6DEB0>
