In [1]:
!pip install datasets
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import pyarrow
import torch
import torchtext
from tqdm import tqdm
import transformers
from torch.utils.data import DataLoader
from torch import nn
import time
import warnings
warnings.filterwarnings("ignore")

In [3]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [4]:
import random
def seed_everything(seed=73):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything(1234)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.0.0+cu117
torchtext Version:  0.15.1+cpu
Using GPU.


In [5]:
dataset  = load_dataset("go_emotions", "simplified")

train = dataset['train'].to_pandas()
validation = dataset['validation'].to_pandas()
test = dataset['test'].to_pandas()

Found cached dataset go_emotions (/user/HS400/ma04274/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
initial_emotion_dict = {0: 'admiration',
1: 'amusement',
2: 'anger',
3: 'annoyance',
4: 'approval',
5: 'caring',
6: 'confusion',
7: 'curiosity',
8: 'desire',
9: 'disappointment',
10: 'disapproval',
11: 'disgust',
12: 'embarrassment',
13: 'excitement',
14: 'fear',
15: 'gratitude',
16: 'grief',
17: 'joy',
18: 'love',
19: 'nervousness',
20: 'optimism',
21: 'pride',
22: 'realization',
23: 'relief',
24: 'remorse',
25: 'sadness',
26: 'surprise',
27: 'neutral'}

n_labels = len(initial_emotion_dict)+1

In [7]:
subsets = train.labels.value_counts().index[0:14]
print(subsets) #returns most frequent 13 indexes + other_emotions for the rest.
kept_labels = [index[0] for index in subsets]
print(kept_labels)
kept_labels = np.array(kept_labels)

Index([[27], [0], [4], [15], [1], [3], [18], [10], [7], [2], [20], [6], [17],
       [25]],
      dtype='object')
[27, 0, 4, 15, 1, 3, 18, 10, 7, 2, 20, 6, 17, 25]


In [8]:
N_CLASSES = len(kept_labels)+1  #for other_emotions

not_kept_labels = np.arange(0,28)
not_kept_labels = np.delete(not_kept_labels, kept_labels)

def one_hot_encoder(df):
    one_hot_encoding = []
    for i in tqdm(range(len(df))):
        temp = [0]*n_labels
        label_indices = df.iloc[i]["labels"]
        for index in label_indices:
            if index in kept_labels:
              temp[index] = 1
            else:
              temp[n_labels-1] = 1 #other_emotions become 1
        temp = np.delete(temp,not_kept_labels)
        one_hot_encoding.append(temp)
    return pd.DataFrame(one_hot_encoding)

In [9]:
train_labels = one_hot_encoder(train)
valid_labels = one_hot_encoder(validation)
test_labels = one_hot_encoder(test)

100%|██████████████████████████████████| 43410/43410 [00:02<00:00, 19976.10it/s]
100%|████████████████████████████████████| 5426/5426 [00:00<00:00, 20166.67it/s]
100%|████████████████████████████████████| 5427/5427 [00:00<00:00, 19936.28it/s]


In [10]:
train = pd.concat([train, train_labels], axis=1)
valid = pd.concat([validation, valid_labels], axis=1)
test = pd.concat([test, test_labels], axis=1)

In [11]:
train.head()

Unnamed: 0,text,labels,id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,My favourite food is anything I didn't have to...,[27],eebbqej,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,"Now if he does off himself, everyone will thin...",[27],ed00q6i,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,WHY THE FUCK IS BAYLESS ISOING,[2],eezlygj,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,To make her feel threatened,[14],ed7ypvh,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,Dirty Southern Wankers,[3],ed0bdzj,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [12]:
emotion_dict = dict()
for index,value in enumerate(np.sort(kept_labels)):
    emotion_dict[index] = initial_emotion_dict[value]
emotion_dict[len(kept_labels)] = 'other_emotions'
emotion_dict

{0: 'admiration',
 1: 'amusement',
 2: 'anger',
 3: 'annoyance',
 4: 'approval',
 5: 'confusion',
 6: 'curiosity',
 7: 'disapproval',
 8: 'gratitude',
 9: 'joy',
 10: 'love',
 11: 'optimism',
 12: 'sadness',
 13: 'neutral',
 14: 'other_emotions'}

In [18]:
LEARNING_RATE = 5e-5
MAX_LEN = 40
BATCH_SIZE = 64
N_EPOCHS = 10
DROPOUT = 0.3
THRESHOLD = 0.6

In [19]:
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.base import TransformerMixin, BaseEstimator

bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class GoEmotionDataset:
    def __init__(self, tokenizer, max_len: int = MAX_LEN, batch_size: int = BATCH_SIZE):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.batch_size = batch_size
        self.texts = None
        self.labels = None
        self.dataset = None
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        inputs = self.tokenizer.__call__(text,
                                        add_special_tokens = True,
                                        max_length = self.max_len,
                                        truncation = True,
                                        padding = 'max_length', 
                                        return_token_type_ids = True,
                                        return_attention_mask = True,
                                        verbose = True
                                        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_ids = inputs['token_type_ids']

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_ids":torch.tensor(token_ids, dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.long)
        }

    def transform(self, texts, labels=None):
        self.texts = texts
        if labels:
            self.labels = labels
        return self.dataset

    def fit(self, texts,labels):
        self.texts = texts
        self.labels = labels
        self.dataset = DataLoader(
            self,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )
        return self.dataset

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
train_dataset = GoEmotionDataset(tokenizer)
train_dataloader = train_dataset.fit(train.text.tolist(), train[range(N_CLASSES)].values.tolist())

In [36]:
class BERTClass(torch.nn.Module):
    def __init__(self, n_train_steps, n_classes, dropout):
        super(BERTClass, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(768, n_classes)
        self.n_train_steps = n_train_steps
        self.step_scheduler_after = "batch"
        
    def forward(self, ids, mask):

        hidden_state =  self.bert(input_ids=ids, attention_mask=mask)[0]

        pooled_output = hidden_state[:, 0]  

        pooled_output = self.dropout(pooled_output)

        logits = self.classifier(pooled_output)

        return logits

    def fit(self, train_dataloader):

        def loss_fn(outputs, targets):
            criterion = nn.BCEWithLogitsLoss()
            criterion = criterion.to(DEVICE)
            loss = criterion(outputs.view(-1, N_CLASSES), 
                          targets.float().view(-1, N_CLASSES))
            if targets is None:
                return None
            return loss

        optimizer = torch.optim.AdamW(params =  self.parameters(), lr=LEARNING_RATE)

        def ret_scheduler(optimizer, num_train_steps):
            sch = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
            return sch

        scheduler = ret_scheduler(optimizer, self.n_train_steps)

        def epoch_time(start_time, end_time):
            elapsed_time = end_time - start_time
            elapsed_mins = int(elapsed_time / 60)
            elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
            return elapsed_mins, elapsed_secs

        for epoch in range(N_EPOCHS):
            train_loss = 0.0
            self.train()  # Set the model to training mode

            for bi, d in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
                ids = d["ids"]
                mask = d["mask"]
                token_ids = d["token_ids"]
                targets = d["labels"]

                ids = ids.to(DEVICE, dtype=torch.long)
                mask = mask.to(DEVICE, dtype=torch.long)
                token_ids = token_ids.to(DEVICE,dtype=torch.long)
                targets = targets.to(DEVICE, dtype=torch.float)

                optimizer.zero_grad()
                outputs = self(ids=ids, mask=mask)
                
                loss = loss_fn(outputs, targets)
                loss.backward()
                train_loss += loss.item()
                optimizer.step()
                scheduler.step()

                self.zero_grad()

            print(train_loss/len(train_dataloader))

        return train_loss/len(train_dataloader)

    def predict(self, sentence):
        max_len = MAX_LEN

        inputs = tokenizer.__call__(sentence,
                            None,
                            add_special_tokens=True,
                            max_length=max_len,
                            padding="max_length",
                            truncation=True,
                            )
        
        ids = inputs['input_ids']
        ids = torch.tensor(ids, dtype=torch.long)
        mask = inputs['attention_mask']
        mask = torch.tensor(mask, dtype=torch.long)

        ids = ids.to(DEVICE, dtype=torch.long).unsqueeze(0)
        mask = mask.to(DEVICE, dtype=torch.long).unsqueeze(0)

        self.eval()
        logits = self(ids=ids, mask=mask)
        result = torch.sigmoid(logits)

        threshold = THRESHOLD
        valid_result = torch.ceil(result-threshold)

        return result, valid_result

In [37]:
n_train_steps = int(len(train) / BATCH_SIZE * 10)
model = BERTClass(n_train_steps,N_CLASSES,DROPOUT)
model.to(DEVICE)
model.fit(train_dataloader)

100%|█████████████████████████████████████████| 679/679 [02:45<00:00,  4.09it/s]

0.08460275066223692



100%|█████████████████████████████████████████| 679/679 [02:48<00:00,  4.03it/s]

0.040693302283582



100%|█████████████████████████████████████████| 679/679 [02:49<00:00,  4.01it/s]

0.030131448903813716



100%|█████████████████████████████████████████| 679/679 [02:49<00:00,  4.00it/s]

0.022828093044108686



100%|█████████████████████████████████████████| 679/679 [02:49<00:00,  4.00it/s]

0.01810119340034402



100%|█████████████████████████████████████████| 679/679 [02:49<00:00,  4.00it/s]

0.014018819489775552



100%|█████████████████████████████████████████| 679/679 [02:49<00:00,  4.00it/s]

0.010839861164429117



100%|█████████████████████████████████████████| 679/679 [02:49<00:00,  3.99it/s]

0.00864474419936461



100%|█████████████████████████████████████████| 679/679 [02:50<00:00,  3.99it/s]

0.007203779633901739



100%|█████████████████████████████████████████| 679/679 [02:50<00:00,  3.99it/s]

0.006113793565487058





0.006113793565487058

In [38]:
model.predict("Troll, bro. They know they're saying stupid shit. The motherfucker does nothing but stink up libertarian subs talking shit")

(tensor([[1.0308e-03, 2.1113e-03, 9.9846e-01, 1.5558e-03, 1.3114e-03, 8.4783e-04,
          6.4232e-04, 1.2279e-03, 1.8579e-03, 1.0868e-03, 9.7520e-04, 9.1735e-04,
          7.2227e-04, 1.2945e-03, 3.0550e-03]], device='cuda:0',
        grad_fn=<SigmoidBackward0>),
 tensor([[-0., -0., 1., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.]],
        device='cuda:0', grad_fn=<CeilBackward0>))

In [39]:
torch.save(model, './saved_model')