In [1]:
import datasets
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, ConcatDataset
from transformers import BertModel, BertConfig, DistilBertModel, DistilBertTokenizer, DistilBertConfig
from torch.optim import AdamW
import torch.nn as nn
from transformers import get_scheduler
import torch
from tqdm.notebook import tqdm
import evaluate
import random
import argparse
import os
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize

In [3]:
global device
global tokenizer
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

def tokenize_function(example):
    return tokenizer(example, padding="max_length", truncation=True,return_attention_mask=True, return_token_type_ids=True, return_tensors='pt')

cuda


Here, I have created my Custom Dataset which gives a dict of tokenized_data, labels and the actual string.

using batch_size = 16 with shuffling on.

Train:Test split is 750:250 here, since there is less data, preferred only 2 splits.

In [4]:
from scipy import rand
import csv
from torch.utils.data import Dataset
from collections import defaultdict

inv_mapping = {}

class myDataset(Dataset):
    def __init__(self, csv_file):
        self.data = []
        self.mapping = {}
        self.labels = []
        self.distribution = defaultdict()
        with open(csv_file, 'r') as csvfile:
            csvreader = csv.reader(csvfile)
            ignore_header = 1
            for row in csvreader:
                if ignore_header:
                    ignore_header = 0
                    continue
                row = tuple(row)
                if row[1] in self.distribution.keys():
                  self.distribution[row[1]]+=1
                else:
                  self.distribution[row[1]] = 1
                self.labels.append(row[1])
                self.data.append(row)
        global unique_labels, inv_mapping
        unique_labels = set(self.labels)
        counter = 0
        for label in unique_labels:
            self.mapping[label] = counter
            inv_mapping[counter] = label
            counter+=1

    def __len__(self):
        return len(self.data)
    
    def get_dist(self):
        return self.distribution 
        
    def __getitem__(self, index):
        item = self.data[index]
        tokenize_input = tokenize_function(item[0]+' [SEP] '+ item[1])
        tokenize_input["label"] = self.mapping[item[1]]
        tokenize_input["text"] = item[0]
        return tokenize_input

def countlabels(train_dataset):
  train_labels = {}
  for x in train_dataset:
    if inv_mapping[x['label']] in train_labels.keys():
      train_labels[inv_mapping[x['label']]]+=1
    else:
      train_labels[inv_mapping[x['label']]]=1
  return train_labels


dataset = myDataset('assignment_B.csv')
print(dataset.get_dist())
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [750, 250])
train_labels = countlabels(train_dataset)
print(train_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

defaultdict(None, {'family': 64, 'work': 382, 'money': 99, 'medical': 99, 'emotional': 347, 'zp': 7, 'food': 1, 'miscellaneous': 1})
{'work': 287, 'emotional': 260, 'money': 77, 'food': 1, 'family': 45, 'medical': 75, 'zp': 4, 'miscellaneous': 1}


Taking the sentence embedding i.e. zero-th embedding and projected into 768-d (the last operation was layer normalization in DistilBert) followed by a non-linear activation (i have chosen Tanh here), dropout and finally projected to #labels-dimension(the logits)

In [5]:
class Model(torch.nn.Module):
    def __init__(self, bert, num_classes):
        super(Model, self).__init__()
        self.bert = bert
        for key, value in self.bert.named_parameters():
          value.requires_grad = True
        self.num_classes = num_classes
        self.linear1 = nn.Linear(768,768)
        self.dropout1 = nn.Dropout(0.3)
        self.linear2 = nn.Linear(768,self.num_classes)        

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        x = outputs[0][:,0,:]
        x = self.linear1(x)
        x = nn.Tanh()(x)
        x = self.dropout1(x)
        x = self.linear2(x)
        return x

In [6]:
distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased',dropout=0.2)
model = Model(distilbert,len(unique_labels))
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay = 0.01)
n_epochs = 3
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.2, inplace=False)
            (lin1): Linear(in_feature

In [7]:
def do_eval(eval_dataloader, print_count = 0):
    model.eval()
    metric = evaluate.load("accuracy")
    counter = 0
    for batch in tqdm(eval_dataloader):
        batch_input_ids = batch['input_ids'].squeeze(1).to(device)
        batch_token_type_ids = batch['token_type_ids'].squeeze(1).to(device)
        batch_attention_mask = batch['attention_mask'].squeeze(1).to(device)
        batch_labels = batch['label'].float().to(device)
        outputs = model(batch_input_ids,batch_attention_mask,batch_token_type_ids)
        _, predictions = torch.max(outputs,1)
        metric.add_batch(predictions=predictions, references=batch_labels)
        wrong_indices = torch.nonzero(predictions!=batch_labels)
        for i in range(wrong_indices.shape[0]):
            if counter == print_count:
                break
            counter+=1
            index = wrong_indices[i,0].item()
            print(f"{counter}, Text: {batch['text'][index]}, Actual prediction: {inv_mapping[batch_labels[index].item()]} My Prediction: {inv_mapping[predictions[index].item()]}")
    score = metric.compute()
    
    return score

In [8]:
num_training_steps = n_epochs * len(train_loader)
progress_bar = tqdm(range(num_training_steps))
lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

for epoch in range(n_epochs):
    model.train()
    metric = evaluate.load("accuracy")
    for batch in train_loader:
        batch_input_ids = batch['input_ids'].squeeze(1).to(device)
        batch_token_type_ids = batch['token_type_ids'].squeeze(1).to(device)
        batch_attention_mask = batch['attention_mask'].squeeze(1).to(device)
        batch_labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(batch_input_ids,batch_attention_mask,batch_token_type_ids)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        _, predictions = torch.max(outputs,1)
        metric.add_batch(predictions=predictions, references=batch_labels)
    print("Training Accuracy: ", metric.compute())
    print("Test Accuracy: ",do_eval(test_loader,print_count=0))
        

  0%|          | 0/141 [00:00<?, ?it/s]

Training Accuracy:  {'accuracy': 0.6226666666666667}


  0%|          | 0/16 [00:00<?, ?it/s]

Test Accuracy:  {'accuracy': 0.96}
Training Accuracy:  {'accuracy': 0.9813333333333333}


  0%|          | 0/16 [00:00<?, ?it/s]

Test Accuracy:  {'accuracy': 0.988}
Training Accuracy:  {'accuracy': 0.992}


  0%|          | 0/16 [00:00<?, ?it/s]

Test Accuracy:  {'accuracy': 0.988}


Wrong Predictions as per ground truth: 

I think all these 3 are multi-labelled, they have multiple topics in the same sentences. Even the model's predictions are reasonable here.



In [9]:
do_eval(test_loader,10)

  0%|          | 0/16 [00:00<?, ?it/s]

1, Text: this site does not work, Actual prediction: zp My Prediction: work
2, Text: i am going to stop the zp challenge  and do it later in the year as i've been working crazy hours and not sleeping well do my health, exerxise,and eating right are way off, Actual prediction: zp My Prediction: medical
3, Text: how do i signup to this zp better choices to make better choices and win money?, Actual prediction: zp My Prediction: money


{'accuracy': 0.988}

In [10]:
do_eval(train_loader,10)

  0%|          | 0/47 [00:00<?, ?it/s]

1, Text: being afraid of  being fired  because of non productive work, Actual prediction: miscellaneous My Prediction: work
2, Text: i made a comment yesterday in a fit. i am trying to remove it so i don't get in trouble at work. however i can't find it. i just had to blow off some steam. irresponsible people drive me nuts. how can i find it to remove it? thx for being a sounding board!, Actual prediction: zp My Prediction: emotional
3, Text: money, Actual prediction: zp My Prediction: money
4, Text: food , house payment, house insurance, about to loose my car, can’t pay car insurance. not enough money to pay bills or get food or medicine, Actual prediction: food My Prediction: medical
5, Text: how does this app work, Actual prediction: zp My Prediction: work
6, Text: so how does this work, Actual prediction: zp My Prediction: work


{'accuracy': 0.992}