# Multiclass Classification task with BERT

In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 27.4 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 73.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.0-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 72.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 75.3 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 71.3 MB/s 
Installing collected packag

In [2]:
!pip install Transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 8.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 65.0 MB/s 
Installing collected packages: tokenizers, Transformers
Successfully installed Transformers-4.21.1 tokenizers-0.12.1


In [3]:
import csv
import numpy as np
import pandas as pd
import re

from datasets import Dataset

from sklearn.model_selection import train_test_split


#Transformers library for BERT
import transformers
from transformers import BertModel
from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

import tensorflow as tf
import tensorflow_hub as hub

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import time

from google.colab import output
output.enable_custom_widget_manager()

In [4]:
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [5]:
df1 = pd.read_csv('Data/cleaned_tweet.csv')
df1 = df1.drop(columns=['Unnamed: 0'])

In [6]:
df1= df1.dropna()

In [7]:
df1 = df1[['Tweet',"Type"]]

{"not_cyberbullying": 0, "religion": 1, "age": 2, "gender": 3, "ethnicity": 4}

In [8]:
possible_labels = sorted(list(df1.Type.unique()))
possible_labels

[0, 1, 2, 3, 4]

In [9]:
df1

Unnamed: 0,Tweet,Type
0,"In other words #katandandre, your food was cra...",0
1,Why is #aussietv so white? #MKR #theblock #ImA...,0
2,@XochitlSuckkks a classy whore? Or more red ve...,0
3,"@Jason_Gio meh. :P thanks for the heads up, b...",0
4,@RudhoeEnglish This is an ISIS account pretend...,0
...,...,...
39397,"Black ppl aren't expected to do anything, depe...",4
39398,Turner did not withhold his disappointment. Tu...,4
39399,I swear to God. This dumb nigger bitch. I have...,4
39400,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,4


In [10]:
df1['Tweet'] = df1['Tweet'].apply(lambda x: text_preprocessing(x))

In [11]:
df1

Unnamed: 0,Tweet,Type
0,"In other words #katandandre, your food was cra...",0
1,Why is #aussietv so white? #MKR #theblock #ImA...,0
2,a classy whore? Or more red velvet cupcakes?,0
3,"meh. :P thanks for the heads up, but not too c...",0
4,This is an ISIS account pretending to be a Kur...,0
...,...,...
39397,"Black ppl aren't expected to do anything, depe...",4
39398,Turner did not withhold his disappointment. Tu...,4
39399,I swear to God. This dumb nigger bitch. I have...,4
39400,Yea fuck you RT IF YOURE A NIGGER FUCKING UNFO...,4


## Dataset split
As first thing I decided to take a little  sample of the original dataset, because BERT model is time consuming in order to make predictions.
Later, I split the sample in train and validation set.

In [12]:
#df = df1.sample(frac=0.05, replace=True, random_state=42)
df = df1.copy()
df.shape

(39401, 2)

In [13]:
max_len = max([len(sent) for sent in df.Tweet])
print('Max length: ', max_len)

Max length:  1734


In [14]:
#TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(df.index.values, 
                                                   df.Type.values,
                                                   test_size = 0.33,
                                                   random_state = 42,
                                                   stratify = df.Type.values)

In [15]:
df['data_type'] = ['not_set'] * df.shape[0]

In [16]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_test, 'data_type'] = 'test'

#groupby count
df.groupby(['Type', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Tweet
Type,data_type,Unnamed: 2_level_1
0,test,2601
0,train,5280
1,test,2630
1,train,5338
2,test,2606
2,train,5291
3,test,2580
3,train,5239
4,test,2586
4,train,5250


## BERT Tokenization

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True) #o AutoTokenizer with bert-base-uncased

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [18]:
MAX_LENGTH= 128

#encode train set
encoded_data_train = tokenizer.batch_encode_plus(df[df.data_type == 'train'].Tweet.values,
                                                add_special_tokens = True,
                                                return_attention_mask = True,
                                                pad_to_max_length = True,
                                                max_length = MAX_LENGTH,
                                                return_tensors = 'pt')
                                                
#encode validation set
encoded_data_test = tokenizer.batch_encode_plus(df[df.data_type == 'test'].Tweet.values,
                                                add_special_tokens = True,
                                                return_attention_mask = True,
                                                pad_to_max_length = True,
                                                max_length = MAX_LENGTH,
                                                return_tensors = 'pt')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [19]:
#train set
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].Type.values)

#test set
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df[df.data_type == 'test'].Type.values)

## BERT pre-trained model

In [20]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = len(possible_labels),
                                                      output_attentions = False,
                                                      output_hidden_states = False)

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [21]:
#Create dataloaders
from torch.utils.data import TensorDataset

#train set
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

#test set
dataset_test = TensorDataset(input_ids_test, 
                             attention_masks_test, 
                             labels_test)

In [22]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

#train set
dataloader_train = DataLoader(dataset_train,
                              sampler = RandomSampler(dataset_train),
                              batch_size = batch_size)

#validation set
dataloader_test = DataLoader(dataset_test,
                              sampler = RandomSampler(dataset_test),
                              batch_size = batch_size)


In [23]:
#Set Up Optimizer and Scheduler

from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                 lr = 1e-5,
                 eps = 1e-8) 
                 
epochs = 2

scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = len(dataloader_train)*epochs)



In [24]:
#Define evaluation performance

def evaluate(dataloader_val):

    #evaluation mode 
    model.eval()
    
    #tracking variables
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm_notebook(dataloader_val):
        
        #load into GPU
        batch = tuple(b.to(device) for b in batch)
        
        #define inputs
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}

        #compute logits
        with torch.no_grad():        
            outputs = model(**inputs)
        
        #compute loss
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        #compute accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    #compute average loss
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [25]:
#F1 Score
import numpy as np
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds, average = 'weighted')

In [26]:
#accuracy score
def accuracy_per_class(preds, labels):   
    #make prediction
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {possible_labels[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

## Train model

In [27]:
#Set seeds for reproducibility
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [28]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [29]:
from tqdm.notebook import tqdm_notebook #progress bar
import torch 

#to.device() unpacks our data from the DataLoader and load the data onto the GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

for epoch in tqdm_notebook(range(1, epochs+1)):
    model.to(device)
    model.train()
    
    loss_train_total = 0
    
    progress_bar = tqdm_notebook(dataloader_train, 
                        desc = 'Epoch {:1d}'.format(epoch), 
                        leave = False, 
                        disable = False)
    
    for batch in progress_bar:
        
        model.zero_grad() #set gradient to 0
    
        batch = tuple(b.to(device) for b in batch)
        
#input will take three input: ids, attention_mask and labels
        inputs = {'input_ids': batch[0], 
                  'attention_mask': batch[1], 
                  'labels': batch[2]}
        
        outputs = model(**inputs) #unpack the dict straight into inputs
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
#clips the norm of the gradients to 1.0 to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step() #updates the model’s parameters
        scheduler.step() #updates learning rate
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})
            
    tqdm_notebook.write(f'\n Epoch {epoch}')
    
    loss_train_ave = loss_train_total / len(dataloader_train)
    tqdm_notebook.write(f'Training loss: {loss_train_ave}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_test)

    tqdm_notebook.write(f'Validation loss: {val_loss}')


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1650 [00:00<?, ?it/s]


 Epoch 1
Training loss: 0.3154056619677805


  0%|          | 0/813 [00:00<?, ?it/s]

Validation loss: 0.20933804220175153


Epoch 2:   0%|          | 0/1650 [00:00<?, ?it/s]


 Epoch 2
Training loss: 0.15880730627810188


  0%|          | 0/813 [00:00<?, ?it/s]

Validation loss: 0.20598869557093788


In [30]:
_, predictions, true_vals = evaluate(dataloader_test)

  0%|          | 0/813 [00:00<?, ?it/s]

In [31]:
accuracy_per_class(predictions, true_vals)

Class: 0
Accuracy:2296/2601

Class: 1
Accuracy:2547/2630

Class: 2
Accuracy:2551/2606

Class: 3
Accuracy:2317/2580

Class: 4
Accuracy:2529/2586

