In [3]:
!pip install -q transformers

In [4]:
import os
import time
import random
import warnings
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics

import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader,RandomSampler,SequentialSampler

import transformers
from transformers import (AdamW,
                          WarmUp,
                          get_linear_schedule_with_warmup,
                          DistilBertTokenizer, 
                          DistilBertModel)

import nltk
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

SEED=42
warnings.filterwarnings(action="ignore",category=UserWarning)
%matplotlib inline

Mounted at /content/drive


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
class Config:
  patience=5
  batch_size=32
  num_workers=4
  lr=0.00003
  n_epochs=100
  load_weights_path="model/"
  save_file_name="model_weights_distilbert"
  MODEL_NAME="distilbert-base-uncased"

  huggingface_model="model/huggingFace"
  huggingface_tokenizer="model/huggingFace"

  device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED']=str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic=True
  torch.backends.cudnn.benchmark=True
seed_everything(SEED)  

In [8]:
def hamming_score(y_true,y_pred,normalize=True,sample_weight=None):
  acc_list=[]
  for i in range(y_true.shape[0]):
    set_true=set(np.where(y_true[i])[0])
    set_pred=set(np.where(y_pred[i])[0])
    tmp_a=None
    if len(set_true)==0 and len(set_pred)==0:
      tmp_a=1
    else:
      tmp_a=len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)))
    acc_list.append(tmp_a)
  return np.mean(acc_list)      

In [9]:
def get_model():
  tokenizer=DistilBertTokenizer.from_pretrained(Config.MODEL_NAME)
  SPECIAL_TOKENS_DICT={
     'sep_token':'[SEP]', 
     'pad_token':'[PAD]',
     'cls_token':'[CLS]' 
  }
  tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT)
  model=DistilBertModel.from_pretrained(Config.MODEL_NAME)
  model.resize_token_embeddings(len(tokenizer))
  return model,tokenizer

### EDA and PreProcessing the data

In [10]:
dataset=pd.read_csv("drive/MyDrive/dataset/Twitter_Instagram_Annotated1.csv",encoding='UTF-8')
dataset.head()

Unnamed: 0,ID,Text,Label
0,0,@ActuFoot_ @Betclic Raclé contre ce pays qui a...,
1,1,RT @realmarcel1: L'enchaînement du gros plan s...,
2,2,@Acermendax Je note que pour certain dénoncer ...,
3,3,RT @IlanLamar: @Cdanslair Le gros problème chè...,
4,4,RT @ricofiascojr: @AmraneHB @Marvel_Fit Aux US...,


In [11]:
dataset["Text"][0]

'@ActuFoot_ @Betclic Raclé contre ce pays qui a réfugié des milliers de nazis et qui a décimé sa population noir.'

In [12]:
def clean_text(data):
    """
    input: data: a dataframe containing texts to be cleaned
    return: the same dataframe with an added column of clean text
    """
    clean_data = data.copy()
    clean_data = clean_data[['Text', 'Label']]
    stop_words = stopwords.words('french')
    clean_text = []
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer()

    for idx in range(len(data)):
        text = clean_data['Text'][idx]
        label = clean_data['Label'][idx]
        text_lowercase = text.lower()
        text_no_stopwords = " ".join([word for word in text_lowercase.split() if word not in (stop_words)])
        text_tokenized = tokenizer.tokenize(text_no_stopwords)
        text_lemmatized = [lemmatizer.lemmatize(token) for token in text_tokenized]
        clean_text = ' '.join(text_lemmatized)
        clean_data['Text'][idx] = clean_text
    return clean_data

In [13]:
dataset=clean_text(dataset)

In [14]:
dataset["Label"].replace("None","none",inplace=True)

In [15]:
dataset["Label"].replace("none ","none",inplace=True)

In [16]:
dataset["Label"].replace("Sexism","sexism",inplace=True)

In [17]:
dataset["Label"].replace("sexism ","sexism",inplace=True)

In [18]:
dataset["Label"].replace("Sexism ","sexism",inplace=True)

In [19]:
dataset["Label"].replace("Homophobia","homophobia",inplace=True)

In [20]:
dataset["Label"].value_counts()

none           4012
homophobia     1197
Bullying        513
Hate_Speech     453
Racism          253
sexism          206
Name: Label, dtype: int64

In [21]:
labeldict={"none":0,"homophobia":1,"Bullying":2,"Hate_Speech":3,"Racism":4,"sexism":5}
dataset["Label"]=dataset["Label"].map(labeldict)

In [22]:
dataset["Label"].unique()

array([0, 3, 5, 4, 2, 1])

In [23]:
dataset["Label"] = dataset["Label"].apply(lambda x: [1 if i == x else 0 for i in range(6)])

In [24]:
data=pd.DataFrame()
data["text"]=dataset["Text"]
data["labels"]=dataset.iloc[:,1:].values

In [25]:
data.head()

Unnamed: 0,text,labels
0,actufoot_ betclic raclé contre pay a réfugié m...,"[1, 0, 0, 0, 0, 0]"
1,rt realmarcel1 l enchaînement gros plan l énor...,"[1, 0, 0, 0, 0, 0]"
2,acermendax note certain dénoncer harcèlement c...,"[1, 0, 0, 0, 0, 0]"
3,rt ilanlamar cdanslair gros problème chère mad...,"[1, 0, 0, 0, 0, 0]"
4,rt ricofiascojr amranehb marvel_fit usa c est ...,"[1, 0, 0, 0, 0, 0]"


In [26]:
class SentimentDataset(Dataset):
  def __init__(self,data,tokenizer,max_len):
    self.tokenizer=tokenizer
    self.data=data
    self.text=data.text
    self.targets=self.data.labels
    self.max_len=max_len

  def __len__(self):
    return len(self.text)

  def __getitem__(self,index):
    text=str(self.text[index])
    text=" ".join(text.split())

    inputs=self.tokenizer.encode_plus(
        text,
        None,
        padding="max_length",
        add_special_tokens=True,
        truncation=True,
        max_length=self.max_len,
        pad_to_max_length=True,
        return_token_type_ids=True
      )
    ids=inputs['input_ids']
    mask=inputs['attention_mask']
    token_type_ids=inputs['token_type_ids']

    # print(torch.tensor(self.targets[index],dtype=torch.float))
    return {
        'ids':torch.tensor(ids,dtype=torch.long),
        'mask':torch.tensor(mask,dtype=torch.long),
        'token_type_ids':torch.tensor(token_type_ids,dtype=torch.long),
        'targets':torch.tensor(self.targets[index],dtype=torch.float)
    }

In [None]:
train_size=0.8
train_data=data.sample(frac=train_size,random_state=seed_everything(SEED))
test_data=data.drop(train_data.index).reset_index(drop=True)
train_data=train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))


FULL Dataset: (6634, 2)
TRAIN Dataset: (5307, 2)
TEST Dataset: (1327, 2)


In [None]:
model,tokenizer=get_model()
training_set = SentimentDataset(train_data, tokenizer, 100)
testing_set = SentimentDataset(test_data, tokenizer, 100)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_params={'batch_size':Config.batch_size,
              'shuffle':True,
              'num_workers':Config.num_workers}

test_params={
    'batch_size':16,
    'shuffle':True,
    'num_workers':Config.num_workers
}              

training_loader=DataLoader(training_set,**train_params)
testing_loader=DataLoader(testing_set,**test_params)

In [None]:
class DistilBERTClass(torch.nn.Module):
  def __init__(self):
    super(DistilBERTClass,self).__init__()
    self.l1=model
    self.pre_classifier=torch.nn.Linear(768,768)
    self.dropout=torch.nn.Dropout(0.1)
    self.classifier=torch.nn.Linear(768,6)

  def forward(self, input_ids, attention_mask,token_type_ids):
    output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
    # print(f"output_1:{output_1}")
    hidden_state = output_1[0]
    # print(f"hidden_state:{hidden_state}")
    pooler = hidden_state[:, 0]
    # print(f"pooler:{pooler}")
    pooler = self.pre_classifier(pooler)
    pooler = torch.nn.Tanh()(pooler)
    pooler = self.dropout(pooler)
    output = self.classifier(pooler)
    # print(output.shape)
    return output  

model=DistilBERTClass()
model.to(Config.device)    


DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

In [None]:
def loss_fn(outputs,targets):
  return torch.nn.BCEWithLogitsLoss()(outputs,targets)

In [None]:
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=Config.lr)

In [None]:
def train(epoch):
  model.train()
  for _,data in tqdm(enumerate(training_loader,0)):
    ids=data["ids"].to(Config.device,dtype=torch.long)
    mask=data["mask"].to(Config.device,dtype=torch.long)
    token_type_ids=data["token_type_ids"].to(Config.device,dtype=torch.long)
    targets=data["targets"].to(Config.device,dtype=torch.float)

    outputs=model(ids,mask,token_type_ids)

    optimizer.zero_grad()
    loss=loss_fn(outputs,targets)
    if _%5000==0:
      print(f'Epoch: {epoch}, Loss:  {loss.item()}')

    loss.backward()
    optimizer.step()  

In [None]:
for epoch in range(Config.n_epochs):
  train(epoch)

In [None]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(Config.device, dtype = torch.long)
            mask = data['mask'].to(Config.device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(Config.device, dtype = torch.long)
            targets = data['targets'].to(Config.device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

0it [00:00, ?it/s]

In [None]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.7471740768651093
Hamming Loss = 0.08239135895503642


In [None]:
output_model_file = 'pytorch_distilbert_omdenaAI.pt'
output_vocab_file = 'vocab_distilbert_omdenaAI.pt'

torch.save({'model_state_dict': model.state_dict(),'best_score': val_hamming_score, 'epoch': epoch},  output_model_file)

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('Saved')

In [None]:
class AverageMeter(object):
  """Computes and stores the average and current value"""
  def __init__(self):
    self.reset()

  def reset(self):
    self.val=0
    self.avg=0
    self.sum=0
    self.count=0

  def update(self,val,n=1):
    self.val=val
    self.sum+=val*n
    self.count+=n
    self.avg=self.sum/self.count  

In [None]:
class Engine:
  def __init__(self, model, device, config, save_file_name = 'model_weights', weight_path='./'):
    self.train_loss=dict()
    self.valid_loss=dict()
    self.valid_acc=dict()
    fin_targets=[]
    fin_outputs=[]
    self.model=model
    self.config=config
    self.best_score=0
    self.best_loss=5000
    self.save_file_name=save_file_name
    self.weight_path=weight_path

  def fit(self,train_loader,valid_loader):
    num_train_steps=int(len(train_loader)/self.config.batch_size*self.config.epochs)
    self.optimizer=torch.optim.AdamW(self.model.parameters(),lr=self.config.lr)
    self.scheduler=get_linear_schedule_with_warmup(self.optimizer,num_warmup_steps=0,num_training_steps=num_train_steps)

    for epoch in range(self.config.n_epochs):
      t=time.time()
      print("Training started...")

      summary_loss=self.train_one_epoch(train_loader)
      self.train_loss[epoch]=summary_loss.avg

      print(f'Train : Epoch {epoch}: | Summary Loss: {summary_loss.avg} | Training time: {time.time() - t}')

      t=time.time()
      print("Validation Started...")

      summary_loss,_,_ = self.validation(valid_loader)
      self.valid_loss[epoch] = summary_loss.avg

      print(f'Valid : Epoch {epoch}: | Summary Loss: {summary_loss.avg} | Training time: {time.time() - t}')

      if not self.best_score:
        self.best_score=summary_loss.avg
        print(f"Saving model with lowest validation loss as {self.best_score}")
        self.model.eval()
        patience=self.config.patience
        torch.save({'model_state_dict': self.model.state_dict(),'best_score': self.best_score, 'epoch': epoch},  f"{self.weight_path}/{self.save_file_name}.pt")
        continue

      if summary_loss.avg<=self.best_score:
        self.best_score=summary_loss.avg
        patience=self.config.patience
        print(f"Improved model with lowest validation loss as {self.best_score}") 
      else:
        patience-=1
        print("Patience Reduced")
        if patience==0:
          print(f"Early stopping. Lowest validation loss achieved")   
          break

  def train_one_epoch(self,train_loader):
    self.model.train()
    t=time.time()
    summary_loss=AverageMeter()

    for steps,data in enumerate(tqdm(train_loader)):
      ids=data["ids"]
      mask=data["mask"]
      token_type_ids=data["token_type_ids"]
      targets=data["targets"]

      ids = ids.to(self.device, dtype=torch.long)
      mask = mask.to(self.device, dtype=torch.long)
      token_type_ids=token_type_ids.to(self.device,dtype=torch.long)
      targets = targets.to(self.device,dtype=torch.long)

      self.optimizer.zero_grad()
      outputs=self.model(input_ids=ids,attention_mask=mask)
      loss=loss_fn(outputs,targets)
      loss.backward()

      self.optimizer.step()
      self.scheduler.step()

      summary_loss.update(loss.detach().item(),self.config.batch_size)

      return summary_loss

  def validation(self,valid_loader,token_type_ids):
    self.model.eval()

    t=time.time()
    summary_loss=AverageMeter()    

    with torch.no_grad():
      for steps,data in enumerate(tqdm(valid_loader)):
        ids=data["ids"]
        mask=data["mask"]
        token_type_ids=data["token_type_ids"]
        targets=data["targets"]

        ids = ids.to(self.device, dtype=torch.long)
        mask = mask.to(self.device, dtype=torch.long)
        token_type_ids=token_type_ids.to(self.device,dtype=torch.long)
        targets = targets.to(self.device,dtype=torch.float)

        outputs=self.model(input_ids=ids,attention_mask=mask,token_type_ids=token_type_ids)
        loss =loss_fn(outputs,targets,token_type_ids)

        fin_targets.extend(targets.cpu().detach().numpy().tolist())
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

        summary_loss.update(loss.detach().item(),self.config.batch_size)

    return summary_loss,fin_outputs,fin_targets    


