In [None]:
!pip install -q lightning
!pip install -q transformers
!pip install -q torchmetrics
!pip install -q watermark
!pip install -q torchsampler

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.4/562.4 kB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import time
import random
import warnings
import os.path as op
from google.colab import drive
drive.mount("/content/drive")

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

import lightning as L
from watermark import watermark
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

import torch
import torchmetrics
import torch.nn as nn
from tqdm.notebook import tqdm
import torch.nn.functional as F
from torchsampler import ImbalancedDatasetSampler
from torch.utils.data import WeightedRandomSampler,Dataset, DataLoader,TensorDataset

import transformers
from transformers import (
    AdamW,
    WarmUp,
    get_linear_schedule_with_warmup,
    DistilBertTokenizer,
    DistilBertModel
)

import nltk
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

SEED=42
warnings.filterwarnings(action="ignore",category=UserWarning)
%matplotlib inline

Mounted at /content/drive


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def seed_everything(seed):
  random.seed(seed)
  os.environ["PYTHONHASHSEED"]=str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic=True
  torch.backends.cudnn.benchmark=True
seed_everything(SEED)

In [None]:
class Config:
  batch_size=32
  num_workers=4
  lr=0.00003
  epochs=20
  load_weights_path="model/"
  save_file_name="model_weights_distilbert_lightning_v1"
  MODEL_NAME="distilbert-base-uncased"
  device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def get_model():
  tokenizer=DistilBertTokenizer.from_pretrained(Config.MODEL_NAME)
  SPECIAL_TOKENS_DICT={
     'sep_token':'[SEP]', 
     'pad_token':'[PAD]',
     'cls_token':'[CLS]' 
  }
  tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT)
  model=DistilBertModel.from_pretrained(Config.MODEL_NAME)
  model.resize_token_embeddings(len(tokenizer))
  return model,tokenizer

## EDA

In [None]:
dataset=pd.read_csv("drive/MyDrive/dataset/Twitter_Instagram_Annotated1.csv",encoding='UTF-8')
dataset.head()

Unnamed: 0,ID,Text,Label
0,0,@ActuFoot_ @Betclic Raclé contre ce pays qui a...,
1,1,RT @realmarcel1: L'enchaînement du gros plan s...,
2,2,@Acermendax Je note que pour certain dénoncer ...,
3,3,RT @IlanLamar: @Cdanslair Le gros problème chè...,
4,4,RT @ricofiascojr: @AmraneHB @Marvel_Fit Aux US...,


In [None]:
def clean_text(data):
    """
    input: data: a dataframe containing texts to be cleaned
    return: the same dataframe with an added column of clean text
    """
    clean_data = data.copy()
    clean_data = clean_data[['Text', 'Label']]
    stop_words = stopwords.words('french')
    clean_text = []
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer()

    for idx in range(len(data)):
        text = clean_data['Text'][idx]
        label = clean_data['Label'][idx]
        text_lowercase = text.lower()
        text_no_stopwords = " ".join([word for word in text_lowercase.split() if word not in (stop_words)])
        text_tokenized = tokenizer.tokenize(text_no_stopwords)
        text_lemmatized = [lemmatizer.lemmatize(token) for token in text_tokenized]
        clean_text = ' '.join(text_lemmatized)
        clean_data['Text'][idx] = clean_text
    return clean_data

In [None]:
dataset=clean_text(dataset)

In [None]:
dataset["Label"].replace("None","none",inplace=True)

In [None]:
dataset["Label"].replace("none ","none",inplace=True)

In [None]:
dataset["Label"].replace("Sexism","sexism",inplace=True)

In [None]:
dataset["Label"].replace("sexism ","sexism",inplace=True)

In [None]:
dataset["Label"].replace("Sexism ","sexism",inplace=True)

In [None]:
dataset["Label"].replace("Homophobia","homophobia",inplace=True)

In [None]:
dataset['Label'].value_counts()

none           4012
homophobia     1197
Bullying        513
Hate_Speech     453
Racism          253
sexism          206
Name: Label, dtype: int64

In [None]:
labeldict={"none":0,"homophobia":1,"Bullying":2,"Hate_Speech":3,"Racism":4,"sexism":5}
dataset["Label"]=dataset["Label"].map(labeldict)

In [None]:
data=pd.DataFrame()
data["text"]=dataset["Text"]
data["labels"]=dataset.iloc[:,1:].values

In [None]:
data.head()

Unnamed: 0,text,labels
0,actufoot_ betclic raclé contre pay a réfugié m...,0
1,rt realmarcel1 l enchaînement gros plan l énor...,0
2,acermendax note certain dénoncer harcèlement c...,0
3,rt ilanlamar cdanslair gros problème chère mad...,0
4,rt ricofiascojr amranehb marvel_fit usa c est ...,0


In [None]:
class LightningSentimentDataset(nn.Module):
    def __init__(self,data,tokenizer,max_len=128):
        self.data=data
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.text=self.data.text
        self.targets=self.data.labels

    def __len__(self):
        return len(self.text)

    def __getitem__(self,index):
        text=str(self.text[index])
        text=" ".join(text.split())

        inputs=self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        input_ids=inputs["input_ids"]
        attention_mask=inputs["attention_mask"]

        return {
            "input_ids":torch.tensor(input_ids,dtype=torch.long),
            "attention_mask":torch.tensor(attention_mask,dtype=torch.long),
            "targets":torch.tensor(self.targets[index],dtype=torch.long)
        }
    
    def __len__(self):
        return len(self.text)

In [None]:
#@title I'll be back later
class DistilBertClass(nn.Module):
  def __init__(self,num_features,num_classes):
    super(DistilBertClass,self).__init__()
    self.l1=model,_=get_model()
    self.classifier=nn.Sequential(
        nn.Linear(num_features,768),
        nn.Tanh(),
        nn.Dropout(0.8),
        nn.Linear(768,num_classes)
    )

    def forward(self,input_ids,attention_mask):
      output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
      hidden_state=output_1[0]
      pooler=hidden_state[:,0]
      logits=self.classifier(pooler)
      return logits 

In [None]:
class LightningModel(L.LightningModule):
    def __init__(self,learning_rate,num_features,num_classes):
      super(LightningModel,self).__init__()
      self.learning_rate=learning_rate
      self.save_hyperparameters(ignore=['model'])
      self.train_acc=torchmetrics.Accuracy(task="multiclass",num_classes=6)
      self.val_acc=torchmetrics.Accuracy(task="multiclass",num_classes=6)
      self.num_features=num_features
      self.num_classes=num_classes
      self.l1,_=get_model()
      
      self.classifier=nn.Sequential(
          nn.Linear(self.num_features,768),
          nn.Tanh(),
          nn.Dropout(0.8),
          nn.Linear(768,self.num_classes)
        )
        
    def forward(self,input_ids,attention_mask):
      output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
      hidden_state=output_1[0]
      pooler=hidden_state[:,0]
      logits=self.classifier(pooler)
      return logits 
    
    def training_step(self,batch,true_labels):
      outputs={
          'input_ids':batch["input_ids"],
          'attention_mask':batch["attention_mask"]
      }
      true_labels=batch["targets"]
      logits=self.forward(**outputs)
      loss=nn.CrossEntropyLoss()(logits,true_labels)
      self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
      with torch.no_grad():
        predicted_labels=torch.argmax(logits,dim=1)
        self.train_acc(predicted_labels, true_labels)
        self.log("train_acc", self.train_acc, on_epoch=True, on_step=False)
      return loss
    
    def validation_step(self,batch,true_labels):
      outputs={
          'input_ids':batch["input_ids"],
          'attention_mask':batch["attention_mask"]
      }
      true_labels=batch["targets"]
      logits=self.forward(**outputs)
      loss=nn.CrossEntropyLoss()(logits,true_labels)
      self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
      predicted_labels=torch.argmax(logits,dim=1)
      self.val_acc(predicted_labels,true_labels)
      self.log("val_acc",self.val_acc,prog_bar=True)
    
    def configure_optimizers(self):
      optimizer=torch.optim.AdamW(self.parameters(),lr=self.learning_rate)
      return optimizer

In [None]:
if __name__=='__main__':
  print(watermark(packages="torch,lightning,transformers", python=True), flush=True)
  print("Torch CUDA available?", torch.cuda.is_available(), flush=True)

  ###################### Set up DataLoaders####################
  _,tokenizer=get_model()

  train_size=0.8
  train_data=data.sample(frac=train_size,random_state=seed_everything(SEED))
  test_data=data.drop(train_data.index).reset_index(drop=True)
  train_data=train_data.reset_index(drop=True)
  training_set = LightningSentimentDataset(train_data, tokenizer, 100)
  testing_set = LightningSentimentDataset(test_data, tokenizer, 100)

  train_loader=DataLoader(
    training_set,
    batch_size=Config.batch_size,
    num_workers=Config.num_workers)

  val_loader=DataLoader(
      testing_set,
      batch_size=Config.batch_size,
      num_workers=Config.num_workers)
  
  lightning_model=LightningModel(learning_rate=Config.lr,num_features=768,num_classes=6)
  
  callbacks = [
        ModelCheckpoint(save_top_k=1, mode="max", monitor="val_acc")  # save top 1 model
    ]
  logger=CSVLogger(save_dir="logs/",name=Config.save_file_name)

  trainer=L.Trainer(
      max_epochs=Config.epochs,
      callbacks=callbacks,
      accelerator="gpu",
      precision="16-mixed",
      logger=logger,
      log_every_n_steps=10,
      )
  
  start = time.time()
  trainer.fit(lightning_model,train_loader,val_loader)

  end=time.time()
  elapsed = end - start
  print(f"Time elapsed {elapsed/60:.2f} min")

  test_acc = trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")
  print(test_acc)

  with open(op.join(trainer.logger.log_dir, "outputs.txt"), "w") as f:
    f.write((f"Time elapsed {elapsed/60:.2f} min\n"))
    f.write(f"Test acc: {test_acc}")


Python implementation: CPython
Python version       : 3.10.11
IPython version      : 7.34.0

torch       : 2.0.1+cu118
lightning   : 2.0.2
transformers: 4.29.2

Torch CUDA available? True


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- T

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=20` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


Time elapsed 7.21 min


NameError: ignored