In [None]:
%%capture

!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install huggingface_hub

In [None]:
%%capture

import torch
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import pipeline
from datasets import load_dataset
import nltk
nltk.download('punkt')
from torch import nn
from transformers import TrainingArguments
from transformers import Trainer
##others
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["WANDB_DISABLED"] = "true"

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd

/content


In [None]:
data_path= "/content/drive/My Drive/Colab Notebooks/natural-language-processing/clean_copy.csv"

In [None]:
##reading dataset
data= pd.read_csv(data_path)

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,clean_tweet,label,agreement
0,0,amp big homie meanboy stegman st,0.0,1.0
1,1,im thinking devoting career proving autism isn...,1.0,1.0
2,2,vaccines vaccinate child,-1.0,1.0
3,3,mean immunize kid something wont secretly kill...,-1.0,1.0
4,4,thanks catch performing la nuit nyc st ave sho...,0.0,1.0


In [None]:
##Dropping Unnamed: 0 column
data.isna().sum()

Unnamed: 0      0
clean_tweet    29
label           0
agreement       0
dtype: int64

In [None]:
data[data["clean_tweet"].isnull()]

Unnamed: 0.1,Unnamed: 0,clean_tweet,label,agreement
444,444,,0.0,1.0
1523,1523,,0.0,1.0
2155,2155,,0.0,1.0
2515,2515,,0.0,1.0
3062,3062,,0.0,0.666667
3204,3204,,0.0,1.0
3819,3819,,1.0,0.666667
4631,4631,,0.0,1.0
4638,4638,,0.0,1.0
4770,4770,,0.0,1.0


In [None]:
##All missing values dropped

data= data.dropna()
data= data.drop("Unnamed: 0", axis=1)

In [None]:
##before splitting I will convert each tweet row to a tuple since that't the acceptable format

data['clean_tweet'] = data['clean_tweet'].apply(lambda tweet: tuple(tweet.split(),))

##ii. Splitting Data

In [None]:
train_set, eval_set= train_test_split(data, test_size= 0.2, stratify= data["label"])

In [None]:
train_set

Unnamed: 0,clean_tweet,label,agreement
635,"(free, backtoschool, immunizations, offered, a...",1.0,0.666667
6229,"(rang, dibiasimb, brightside, tavern)",0.0,1.000000
1516,"(cnn, measles, outbreak, fuels, vaccine, debat...",1.0,1.000000
2949,"(definition, stupidity, try, something, wait, ...",0.0,0.666667
2578,"(people, vaccinate, children, let, lesson, ple...",1.0,1.000000
...,...,...,...
349,"(aids, n, children, hiv, often, lack, measles,...",0.0,0.666667
3163,"(university, study, flu, vaccine, effectivenes...",-1.0,1.000000
2007,"(vaccinations, fail, prevent, disease, proven,...",-1.0,0.333333
8564,"(current, cases, attributable, travel, countri...",0.0,1.000000


In [None]:
##saving my train and eval set

train_set.to_csv("/content/train_set.csv")
eval_set.to_csv("/content/eval_set.csv")

##iii. Datasets Loading

In [None]:
##ensuring my dataset is in the right format for deep learning.

dataset= load_dataset("csv", data_files={"train_set":"train_set.csv", "eval_set":"eval_set.csv" }, encoding= "ISO-8859-1")

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train_set split: 0 examples [00:00, ? examples/s]

Generating eval_set split: 0 examples [00:00, ? examples/s]

In [None]:
##dataset viewing
dataset

DatasetDict({
    train_set: Dataset({
        features: ['Unnamed: 0', 'clean_tweet', 'label', 'agreement'],
        num_rows: 7976
    })
    eval_set: Dataset({
        features: ['Unnamed: 0', 'clean_tweet', 'label', 'agreement'],
        num_rows: 1994
    })
})

##Tokenization

In [None]:
##instatiating tokenizer
tokenizer= AutoTokenizer.from_pretrained("roberta-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
## the labels are -1, 0, 1 and we will like to transform them respectively into 0,1,2

def transform_labels(input):
  label= input["label"]
  num =0

  if label== -1:
    num= 0
  elif label== 0:
    num =1
  elif label == 1:
    num = 2
  return {"labels": num}

def tokenize(example):
  return tokenizer(example["clean_tweet"], padding= "max_length", truncation=True, return_tensors= "pt")


In [None]:
##tokenizing words and removing all unnecessary column

dataset= dataset.map(tokenize, batched= True)
remove_columns= ['Unnamed: 0', 'clean_tweet', 'label', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

Map:   0%|          | 0/7976 [00:00<?, ? examples/s]

Map:   0%|          | 0/1994 [00:00<?, ? examples/s]

Map:   0%|          | 0/7976 [00:00<?, ? examples/s]

Map:   0%|          | 0/1994 [00:00<?, ? examples/s]

In [None]:
##dataset viewing
dataset

DatasetDict({
    train_set: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7976
    })
    eval_set: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1994
    })
})

##ii. Modelling

In [None]:
##loading my model and instantiating it to have three heads for my three classes

model= AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels= 3)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
##defining my computing metric. I am using f1-score since the dataset is imbalance

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  return {"f1": f1}

In [None]:
#batch size setting
batch_size= 16

In [None]:
##instantiating my trainning arguments

training_args = TrainingArguments( output_dir="Finetuned-Roberta-Base-Sentiment-identifier",
                                   num_train_epochs=10,
                                   load_best_model_at_end=True,
                                   evaluation_strategy="steps",
                                   save_strategy="steps",
                                   push_to_hub=True
                                  )

In [None]:
##setting a shuffle seed to prevent randomization at each rerun
train_dataset= dataset['train_set'].shuffle(seed=10)
eval_dataset= dataset['eval_set'].shuffle(seed=10)

In [None]:
##connecting to my hugginface profile

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
##loading training arguments
trainer= Trainer(
    model= model,
      args= training_args,
      train_dataset= train_dataset,
      eval_dataset= eval_dataset,
      tokenizer= tokenizer,
      compute_metrics=compute_metrics

)

In [None]:



##training the model

trainer.train()

Step,Training Loss,Validation Loss,F1
500,0.8545,0.825081,0.642813
1000,0.7952,0.783138,0.644541
1500,0.7962,0.793544,0.649463
2000,0.7669,0.754446,0.652032
2500,0.7468,0.761416,0.672393
3000,0.76,0.733157,0.662205
3500,0.7352,0.865141,0.603566
4000,0.7454,0.741987,0.658407
4500,0.7302,0.765233,0.657342
5000,0.7099,0.73718,0.669748


TrainOutput(global_step=9970, training_loss=0.7369963099748464, metrics={'train_runtime': 8977.3387, 'train_samples_per_second': 8.885, 'train_steps_per_second': 1.111, 'total_flos': 2.098592619798528e+16, 'train_loss': 0.7369963099748464, 'epoch': 10.0})

In [None]:
# Launch the final evaluation
trainer.evaluate()

{'eval_loss': 0.7331567406654358,
 'eval_f1': 0.6622045631874821,
 'eval_runtime': 59.6305,
 'eval_samples_per_second': 33.439,
 'eval_steps_per_second': 4.192,
 'epoch': 10.0}

In [None]:
##pushing my trained model together with the results to hugginface

trainer.push_to_hub()

'https://huggingface.co/HerbertAIHug/Finetuned-Roberta-Base-Sentiment-identifier/tree/main/'

##Imbalance Handling

In [None]:
##setting my classweights, and giving my lower class(0) a higher weight

class_weights= (1-(data["label"].value_counts().sort_index() /len(data))).values
class_weights

array([0.89618857, 0.50992979, 0.59388164])

In [None]:
##pusing the weight to my gpu
class_weights= torch.from_numpy(class_weights).float().to("cuda")

In [None]:
##creating a custom class to enable the classweight

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"]
        inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits.float()
        labels = labels
        loss_func = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
##instantiating my training arguments

weight_training_args = TrainingArguments(

   output_dir="Roberta-classweight-Sentiment-identifier",
   num_train_epochs=5, load_best_model_at_end=True, weight_decay=0.01, evaluation_strategy="steps",save_strategy="steps",push_to_hub=True

)

In [None]:
##setting my seed to prevent randomization
train_dataset= dataset['train_set'].shuffle(seed=12)
eval_dataset= dataset['eval_set'].shuffle(seed=12)



In [None]:
##instantiating trainer
class_trainer = WeightedLossTrainer(
      model= model,
      args= weight_training_args,
      train_dataset= train_dataset,
      eval_dataset= eval_dataset,
      tokenizer= tokenizer,
      compute_metrics=compute_metrics )

In [None]:
##training my model
class_trainer.train()

Step,Training Loss,Validation Loss,F1
500,0.8212,1.157472,0.248156
1000,0.8156,0.816441,0.642135
1500,0.8291,0.815836,0.659205
2000,0.8044,0.831459,0.637163
2500,0.7937,0.836249,0.661771
3000,0.8094,0.818811,0.67031
3500,0.833,0.844671,0.664955
4000,0.8259,0.847842,0.663132
4500,0.8374,0.825473,0.662561


TrainOutput(global_step=4985, training_loss=0.8165188436402959, metrics={'train_runtime': 4462.8653, 'train_samples_per_second': 8.936, 'train_steps_per_second': 1.117, 'total_flos': 1.049296309899264e+16, 'train_loss': 0.8165188436402959, 'epoch': 5.0})

In [None]:
class_trainer.evaluate()


{'eval_loss': 0.8158360123634338,
 'eval_f1': 0.6592052935678993,
 'eval_runtime': 61.7523,
 'eval_samples_per_second': 32.29,
 'eval_steps_per_second': 4.048,
 'epoch': 5.0}

In [None]:
class_trainer.push_to_hub()

'https://huggingface.co/HerbertAIHug/Roberta-classweight-Sentiment-identifier/tree/main/'