<a href="https://colab.research.google.com/github/JumanaKhrais/Transformer-Based-Deep-Learning-Models-for-Sarcasm-Detection-with-an-Imbalanced-Dataset./blob/main/Roberta_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mounting Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive') #this line to have the ability to read from and load to drive

Mounted at /content/drive


**Importing Libraries**

In [None]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 4.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.8 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 41.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 27.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch

from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification, AutoTokenizer 
from transformers import BertForSequenceClassification

**Reading Dataset + Handling Null Values**


In [None]:
# Reading the dataset
pathT = "drive/MyDrive/TrainEnglish.csv"
train  = pd.read_csv(pathT)

In [None]:
data = train[['tweet', 'sarcastic']].copy()


In [None]:
data.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True)

In [None]:
data.shape

(3467, 2)

**Defining Model Tokenizer + 
Defining Pretrained Model**

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", return_dict=True, num_labels =2)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

**Reading Validation and Training Datasets**

In [None]:
pathT = "drive/MyDrive/train.csv"
train  = pd.read_csv(pathT)

In [None]:
pathT = "drive/MyDrive/valid.csv"
valid = pd.read_csv(pathT)

In [None]:
train.shape

(2946, 2)

In [None]:
valid.shape

(521, 2)

**Preparing Data for the Tokenizer**

In [None]:
#converting the columns into list of strings 

In [None]:
tr = list(train['tweet'])
vl = list(valid['tweet'])
trainL = list(train['sarcastic'])
vlL = list(valid['sarcastic']) 

In [None]:
tokenizedTrain = tokenizer(tr, padding=True, truncation=True, max_length=80)  

In [None]:
tokenizedvalid = tokenizer(vl , padding=True , truncation=True,  max_length=80)

In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])



In [None]:
trainDataset= Dataset(tokenizedTrain, trainL)
validDataset= Dataset(tokenizedvalid, vlL)

**Reading + Tokenizing + Creating tourch test dataset**

In [None]:
# Load test data
test= pd.read_csv("drive/MyDrive/TestEnglish.csv")


In [None]:
print(test.shape)
testData = list(test['text'])
testDatatokenized = tokenizer(testData, padding=True, truncation=True, max_length=80)
# Create torch dataset
test_dataset = Dataset(testDatatokenized)

(1400, 2)


**Fine-Tunning the Pretrained Model**

In [None]:
#defining function to comput the metrics

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

freeze_layer_count = 9
if freeze_layer_count:
	      # We freeze here the embeddings of the model
        for param in model.roberta.embeddings.parameters():
            param.requires_grad = False

        if freeze_layer_count != -1:
	          # if freeze_layer_count == -1, we only freeze the embedding layer
	          # otherwise we freeze the first `freeze_layer_count` encoder layers
            for layer in model.roberta.encoder.layer[:9]:
                for param in layer.parameters():
                    param.requires_grad = False
            for layer in model.roberta.encoder.layer[9:12]:
                for param in layer.parameters():
                    param.requires_grad = True




# Define Trainer
args = TrainingArguments(
    output_dir="drive/MyDrive/RobModBal/Roberta-output2",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=7,
    per_device_eval_batch_size=7,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=trainDataset,
    eval_dataset=validDataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# Train pre-trained model
trainer.train()



PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 2946
  Num Epochs = 3
  Instantaneous batch size per device = 7
  Total train batch size (w. parallel, distributed & accumulation) = 7
  Gradient Accumulation steps = 1
  Total optimization steps = 1263


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.5135,0.680397,0.731286,0.688889,0.197452,0.306931
1000,0.4097,0.802393,0.731286,0.595506,0.33758,0.430894


***** Running Evaluation *****
  Num examples = 521
  Batch size = 7
Saving model checkpoint to drive/MyDrive/RobModBal/Roberta-output2/checkpoint-500
Configuration saved in drive/MyDrive/RobModBal/Roberta-output2/checkpoint-500/config.json
Model weights saved in drive/MyDrive/RobModBal/Roberta-output2/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 521
  Batch size = 7
Saving model checkpoint to drive/MyDrive/RobModBal/Roberta-output2/checkpoint-1000
Configuration saved in drive/MyDrive/RobModBal/Roberta-output2/checkpoint-1000/config.json
Model weights saved in drive/MyDrive/RobModBal/Roberta-output2/checkpoint-1000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from drive/MyDrive/RobModBal/Roberta-output2/checkpoint-500 (score: 0.6803973317146301).


TrainOutput(global_step=1263, training_loss=0.43981599543458966, metrics={'train_runtime': 89.0751, 'train_samples_per_second': 99.22, 'train_steps_per_second': 14.179, 'total_flos': 363339923011200.0, 'train_loss': 0.43981599543458966, 'epoch': 3.0})

**Loading the model + prediction + metric**

In [None]:
from sklearn import metrics 
from transformers import AutoModelForSequenceClassification, AutoTokenizer 

# Load trained model
model_path1 = "drive/MyDrive/RobModBal/Roberta-output2/checkpoint-500"  
model1 = AutoModelForSequenceClassification.from_pretrained(model_path1, num_labels =2)

# Define test trainer
test_trainer = Trainer(model1)

# Make prediction
raw_pred1, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred1 = np.argmax(raw_pred1, axis=1)
predicted1=y_pred1
testL =test['sarcastic']
print(metrics.accuracy_score(testL, predicted1))
print(metrics.precision_score(testL, predicted1))
print(metrics.recall_score(testL,predicted1))
print(metrics.f1_score(testL,predicted1)) 

#**Loading the model + prediction + metric**

loading configuration file drive/MyDrive/RobModBal/Roberta-output2/checkpoint-500/config.json
Model config RobertaConfig {
  "_name_or_path": "drive/MyDrive/RobModBal/Roberta-output2/checkpoint-500",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file drive/MyDrive/RobModBal/Roberta-output2/checkpoint-500/pytorch_model.bin
All mod

0.8292857142857143
0.39226519337016574
0.355
0.37270341207349084
