<a href="https://colab.research.google.com/github/JumanaKhrais/Transformer-Based-Deep-Learning-Models-for-Sarcasm-Detection-with-an-Imbalanced-Dataset./blob/main/Bert_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mounting Drive**

In [None]:
from google.colab import drive

drive.mount('/content/drive') #this line to have the ability to read from and load to drive

Mounted at /content/drive


**Importing Libraries**

In [None]:
! pip install transformers

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification, AutoTokenizer 
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

**Reading Dataset + Handling Null Values**




In [None]:
# Reading the dataset
pathT = "drive/MyDrive/TrainEnglish.csv"
train = pd.read_csv(pathT)

In [None]:
data = train[['tweet', 'sarcastic']].copy()


In [None]:
data.shape

(3468, 2)

In [None]:
data.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True)

In [None]:
data.shape

(3467, 2)

In [None]:
data.tail()

Unnamed: 0,tweet,sarcastic
3463,The population spike in Chicago in 9 months is...,0
3464,You'd think in the second to last English clas...,0
3465,I’m finally surfacing after a holiday to Scotl...,0
3466,Couldn't be prouder today. Well done to every ...,0
3467,Overheard as my 13 year old games with a frien...,0


**Defining Model Tokenizer + 
Defining Pretrained Model**

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", return_dict=True, num_labels =2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

**Splitting Dataset into Train and Validation**


*   15% validation
*   85% training



In [None]:
"""train, valid  = train_test_split(data, test_size=0.15, shuffle = True , random_state= 42) """


'train, valid  = train_test_split(data, test_size=0.15, shuffle = True , random_state= 42) '

In [None]:
"""train= pd.DataFrame(train)
valid = pd.DataFrame(valid)
train.to_csv("drive/MyDrive/paperModels/train.csv", index= False) 
valid.to_csv("drive/MyDrive/paperModels/valid.csv", index = False) """

'train= pd.DataFrame(train)\nvalid = pd.DataFrame(valid)\ntrain.to_csv("drive/MyDrive/paperModels/train.csv", index= False) \nvalid.to_csv("drive/MyDrive/paperModels/valid.csv", index = False) '

**Reading Validation and Training Datasets**

In [None]:
pathT = "drive/MyDrive/train.csv"
train  = pd.read_csv(pathT)

In [None]:
pathT = "drive/MyDrive/valid.csv"
valid = pd.read_csv(pathT)

In [None]:
train.shape

(2946, 2)

In [None]:
valid.shape

(521, 2)

**Preparing Data for the Tokenizer**

In [None]:
#converting the columns into list of strings 

In [None]:
tr = list(train['tweet'])
vl = list(valid['tweet'])
trainL = list(train['sarcastic'])
vlL = list(valid['sarcastic'])

In [None]:
tokenizedTrain = tokenizer(tr, padding=True, truncation=True, max_length=80)  

In [None]:
tokenizedvalid = tokenizer(vl , padding=True , truncation=True,  max_length=80)

**Creating Torch Dataset**

In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

#The Trainer API requires the model to be in a torch.utils.data.Dataset class. 


In [None]:
trainDataset= Dataset(tokenizedTrain, trainL)
validDataset= Dataset(tokenizedvalid, vlL)


**Reading + Tokenizing + Creating tourch test dataset**

In [None]:
# Load test data
test= pd.read_csv("drive/MyDrive/TestEnglish.csv")


In [None]:
print(test.shape)
testData = list(test['text'])
testDatatokenized = tokenizer(testData, padding=True, truncation=True, max_length=80)
# Create torch dataset
test_dataset = Dataset(testDatatokenized)

(1400, 2)


**Fine-Tunning the Pretrained Model**

params: 


*   Batch size = 10 
*   Number of epochs = 3
*   The last 4 layers are trainable









In [None]:

#defining function to comput the metrics
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

#choosing which layers are trainable 
freeze_layer_count = 9
if freeze_layer_count:
        # We freeze here the embeddings of the model
        for param in model.bert.embeddings.parameters():
            param.requires_grad = False

        if freeze_layer_count != -1:
            # if freeze_layer_count == -1, we only freeze the embedding layer
            # otherwise we freeze the first `freeze_layer_count` encoder layers
            for layer in model.bert.encoder.layer[:freeze_layer_count]:
                for param in layer.parameters():
                    param.requires_grad = False



# Define Trainer arguments 
args = TrainingArguments(
    output_dir="drive/MyDrive/BertMod/Bert-output1", 
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
) 

#defining trainer 
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=trainDataset,
    eval_dataset=validDataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)])



# Train pre-trained model
trainer.train()

#A callback is an object that can perform actions at various stages of training (e.g. at the start or end of an epoch, before or after a single batch, etc).

***** Running training *****
  Num examples = 2946
  Num Epochs = 3
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 885


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.5414,0.578044,0.71977,0.596491,0.216561,0.317757


***** Running Evaluation *****
  Num examples = 521
  Batch size = 10
Saving model checkpoint to drive/MyDrive/BertMod/Bert-output1/checkpoint-500
Configuration saved in drive/MyDrive/BertMod/Bert-output1/checkpoint-500/config.json
Model weights saved in drive/MyDrive/BertMod/Bert-output1/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from drive/MyDrive/BertMod/Bert-output1/checkpoint-500 (score: 0.5780436992645264).


TrainOutput(global_step=885, training_loss=0.49830879168321857, metrics={'train_runtime': 67.4442, 'train_samples_per_second': 131.042, 'train_steps_per_second': 13.122, 'total_flos': 363339923011200.0, 'train_loss': 0.49830879168321857, 'epoch': 3.0})

**Loading the model + prediction + metric**

In [None]:
from sklearn import metrics 
# Load trained model
model_path1 =  "drive/MyDrive/BertMod/Bert-output1/checkpoint-500"
model1 = BertForSequenceClassification.from_pretrained(model_path1, num_labels=2)

# Define test trainer
test_trainer = Trainer(model1)

# Make prediction
raw_pred1, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred1 = np.argmax(raw_pred1, axis=1)
predicted1=y_pred1 
testL =test['sarcastic']
print(metrics.accuracy_score(testL, predicted1))
print(metrics.precision_score(testL, predicted1))
print(metrics.recall_score(testL,predicted1))
print(metrics.f1_score(testL,predicted1)) 

loading configuration file drive/MyDrive/BertMod/Bert-output1/checkpoint-500/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file drive/MyDrive/BertMod/Bert-output1/checkpoint-500/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequen

0.8135714285714286
0.3081761006289308
0.245
0.2729805013927576
