# **4 - DistilBERT**

In [2]:
import os
import csv

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from transformers import AutoModelForSequenceClassification, TFDistilBertForSequenceClassification, TFTrainingArguments, TFTrainer
# from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from src.data_exploration.data_preprocessing import train_test_split, tokenization
from src.utils.eval_utils import *

  from .autonotebook import tqdm as notebook_tqdm


# Test 1 - With data preprocessing

In this first test, we will use the preprocessed data generated in the `0-EDA.ipynb`

## Load dataset


In [10]:
dataset = os.path.join("data", "Sarcasm_Headlines_Dataset_v2.csv")

df = pd.read_csv(dataset, sep=";")
df.head()

Unnamed: 0,headline,label
0,scientist unveil doomsday clock hair loss,1
1,dem rep totally nail congress falling short ge...,0
2,eat different recipe,0
3,weather prevents liar getting work,1
4,mother come pretty close using word streaming ...,1


## Split data into train and test sets

We need to generate both training and test sets. We will use 80% of the data for training and the remaining 20% for testing. Moreover, in classification tasks it is important to maintain the same proportion of classes in both training and test sets (otherwise, the model might be affected by it during the training process, and the validation metrics may also be distorted). `scikit-learn` offers the class `StratifiedShuffleSplit` for achieving this, which will be used. We have developed the `train_test_split` function, which is stored in `src/data_preprocessing.py`:

In [11]:
X_train, y_train, X_test, y_test = train_test_split(df, "label")

## Tokenization, Padding and Sequencing

In [8]:
# Define tokenizer
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [15]:
# Tokenize train and test sets
X_train_tokenized = tokenizer(X_train["headline"].tolist(), truncation=True, padding=True)
X_test_tokenized = tokenizer(X_test["headline"].tolist(), truncation=True, padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    X_train_tokenized["input_ids"],
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    X_test_tokenized["input_ids"],
    y_test
))

## Build the model

In [8]:
training_args = TFTrainingArguments(
    output_dir="results",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    eval_steps = 10,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="logs",
    logging_steps=10,
)

In [9]:
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_transform', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

## Train the model

In [10]:
trainer.train()

In [11]:
# See loss
trainer.evaluate(test_dataset)

{'eval_loss': 0.7847300809972426}

In [35]:
# Predict validation set
output = tf.argmax(trainer.predict(test_dataset)[0], axis=1)

In [36]:
cm = confusion_matrix(y_test, output)
cm

array([[1538, 1261],
       [1448, 1154]], dtype=int64)

In [37]:
print(classification_report(y_test, output))

              precision    recall  f1-score   support

           0       0.52      0.55      0.53      2799
           1       0.48      0.44      0.46      2602

    accuracy                           0.50      5401
   macro avg       0.50      0.50      0.50      5401
weighted avg       0.50      0.50      0.50      5401



In [15]:
model_path = os.path.join("models", "distilbert_model")
trainer.save_model(model_path)

## Test on OOS-Data

In [16]:
oos_data = pd.read_csv(os.path.join("data", "Sarcasm_Headlines_Dataset_OOS_Prep.csv"), sep=";", quoting=csv.QUOTE_ALL)
oos_data.head()

Unnamed: 0,headline,label
0,nextdoor ceo recruit army fanatic holy crusade...,1
1,exclusive interview clarence thomas,1
2,pro con banning book,1
3,know train derailment toxic chemical ohio,1
4,cia criticized use abusive etiquette coach bla...,1


In [31]:
# Predict on new headlines
headlines_tokenized = tokenizer(oos_data["headline"].tolist(), truncation=True, padding=True)
headlines_dataset = tf.data.Dataset.from_tensor_slices((
    headlines_tokenized["input_ids"],
    oos_data["label"]
))

predictions = tf.argmax(trainer.predict(headlines_dataset)[0], axis=1)

In [32]:
# Get the confussion matrix
cm = confusion_matrix(oos_data["label"], predictions)
print(cm)

[[169 311]
 [226 539]]


In [34]:
print(classification_report(oos_data["label"], predictions))

              precision    recall  f1-score   support

           0       0.43      0.35      0.39       480
           1       0.63      0.70      0.67       765

    accuracy                           0.57      1245
   macro avg       0.53      0.53      0.53      1245
weighted avg       0.55      0.57      0.56      1245



# Test 2 - Without data preprocessing

As the original dataset has already been cleaned from punctuation marks, special characters and some other noisy or irrelevant information, we will check if simply 

In [16]:
df_noprep = pd.read_json('./data/Sarcasm_Headlines_Dataset_v2.json', lines=True)[['headline', 'is_sarcastic']]
# capitalize every word in the headline
df_noprep['headline'] = df_noprep['headline'].apply(lambda x: ' '.join([word.capitalize() for word in x.split()]))
df_noprep.head()

Unnamed: 0,headline,is_sarcastic
0,Thirtysomething Scientists Unveil Doomsday Clo...,1
1,Dem Rep. Totally Nails Why Congress Is Falling...,0
2,Eat Your Veggies: 9 Deliciously Different Recipes,0
3,Inclement Weather Prevents Liar From Getting T...,1
4,Mother Comes Pretty Close To Using Word 'strea...,1


In [17]:
X_train, y_train, X_test, y_test = train_test_split(df_noprep, "is_sarcastic")

## Tokenization, Padding and Sequencing

As mentioned in the previous notebook, we need to tokenize the text before feeding the data into our model. We will use the pre-trained DistilBERT tokenizer from the transformers class

In [35]:
# Tokenize train and test sets
X_train_tokenized = tokenizer(X_train["headline"].tolist(), truncation=True, padding=True)
X_test_tokenized = tokenizer(X_test["headline"].tolist(), truncation=True, padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    X_train_tokenized["input_ids"],
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    X_test_tokenized["input_ids"],
    y_test
))

## Build the model

In [36]:
training_args = TFTrainingArguments(
    output_dir="results",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    eval_steps = 10,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="logs",
    logging_steps=10,
)

In [37]:
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_39', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

## Train the model

In [38]:
trainer.train()

In [39]:
# See loss
trainer.evaluate(test_dataset)

{'eval_loss': 0.7140190972222222}

In [40]:
# Predict validation set
output = tf.argmax(trainer.predict(test_dataset)[0], axis=1)

In [41]:
cm = confusion_matrix(y_test, output)
cm

array([[1320, 1677],
       [ 856, 1871]], dtype=int64)

In [42]:
print(classification_report(y_test, output))

              precision    recall  f1-score   support

           0       0.61      0.44      0.51      2997
           1       0.53      0.69      0.60      2727

    accuracy                           0.56      5724
   macro avg       0.57      0.56      0.55      5724
weighted avg       0.57      0.56      0.55      5724



In [3]:
model_path = os.path.join("models", "distilbert_model")

In [43]:
trainer.save_model(model_path)

## Test on OOS-Data

In [4]:
# Load the model from the path
model = TFDistilBertForSequenceClassification.from_pretrained(model_path)

Some layers from the model checkpoint at models\distilbert_model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at models\distilbert_model and are newly initialized: ['dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
oos_data = pd.read_csv(os.path.join("data", "Sarcasm_Headlines_Dataset_OOS.csv"), sep=";", quoting=csv.QUOTE_ALL)
oos_data.head()

Unnamed: 0,headline,label,news_source
0,Nextdoor CEO Recruits Army Of Fanatics For Hol...,1,The Onion
1,Exclusive Interview With Clarence Thomas,1,The Onion
2,Pros And Cons Of Banning Books,1,The Onion
3,What To Know About The Train Derailment And To...,1,The Onion
4,What To Know About ChatGPT,1,The Onion


In [19]:
# Predict on new headlines
headlines_tokenized = tokenizer(oos_data["headline"].tolist(), truncation=True, padding=True)
headlines_dataset = tf.data.Dataset.from_tensor_slices((
    headlines_tokenized["input_ids"],
    oos_data["label"]
))

predictions = tf.argmax(model.predict(headlines_dataset)[0], axis=1)



In [20]:
# Get the confussion matrix
cm = confusion_matrix(oos_data["label"], predictions)
print(cm)

[[259 349]
 [107 663]]


In [21]:
print(classification_report(oos_data["label"], predictions))

              precision    recall  f1-score   support

           0       0.71      0.43      0.53       608
           1       0.66      0.86      0.74       770

    accuracy                           0.67      1378
   macro avg       0.68      0.64      0.64      1378
weighted avg       0.68      0.67      0.65      1378

