In [None]:
#!pip install transformers datasets pandas torch evaluate accelerate


In [2]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import evaluate

accuracy = evaluate.load("accuracy")

In [4]:
df = pd.read_csv('../data/interim/ready_for_model.csv', index_col=0)
df.head()

Unnamed: 0,joke_new,score,date,log_score,score_class
0,When I was young my parents used to move a lot...,1.0,1473143000.0,0.693147,1
1,I guess you can say Tom Brady... Dropped the b...,1.0,1517828000.0,0.693147,1
2,"A Mathematician, a Biologist, and an Engineer ...",1.0,1504674000.0,0.693147,1
3,The coach from the thailand football team that...,0.0,1531343000.0,0.0,1
4,Donald Trump figured out how to get Mexico to ...,0.0,1516913000.0,0.0,1


In [None]:
#subset_df = df.sample(frac=1.0)
subset_df = df

In [None]:
#To make sure the labels match
subset_df['score_class'] = subset_df['score_class'].astype(int) - 1

### Train BERT Model
This runs the entire training process.

In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset
import numpy as np
import evaluate
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Define preprocessing function
def preprocess_function(examples):
    result = tokenizer(examples['joke_new'], truncation=True, padding="max_length")
    result['labels'] = examples['score_class']
    return result

# Convert the pandas DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(subset_df)

# Apply the map function with preprocessing
tokenized_subset = hf_dataset.map(preprocess_function, batched=True, remove_columns=hf_dataset.column_names)

# Initialize the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the accuracy evaluation metric
accuracy_metric = evaluate.load("accuracy")

# Define the compute_metrics function for both standard accuracy and top-2 accuracy
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    # Standard accuracy
    standard_accuracy = accuracy_metric.compute(predictions=preds, references=labels)['accuracy']

    # Top-2 accuracy
    top_2_preds = np.argsort(predictions, axis=1)[:, -2:]
    top_2_correct = np.any(top_2_preds == np.expand_dims(labels, axis=1), axis=1)
    top_2_accuracy = np.mean(top_2_correct)

    # Precision, Recall, F1 Score
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)

    return {
        "accuracy": standard_accuracy,
        "top_2_accuracy": top_2_accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Define label mappings
id2label = {
    0: "Label 1",
    1: "Label 2",
    2: "Label 3",
    3: "Label 4",
    4: "Label 5",
}

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=5, id2label=id2label)

# Split the dataset into training and evaluation sets using random seed
train_test_split = tokenized_subset.train_test_split(test_size=0.1, seed=42)  # 10% for testing
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Set up training arguments with warmup steps
training_args = TrainingArguments(
    output_dir="./my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=5,
    weight_decay=0.001,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    warmup_ratio=0.05,  # Number of steps for the warmup phase
    push_to_hub=False
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model and print confusion matrix
results = trainer.evaluate()
predictions, labels, _ = trainer.predict(eval_dataset)
preds = np.argmax(predictions, axis=1)

# Compute confusion matrix
cm = confusion_matrix(labels, preds, labels=[0, 1, 2, 3, 4])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1, 2, 3, 4])
disp.plot(cmap=plt.cm.Blues)
plt.show()
