In [12]:
import pandas as pd
import numpy as np
import torch
import nltk
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


[nltk_data] Downloading package punkt to C:\Users\jorda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
train_df = pd.read_csv(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\datasets\cleaned_train.csv")
train_texts = train_df['text'].fillna("").tolist()
train_labels = train_df['target'].values

# 80/20 split
X_train_texts, X_val_texts, y_train_labels, y_val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, stratify=train_labels, random_state=42
)

In [18]:
# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.to(device)

# Tokenize train
train_encodings = tokenizer(X_train_texts, truncation=True, padding=True, max_length=128)
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': y_train_labels
})

# Tokenize validation
val_encodings = tokenizer(X_val_texts, truncation=True, padding=True, max_length=128)
val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': y_val_labels
})

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"f1": f1_score(labels, preds)}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # Save after each epoch
    save_total_limit=1,      # Only keep the best model
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True  # Enable mixed precision if GPU supports
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.3809,0.378412,0.808953
2,0.3101,0.396556,0.81374
3,0.2118,0.42265,0.812983


TrainOutput(global_step=1143, training_loss=0.34491164707240574, metrics={'train_runtime': 41.3886, 'train_samples_per_second': 441.426, 'train_steps_per_second': 27.616, 'total_flos': 397060678455840.0, 'train_loss': 0.34491164707240574, 'epoch': 3.0})

In [19]:
# Evaluate on validation set manually
model.eval()
inputs = tokenizer(X_val_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).cpu().numpy()

print("Accuracy:", accuracy_score(y_val_labels, preds))
print("F1 Score:", f1_score(y_val_labels, preds))
print("\nClassification Report:\n", classification_report(y_val_labels, preds))
print("\nConfusion Matrix:\n", confusion_matrix(y_val_labels, preds))

Accuracy: 0.8397898883782009
F1 Score: 0.8137404580152672

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86       869
           1       0.81      0.81      0.81       654

    accuracy                           0.84      1523
   macro avg       0.84      0.84      0.84      1523
weighted avg       0.84      0.84      0.84      1523


Confusion Matrix:
 [[746 123]
 [121 533]]


In [20]:
# Save fine-tuned model
save_path = r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\basic-models\finetuned_bert"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Finetuned model saved at {save_path}")

Finetuned model saved at C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\basic-models\finetuned_bert
