### Install Required Libraries

In [None]:
!pip install transformers datasets

### Imports

In [1]:
import os
import time
import sys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_recall_fscore_support

import torch

import transformers
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


### Run locally if Macbook has a GPU

In [2]:

# Is MPS even available? macOS 12.3+

print(torch.backends.mps.is_available())

# Was the current version of PyTorch built with MPS activated?
print(torch.backends.mps.is_built())

dtype = torch.float
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using GPU: Metal Performance Shaders (MPS)")
else:
    device = torch.device('cpu')
    print("Using CPU")

True
True
Using GPU: Metal Performance Shaders (MPS)


 ### Data Preparation

In [3]:
data = pd.read_csv("../data/tokenized_reviews.csv").dropna()
data["quote"] = data["quote"].astype(int)
data["tokenized_words"] = data["tokenized_words"].apply(lambda x: x.strip("[']").replace("', '", " "))
data["label"] = data["popular"].astype(int)  # Hugging Face expects 'label' column

# Downsampling for class balance (as before)
X = data.drop(columns=["popular"])
y = data["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=229)

# Downsample majority class in training set
train_df = X_train.copy()
train_df["label"] = y_train
majority = train_df[train_df.label == 0]
minority = train_df[train_df.label == 1]
majority_downsampled = majority.sample(n=len(minority), random_state=229)
train_df = pd.concat([majority_downsampled, minority]).sample(frac=1, random_state=229)

test_df = X_test.copy()
test_df["label"] = y_test

### Convert to Hugging Face Dataset

In [5]:
train_dataset = Dataset.from_pandas(train_df[["tokenized_words", "label"]])
test_dataset = Dataset.from_pandas(test_df[["tokenized_words", "label"]])
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

In [6]:
train_dataset.shape, test_dataset.shape

((431298, 3), (256518, 3))

### Tokenization

In [7]:
save_path = "./distilbert-ft"

In [8]:
model_name = "distilbert-base-uncased"

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["tokenized_words"], padding="max_length", truncation=True, max_length=64)

start_time = time.time()

tokenized_datasets = dataset.map(tokenize_function, batched=True)

total_time = time.time() - start_time
print(f"\nCompleted in: {total_time:.2f} seconds\n\n")

Map: 100%|████████████████████████████████████| 431298/431298 [00:20<00:00, 21479.24 examples/s]
Map: 100%|████████████████████████████████████| 256518/256518 [00:10<00:00, 25157.84 examples/s]


Completed in: 30.28 seconds







In [18]:
tokenizer.save_pretrained(save_path)

('./distilbert-ft/tokenizer_config.json',
 './distilbert-ft/special_tokens_map.json',
 './distilbert-ft/vocab.txt',
 './distilbert-ft/added_tokens.json',
 './distilbert-ft/tokenizer.json')

### Load Pretrained Model

In [19]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

for param in model.base_model.parameters():
    param.requires_grad = False
    
model.to(device)
print("Model is on:", next(model.parameters()).device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: mps:0


In [20]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"Trainable parameters: {trainable_params} / {total_params}")

Trainable parameters: 592130 / 66955010


### Training Arguments

In [21]:
# revised params
training_args = TrainingArguments(
    output_dir=save_path,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    logging_strategy = "no",
    save_strategy="no",
    report_to="none",  
)

### Compute Metrics

In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1).numpy()
    preds = np.argmax(probs, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)

    roc_auc = roc_auc_score(labels, probs[:, 1])  # class 1 probability
    
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    specificity = tn / (tn + fp)
    
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "specificity": specificity,
        "roc_auc": roc_auc
    }

### Trainer Setup and Training

In [23]:
os.environ["WANDB_DISABLED"] = "true"

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)
start_time = time.time()

trainer.train()

total_time = time.time() - start_time
print(f"\nCompleted in: {total_time:.2f} seconds\n\n")

Step,Training Loss



Completed in: 2409.62 seconds




In [25]:
trainer.save_model(save_path)

### Evaluation on Test data

In [26]:
predictions = trainer.predict(tokenized_datasets["test"])
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

In [27]:
# revised params
metrics = compute_metrics((predictions.predictions, predictions.label_ids))

for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")

accuracy: 0.645
f1: 0.346
precision: 0.238
recall: 0.633
specificity: 0.647
roc_auc: 0.688


### Diagnostics