In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
import seaborn as sns
import torchvision
import io
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Debug mode for CUDA errors
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

# Load Yelp dataset
dataset = load_dataset("yelp_review_full")
train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()

# Preprocessing
train_df["Sentence"] = train_df["text"].str.lower()
test_df["Sentence"] = test_df["text"].str.lower()
train_df["Class"] = train_df["label"]
test_df["Class"] = test_df["label"]
train_df = train_df[["Sentence", "Class"]]
test_df = test_df[["Sentence", "Class"]]

#Downsample for faster dev
train_df = train_df.sample(30000, random_state=42)
test_df = test_df.sample(5000, random_state=42)

# Split train/val
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["Sentence"].values,
    train_df["Class"].values,
    test_size=0.2,
    random_state=42,
    shuffle=True
)
test_texts = test_df["Sentence"].values
test_labels = test_df["Class"].values

# Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)


class YelpDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

# Build datasets
train_dataset = YelpDataset(train_encodings, train_labels)
val_dataset = YelpDataset(val_encodings, val_labels)
test_dataset = YelpDataset(test_encodings, test_labels)


batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=5
).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=2e-5)


# TensorBoard writer
writer = SummaryWriter("runs/yelp_distilbert_experiment")

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss, val_preds, val_true = 0, [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            val_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_true.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_acc = accuracy_score(val_true, val_preds)

    print(f"Validation Loss: {avg_val_loss:.4f} | Validation Acc: {val_acc:.4f}")

    # TensorBoard logging
    writer.add_scalar("Loss/Train", avg_train_loss, epoch)
    writer.add_scalar("Loss/Validation", avg_val_loss, epoch)
    writer.add_scalar("Accuracy/Validation", val_acc, epoch)

# Final Testing
model.eval()
test_preds, test_true = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_true.extend(labels.cpu().numpy())

# Metrics
print("\n--- Final Test Results ---")
print("Accuracy:", accuracy_score(test_true, test_preds))
print("Precision:", precision_score(test_true, test_preds, average='macro'))
print("Recall:", recall_score(test_true, test_preds, average='macro'))
print("\nClassification Report:\n", classification_report(test_true, test_preds, target_names=[f"{i+1} Star" for i in range(5)]))

# Confusion Matrix
cm = confusion_matrix(test_true, test_preds)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[f"{i+1} Star" for i in range(5)], yticklabels=[f"{i+1} Star" for i in range(5)])
plt.title("Confusion Matrix")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.tight_layout()
plt.savefig("confusion_matrix_distilbert.png")
plt.show()

# TensorBoard Confusion Matrix Logging
fig, ax = plt.subplots(figsize=(6,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, ax=ax)
ax.set_xlabel('Predicted Labels')
ax.set_ylabel('True Labels')
buf = io.BytesIO()
plt.savefig(buf, format='jpeg')
buf.seek(0)
image = torchvision.transforms.ToTensor()(plt.imread(buf))
writer.add_image("Confusion Matrix", image, 0)

writer.close()


  from .autonotebook import tqdm as notebook_tqdm
2025-04-28 13:06:40.689850: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745838400.710663  501200 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745838400.719218  501200 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745838400.738773  501200 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745838400.738792  501200 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745838400.738794  501200

Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
