In [None]:
# Mount Google Drive if needed
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully")
except:
    print("Not running in Colab or Drive already mounted")

# Install required packages if needed
try:
    import optuna
except:
    print("Installing Optuna...")
    !pip install optuna

Mounted at /content/drive
Google Drive mounted successfully
Installing Optuna...
Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.2.1


In [None]:
# Regular Imports
import pandas as pd
import numpy as np
import time
import os
from tabulate import tabulate as tabulate_func
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support

# NLP Preprocessing Imports
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import re

# HPO & Training imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
import optuna

# Saving Files
import shutil
import glob
from typing import List, Optional

In [None]:
# Path to the database file
db_path = "/content/drive/My Drive/ADL Final Project/results/distilbert_hpo.db"

# Create or load the study
study = optuna.create_study(
    study_name="distilbert_hpo",
    storage=f"sqlite:///{db_path}",
    load_if_exists=True,
    direction="minimize"
)

[I 2025-04-13 14:01:34,659] Using an existing study with name 'distilbert_hpo' instead of creating a new one.


In [None]:
best_params = study.best_trial.params
print(best_params)

{'learning_rate': 1.336606944413412e-05, 'weight_decay': 2.5038567374664683e-05, 'lr_scheduler_type': 'linear'}


In [None]:
# Set device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU: Tesla T4


In [None]:
# Download NLTK data if needed
try:
    stop_words = set(stopwords.words('english'))
except:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
except:
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
!ls "/content/drive/My Drive/ADL Final Project"

 distilbert_hpo.db	      optuna_study.db	      yelp_dataset_train.csv
'DistilBERT Training.ipynb'   results
 final_updated.ipynb	      yelp_dataset_test.csv


In [None]:
# Load datasets
train_df = pd.read_csv("/content/drive/My Drive/ADL Final Project/yelp_dataset_train.csv")
test_df = pd.read_csv("/content/drive/My Drive/ADL Final Project/yelp_dataset_test.csv")

In [None]:
# Train-Validation Split (80% of train_df becomes train, 20% becomes val)
train_df, val_df = train_test_split(
    train_df,
    test_size = 0.2,
    stratify = train_df['class_index'],
    random_state = 42
)

# Confirm the split sizes
print(f"Train: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")

Train: 336000 samples
Validation: 84000 samples
Test: 30000 samples


In [None]:
# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize function
def tokenize_data(df):
    texts = df['review_text'].tolist()
    labels = df['class_index'].tolist()
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
    return encodings, labels

# Dataset class
class YelpDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
# Tokenize each split
train_encodings, train_labels = tokenize_data(train_df)
val_encodings, val_labels = tokenize_data(val_df)
test_encodings, test_labels = tokenize_data(test_df)

# Create datasets
train_dataset = YelpDataset(train_encodings, train_labels)
val_dataset = YelpDataset(val_encodings, val_labels)
test_dataset = YelpDataset(test_encodings, test_labels)

In [None]:
# Metrics container
logs = {
    'train_loss': [],
    'eval_loss': [],
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'epoch_time': [],
    'confusion_matrix': None  # Added after training
}

In [None]:
# Track metrics per epoch
class LoggingCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        logs['train_loss'].append(state.log_history[-2].get('loss'))
        logs['eval_loss'].append(state.log_history[-1].get('eval_loss'))
        logs['accuracy'].append(state.log_history[-1].get('eval_accuracy'))
        logs['precision'].append(state.log_history[-1].get('eval_precision'))
        logs['recall'].append(state.log_history[-1].get('eval_recall'))
        logs['f1'].append(state.log_history[-1].get('eval_f1'))
        logs['epoch_time'].append(state.log_history[-1].get('epoch_runtime', 0.0))

In [None]:
# Compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
training_args = TrainingArguments(
    output_dir = "./final_model",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 64,
    warmup_steps = 500,
    weight_decay = best_params["weight_decay"],
    learning_rate = best_params["learning_rate"],
    lr_scheduler_type = best_params["lr_scheduler_type"],
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy",
    logging_dir = "./final_logs",
    logging_steps = 10,
    save_total_limit = 3,
    fp16 = True
)



In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 3).to(device)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
import wandb
wandb.init(project="distilbert_training_testing")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mneel-1999[0m ([33mneel-1999-humber-college[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Training Model
trainer.train()

# Evaluating Model
test_results = trainer.predict(test_dataset)
logs['confusion_matrix'] = confusion_matrix(test_results.label_ids, test_results.predictions.argmax(-1))
report = classification_report(test_results.label_ids, test_results.predictions.argmax(-1), digits=4)
print(report)



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2512,0.246979,0.907798,0.910932,0.907798,0.908563
2,0.1074,0.231465,0.917821,0.918294,0.917821,0.918007
3,0.1813,0.282536,0.918679,0.918486,0.918679,0.918552


              precision    recall  f1-score   support

           0     0.9629    0.9676    0.9652     10000
           1     0.9397    0.9291    0.9344     10000
           2     0.9609    0.9670    0.9639     10000

    accuracy                         0.9546     30000
   macro avg     0.9545    0.9546    0.9545     30000
weighted avg     0.9545    0.9546    0.9545     30000



In [None]:
# Save model and tokenizer
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")

('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/vocab.txt',
 './final_model/added_tokens.json')

In [None]:
drive_save_path = "/content/drive/My Drive/ADL Final Project/final_model"

# Save model and tokenizer to Google Drive
model.save_pretrained(drive_save_path)
tokenizer.save_pretrained(drive_save_path)

('/content/drive/My Drive/ADL Final Project/final_model/tokenizer_config.json',
 '/content/drive/My Drive/ADL Final Project/final_model/special_tokens_map.json',
 '/content/drive/My Drive/ADL Final Project/final_model/vocab.txt',
 '/content/drive/My Drive/ADL Final Project/final_model/added_tokens.json')