In [2]:
# Install required libraries
!pip install transformers[torch] datasets evaluate kaggle scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
import os
from google.colab import files

# Upload the kaggle.json API key
print("Please upload your kaggle.json file")
files.upload()

Please upload your kaggle.json file


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"gamos3476","key":"e15addf5c4419f151c0322270ae3d7e7"}'}

In [4]:
# Set up Kaggle directory and permissions
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download and unzip the dataset
!kaggle datasets download -d ilhamfp31/yelp-review-dataset
!unzip -o yelp-review-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/ilhamfp31/yelp-review-dataset
License(s): unknown
Downloading yelp-review-dataset.zip to /content
 88% 142M/162M [00:00<00:00, 1.48GB/s]
100% 162M/162M [00:00<00:00, 1.26GB/s]
Archive:  yelp-review-dataset.zip
  inflating: yelp_review_polarity_csv/readme.txt  
  inflating: yelp_review_polarity_csv/test.csv  
  inflating: yelp_review_polarity_csv/train.csv  


In [5]:
import pandas as pd

# 1. Load the pre-split data
train_df_raw = pd.read_csv('yelp_review_polarity_csv/train.csv', header=None)
test_df_raw = pd.read_csv('yelp_review_polarity_csv/test.csv', header=None)

# 2. Preprocess and format the data
# Column 0 is the label (1 or 2), Column 1 is the text.
train_df_raw.columns = ['label', 'text']
test_df_raw.columns = ['label', 'text']

# Create our dataframes
# We convert labels: 1 -> 0 (Negative), 2 -> 1 (Positive)
train_df = pd.DataFrame({
    'text': train_df_raw['text'],
    'label': train_df_raw['label'].apply(lambda x: 0 if x == 1 else 1)
})

test_df = pd.DataFrame({
    'text': test_df_raw['text'],
    'label': test_df_raw['label'].apply(lambda x: 0 if x == 1 else 1)
})

In [6]:
# 3. Verify the data
print(f"Total training reviews: {len(train_df)}")
print(f"Total test reviews: {len(test_df)}")

# We will use a smaller sample to avoid Colab time limits.
TRAIN_SAMPLE_SIZE = 50000
TEST_SAMPLE_SIZE = 5000

# Create the sample using 'random_state=42' so the sample is the same every time
train_df = train_df.sample(n=TRAIN_SAMPLE_SIZE, random_state=42)
test_df = test_df.sample(n=TEST_SAMPLE_SIZE, random_state=42)

print(f"\n- USING A SMALLER SAMPLE")
print(f"New training reviews: {len(train_df)}")
print(f"New test reviews: {len(test_df)}")

Total training reviews: 560000
Total test reviews: 38000

- USING A SMALLER SAMPLE
New training reviews: 50000
New test reviews: 5000


In [7]:
import torch
import numpy as np
import evaluate # Hugging Face's evaluation library
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# 1. Define Model and Tokenizer
MODEL_NAME = 'google/mobilebert-uncased'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 2. Convert Pandas to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# 3. Tokenize the Data
def tokenize_function(examples):
    # 'truncation=True' cuts reviews longer than the model's max length
    return tokenizer(examples['text'], truncation=True, max_length=512)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# 4. Define Evaluation Metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/147M [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [9]:
# 5. Define Training Arguments
training_args = TrainingArguments(
    output_dir="yelp_sentiment_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
)

# 6. Create and Run the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

print(" Starting Fine-Tuning --")
trainer.train()
print(" Fine-Tuning Complete-")

 Starting Fine-Tuning --


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1333,0.135368,0.959
2,0.0956,0.146506,0.9608


 Fine-Tuning Complete-


In [10]:
from transformers import pipeline
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

# Load a larger, general-purpose model for zero-shot classification
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli",
                      device=0) # Use 0 for GPU

# Define labels
candidate_labels = ["positive", "negative"]

# Running on all 5000 test reviews takes a long time, so we ensure we use the sampled set
test_sample = test_df.copy() # reusing the 5000 sample from earlier

predictions_large_model = []
true_labels_large_model = []

print(f" Running Zero-Shot Evaluation on {len(test_sample)} reviews -")

Device set to use cuda:0


 Running Zero-Shot Evaluation on 5000 reviews -


In [11]:
# Loop through the sample
for index, row in tqdm(test_sample.iterrows(), total=test_sample.shape[0]):
    text = row['text']
    true_label_int = row['label']

    # Get the model's prediction
    result = classifier(
        text,
        candidate_labels,
        hypothesis_template="The sentiment of this review is {}."
    )

    # The result['labels'][0] is the label with the highest score
    predicted_label_str = result['labels'][0]

    # Convert string "positive"/"negative" back to 0/1
    predicted_label_int = 1 if predicted_label_str == "positive" else 0

    predictions_large_model.append(predicted_label_int)
    true_labels_large_model.append(true_label_int)

  0%|          | 0/5000 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [17]:
large_model_accuracy = accuracy_score(true_labels_large_model, predictions_large_model)

print(f"\n Evaluating Large Zero-Shot Model (BART-Large) --")
print(f"Test Set (Sample) Accuracy: {large_model_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels_large_model, predictions_large_model, target_names=['Negative', 'Positive']))

print(" Final Experiment Results --\n")
print(f"Small, Fine-Tuned Model (MobileBERT):")
print(f" Accuracy on {len(test_df)} reviews: {small_model_accuracy:.4f}")
print("\n")
print(f"Large, Zero-Shot Model (BART-Large):")
print(f" Accuracy on {len(test_sample)} reviews: {large_model_accuracy:.4f}")
print("\n")

print("Conclusion")
if small_model_accuracy > large_model_accuracy:
    print("Hypothesis confirmed: The small model fine-tuned on Yelp data")
    print("outperformed the large, general-purpose model.")
else:
    print("Hypothesis not confirmed: The large, general-purpose model")
    print("was more accurate than the small, fine-tuned model.")


 Evaluating Large Zero-Shot Model (BART-Large) --
Test Set (Sample) Accuracy: 0.9568

Classification Report:
 Final Experiment Results --

Small, Fine-Tuned Model (MobileBERT):


NameError: name 'small_model_accuracy' is not defined

In [14]:
import torch
import numpy as np
import pandas as pd
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import classification_report
from datasets import Dataset
import evaluate, gc

# Clear memory
del model, trainer, classifier
torch.cuda.empty_cache()
gc.collect()

# Reuse the data from earlier steps
# Ensure labels are int and drop NaNs
train_df["label"] = train_df["label"].astype(int)
test_df["label"] = test_df["label"].astype(int)

# Setup Model + Tokenizer
MODEL_NAME = "google/mobilebert-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Convert to HF datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# Tokenize (Reduced max_length for LoRA efficiency)
def tokenize_function(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

print("Loading MobileBERT...")
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Correct LoRA modules for MobileBERT
LORA_TARGET_MODULES = [
    "attention.self.query",
    "attention.self.key",
    "attention.self.value",
    "attention.output.dense"
]

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    modules_to_save=["classifier"] # ensures classifier head is trained
)

peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Loading MobileBERT...


Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 541,698 || all params: 25,124,612 || trainable%: 2.1560


In [15]:
# Training Settings (Optimized for Colab T4)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

training_args = TrainingArguments(
    output_dir="mobilebert_lora_yelp",
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=False, # IMPORTANT: MobileBERT becomes NaN with fp16!
    logging_steps=50,
    warmup_ratio=0.1,
    optim="adamw_torch",
    report_to="none",
    dataloader_pin_memory=True
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

print("Training started...")
trainer.train()
print("Training finished.")

Training started...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.177,0.154786,0.9486
2,0.1385,0.145392,0.9512
3,0.1105,0.141881,0.9532


Training finished.


In [None]:
# Evaluation
results = trainer.evaluate()
print("\nEvaluation:", results)

preds = trainer.predict(tokenized_test).predictions
pred_labels = np.argmax(preds, axis=1)

print("\nClassification Report:")
#print(classification_report(test_df["label"], pred_labels, target_names=["Negative", "Positive"]))