In [None]:
from google.colab import drive
drive.mount('/content/drive')


# **Introduction**
This project focuses on sentiment analysis using the IMDB Movie Reviews dataset, which contains 50,000 movie reviews evenly split between positive and negative sentiment labels. The dataset is available at:
http://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

We fine-tune a pretrained Transformer model, specifically DistilBERT (distilbert-base-uncased), using a parameter-efficient method called LoRA (Low-Rank Adaptation). LoRA allows us to adapt large models efficiently by injecting small trainable rank matrices into attention layers, reducing computational cost.

In [None]:
# ===============================
# 1. Install Required Libraries
# ===============================
!pip install transformers datasets evaluate peft --quiet

In [None]:
# ===============================
# 2. Load and Prepare Dataset
# ===============================
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# Load and sample IMDB dataset
path = "/content/drive/MyDrive/DL/project3/IMDB Dataset.csv"
df = pd.read_csv(path)
df = df.sample(n=10000, random_state=42).reset_index(drop=True)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

In [None]:
# ===============================
# 3. Tokenize Texts
# ===============================
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# ===============================
# 4. Create Dataset Class
# ===============================
import torch

class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = IMDBDataset(train_encodings, train_labels)
val_dataset = IMDBDataset(val_encodings, val_labels)

In [None]:
# ===============================
# 5. Load Model with LoRA
# ===============================
from transformers import AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_lin", "v_lin"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 739,586 || all params: 67,694,596 || trainable%: 1.0925


In [None]:
# ===============================
# 6. Define Training Configuration
# ===============================
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-imdb",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    report_to="none"
)

In [None]:
# ===============================
# 7. Define Evaluation Metrics
# ===============================
import evaluate
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

In [None]:
# ===============================
# 8. Train the Model
# ===============================
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.408,0.314374,0.874,0.873665
2,0.3039,0.284283,0.8825,0.882496
3,0.2864,0.281029,0.886,0.885992


TrainOutput(global_step=1500, training_loss=0.332762685139974, metrics={'train_runtime': 441.1292, 'train_samples_per_second': 54.406, 'train_steps_per_second': 3.4, 'total_flos': 1616872882176000.0, 'train_loss': 0.332762685139974, 'epoch': 3.0})

In [None]:
# ===============================
# 9. Evaluate the Model
# ===============================
results = trainer.evaluate()
print("\n📊 Final Evaluation:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")



📊 Final Evaluation:
eval_loss: 0.2810
eval_accuracy: 0.8860
eval_f1: 0.8860
eval_runtime: 15.1251
eval_samples_per_second: 132.2310
eval_steps_per_second: 8.2640
epoch: 3.0000


In [None]:
# ===============================
# 10. Predict Sample
# ===============================
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move inputs to the same device as the model
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probs).item()
    return "positive" if predicted_class == 1 else "negative", probs[0].tolist()

# Example prediction
print("\n🧪 Example Prediction:")
example_text = "i dont any word ."
pred, prob = predict(example_text)
print(f"Text: {example_text}\nPrediction: {pred}, Probabilities: {prob}")


🧪 Example Prediction:
Text: i dont any word .
Prediction: negative, Probabilities: [0.7518314719200134, 0.2481684684753418]
