In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# **Introduction**
In this project, we perform sentiment analysis on the IMDB Movie Reviews dataset, which contains 50,000 movie reviews labeled as either positive or negative. The dataset is publicly available at:
http://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

We fine-tune the pretrained Transformer model distilbert-base-uncased for this binary classification task. DistilBERT is a compressed version of BERT that retains over 95% of BERT’s accuracy, while being significantly more efficient — 40% smaller and 60% faster in inference.

Unlike BERT, which is computationally heavier and slower to train, DistilBERT provides a practical trade-off between performance and efficiency. This makes it especially useful for real-world applications where speed and resource constraints matter.

We tokenize input reviews using the DistilBERT tokenizer with truncation and padding, and then fine-tune the model using standard classification techniques. The model is evaluated using accuracy and F1-score to measure its performance in predicting the sentiment of unseen reviews.

In [None]:
# ===============================
# 1. Install Required Libraries
# ===============================
!pip install transformers evaluate



In [None]:
# ===============================
# 2. Load Dataset
# ===============================
import pandas as pd
from sklearn.model_selection import train_test_split

# Load IMDB CSV (make sure path is correct or upload)
df = pd.read_csv("/content/drive/MyDrive/DL/project3/IMDB Dataset.csv")
df = df.sample(n=2000, random_state=42).reset_index(drop=True)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

In [None]:
# ===============================
# 3. Tokenize with BERT Tokenizer
# ===============================
from transformers import AutoTokenizer

# You can change this to 'distilbert-base-uncased' or 'bert-base-uncased'
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# ===============================
# 4. Create Torch Dataset
# ===============================
import torch

class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = IMDBDataset(train_encodings, train_labels)
val_dataset = IMDBDataset(val_encodings, val_labels)

In [None]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# ===============================
# 6. Training Configuration
# ===============================
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./distilbert-base-uncased-imdb",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    report_to="none"
)

In [None]:
# ===============================
# 7. Define Metrics
# ===============================
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

In [None]:

# ===============================
# 8. Train the Model
# ===============================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5781,0.365769,0.845,0.845295
2,0.2814,0.418856,0.81,0.801918
3,0.1319,0.390446,0.85,0.850501


TrainOutput(global_step=150, training_loss=0.3304743258158366, metrics={'train_runtime': 80.4009, 'train_samples_per_second': 29.85, 'train_steps_per_second': 1.866, 'total_flos': 158960878387200.0, 'train_loss': 0.3304743258158366, 'epoch': 3.0})

In [None]:

# ===============================
# 9. Evaluate the Model
# ===============================
results = trainer.evaluate()
print("\n📊 Final Evaluation:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")


📊 Final Evaluation:
eval_loss: 0.3658
eval_accuracy: 0.8450
eval_f1: 0.8453
eval_runtime: 1.4134
eval_samples_per_second: 141.5040
eval_steps_per_second: 9.1980
epoch: 3.0000


In [None]:
# ===============================
# 10. Predict Sample
# ===============================
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move inputs to the same device as the model
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probs).item()
    return "positive" if predicted_class == 1 else "negative", probs[0].tolist()

# Example prediction
print("\n🧪 Example Prediction:")
example_text = "this man ."
pred, prob = predict(example_text)
print(f"Text: {example_text}\nPrediction: {pred}, Probabilities: {prob}")


🧪 Example Prediction:
Text: this man .
Prediction: positive, Probabilities: [0.3708809018135071, 0.6291190385818481]
