In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import classification_report
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import os
import zipfile
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np

In [3]:
# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Load dataset

In [4]:
# Extract archive.zip
zip_path = "/content/drive/MyDrive/CMI/Sem 4/AML/archive.zip"
extract_dir = "dataset"

if not os.path.exists(extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print("Dataset extracted!")

Dataset extracted!


In [5]:
# Load the dataset
train_df = pd.read_csv("dataset/train.csv", encoding="ISO-8859-1").dropna()
test_df = pd.read_csv("dataset/test.csv", encoding="ISO-8859-1").dropna()

# Rename columns if necessary
train_df = train_df.rename(columns={"text": "review", "sentiment": "label"})
test_df = test_df.rename(columns={"text": "review", "sentiment": "label"})

# Convert labels to integers: Negative (0), Neutral (1), Positive (2)
label_map = {"negative": 0, "neutral": 1, "positive": 2}
train_df["label"] = train_df["label"].map(label_map)
test_df["label"] = test_df["label"].map(label_map)

In [None]:
# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Create datasets
train_dataset = SentimentDataset(train_df["review"].tolist(), train_df["label"].tolist(), tokenizer)
test_dataset = SentimentDataset(test_df["review"].tolist(), test_df["label"].tolist(), tokenizer)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

## Load Pretrained DistilBERT Model

In [7]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
model.to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [8]:
# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

## Training the model

In [9]:
# Training loop
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, preds = torch.max(outputs.logits, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        progress_bar.set_postfix(loss=running_loss/total, acc=100 * correct/total)

    train_acc = 100 * correct / total
    print(f"\nEpoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Train Accuracy: {train_acc:.2f}%")

print("\nTraining complete!")

Epoch 1/3: 100%|██████████| 3435/3435 [05:52<00:00,  9.75it/s, acc=76.3, loss=0.0715]



Epoch [1/3], Loss: 0.5720, Train Accuracy: 76.34%


Epoch 2/3: 100%|██████████| 3435/3435 [05:50<00:00,  9.79it/s, acc=83.4, loss=0.0525]



Epoch [2/3], Loss: 0.4197, Train Accuracy: 83.42%


Epoch 3/3: 100%|██████████| 3435/3435 [05:50<00:00,  9.79it/s, acc=89, loss=0.0359]


Epoch [3/3], Loss: 0.2869, Train Accuracy: 89.03%

Training complete!





## Evaluation

In [10]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].cpu().numpy()

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

# Print classification report
print("\nClassification Report:\n", classification_report(all_labels, all_preds, target_names=label_map.keys()))

Evaluating: 100%|██████████| 442/442 [00:13<00:00, 32.77it/s]


Classification Report:
               precision    recall  f1-score   support

    negative       0.75      0.82      0.78      1001
     neutral       0.77      0.71      0.74      1430
    positive       0.82      0.84      0.83      1103

    accuracy                           0.78      3534
   macro avg       0.78      0.79      0.78      3534
weighted avg       0.78      0.78      0.78      3534




