In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizerFast, DistilBertModel
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Constants
LABEL_MAPPING = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}
NUM_LABELS = len(LABEL_MAPPING)
BATCH_SIZE = 16
EPOCHS = 4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

# Classifier Head
class SentimentClassifier(nn.Module):
    def __init__(self, hidden_size=768, num_classes=NUM_LABELS):
        super(SentimentClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS token representation
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

# Load & preprocess
def load_and_prepare_data(file_path):
    df = pd.read_csv(file_path, encoding='latin1', on_bad_lines='skip')
    df.columns = df.columns.str.strip()
    df = df[['OriginalTweet', 'Sentiment']]
    df.columns = ['text', 'label']
    df['label'] = df['label'].map(LABEL_MAPPING)
    return df.dropna()

train_df = load_and_prepare_data("Corona_NLP_train.csv")
test_df = load_and_prepare_data("Corona_NLP_test.csv")

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df["text"].tolist(),
    train_df["label"].tolist(),
    test_size=0.375,
    random_state=42
)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)
test_dataset = SentimentDataset(test_df["text"].tolist(), test_df["label"].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Model, Loss, Optimizer
model = SentimentClassifier().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training Loop
best_val_loss = float('inf')
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    val_preds, val_true = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_true.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_state = model.state_dict().copy()
        print("Best model saved.")

# Testing
model.load_state_dict(best_model_state)
model.eval()
test_preds, test_true = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_true.extend(labels.cpu().numpy())

print("\nTest Results:")
print("Accuracy:", accuracy_score(test_true, test_preds))
print("Precision:", precision_score(test_true, test_preds, average='macro'))
print("Recall:", recall_score(test_true, test_preds, average='macro'))
print("\nClassification Report:\n", classification_report(test_true, test_preds, target_names=LABEL_MAPPING.keys()))


  from .autonotebook import tqdm as notebook_tqdm
2025-04-24 15:55:26.607679: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745502926.627521     154 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745502926.633646     154 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745502926.649649     154 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745502926.649667     154 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745502926.649669     154

Epoch 1/4, Train Loss: 0.8530
Validation Loss: 0.5817
Best model saved.
Epoch 2/4, Train Loss: 0.4606
Validation Loss: 0.4424
Best model saved.
Epoch 3/4, Train Loss: 0.3064
Validation Loss: 0.4381
Best model saved.
Epoch 4/4, Train Loss: 0.2144
Validation Loss: 0.4575

Test Results:
Accuracy: 0.8309636650868878
Precision: 0.8416191977029783
Recall: 0.8347038076661489

Classification Report:
                     precision    recall  f1-score   support

Extremely Negative       0.87      0.82      0.85       592
          Negative       0.82      0.82      0.82      1041
           Neutral       0.92      0.84      0.88       619
          Positive       0.78      0.81      0.79       947
Extremely Positive       0.83      0.88      0.85       599

          accuracy                           0.83      3798
         macro avg       0.84      0.83      0.84      3798
      weighted avg       0.83      0.83      0.83      3798

