In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import pandas as pd
from copy import deepcopy


# Clear unused memory
torch.cuda.empty_cache()

# Mapping sentiment labels to integers
LABEL_MAPPING = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}

class SentimentDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            sentence,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

def preprocess_pandas(data):
    # Convert text to lowercase and apply mapping
    data['Sentence'] = data['Sentence'].str.lower()
    data['Class'] = data['Class'].map(LABEL_MAPPING)
    data = data.dropna()
    return data

if __name__ == "__main__":
    # Load and preprocess dataset
    data = pd.read_csv("Corona_NLP_train.csv", delimiter=',', encoding='latin1', on_bad_lines='skip')
    data.columns = data.columns.str.strip()
    data = data[['OriginalTweet', 'Sentiment']]
    data.columns = ['Sentence', 'Class']
    data = preprocess_pandas(data)

    # Split data
    train_sentences, val_sentences, train_labels, val_labels = train_test_split(
        data['Sentence'].values, 
        data['Class'].values, 
        test_size=0.2, 
        random_state=0, 
        stratify=data['Class']
    )

    # Load BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Create datasets and data loaders
    max_len = 128
    batch_size = 16
    train_dataset = SentimentDataset(train_sentences, train_labels, tokenizer, max_len)
    val_dataset = SentimentDataset(val_sentences, val_labels, tokenizer, max_len)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Load BERT model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
    model = model.to(torch.device('cuda:0' if torch.cuda.is_available() else 'cpu'))

    # Optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # Training loop
    num_epochs = 4
    best_model = None
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['label'].to(model.device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
        total_loss /= len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {total_loss:.4f}")

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(model.device)
                attention_mask = batch['attention_mask'].to(model.device)
                labels = batch['label'].to(model.device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

        val_loss /= len(val_loader)
        print(f"Validation Loss: {val_loss:.4f}")

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = deepcopy(model)
            print(f"Saved Best Model at Epoch {epoch+1}")

    # Load and preprocess test data
    test_data = pd.read_csv("Corona_NLP_test.csv", delimiter=',', encoding='latin1', on_bad_lines='skip')
    test_data.columns = test_data.columns.str.strip()
    test_data = test_data[['OriginalTweet', 'Sentiment']]
    test_data.columns = ['Sentence', 'Class']
    test_data = preprocess_pandas(test_data)

    test_sentences = test_data['Sentence'].values
    test_labels = test_data['Class'].values
    test_dataset = SentimentDataset(test_sentences, test_labels, tokenizer, max_len)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # Evaluate the best model on test data
    best_model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(best_model.device)
            attention_mask = batch['attention_mask'].to(best_model.device)
            labels = batch['label'].to(best_model.device)

            outputs = best_model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, dim=1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print("\nTest Results:")
    print("Accuracy:", accuracy_score(all_labels, all_preds))
    print("Precision:", precision_score(all_labels, all_preds, average='macro'))
    print("Recall:", recall_score(all_labels, all_preds, average='macro'))
    print("\nClassification Report:\n", classification_report(all_labels, all_preds, target_names=LABEL_MAPPING.keys()))


  from .autonotebook import tqdm as notebook_tqdm
2025-04-15 14:17:44.903603: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744719464.922940  274139 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744719464.928796  274139 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744719464.944188  274139 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744719464.944207  274139 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744719464.944209  274139

Epoch [1/4], Training Loss: 0.7701
Validation Loss: 0.5242
Saved Best Model at Epoch 1
Epoch [2/4], Training Loss: 0.3761
Validation Loss: 0.4169
Saved Best Model at Epoch 2
Epoch [3/4], Training Loss: 0.2621
Validation Loss: 0.4051
Saved Best Model at Epoch 3
Epoch [4/4], Training Loss: 0.1934
Validation Loss: 0.3910
Saved Best Model at Epoch 4

Test Results:
Accuracy: 0.8464981569246972
Precision: 0.8509993687856623
Recall: 0.8506496378254369

Classification Report:
                     precision    recall  f1-score   support

Extremely Negative       0.89      0.86      0.88       592
          Negative       0.85      0.83      0.84      1041
           Neutral       0.82      0.87      0.85       619
          Positive       0.82      0.83      0.82       947
Extremely Positive       0.88      0.86      0.87       599

          accuracy                           0.85      3798
         macro avg       0.85      0.85      0.85      3798
      weighted avg       0.85      0.85     