In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Ensure NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')

LABEL_MAPPING = {
    "Extremely Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive": 4
}

def preprocess_pandas(data):
    data['Sentence'] = data['Sentence'].str.lower()
    data['Sentence'] = data['Sentence'].replace(r'[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+', '', regex=True)
    data['Sentence'] = data['Sentence'].replace(r'((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', regex=True)
    data['Sentence'] = data['Sentence'].str.replace(r'[^\w\s]', '', regex=True)
    data['Sentence'] = data['Sentence'].replace(r'\d', '', regex=True)

    stop_words = set(stopwords.words('english'))
    data['Sentence'] = data['Sentence'].apply(lambda x: " ".join(
        [word for word in word_tokenize(x) if word not in stop_words]
    ))
    return data

class SentimentANN(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, dropout=0.3):
        super(SentimentANN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 5)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.fc3(x)
        return x

if __name__ == "__main__":
    # Load and preprocess training data
    train_df = pd.read_csv("Corona_NLP_train.csv", encoding='latin1', on_bad_lines='skip')
    train_df.columns = train_df.columns.str.strip()
    train_df = train_df[['OriginalTweet', 'Sentiment']]
    train_df.columns = ['Sentence', 'Class']
    train_df['Class'] = train_df['Class'].map(LABEL_MAPPING)
    train_df = train_df.dropna()
    train_df = preprocess_pandas(train_df)

    # Load and preprocess test data
    test_df = pd.read_csv("Corona_NLP_test.csv", encoding='latin1', on_bad_lines='skip')
    test_df.columns = test_df.columns.str.strip()
    test_df = test_df[['OriginalTweet', 'Sentiment']]
    test_df.columns = ['Sentence', 'Class']
    test_df['Class'] = test_df['Class'].map(LABEL_MAPPING)
    test_df = test_df.dropna()
    test_df = preprocess_pandas(test_df)

    # Train/Val Split
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_df['Sentence'].values.astype('U'),
        train_df['Class'].values.astype('int32'),
        test_size=0.375,  # 30% val
        random_state=0,
        shuffle=True
    )

    test_texts = test_df['Sentence'].values.astype('U')
    test_labels = test_df['Class'].values.astype('int32')

    # TF-IDF and Scaling
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=10000, max_df=0.5, use_idf=True, norm='l2')
    train_vec = vectorizer.fit_transform(train_texts).toarray()
    val_vec = vectorizer.transform(val_texts).toarray()
    test_vec = vectorizer.transform(test_texts).toarray()

    scaler = MinMaxScaler()
    train_vec = scaler.fit_transform(train_vec)
    val_vec = scaler.transform(val_vec)
    test_vec = scaler.transform(test_vec)

    # Tensor conversion
    train_x_tensor = torch.tensor(train_vec, dtype=torch.float32)
    train_y_tensor = torch.tensor(train_labels, dtype=torch.long)
    val_x_tensor = torch.tensor(val_vec, dtype=torch.float32)
    val_y_tensor = torch.tensor(val_labels, dtype=torch.long)
    test_x_tensor = torch.tensor(test_vec, dtype=torch.float32)
    test_y_tensor = torch.tensor(test_labels, dtype=torch.long)

    input_dim = train_x_tensor.shape[1]
    model = SentimentANN(input_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    batch_size = 16
    train_loader = DataLoader(TensorDataset(train_x_tensor, train_y_tensor), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(val_x_tensor, val_y_tensor), batch_size=batch_size)
    test_loader = DataLoader(TensorDataset(test_x_tensor, test_y_tensor), batch_size=batch_size)

    num_epochs = 50
    patience = 5
    best_val_loss = float('inf')
    best_model_state = None
    epochs_no_improve = 0

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}")

        model.eval()
        val_loss = 0
        val_preds, val_true = [], []
        
        # Validation loop
        with torch.no_grad():
            for val_x, val_y in val_loader:
                outputs = model(val_x)
                loss = criterion(outputs, val_y)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                val_preds.extend(predicted.tolist())
                val_true.extend(val_y.tolist())

        avg_val_loss = val_loss / len(val_loader)
        print(f"Validation Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict().copy()
            epochs_no_improve = 0
            print("Best model updated.")
        else:
            epochs_no_improve += 1
            print(f"No improvement. Patience left: {patience - epochs_no_improve}")

        if epochs_no_improve >= patience:
            print("\nEarly stopping triggered.")
            break

    # Load best model
    if best_model_state:
        model.load_state_dict(best_model_state)
        print("\nLoaded best model for testing.")

    # Testing
    model.eval()
    test_preds, test_true = [], []
    with torch.no_grad():
        for test_x, test_y in test_loader:
            outputs = model(test_x)
            _, predicted = torch.max(outputs, 1)
            test_preds.extend(predicted.tolist())
            test_true.extend(test_y.tolist())

    print("\nTest Results:")
    print("Accuracy:", accuracy_score(test_true, test_preds))
    print("Precision:", precision_score(test_true, test_preds, average='macro'))
    print("Recall:", recall_score(test_true, test_preds, average='macro'))
    print("\nClassification Report:\n", classification_report(test_true, test_preds, target_names=LABEL_MAPPING.keys()))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch [1/50], Training Loss: 1.2493
Validation Loss: 1.0753
Best model updated.
Epoch [2/50], Training Loss: 0.8784
Validation Loss: 1.0554
Best model updated.
Epoch [3/50], Training Loss: 0.6700
Validation Loss: 1.1124
No improvement. Patience left: 4
Epoch [4/50], Training Loss: 0.5142
Validation Loss: 1.2479
No improvement. Patience left: 3
Epoch [5/50], Training Loss: 0.4207
Validation Loss: 1.3743
No improvement. Patience left: 2
Epoch [6/50], Training Loss: 0.3406
Validation Loss: 1.4630
No improvement. Patience left: 1
Epoch [7/50], Training Loss: 0.3000
Validation Loss: 1.5615
No improvement. Patience left: 0

Early stopping triggered.

Loaded best model for testing.

Test Results:
Accuracy: 0.5194839389152185
Precision: 0.5327908842939264
Recall: 0.5344947882651091

Classification Report:
                     precision    recall  f1-score   support

Extremely Negative       0.60      0.52      0.56       592
          Negative       0.48      0.43      0.45      1041
         