In [23]:
!pip install torch==2.3.0 torchtext==0.18.0 torchdata==0.8.0



In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.transforms import VocabTransform, ToTensor, PadTransform
import torch.nn.functional as F

In [25]:
data=pd.read_csv('/content/drive/MyDrive/kaggle api/cleaned_text.csv')
data.head()

Unnamed: 0,Text,Label
0,feel really helpless heavy hearted,4
1,ive enjoyed able slouch relax unwind frankly n...,0
2,gave internship dmrg feeling distraught,4
3,dont know feel lost,0
4,kindergarten teacher thoroughly weary job take...,4


In [26]:
X=data['Text']
y=data['Label']
X=X.astype(str)
X=X.to_list()
y=y.to_list()

In [27]:
data.dropna()

Unnamed: 0,Text,Label
0,feel really helpless heavy hearted,4
1,ive enjoyed able slouch relax unwind frankly n...,0
2,gave internship dmrg feeling distraught,4
3,dont know feel lost,0
4,kindergarten teacher thoroughly weary job take...,4
...,...,...
416804,feel like telling horny devils find site suite...,2
416805,began realize feeling agitated restless would ...,3
416806,feel curious previous early dawn time seek tro...,5
416807,feel becuase tyranical nature government el sa...,3


In [28]:
data.value_counts("Label")

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
1,141067
0,121187
3,57317
4,47712
2,34554
5,14972


In [29]:
tokenizer=get_tokenizer('basic_english')
def yield_tokens(data):
    for text in data:
        yield tokenizer(str(text))

vocab=build_vocab_from_iterator(yield_tokens(X),specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab.set_default_index(vocab["<unk>"])

In [30]:
def text_to_indices(texts, vocab, tokenizer):
    return [torch.tensor([vocab[token] for token in tokenizer(text)], dtype=torch.long) for text in texts]

In [31]:
X_indices = text_to_indices(X, vocab, tokenizer)
X_train, X_test, y_train, y_test = train_test_split(X_indices, y, test_size=0.2, random_state=42, stratify=y)

In [32]:
max_len=max(len(seq) for seq in X_indices)
print(max_len)

79


In [33]:
text_transform=VocabTransform(vocab)
tensor_transform=ToTensor(padding_value=vocab["<pad>"])
pad_transform=PadTransform(max_length=max_len, pad_value=vocab["<pad>"])

In [34]:
class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [35]:
def collate_fn(batch):
    sequences, labels = zip(*batch)
    lengths = torch.tensor([len(seq) for seq in sequences])
    padded_seqs = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=vocab["<pad>"])
    labels = torch.tensor(labels, dtype=torch.long)

    return padded_seqs, labels, lengths

In [36]:
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
batch_size = 64  # Increased batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [37]:
# Check class distribution
class_counts = [y.count(i) for i in range(6)]
print(f"Class distribution: {class_counts}")

# Calculate class weights to handle imbalance
total_samples = len(y)
class_weights = [total_samples / (len(class_counts) * count) for count in class_counts]
class_weights = torch.tensor(class_weights, dtype=torch.float)
print(f"Class weights: {class_weights}")

Class distribution: [121187, 141067, 34554, 57317, 47712, 14972]
Class weights: tensor([0.5732, 0.4924, 2.0104, 1.2120, 1.4560, 4.6399])


In [38]:
len(train_dataset[0][0])

4

In [39]:
len(train_dataset)

333447

In [40]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=2, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab["<pad>"])
        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=num_layers,
                           batch_first=True,
                           bidirectional=True,
                           dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)
        # Bidirectional LSTM means hidden dimension is doubled
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, lengths):
        # Embed the input
        embedded = self.embedding(x)

        # Pack the sequences
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
        )

        # Pass through LSTM
        packed_output, (hidden, _) = self.lstm(packed)

        # Concatenate the final forward and backward hidden states
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        hidden = self.dropout(hidden)

        # Pass through the fully connected layer
        output = self.fc(hidden)

        return output

In [41]:
vocab_size = len(vocab)
embedding_dim = 128
hidden_dim = 256
output_dim = 6
num_layers = 2
dropout = 0.5
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [42]:
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, dropout).to(device)
class_weights = class_weights.to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.AdamW(model.parameters(), lr=0.0003, weight_decay=1e-5)

In [43]:
epochs = 30
best_val_loss = float('inf')
patience = 3
counter = 0
best_accuracy = 0.0

In [44]:
from sklearn.metrics import classification_report
for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0.0
    for sequences, labels, lengths in train_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        lengths = lengths.to(device)

        optimizer.zero_grad()
        outputs = model(sequences, lengths)
        loss = criterion(outputs, labels)
        loss.backward()

        # Gradient clipping to prevent exploding gradients
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for sequences, labels, lengths in test_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            lengths = lengths.to(device)

            outputs = model(sequences, lengths)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Get predictions
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            # Save for classification report
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_loss /= len(test_loader)
    accuracy = correct / total
    print(f"Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")

    # Print classification report
    target_names = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
    print(classification_report(all_labels, all_preds, target_names=target_names))

    # Early stopping check
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), 'best_emotion_model.pt')
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Load best model and evaluate
model.load_state_dict(torch.load('best_emotion_model.pt'))
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for sequences, labels, lengths in test_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        lengths = lengths.to(device)

        outputs = model(sequences, lengths)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

final_accuracy = correct / total
print(f"Final Test Accuracy: {final_accuracy:.4f}")
print("Final Classification Report:")
print(classification_report(all_labels, all_preds, target_names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']))

Epoch 1/30, Training Loss: 0.3841
Validation Loss: 0.1497, Accuracy: 0.9262
              precision    recall  f1-score   support

     sadness       0.99      0.94      0.96     24238
         joy       0.98      0.90      0.94     28214
        love       0.75      0.99      0.85      6911
       anger       0.92      0.94      0.93     11463
        fear       0.90      0.89      0.89      9542
    surprise       0.71      1.00      0.83      2994

    accuracy                           0.93     83362
   macro avg       0.87      0.94      0.90     83362
weighted avg       0.94      0.93      0.93     83362

Epoch 2/30, Training Loss: 0.1347
Validation Loss: 0.1200, Accuracy: 0.9362
              precision    recall  f1-score   support

     sadness       1.00      0.95      0.97     24238
         joy       1.00      0.91      0.95     28214
        love       0.76      1.00      0.86      6911
       anger       0.92      0.95      0.94     11463
        fear       0.90      0.90 