In [29]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.metrics import accuracy_score

In [30]:
import nltk
nltk.download('punkt')

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:
df = pd.read_csv('/content/all-data.csv', encoding='ISO-8859-1',names=['sentiment', 'text'])

print(df.shape)
print(df.head())

#drop rows with NaN in sentiment and text columns
df = df.dropna(subset=['sentiment', 'text'])

#convert sentiment to numerical labels
df['sentiment'] = df['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

#check if dataset size is sufficient
if len(df) == 0:
    raise ValueError("After preprocessing, the dataset is empty. Please check the preprocessing steps.")


(4846, 2)
  sentiment                                               text
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...


In [32]:
#preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text), re.I|re.A)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if not word in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['text'] = df['text'].apply(preprocess_text)

#check if any text is empty after preprocessing
empty_texts = df[df['text'] == '']
if not empty_texts.empty:
    print(f"There are {len(empty_texts)} rows with empty text after preprocessing.")
    print(empty_texts.head())

In [33]:
#tokenization and padding
class Tokenizer:
    def __init__(self, max_features, maxlen):
        self.max_features = max_features
        self.maxlen = maxlen
        self.word_index = {}
        self.index_word = {}

    def fit_on_texts(self, texts):
        word_counts = Counter()
        for text in texts:
            words = text.split()
            word_counts.update(words)
        common_words = word_counts.most_common(self.max_features)
        self.word_index = {word: idx + 1 for idx, (word, count) in enumerate(common_words)}
        self.index_word = {idx + 1: word for word, idx in self.word_index.items()}
        return self

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            words = text.split()
            sequence = [self.word_index.get(word, 0) for word in words][:self.maxlen]
            sequences.append(sequence)
        return sequences

    def pad_sequences(self, sequences):
        padded_sequences = np.zeros((len(sequences), self.maxlen))
        for i, sequence in enumerate(sequences):
            if len(sequence) == 0:
                continue  # skip empty sequences
            padded_sequence = sequence[:self.maxlen]
            padded_sequences[i, -len(padded_sequence):] = padded_sequence
        return padded_sequences.astype(int)

max_features = 10000
maxlen = 200

tokenizer = Tokenizer(max_features, maxlen)
tokenizer.fit_on_texts(df['text'])
X = tokenizer.texts_to_sequences(df['text'])
X = tokenizer.pad_sequences(X)

In [59]:
#convert labels to one-hot encoding
y = np.zeros((len(df), 3))
y[np.arange(len(df)), df['sentiment'].astype(int)] = 1


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

batch_size = 32

train_size = len(X_train) - (len(X_train) % batch_size)
test_size = len(X_test) - (len(X_test) % batch_size)

X_train = X_train[:train_size]
y_train = y_train[:train_size]
X_test = X_test[:test_size]
y_test = y_test[:test_size]

#check the final sizes after truncation
print(f"Final Training Dataset Shape: {X_train.shape}, {y_train.shape}")
print(f"Final Testing Dataset Shape: {X_test.shape}, {y_test.shape}")


Final Training Dataset Shape: (3872, 200), (3872, 3)
Final Testing Dataset Shape: (960, 200), (960, 3)


In [67]:
#custom Dataset class
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.float)

train_dataset = MyDataset(X_train, y_train)
test_dataset = MyDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

In [70]:
#define the RNN model
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, no_layers, drop_prob=0.5):
        super(RNNClassifier, self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, no_layers, dropout=drop_prob, batch_first=True)

        self.dropout = nn.Dropout(drop_prob)

        self.fc = nn.Linear(hidden_dim , output_dim)

        self.attention = nn.Linear(hidden_dim, 1)

    def forward(self, x, hidden):
        batch_size = x.size(0)

        embeds = self.embedding(x)
        rnn_out, hidden = self.rnn(embeds, hidden)

        attention_w = torch.softmax(self.attention(rnn_out).squeeze(), dim=-1).unsqueeze(-1)
        attention_out = torch.sum(attention_w * rnn_out, dim=1)

        out = self.dropout(attention_out)
        out = self.fc(out)

        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.no_layers , batch_size, self.hidden_dim).zero_())
        return hidden

#hyperparameters
vocab_size = max_features + 1
embedding_dim = 128
hidden_dim = 256
output_dim = 3
no_layers = 2

model = RNNClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, no_layers)
print(model)


RNNClassifier(
  (embedding): Embedding(10001, 128)
  (rnn): RNN(128, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (attention): Linear(in_features=256, out_features=1, bias=True)
)


In [72]:
#loss and optimization functions
lr = 0.001
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)

#training params
epochs = 20
patience = 5
best_val_loss = np.inf
counter = 0
best_model = None

#train the model
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    h = model.init_hidden(batch_size).to(device)
    train_acc = 0.0
    model.train()

    for inputs, labels in tqdm(train_loader):
        # Detach hidden state to prevent backpropagation through the entire training history
        h = h.detach()
        inputs, labels = inputs.to(device), labels.to(device)

        model.zero_grad()

        output, h = model(inputs, h)

        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        pred = torch.round(torch.sigmoid(output))
        correct_tensor = pred.eq(labels.view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy())
        train_acc += np.mean(correct)

        train_loss += loss.item() * inputs.size(0)

    train_loss = train_loss / len(train_loader.dataset)
    train_acc = train_acc / len(train_loader)

    model.eval()
    val_loss = 0.0
    val_acc = 0.0
    h = model.init_hidden(batch_size)

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader):
            h = model.init_hidden(batch_size).to(device)
            inputs, labels = inputs.to(device), labels.to(device)

            output, h = model(inputs, h)
            loss = criterion(output, labels)

            pred = torch.round(torch.sigmoid(output))
            correct_tensor = pred.eq(labels.view_as(pred))
            correct = np.squeeze(correct_tensor.cpu().numpy())
            val_acc += np.mean(correct)

            val_loss += loss.item() * inputs.size(0)

    val_loss = val_loss / len(test_loader.dataset)
    val_acc = val_acc / len(test_loader)

    print(f'Epoch: {epoch+1}/{epochs} \tTraining Loss: {train_loss:.6f} \tTraining Acc: {train_acc:.6f} \tValidation Loss: {val_loss:.6f} \tValidation Acc: {val_acc:.6f}')

    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        best_model = model.state_dict()
    else:
        counter += 1
        if counter >= patience:
            print(f'Early stopping after {epoch+1} epochs.')
            break

100%|██████████| 121/121 [00:35<00:00,  3.37it/s]
100%|██████████| 30/30 [00:02<00:00, 12.62it/s]


Epoch: 1/20 	Training Loss: 0.549698 	Training Acc: 0.745093 	Validation Loss: 0.547090 	Validation Acc: 0.740625


100%|██████████| 121/121 [00:35<00:00,  3.38it/s]
100%|██████████| 30/30 [00:02<00:00, 12.67it/s]


Epoch: 2/20 	Training Loss: 0.480154 	Training Acc: 0.779614 	Validation Loss: 0.477499 	Validation Acc: 0.773264


100%|██████████| 121/121 [00:34<00:00,  3.54it/s]
100%|██████████| 30/30 [00:02<00:00, 11.34it/s]


Epoch: 3/20 	Training Loss: 0.428639 	Training Acc: 0.809143 	Validation Loss: 0.457192 	Validation Acc: 0.795486


100%|██████████| 121/121 [00:35<00:00,  3.41it/s]
100%|██████████| 30/30 [00:02<00:00, 12.74it/s]


Epoch: 4/20 	Training Loss: 0.379892 	Training Acc: 0.843233 	Validation Loss: 0.462850 	Validation Acc: 0.800000


100%|██████████| 121/121 [00:33<00:00,  3.58it/s]
100%|██████████| 30/30 [00:03<00:00,  7.99it/s]


Epoch: 5/20 	Training Loss: 0.326863 	Training Acc: 0.873020 	Validation Loss: 0.483905 	Validation Acc: 0.793403


100%|██████████| 121/121 [00:34<00:00,  3.50it/s]
100%|██████████| 30/30 [00:02<00:00, 12.50it/s]


Epoch: 6/20 	Training Loss: 0.278621 	Training Acc: 0.894886 	Validation Loss: 0.504306 	Validation Acc: 0.803125


100%|██████████| 121/121 [00:34<00:00,  3.46it/s]
100%|██████████| 30/30 [00:03<00:00,  9.05it/s]


Epoch: 7/20 	Training Loss: 0.208258 	Training Acc: 0.931904 	Validation Loss: 0.515714 	Validation Acc: 0.799306


100%|██████████| 121/121 [00:34<00:00,  3.53it/s]
100%|██████████| 30/30 [00:02<00:00, 12.44it/s]

Epoch: 8/20 	Training Loss: 0.189063 	Training Acc: 0.941202 	Validation Loss: 0.526605 	Validation Acc: 0.804167
Early stopping after 8 epochs.



