In [None]:
pip install pandas scikit-learn torch transformers tqdm

[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv("processed_train.csv")
test_df = pd.read_csv("processed_test.csv")

In [None]:
# Checking for null values in processed files
print("Null values in train_df:")
print(train_df.isnull().sum())

print("\nNull values in test_df:")
print(test_df.isnull().sum())

Null values in train_df:
sentiment           0
processed_review    0
dtype: int64

Null values in test_df:
sentiment           0
processed_review    0
dtype: int64


In [None]:
print(train_df.info())
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519951 entries, 0 to 519950
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   sentiment         519951 non-null  int64 
 1   processed_review  519951 non-null  object
dtypes: int64(1), object(1)
memory usage: 7.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39998 entries, 0 to 39997
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sentiment         39998 non-null  int64 
 1   processed_review  39998 non-null  object
dtypes: int64(1), object(1)
memory usage: 625.1+ KB
None


In [None]:
from sklearn.model_selection import train_test_split

# Splitting the train data into train and validation sets (90% Train, 10% Validation)
X_train, X_val, y_train, y_val = train_test_split(
    train_df["processed_review"], train_df["sentiment"], test_size=0.1, random_state=42
)

X_test = test_df["processed_review"]
y_test = test_df["sentiment"]

print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}, Test size: {len(X_test)}")

Train size: 467955, Validation size: 51996, Test size: 39998


In [None]:
import torch

# Checking if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on: {device}")

Running on: cuda


In [None]:
!pip install torch==2.0.0 torchtext==0.15.1

[0m

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from tqdm import tqdm

# Loading the tokenizer
text_tokenizer = get_tokenizer("basic_english")

# Checking if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Tokenization Running on: {device}")

def yield_tokens(data):
    for text in tqdm(data, desc="Building Vocabulary", unit="sentence"):
        yield text_tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(X_train), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

def tokenize_texts(texts, vocab, tokenizer, max_len=256, batch_size=512):
    """Tokenizes text in batches and moves to GPU for efficiency."""
    all_encodings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing", unit="batch"):
        batch_texts = texts[i:i + batch_size]
        encodings = [
            [vocab[token] for token in tokenizer(text)] for text in batch_texts
        ]

        # Padding sequences
        encodings = [
            seq + [vocab["<pad>"]] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len]
            for seq in encodings
        ]

        encodings = torch.tensor(encodings, dtype=torch.long).to(device)
        all_encodings.append(encodings)

    return torch.cat(all_encodings, dim=0)

train_encodings = tokenize_texts(X_train, vocab, text_tokenizer)
val_encodings = tokenize_texts(X_val, vocab, text_tokenizer)
test_encodings = tokenize_texts(X_test, vocab, text_tokenizer)

print("Tokenization completed and moved to GPU!")

Tokenization Running on: cuda


Building Vocabulary: 100%|██████████| 467955/467955 [00:09<00:00, 48055.67sentence/s]
Tokenizing: 100%|██████████| 914/914 [00:27<00:00, 33.51batch/s]
Tokenizing: 100%|██████████| 102/102 [00:02<00:00, 35.48batch/s]
Tokenizing: 100%|██████████| 79/79 [00:02<00:00, 37.87batch/s]

Tokenization completed and moved to GPU!





In [None]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

class YelpDataset(Dataset):
    """Custom PyTorch dataset for tokenized text data"""
    def __init__(self, encodings, labels):
        self.encodings = encodings  # Already tokenized and padded sequences
        self.labels = torch.tensor(labels.values).to(device)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.encodings[idx], self.labels[idx]

train_dataset = YelpDataset(train_encodings, y_train)
val_dataset = YelpDataset(val_encodings, y_val)
test_dataset = YelpDataset(test_encodings, y_test)

print("Datasets created and moved to GPU!")

Datasets created and moved to GPU!


In [None]:
BATCH_SIZE = 32

# Creating DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=RandomSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=SequentialSampler(val_dataset))
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=SequentialSampler(test_dataset))

print("DataLoaders ready for training!")

DataLoaders ready for training!


In [None]:
import torch.nn as nn
import torch.optim as optim

# Defining the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, hidden_dim=512, num_layers=4, output_dim=2, dropout=0.3):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab["<pad>"])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.fc1 = nn.Linear(hidden_dim * 2, 1024)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(1024, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)  # Bidirectional hidden state concatenation
        out = self.fc1(hidden)
        out = self.relu(out)
        out = self.fc2(out)
        return self.softmax(out)

# Initializing the Model
VOCAB_SIZE = len(vocab)
model = LSTMClassifier(VOCAB_SIZE, embed_dim=512, hidden_dim=512, num_layers=4, output_dim=2, dropout=0.3).to(device)

print(f"Model loaded on: {device}")

Model loaded on: cuda


In [None]:
EPOCHS = 3
PATIENCE = 2

class EarlyStopping:
    """Stops training if validation loss does not improve after `patience` epochs."""
    def __init__(self, patience=2):
        self.patience = patience
        self.best_loss = float("inf")
        self.counter = 0

    def check_early_stop(self, val_loss):
        if val_loss < self.best_loss:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                print("\n Early stopping triggered! Stopping training.")
                return True
        return False

early_stopping = EarlyStopping(patience=PATIENCE)

In [None]:
def train_model(model, train_loader, val_loader, epochs=EPOCHS):
    best_val_loss = float("inf")
    patience_counter = 0
    optimizer = optim.Adam(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        print(f"\n Epoch {epoch+1}/{epochs}")
        model.train()
        total_loss, total_correct = 0, 0
        loop = tqdm(train_loader, desc=f"Training Epoch {epoch+1}", unit="batch", dynamic_ncols=True)

        for texts, labels in loop:
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            total_correct += (outputs.argmax(dim=1) == labels).sum().item()
            loop.set_postfix(loss=loss.item())

        avg_train_loss = total_loss / len(train_loader)
        train_acc = total_correct / len(train_loader.dataset)
        print(f"\n Training Loss: {avg_train_loss:.4f}, Accuracy: {train_acc:.4f}")

        # Validation Phase
        model.eval()
        total_correct, total_loss = 0, 0
        with torch.no_grad():
            for texts, labels in val_loader:
                outputs = model(texts)
                loss = criterion(outputs, labels)
                total_loss += loss.item()
                total_correct += (outputs.argmax(dim=1) == labels).sum().item()

        avg_val_loss = total_loss / len(val_loader)
        val_acc = total_correct / len(val_loader.dataset)
        print(f"\n Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_acc:.4f}")

        # Checking for improvement and saving the best model's state
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0  # Resetting the patience counter
            # Saving the best model's state dictionary in .pt format
            torch.save(model.state_dict(), 'LSTM_Best_Model_State.pt')
            print("Best model saved!")
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print("\nEarly stopping triggered! Stopping training.")
                break

train_model(model, train_loader, val_loader, epochs=EPOCHS)


 Epoch 1/3


Training Epoch 1: 100%|██████████| 14624/14624 [26:14<00:00,  9.29batch/s, loss=0.119]



 Training Loss: 0.2825, Accuracy: 0.8805

 Validation Loss: 0.2285, Accuracy: 0.9046
Best model saved!

 Epoch 2/3


Training Epoch 2: 100%|██████████| 14624/14624 [26:01<00:00,  9.36batch/s, loss=0.224]



 Training Loss: 0.2101, Accuracy: 0.9153

 Validation Loss: 0.1998, Accuracy: 0.9191
Best model saved!

 Epoch 3/3


Training Epoch 3: 100%|██████████| 14624/14624 [26:29<00:00,  9.20batch/s, loss=0.222]



 Training Loss: 0.1828, Accuracy: 0.9270

 Validation Loss: 0.1883, Accuracy: 0.9253
Best model saved!


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate_model(model, test_loader):
    model.eval()  # Seting the model to evaluation mode
    all_preds, all_labels = [], []

    loop = tqdm(test_loader, desc="Evaluating Test Set", unit="batch")

    with torch.no_grad():
        for texts, labels in loop:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)

            preds = torch.argmax(outputs, dim=1).cpu().numpy()  # Converting to NumPy
            labels = labels.cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    return all_preds, all_labels

In [None]:
model.load_state_dict(torch.load('LSTM_Best_Model_State.pt'))
test_preds, test_labels = evaluate_model(model, test_loader)

# Computing performance metrics
accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average="binary")

print(f"\nTest Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Evaluating Test Set: 100%|██████████| 1250/1250 [00:44<00:00, 28.34batch/s]



Test Accuracy: 0.9252
Precision: 0.9270
Recall: 0.9233
F1-Score: 0.9251
