In [None]:
# ==============================
# 1. Imports and Data Loading
# ==============================
import pandas as pd
import numpy as np
import re

# Sklearn imports for classical models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# PyTorch imports for LSTM
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Dataset-1

In [None]:
# Load the Excel dataset (adjust path if needed)
file_path = "Dataset-1.xlsx"  # Adjust your file path as necessary
df = pd.read_excel(file_path)

# Display first few rows (for debugging purposes)
print("First few rows of the dataset:")
print(df.head())

# =========================================
# 2. Data Preprocessing
# =========================================

# --- Create target label from multiple subject columns ---
subject_columns = [
    "Computer Science", "Physics", "Mathematics",
    "Statistics", "Quantitative Biology", "Quantitative Finance"
]

def get_subject(row):
    for col in subject_columns:
        if row[col] == 1:
            return col
    return "Unknown"

df["target"] = df.apply(get_subject, axis=1)
print("\nUnique targets:", df["target"].unique())

# --- Combine 'TITLE' and 'ABSTRACT' into one text field ---
df["text"] = (df["TITLE"].astype(str) + " " + df["ABSTRACT"].astype(str)).str.lower()

# --- Encode the target labels ---
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["target"])
num_classes = len(label_encoder.classes_)
print("\nEncoded classes:", label_encoder.classes_)

# --- Manual splitting per class ---
# For each class, split samples into 50% train, 30% validation, 20% test.
np.random.seed(42)  # for reproducibility

train_indices = []
val_indices = []
test_indices = []

for label in df["label"].unique():
    group = df[df["label"] == label]
    indices = group.index.tolist()
    np.random.shuffle(indices)
    n = len(indices)
    train_count = int(0.5 * n)
    val_count = int(0.3 * n)
    train_indices.extend(indices[:train_count])
    val_indices.extend(indices[train_count:train_count + val_count])
    test_indices.extend(indices[train_count + val_count:])

# Create train, validation, and test sets using the computed indices
X_train = df.loc[train_indices, "text"]
y_train = df.loc[train_indices, "label"]
X_val   = df.loc[val_indices, "text"]
y_val   = df.loc[val_indices, "label"]
X_test  = df.loc[test_indices, "text"]
y_test  = df.loc[test_indices, "label"]

print(f"\nDataset split sizes by manual per-class splitting:")
print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

# =========================================
# 3. Classical Models with TF-IDF Features
# =========================================
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf   = vectorizer.transform(X_val)
X_test_tfidf  = vectorizer.transform(X_test)

# ----- Naive Bayes -----
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
print("\nNaive Bayes Accuracy on Test Set:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_, zero_division=0))

# ----- Support Vector Machine (Linear SVC) -----
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)
print("\nSVM Accuracy on Test Set:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_, zero_division=0))

# ----- Random Forest -----
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)
print("\nRandom Forest Accuracy on Test Set:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_, zero_division=0))

# =========================================
# 4. LSTM Model with PyTorch
# =========================================
from collections import Counter

# --- Build a Vocabulary ---
def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        tokens = text.split()  # simple whitespace tokenization
        counter.update(tokens)
    # Reserve index 0 for <PAD> and 1 for <UNK>
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for token, freq in counter.items():
        if freq >= min_freq:
            vocab[token] = len(vocab)
    return vocab

# Build vocabulary on the training texts
vocab = build_vocab(X_train)
vocab_size = len(vocab)
print(f"\nVocabulary Size: {vocab_size}")

# --- Helper Function: Convert Text to Sequence ---
def text_to_sequence(text, vocab):
    tokens = text.split()
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens]

# --- PyTorch Dataset for LSTM ---
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        seq = text_to_sequence(text, self.vocab)
        if len(seq) < self.max_len:
            seq = seq + [self.vocab["<PAD>"]] * (self.max_len - len(seq))
        else:
            seq = seq[:self.max_len]
        return torch.tensor(seq, dtype=torch.long), torch.tensor(label, dtype=torch.long)

max_len = 100  # maximum sequence length
batch_size = 32

train_dataset = TextDataset(X_train, y_train, vocab, max_len)
val_dataset   = TextDataset(X_val, y_val, vocab, max_len)
test_dataset  = TextDataset(X_test, y_test, vocab, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# --- Define the LSTM Classifier Model ---
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=1, bidirectional=True):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, bidirectional=bidirectional)
        lstm_output_dim = hidden_dim * (2 if bidirectional else 1)
        self.fc = nn.Linear(lstm_output_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # [batch, max_len, embed_dim]
        lstm_out, (h_n, _) = self.lstm(embedded)
        if self.lstm.bidirectional:
            h = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h = h_n[-1]
        out = self.fc(h)
        return out

# --- Set up CUDA ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

embed_dim = 128
hidden_dim = 128
num_epochs = 50   # maximum epochs
learning_rate = 0.001
patience = 5      # early stopping patience

lstm_model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, bidirectional=True)
lstm_model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)

print("\nTraining LSTM Model")
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    # Training Phase
    lstm_model.train()
    train_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = lstm_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
    train_loss /= len(train_dataset)

    # Validation Phase
    lstm_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = lstm_model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
    val_loss /= len(val_dataset)

    print(f"Epoch {epoch+1}/{num_epochs} -- Train Loss: {train_loss:.4f} -- Val Loss: {val_loss:.4f}")

    # Early Stopping Check with Patience
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Validation loss did not improve for 5 consecutive epochs; early stopping.")
            break

# --- Evaluate the LSTM Model on the Test Set ---
lstm_model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = lstm_model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("\nLSTM Test Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_, zero_division=0))


# A2D2 dataset

In [None]:
# =============================================
# Text Classification on A2D2.xlsx
# =============================================


# Load the Excel dataset "A2D2.xlsx"
file_path = "A2D2.xlsx"  # Adjust the path if necessary
df = pd.read_excel(file_path)

print("First few rows of A2D2.xlsx:")
print(df.head())

# ------------------------------
# 2. Data Preprocessing
# ------------------------------
# For A2D2, we use the "content" column as our text feature.
df["text"] = df["Content"].astype(str).str.lower()

# Encode target labels from the "domain" column
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Domain"])
num_classes = len(label_encoder.classes_)
print("\nEncoded classes:", label_encoder.classes_)

# --- Manual per-class splitting ---
# For each class, split samples into 50% train, 30% validation, and 20% test.
np.random.seed(42)  # For reproducibility

train_indices = []
val_indices = []
test_indices = []

for lbl in df["label"].unique():
    group = df[df["label"] == lbl]
    indices = group.index.tolist()
    np.random.shuffle(indices)
    n = len(indices)
    train_count = int(0.5 * n)
    val_count = int(0.3 * n)
    train_indices.extend(indices[:train_count])
    val_indices.extend(indices[train_count:train_count + val_count])
    test_indices.extend(indices[train_count + val_count:])

# Create train, validation, and test sets
X_train_text = df.loc[train_indices, "text"]
y_train = df.loc[train_indices, "label"]
X_val_text = df.loc[val_indices, "text"]
y_val = df.loc[val_indices, "label"]
X_test_text = df.loc[test_indices, "text"]
y_test = df.loc[test_indices, "label"]

print(f"\nManual split sizes: Train={len(X_train_text)}, Val={len(X_val_text)}, Test={len(X_test_text)}")

# ------------------------------
# 3. Classical Models with TF-IDF Features
# ------------------------------
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf  = vectorizer.transform(X_test_text)
# (You may also vectorize X_val_text if desired)

# ----- Naive Bayes -----
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
print("\nNaive Bayes Accuracy on Test Set:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_, zero_division=0))

# ----- Support Vector Machine (LinearSVC) -----
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)
print("\nSVM Accuracy on Test Set:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_, zero_division=0))

# ----- Random Forest -----
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)
print("\nRandom Forest Accuracy on Test Set:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_, zero_division=0))

# ------------------------------
# 4. LSTM Model with PyTorch
# ------------------------------
from collections import Counter

# --- Build a Vocabulary ---
def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        tokens = text.split()  # simple whitespace tokenization
        counter.update(tokens)
    # Reserve index 0 for <PAD> and index 1 for <UNK>
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for token, freq in counter.items():
        if freq >= min_freq:
            vocab[token] = len(vocab)
    return vocab

# Build vocabulary on training texts
vocab = build_vocab(X_train_text)
vocab_size = len(vocab)
print(f"\nVocabulary Size: {vocab_size}")

# --- Helper Function: Convert Text to Sequence ---
def text_to_sequence(text, vocab):
    tokens = text.split()
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens]

# --- PyTorch Dataset for LSTM ---
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        seq = text_to_sequence(text, self.vocab)
        # Pad or truncate the sequence to fixed length
        if len(seq) < self.max_len:
            seq = seq + [self.vocab["<PAD>"]] * (self.max_len - len(seq))
        else:
            seq = seq[:self.max_len]
        return torch.tensor(seq, dtype=torch.long), torch.tensor(label, dtype=torch.long)

max_len = 100  # Maximum sequence length
batch_size = 32

train_dataset = TextDataset(X_train_text, y_train, vocab, max_len)
val_dataset   = TextDataset(X_val_text, y_val, vocab, max_len)
test_dataset  = TextDataset(X_test_text, y_test, vocab, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# --- Define the LSTM Classifier Model ---
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=1, bidirectional=True):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, bidirectional=bidirectional)
        lstm_output_dim = hidden_dim * (2 if bidirectional else 1)
        self.fc = nn.Linear(lstm_output_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # [batch, max_len, embed_dim]
        lstm_out, (h_n, _) = self.lstm(embedded)
        if self.lstm.bidirectional:
            h = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h = h_n[-1]
        out = self.fc(h)
        return out

# --- Set up CUDA ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

# Hyperparameters for LSTM
embed_dim = 128
hidden_dim = 128
num_epochs = 50   # Maximum epochs
learning_rate = 0.001
patience = 5      # Early stopping patience

lstm_model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, bidirectional=True)
lstm_model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)

print("\nTraining LSTM Model with Early Stopping (patience = 5)...")
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    # Training Phase
    lstm_model.train()
    train_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = lstm_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
    train_loss /= len(train_dataset)

    # Validation Phase
    lstm_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = lstm_model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
    val_loss /= len(val_dataset)

    print(f"Epoch {epoch+1}/{num_epochs} -- Train Loss: {train_loss:.4f} -- Val Loss: {val_loss:.4f}")

    # Early Stopping Check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Validation loss did not improve for 5 consecutive epochs; early stopping.")
            break

# --- Evaluate the LSTM Model on the Test Set ---
lstm_model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = lstm_model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("\nLSTM Test Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_, zero_division=0))


# Dataset-2

In [None]:
# Load the Excel dataset (adjust path if needed)
file_path = "Dataset-2.xlsx"
df = pd.read_excel(file_path)

# Display first few rows for debugging
print("First few rows:")
print(df.head())

# =========================================
# 2. Data Preprocessing for All Models
# =========================================

# Combine 'Title' and 'Content' into a single text field (and lowercase it)
df["text"] = (df["Title"].astype(str) + " " + df["Content"].astype(str)).str.lower()

# Encode target labels from the "Domain" column
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Domain"])
num_classes = len(label_encoder.classes_)
print("\nEncoded classes:", label_encoder.classes_)

# --- Manual per-class splitting into 50% train, 30% validation, 20% test ---
np.random.seed(42)  # for reproducibility

train_indices = []
val_indices = []
test_indices = []

for lbl in df["label"].unique():
    group = df[df["label"] == lbl]
    indices = group.index.tolist()
    np.random.shuffle(indices)
    n = len(indices)
    train_count = int(0.5 * n)
    val_count = int(0.3 * n)
    # The remaining samples go to test (20% or remainder)
    train_indices.extend(indices[:train_count])
    val_indices.extend(indices[train_count:train_count + val_count])
    test_indices.extend(indices[train_count + val_count:])

# Create train, validation, and test sets
X_train_text = df.loc[train_indices, "text"]
y_train = df.loc[train_indices, "label"]
X_val_text = df.loc[val_indices, "text"]
y_val = df.loc[val_indices, "label"]
X_test_text = df.loc[test_indices, "text"]
y_test = df.loc[test_indices, "label"]

print(f"\nManual split sizes: Train={len(X_train_text)}, Val={len(X_val_text)}, Test={len(X_test_text)}")

# =========================================
# 3. Classical Models using TF-IDF Features
# =========================================

# Vectorize text with TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf  = vectorizer.transform(X_test_text)

# ----- Naive Bayes -----
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
print("\nNaive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_))

# ----- Support Vector Machine (Linear SVC) -----
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)
print("\nSVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

# ----- Random Forest -----
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)
print("\nRandom Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

# =========================================
# 4. LSTM Model with PyTorch
# =========================================
from collections import Counter

# --- Build a Vocabulary ---
def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        tokens = text.split()  # simple whitespace tokenization
        counter.update(tokens)
    # Reserve indices: 0 for <PAD> and 1 for <UNK>
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for token, freq in counter.items():
        if freq >= min_freq:
            vocab[token] = len(vocab)
    return vocab

# Build vocabulary on training texts
vocab = build_vocab(X_train_text)
vocab_size = len(vocab)
print(f"\nVocabulary Size: {vocab_size}")

# --- Helper Function: Convert Text to Sequence ---
def text_to_sequence(text, vocab):
    tokens = text.split()
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens]

# --- PyTorch Dataset for LSTM ---
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        seq = text_to_sequence(text, self.vocab)
        # Pad or truncate the sequence to fixed length
        if len(seq) < self.max_len:
            seq = seq + [self.vocab["<PAD>"]] * (self.max_len - len(seq))
        else:
            seq = seq[:self.max_len]
        return torch.tensor(seq, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# Create datasets and DataLoaders for train, validation, and test
max_len = 100  # maximum sequence length
batch_size = 32

train_dataset = TextDataset(X_train_text, y_train, vocab, max_len)
val_dataset   = TextDataset(X_val_text, y_val, vocab, max_len)
test_dataset  = TextDataset(X_test_text, y_test, vocab, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# --- Define the LSTM Classifier Model ---
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers=1, bidirectional=True):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, bidirectional=bidirectional)
        lstm_output_dim = hidden_dim * (2 if bidirectional else 1)
        self.fc = nn.Linear(lstm_output_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # [batch, max_len, embed_dim]
        lstm_out, (h_n, _) = self.lstm(embedded)
        # For bidirectional LSTM, concatenate final forward and backward hidden states
        if self.lstm.bidirectional:
            h = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h = h_n[-1]
        out = self.fc(h)
        return out

# --- Set up CUDA ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

# Hyperparameters for LSTM
embed_dim = 128
hidden_dim = 128
num_epochs = 50   # maximum epochs
learning_rate = 0.001
patience = 5      # early stopping patience

lstm_model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, bidirectional=True)
lstm_model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)

# --- Training Loop with Early Stopping ---
print("\nTraining LSTM Model")
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    # Training Phase
    lstm_model.train()
    train_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = lstm_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
    train_loss /= len(train_dataset)

    # Validation Phase
    lstm_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = lstm_model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
    val_loss /= len(val_dataset)

    print(f"Epoch {epoch+1}/{num_epochs} -- Train Loss: {train_loss:.4f} -- Val Loss: {val_loss:.4f}")

    # Early Stopping Check with Patience
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Validation loss did not improve for 5 consecutive epochs; early stopping.")
            break

# --- Evaluate the LSTM Model on the Test Set ---
lstm_model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = lstm_model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("\nLSTM Test Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))
