<a href="https://colab.research.google.com/github/Hirwabrian/Group19-Machine_Learning_Techniques_I/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import re
import string

def preprocess_ag_news(file_path):
    # 1. Load data - skipping the header row
    df = pd.read_csv(file_path, header=0, names=['label', 'title', 'description'], engine='python')

    # 2. Map numeric labels to names for better visualization
    label_map = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}
    df['class_name'] = df['label'].map(label_map)

    # 3. Combine Title and Description
    df['text'] = df['title'] + " " + df['description']

    def clean_text(text):
        # Lowercase
        text = text.lower()
        # Remove backslash escapes like \n or \b
        text = re.sub(r'\\[nb]', ' ', text)
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text

    print(f"Cleaning {file_path}...")
    df['text'] = df['text'].apply(clean_text)

    # Keep only what is necessary for the models
    return df[['text', 'label', 'class_name']]

# EXECUTION
train_cleaned = preprocess_ag_news('/content/drive/MyDrive/AG News Classification Dataset/train.csv')
test_cleaned = preprocess_ag_news('/content/drive/MyDrive/AG News Classification Dataset/test.csv')

# Save to shared CSVs
train_cleaned.to_csv('ag_news_train_cleaned.csv', index=False)
test_cleaned.to_csv('ag_news_test_cleaned.csv', index=False)

print("\nSample Output:")
print(train_cleaned.head())
print("\nClass Distribution:")
print(train_cleaned['class_name'].value_counts())

Cleaning /content/drive/MyDrive/AG News Classification Dataset/train.csv...
Cleaning /content/drive/MyDrive/AG News Classification Dataset/test.csv...

Sample Output:
                                                text  label class_name
0  wall st bears claw back into the black reuters...      3   Business
1  carlyle looks toward commercial aerospace reut...      3   Business
2  oil and economy cloud stocks outlook reuters r...      3   Business
3  iraq halts oil exports from main southern pipe...      3   Business
4  oil prices soar to alltime record posing new m...      3   Business

Class Distribution:
class_name
Business    30000
Sci/Tech    30000
Sports      30000
World       30000
Name: count, dtype: int64


In [3]:
!pip install matplotlib-venn



In [4]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [5]:
import urllib.request
import zipfile
import os

def download_glove():
    """Download and extract GloVe embeddings"""
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    zip_file = "glove.6B.zip"

    # Download if not exists
    if not os.path.exists(zip_file):
        print("Downloading GloVe embeddings (862 MB)...")
        urllib.request.urlretrieve(url, zip_file)
        print("Download complete!")

    # Extract the 100d file
    if not os.path.exists("glove.6B.100d.txt"):
        print("Extracting glove.6B.100d.txt...")
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extract("glove.6B.100d.txt")
        print("Extraction complete!")

    print("GloVe embeddings ready!")

# Run this before your main code
download_glove()

Downloading GloVe embeddings (862 MB)...
Download complete!
Extracting glove.6B.100d.txt...
Extraction complete!
GloVe embeddings ready!


In [6]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-3.0.1-py3-none-any.whl.metadata (10.0 kB)
Using cached pybind11-3.0.1-py3-none-any.whl (293 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp312-cp312-linux_x86_64.whl size=4498209 sha256=111a1761a139a262d37e8e5364b826d8abccab8236b7da495e4e74be7ba30eab
  Stored in directory: /root/.cache/pip/wheels/20/27/95/a7baf1b435f1cbde017cabd

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from gensim.models import Word2Vec, FastText
import re
import pickle
import time
from typing import Tuple, Dict
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path

# Create the directory
output_path = Path('/mnt/user-data/outputs/')
output_path.mkdir(parents=True, exist_ok=True)

# Save your model
# model.save(output_path / 'tfidf_lstm_model.keras') # This line was causing the NameError, as 'model' was not defined here.

# Set random seeds for reproducibility
np.random.seed(42)
import tensorflow as tf
tf.random.set_seed(42)

# Constants
MAX_WORDS = 20000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100
BATCH_SIZE = 128
EPOCHS = 5  # Reduced to 5
SAMPLE_SIZE = 0.2  # Use 20% of data for faster training

class DataPreprocessor:
    """Handle data loading and preprocessing"""

    def __init__(self, train_path: str, test_path: str):
        self.train_path = train_path
        self.test_path = test_path
        self.tokenizer = None
        self.label_encoder = None

    def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load train and test datasets with sampling"""
        print("Loading datasets...")
        train_df = pd.read_csv(self.train_path)
        test_df = pd.read_csv(self.test_path)

        # Sample the data for faster training
        print(f"\nOriginal train samples: {len(train_df)}")
        print(f"Original test samples: {len(test_df)}")

        train_df = train_df.sample(frac=SAMPLE_SIZE, random_state=42).reset_index(drop=True)
        test_df = test_df.sample(frac=SAMPLE_SIZE, random_state=42).reset_index(drop=True)

        print(f"\nSampled train samples ({SAMPLE_SIZE*100}%): {len(train_df)}")
        print(f"Sampled test samples ({SAMPLE_SIZE*100}%): {len(test_df)}")

        # AG News has columns: Class Index, Title, Description
        # Combine Title and Description for better context
        train_df['text'] = train_df['Title'] + ' ' + train_df['Description']
        test_df['text'] = test_df['Title'] + ' ' + test_df['Description']

        print(f"Classes: {sorted(train_df['Class Index'].unique())}")

        return train_df, test_df

    def clean_text(self, text: str) -> str:
        """Clean and preprocess text"""
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def prepare_data(self, train_df: pd.DataFrame, test_df: pd.DataFrame):
        """Prepare data for training"""
        print("\nCleaning text...")
        train_df['cleaned_text'] = train_df['text'].apply(self.clean_text)
        test_df['cleaned_text'] = test_df['text'].apply(self.clean_text)

        # Encode labels
        self.label_encoder = LabelEncoder()
        y_train = self.label_encoder.fit_transform(train_df['Class Index'])
        y_test = self.label_encoder.transform(test_df['Class Index'])

        # Tokenize text
        print("Tokenizing text...")
        self.tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<UNK>')
        self.tokenizer.fit_on_texts(train_df['cleaned_text'])

        # Convert to sequences
        X_train_seq = self.tokenizer.texts_to_sequences(train_df['cleaned_text'])
        X_test_seq = self.tokenizer.texts_to_sequences(test_df['cleaned_text'])

        # Pad sequences
        X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
        X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

        # One-hot encode labels
        y_train_cat = to_categorical(y_train)
        y_test_cat = to_categorical(y_test)

        print(f"Vocabulary size: {len(self.tokenizer.word_index)}")
        print(f"X_train shape: {X_train_padded.shape}")
        print(f"X_test shape: {X_test_padded.shape}")

        return X_train_padded, X_test_padded, y_train_cat, y_test_cat, train_df['cleaned_text'], test_df['cleaned_text']


class TFIDFEmbedding:
    """Create TF-IDF weighted word embeddings"""

    def __init__(self, tokenizer, vocab_size: int, embedding_dim: int):
        self.tokenizer = tokenizer
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.tfidf_vectorizer = None
        self.embedding_matrix = None

    def create_embedding_matrix(self, texts):
        """Create embedding matrix using TF-IDF weights"""
        print("\n=== Creating TF-IDF Embedding Matrix ===")

        # Calculate TF-IDF
        self.tfidf_vectorizer = TfidfVectorizer(max_features=self.vocab_size)
        self.tfidf_vectorizer.fit(texts)

        # Initialize random embedding matrix
        self.embedding_matrix = np.random.randn(self.vocab_size + 1, self.embedding_dim) * 0.01

        # Weight embeddings by TF-IDF scores
        word_index = self.tokenizer.word_index
        tfidf_feature_names = self.tfidf_vectorizer.get_feature_names_out()
        tfidf_vocab = {word: idx for idx, word in enumerate(tfidf_feature_names)}

        for word, idx in word_index.items():
            if idx < self.vocab_size and word in tfidf_vocab:
                # Use TF-IDF score to scale the random embedding
                tfidf_idx = tfidf_vocab[word]
                tfidf_scores = self.tfidf_vectorizer.idf_[tfidf_idx]
                self.embedding_matrix[idx] = np.random.randn(self.embedding_dim) * tfidf_scores * 0.01

        print(f"TF-IDF Embedding matrix shape: {self.embedding_matrix.shape}")
        return self.embedding_matrix


class Word2VecEmbedding:
    """Create Word2Vec Skip-gram embeddings"""

    def __init__(self, tokenizer, vocab_size: int, embedding_dim: int):
        self.tokenizer = tokenizer
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.w2v_model = None
        self.embedding_matrix = None

    def create_embedding_matrix(self, texts):
        """Create embedding matrix using Word2Vec Skip-gram"""
        print("\n=== Creating Word2Vec Skip-gram Embedding Matrix ===")

        # Tokenize texts into words
        sentences = [text.split() for text in texts]

        # Train Word2Vec model with Skip-gram (sg=1)
        print("Training Word2Vec Skip-gram model...")
        self.w2v_model = Word2Vec(
            sentences=sentences,
            vector_size=self.embedding_dim,
            window=5,
            min_count=2,
            workers=4,
            sg=1,  # Skip-gram
            epochs=10,
            seed=42
        )

        # Create embedding matrix
        self.embedding_matrix = np.zeros((self.vocab_size + 1, self.embedding_dim))
        word_index = self.tokenizer.word_index

        found = 0
        for word, idx in word_index.items():
            if idx < self.vocab_size:
                try:
                    self.embedding_matrix[idx] = self.w2v_model.wv[word]
                    found += 1
                except KeyError:
                    # Word not in Word2Vec vocabulary, use random initialization
                    self.embedding_matrix[idx] = np.random.randn(self.embedding_dim) * 0.01

        print(f"Word2Vec Embedding matrix shape: {self.embedding_matrix.shape}")
        print(f"Found embeddings for {found}/{min(len(word_index), self.vocab_size)} words")

        return self.embedding_matrix


class FastTextEmbedding:
    """Create FastText embeddings"""

    def __init__(self, tokenizer, vocab_size: int, embedding_dim: int):
        self.tokenizer = tokenizer
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.fasttext_model = None
        self.embedding_matrix = None

    def create_embedding_matrix(self, texts):
        """Create embedding matrix using FastText"""
        print("\n=== Creating FastText Embedding Matrix ===")

        # Tokenize texts into words
        sentences = [text.split() for text in texts]

        # Train FastText model
        print("Training FastText model...")
        self.fasttext_model = FastText(
            sentences=sentences,
            vector_size=self.embedding_dim,
            window=5,
            min_count=2,
            workers=4,
            sg=1,  # Skip-gram
            epochs=10,
            seed=42
        )

        # Create embedding matrix
        self.embedding_matrix = np.zeros((self.vocab_size + 1, self.embedding_dim))
        word_index = self.tokenizer.word_index

        found = 0
        for word, idx in word_index.items():
            if idx < self.vocab_size:
                try:
                    # FastText can generate embeddings for OOV words using subword information
                    self.embedding_matrix[idx] = self.fasttext_model.wv[word]
                    found += 1
                except KeyError:
                    # Use random initialization as fallback
                    self.embedding_matrix[idx] = np.random.randn(self.embedding_dim) * 0.01

        print(f"FastText Embedding matrix shape: {self.embedding_matrix.shape}")
        print(f"Found embeddings for {found}/{min(len(word_index), self.vocab_size)} words")

        return self.embedding_matrix


class GloVeEmbedding:
    """Load and use pre-trained GloVe embeddings"""

    def __init__(self, tokenizer, vocab_size: int, embedding_dim: int, glove_path: str = None):
        self.tokenizer = tokenizer
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.glove_path = glove_path
        self.embedding_matrix = None

    def load_glove_embeddings(self):
        """Load GloVe embeddings from file"""
        embeddings_index = {}

        if self.glove_path is None:
            print("\nNote: GloVe file path not provided. Creating synthetic GloVe-style embeddings.")
            print("For real GloVe embeddings, download from: https://nlp.stanford.edu/projects/glove/")
            return None

        print(f"\nLoading GloVe embeddings from {self.glove_path}...")
        with open(self.glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = vector

        print(f"Loaded {len(embeddings_index)} word vectors")
        return embeddings_index

    def create_embedding_matrix(self, texts=None):
        """Create embedding matrix using GloVe"""
        print("\n=== Creating GloVe Embedding Matrix ===")

        embeddings_index = self.load_glove_embeddings()

        # Initialize with zeros
        self.embedding_matrix = np.zeros((self.vocab_size + 1, self.embedding_dim))
        word_index = self.tokenizer.word_index

        if embeddings_index is None:
            # Create synthetic embeddings if GloVe file not available
            print("Creating synthetic context-based embeddings...")
            for word, idx in word_index.items():
                if idx < self.vocab_size:
                    # Use hash-based deterministic initialization
                    np.random.seed(hash(word) % (2**32))
                    self.embedding_matrix[idx] = np.random.randn(self.embedding_dim) * 0.01
        else:
            # Use real GloVe embeddings
            found = 0
            for word, idx in word_index.items():
                if idx < self.vocab_size:
                    embedding_vector = embeddings_index.get(word)
                    if embedding_vector is not None:
                        self.embedding_matrix[idx] = embedding_vector
                        found += 1
                    else:
                        # Random initialization for OOV words
                        self.embedding_matrix[idx] = np.random.randn(self.embedding_dim) * 0.01

            print(f"Found embeddings for {found}/{min(len(word_index), self.vocab_size)} words")

        print(f"GloVe Embedding matrix shape: {self.embedding_matrix.shape}")
        return self.embedding_matrix

    def evaluate(self, X_test, y_test):
        """Evaluate the model"""
        print(f"\n=== Evaluating {self.name} ===")

        loss, accuracy = self.model.evaluate(X_test, y_test, verbose=0)
        print(f"Test Loss: {loss:.4f}")
        print(f"Test Accuracy: {accuracy:.4f}")

        return loss, accuracy


def plot_training_history(histories: Dict, save_path: str = '/mnt/user-data/outputs/training_comparison.png'):
    """Plot training history comparison"""
    fig, axes = plt.subplots(1, 2, figsize=(16, 5))

    colors = {'TF-IDF': '#3498db', 'Skip-gram': '#e74c3c', 'FastText': '#9b59b6', 'GloVe': '#2ecc71'}

    for name, history in histories.items():
        color = colors.get(name, '#000000')
        # Plot accuracy
        axes[0].plot(history.history['accuracy'], label=f'{name} - Train',
                    alpha=0.8, color=color, linewidth=2)
        axes[0].plot(history.history['val_accuracy'], label=f'{name} - Val',
                    linestyle='--', alpha=0.8, color=color, linewidth=2)

    axes[0].set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Accuracy')
    axes[0].legend(loc='lower right', fontsize=9)
    axes[0].grid(True, alpha=0.3)

    for name, history in histories.items():
        color = colors.get(name, '#000000')
        # Plot loss
        axes[1].plot(history.history['loss'], label=f'{name} - Train',
                    alpha=0.8, color=color, linewidth=2)
        axes[1].plot(history.history['val_loss'], label=f'{name} - Val',
                    linestyle='--', alpha=0.8, color=color, linewidth=2)

    axes[1].set_title('Model Loss Comparison', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Loss')
    axes[1].legend(loc='upper right', fontsize=9)
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"\nTraining history plot saved to {save_path}")
    plt.close()


def plot_results_comparison(results: Dict, save_path: str = '/mnt/user-data/outputs/results_comparison.png'):
    """Plot final results comparison"""
    models = list(results.keys())
    accuracies = [results[model]['accuracy'] for model in models]
    losses = [results[model]['loss'] for model in models]

    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    colors = ['#3498db', '#e74c3c', '#9b59b6', '#2ecc71']

    # Accuracy comparison
    bars1 = axes[0].bar(models, accuracies, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
    axes[0].set_title('Test Accuracy Comparison', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('Accuracy')
    axes[0].set_ylim([0, 1])
    axes[0].grid(True, alpha=0.3, axis='y')

    # Add value labels on bars
    for bar in bars1:
        height = bar.get_height()
        axes[0].text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.4f}',
                    ha='center', va='bottom', fontweight='bold', fontsize=10)

    # Loss comparison
    bars2 = axes[1].bar(models, losses, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
    axes[1].set_title('Test Loss Comparison', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('Loss')
    axes[1].grid(True, alpha=0.3, axis='y')

    # Add value labels on bars
    for bar in bars2:
        height = bar.get_height()
        axes[1].text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.4f}',
                    ha='center', va='bottom', fontweight='bold', fontsize=10)

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"Results comparison plot saved to {save_path}")
    plt.close()


def main():
    """Main training pipeline"""
    print("="*80)
    print("LSTM TEXT CLASSIFICATION WITH MULTIPLE EMBEDDING APPROACHES")
    print("="*80)
    print(f"Epochs: {EPOCHS}")
    print(f"Data sample size: {SAMPLE_SIZE*100}%")
    print("="*80)

    # Paths - UPDATE THESE WITH YOUR LOCAL PATHS
    train_path = '/content/drive/MyDrive/AG News Classification Dataset/train.csv'
    test_path = '/content/drive/MyDrive/AG News Classification Dataset/test.csv'
    glove_path = None  # Set to 'glove.6B.100d.txt' if you have GloVe file

    # 1. Load and preprocess data
    preprocessor = DataPreprocessor(train_path, test_path)
    train_df, test_df = preprocessor.load_data()
    X_train, X_test, y_train, y_test, train_texts, test_texts = preprocessor.prepare_data(train_df, test_df)

    # Split training data for validation
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.1, random_state=42
    )

    vocab_size = min(len(preprocessor.tokenizer.word_index), MAX_WORDS)
    num_classes = y_train.shape[1]

    # Store results
    histories = {}
    results = {}

    # 2. TF-IDF Embedding Approach
    print("\n" + "="*80)
    print("APPROACH 1: TF-IDF WEIGHTED EMBEDDINGS")
    print("="*80)

    tfidf_emb = TFIDFEmbedding(preprocessor.tokenizer, vocab_size, EMBEDDING_DIM)
    tfidf_matrix = tfidf_emb.create_embedding_matrix(train_texts)

    # Define the LSTMClassifier class first
    class LSTMClassifier:
        def __init__(self, vocab_size, embedding_dim, num_classes, name="LSTM_Model"):
            self.vocab_size = vocab_size
            self.embedding_dim = embedding_dim
            self.num_classes = num_classes
            self.name = name
            self.model = None

        def build_model(self, embedding_matrix=None, trainable_embeddings=True):
            model = Sequential()
            if embedding_matrix is not None:
                model.add(Embedding(self.vocab_size + 1, self.embedding_dim, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=trainable_embeddings))
            else:
                model.add(Embedding(self.vocab_size + 1, self.embedding_dim, input_length=MAX_SEQUENCE_LENGTH))
            model.add(Bidirectional(LSTM(64, return_sequences=True)))
            model.add(Dropout(0.3))
            model.add(Bidirectional(LSTM(32)))
            model.add(Dropout(0.3))
            model.add(Dense(self.num_classes, activation='softmax'))

            model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
            self.model = model
            print(f"\n{self.name} Model Summary:")
            self.model.summary()

        def train(self, X_train, y_train, X_val, y_val):
            print(f"\n=== Training {self.name} ===")
            early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
            reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)
            history = self.model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr], verbose=1)
            return history

        def evaluate(self, X_test, y_test):
            print(f"\n=== Evaluating {self.name} ===")
            loss, accuracy = self.model.evaluate(X_test, y_test, verbose=0)
            print(f"Test Loss: {loss:.4f}")
            print(f"Test Accuracy: {accuracy:.4f}")
            return loss, accuracy

    tfidf_model = LSTMClassifier(vocab_size, EMBEDDING_DIM, num_classes, "TF-IDF LSTM")
    tfidf_model.build_model(embedding_matrix=tfidf_matrix, trainable_embeddings=True)
    tfidf_history = tfidf_model.train(X_train_split, y_train_split, X_val, y_val)
    tfidf_loss, tfidf_acc = tfidf_model.evaluate(X_test, y_test)

    histories['TF-IDF'] = tfidf_history
    results['TF-IDF'] = {'loss': tfidf_loss, 'accuracy': tfidf_acc}

    # Save model
    tfidf_model.model.save('/mnt/user-data/outputs/tfidf_lstm_model.keras')

    # 3. Word2Vec Skip-gram Approach
    print("\n" + "="*80)
    print("APPROACH 2: WORD2VEC SKIP-GRAM EMBEDDINGS")
    print("="*80)

    w2v_emb = Word2VecEmbedding(preprocessor.tokenizer, vocab_size, EMBEDDING_DIM)
    w2v_matrix = w2v_emb.create_embedding_matrix(train_texts)

    w2v_model = LSTMClassifier(vocab_size, EMBEDDING_DIM, num_classes, "Skip-gram LSTM")
    w2v_model.build_model(embedding_matrix=w2v_matrix, trainable_embeddings=True)
    w2v_history = w2v_model.train(X_train_split, y_train_split, X_val, y_val)
    w2v_loss, w2v_acc = w2v_model.evaluate(X_test, y_test)

    histories['Skip-gram'] = w2v_history
    results['Skip-gram'] = {'loss': w2v_loss, 'accuracy': w2v_acc}

    # Save model
    w2v_model.model.save('/mnt/user-data/outputs/skipgram_lstm_model.keras')

    # 4. FastText Approach
    print("\n" + "="*80)
    print("APPROACH 3: FASTTEXT EMBEDDINGS")
    print("="*80)

    fasttext_emb = FastTextEmbedding(preprocessor.tokenizer, vocab_size, EMBEDDING_DIM)
    fasttext_matrix = fasttext_emb.create_embedding_matrix(train_texts)

    fasttext_model = LSTMClassifier(vocab_size, EMBEDDING_DIM, num_classes, "FastText LSTM")
    fasttext_model.build_model(embedding_matrix=fasttext_matrix, trainable_embeddings=True)
    fasttext_history = fasttext_model.train(X_train_split, y_train_split, X_val, y_val)
    fasttext_loss, fasttext_acc = fasttext_model.evaluate(X_test, y_test)

    histories['FastText'] = fasttext_history
    results['FastText'] = {'loss': fasttext_loss, 'accuracy': fasttext_acc}

    # Save model
    fasttext_model.model.save('/mnt/user-data/outputs/fasttext_lstm_model.keras')

    # 5. GloVe Embedding Approach
    print("\n" + "="*80)
    print("APPROACH 4: GLOVE EMBEDDINGS")
    print("="*80)

    glove_emb = GloVeEmbedding(preprocessor.tokenizer, vocab_size, EMBEDDING_DIM, glove_path)
    glove_matrix = glove_emb.create_embedding_matrix(train_texts)

    glove_model = LSTMClassifier(vocab_size, EMBEDDING_DIM, num_classes, "GloVe LSTM")
    glove_model.build_model(embedding_matrix=glove_matrix, trainable_embeddings=False)
    glove_history = glove_model.train(X_train_split, y_train_split, X_val, y_val)
    glove_loss, glove_acc = glove_model.evaluate(X_test, y_test)

    histories['GloVe'] = glove_history
    results['GloVe'] = {'loss': glove_loss, 'accuracy': glove_acc}

    # Save model
    glove_model.model.save('/mnt/user-data/outputs/glove_lstm_model.keras')

    # 6. Generate comparison plots
    plot_training_history(histories)
    plot_results_comparison(results)

    # 7. Save results summary
    results_summary = pd.DataFrame(results).T
    results_summary = results_summary.sort_values('accuracy', ascending=False)
    results_summary.to_csv('/mnt/user-data/outputs/results_summary.csv')

    # 8. Print final summary
    print("\n" + "="*80)
    print("FINAL RESULTS SUMMARY")
    print("="*80)
    print(results_summary)
    print("\n" + "="*80)
    print(f"Best Model: {results_summary.index[0]}")
    print(f"Best Accuracy: {results_summary['accuracy'].iloc[0]:.4f}")
    print("="*80)

    # Save preprocessing objects
    with open('/mnt/user-data/outputs/tokenizer.pkl', 'wb') as f:
        pickle.dump(preprocessor.tokenizer, f)

    with open('/mnt/user-data/outputs/label_encoder.pkl', 'wb') as f:
        pickle.dump(preprocessor.label_encoder, f)

    print("\n" + "="*80)
    print("TRAINING COMPLETE!")
    print("="*80)
    print("\nSaved files:")
    print(" - tfidf_lstm_model.keras")
    print(" - skipgram_lstm_model.keras")
    print(" - fasttext_lstm_model.keras")
    print(" - glove_lstm_model.keras")
    print(" - training_comparison.png")
    print(" - results_comparison.png")
    print(" - results_summary.csv")
    print(" - tokenizer.pkl")
    print(" - label_encoder.pkl")
    print("="*80)


if __name__ == "__main__":
    main()

LSTM TEXT CLASSIFICATION WITH MULTIPLE EMBEDDING APPROACHES
Epochs: 5
Data sample size: 20.0%
Loading datasets...

Original train samples: 120000
Original test samples: 7600

Sampled train samples (20.0%): 24000
Sampled test samples (20.0%): 1520
Classes: [np.int64(1), np.int64(2), np.int64(3), np.int64(4)]

Cleaning text...
Tokenizing text...
Vocabulary size: 42852
X_train shape: (24000, 100)
X_test shape: (1520, 100)

APPROACH 1: TF-IDF WEIGHTED EMBEDDINGS

=== Creating TF-IDF Embedding Matrix ===
TF-IDF Embedding matrix shape: (20001, 100)

TF-IDF LSTM Model Summary:



=== Training TF-IDF LSTM ===
Epoch 1/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 767ms/step - accuracy: 0.5816 - loss: 0.9945 - val_accuracy: 0.8779 - val_loss: 0.3912 - learning_rate: 0.0010
Epoch 2/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 680ms/step - accuracy: 0.9106 - loss: 0.2962 - val_accuracy: 0.8821 - val_loss: 0.3675 - learning_rate: 0.0010
Epoch 3/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 684ms/step - accuracy: 0.9517 - loss: 0.1769 - val_accuracy: 0.8796 - val_loss: 0.4048 - learning_rate: 0.0010
Epoch 4/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 668ms/step - accuracy: 0.9691 - loss: 0.1193 - val_accuracy: 0.8621 - val_loss: 0.5000 - learning_rate: 0.0010
Epoch 5/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 721ms/step - accuracy: 0.9787 - loss: 0.0853 - val_accuracy: 0.8792 - val_loss: 0.4687 - learning_rate: 2.0000e-04

=== Evaluating TF-ID


=== Training Skip-gram LSTM ===
Epoch 1/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 677ms/step - accuracy: 0.7553 - loss: 0.7155 - val_accuracy: 0.8908 - val_loss: 0.3387 - learning_rate: 0.0010
Epoch 2/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 684ms/step - accuracy: 0.9047 - loss: 0.3003 - val_accuracy: 0.8946 - val_loss: 0.3247 - learning_rate: 0.0010
Epoch 3/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 777ms/step - accuracy: 0.9288 - loss: 0.2347 - val_accuracy: 0.8950 - val_loss: 0.3252 - learning_rate: 0.0010
Epoch 4/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 684ms/step - accuracy: 0.9458 - loss: 0.1833 - val_accuracy: 0.8971 - val_loss: 0.3267 - learning_rate: 0.0010
Epoch 5/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 723ms/step - accuracy: 0.9609 - loss: 0.1386 - val_accuracy: 0.9021 - val_loss: 0.3363 - learning_rate: 2.0000e-04

=== Evaluating Sk


=== Training FastText LSTM ===
Epoch 1/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 700ms/step - accuracy: 0.7721 - loss: 0.6964 - val_accuracy: 0.8838 - val_loss: 0.3485 - learning_rate: 0.0010
Epoch 2/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 683ms/step - accuracy: 0.9032 - loss: 0.3118 - val_accuracy: 0.8929 - val_loss: 0.3227 - learning_rate: 0.0010
Epoch 3/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 679ms/step - accuracy: 0.9281 - loss: 0.2365 - val_accuracy: 0.8958 - val_loss: 0.3246 - learning_rate: 0.0010
Epoch 4/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 725ms/step - accuracy: 0.9447 - loss: 0.1881 - val_accuracy: 0.8908 - val_loss: 0.3627 - learning_rate: 0.0010
Epoch 5/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 693ms/step - accuracy: 0.9587 - loss: 0.1422 - val_accuracy: 0.9054 - val_loss: 0.3369 - learning_rate: 2.0000e-04

=== Evaluating Fas


=== Training GloVe LSTM ===
Epoch 1/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 596ms/step - accuracy: 0.2851 - loss: 1.3674 - val_accuracy: 0.4038 - val_loss: 1.2750 - learning_rate: 0.0010
Epoch 2/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 584ms/step - accuracy: 0.4130 - loss: 1.2504 - val_accuracy: 0.5117 - val_loss: 1.1531 - learning_rate: 0.0010
Epoch 3/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 586ms/step - accuracy: 0.5131 - loss: 1.1356 - val_accuracy: 0.5258 - val_loss: 1.1262 - learning_rate: 0.0010
Epoch 4/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 587ms/step - accuracy: 0.5317 - loss: 1.1037 - val_accuracy: 0.5333 - val_loss: 1.1081 - learning_rate: 0.0010
Epoch 5/5
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 583ms/step - accuracy: 0.5527 - loss: 1.0726 - val_accuracy: 0.5433 - val_loss: 1.0859 - learning_rate: 0.0010

=== Evaluating GloVe LSTM ==