<a href="https://colab.research.google.com/github/Ishika-Pattnaik/Reviews-sentiment-analysis/blob/main/ML_Task02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Sentiment Analysis

##Step 1: Preprocessing text

###Cleaning text

In [None]:
import tarfile

dataset_path = '/content/sample_data/aclImdb_v1.tar'
extract_path = '/content/'

with tarfile.open(dataset_path, 'r') as tar:
    tar.extractall(path=extract_path)

print("✅ Dataset extracted successfully!")


  tar.extractall(path=extract_path)


✅ Dataset extracted successfully!


In [None]:
import pandas as pd
import numpy as np
import os
import re
import string
import nltk
from pathlib import Path
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK data (one-time setup)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet', quiet=True)

class SentimentAwarePreprocessorTFIDF:
    """
    Text preprocessor optimized for TF-IDF + Logistic Regression.
    Focuses on noise reduction and vocabulary simplification while preserving sentiment.
    """

    def __init__(self):
        # Initialize lemmatizer
        self.lemmatizer = WordNetLemmatizer()

        # Standard English stopwords
        self.base_stopwords = set(stopwords.words('english'))

        # Preserve sentiment-important words
        self.sentiment_preservers = {
            'not', 'no', 'never', 'none', 'neither', 'nobody', 'nothing', 'nowhere',
            'without', 'barely', 'hardly', 'scarcely', 'seldom', 'rarely',
            'very', 'really', 'extremely', 'quite', 'rather', 'too', 'so', 'such',
            'more', 'most', 'much', 'many', 'few', 'little', 'less', 'least',
            'but', 'however', 'although', 'though', 'despite', 'yet',
            'good', 'bad', 'best', 'worst', 'better', 'worse'
        }

        # Remove sentiment preservers from stopwords
        self.stopwords = self.base_stopwords - self.sentiment_preservers

        # Simplified emoticon handling (convert to basic tokens)
        self.emoticon_pattern = r'[:\-;=][\)\(\[\]DPpOo\|\\\/\{\}@><\*]|[\)\(\[\]DPpOo\|\\\/\{\}@><\*][:\-;=]'

        # Simplified emphasis pattern (normalize to single character)
        self.emphasis_pattern = r'([a-zA-Z])\1{2,}'

    def clean_text_efficiently(self, text):
        """
        Clean text for TF-IDF: remove noise, preserve sentiment, simplify features.
        """
        if pd.isna(text) or text == '':
            return ''

        text = str(text).lower()

        # Handle emoticons (simplify to POSITIVE/NEGATIVE)
        emoticons = re.findall(self.emoticon_pattern, text)
        for i, emoticon in enumerate(emoticons):
            text = text.replace(emoticon, f'EMOTICON{i}', 1)

        # Remove URLs, HTML tags, and emails
        text = re.sub(r'https?://[^\s]+', '', text)
        text = re.sub(r'www\.[^\s]+', '', text)
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'\S+@\S+', '', text)

        # Handle contractions
        contraction_fixes = {
            "won't": "will not", "can't": "can not", "n't": " not",
            "'re": " are", "'ve": " have", "'ll": " will", "'d": " would",
            "'m": " am", "don't": "do not", "didn't": "did not",
            "wasn't": "was not", "weren't": "were not", "isn't": "is not",
            "aren't": "are not", "hasn't": "has not", "haven't": "have not",
            "wouldn't": "would not", "shouldn't": "should not", "couldn't": "could not"
        }
        for contraction, expansion in contraction_fixes.items():
            text = text.replace(contraction, expansion)

        # Remove all punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Normalize emphasis (e.g., "sooo" -> "so")
        text = re.sub(self.emphasis_pattern, r'\1', text)

        # Restore emoticons as simple tokens
        for i, emoticon in enumerate(emoticons):
            text = text.replace(f'EMOTICON{i}', 'POSITIVE' if emoticon in [':)', ':-)', ':D', '=)'] else 'NEGATIVE')

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def tokenize_and_filter(self, text):
        """
        Tokenize and filter for TF-IDF: lemmatize and remove stopwords.
        """
        if not text:
            return []

        tokens = text.split()

        # Remove short tokens (except sentiment preservers)
        tokens = [token for token in tokens if len(token) >= 2 or token in self.sentiment_preservers]

        # Remove stopwords (except sentiment preservers)
        tokens = [token for token in tokens if token not in self.stopwords]

        # Lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]

        return tokens

    def preprocess_text(self, text):
        """
        Complete preprocessing pipeline for a single text.
        """
        cleaned_text = self.clean_text_efficiently(text)
        tokens = self.tokenize_and_filter(cleaned_text)
        return ' '.join(tokens) if tokens else 'EMPTY_REVIEW'

    def preprocess_corpus(self, texts):
        """
        Preprocess entire corpus efficiently.
        """
        return [self.preprocess_text(text) for text in texts]

def load_imdb_dataset(dataset_path):
    """
    Load IMDb dataset from folder structure.
    """
    print("📁 Loading IMDb Movie Reviews Dataset")
    print("="*50)

    dataset_path = Path("/content/aclImdb")

    if not dataset_path.exists():
        print(f"❌ Dataset path '{dataset_path}' not found!")
        return None

    train_path = dataset_path / "train"
    test_path = dataset_path / "test"

    if not train_path.exists() or not test_path.exists():
        print("❌ Train or test folders not found!")
        return None

    def load_reviews_from_folder(folder_path, label, split_name):
        reviews, labels, ratings = [], [], []
        for sentiment, sent_label in [('pos', 1), ('neg', 0)]:
            if label not in [sentiment, 'both']:
                continue
            sent_path = folder_path / sentiment
            if not sent_path.exists():
                print(f"⚠️ Warning: {sent_path} not found. Skipping {sentiment} reviews.")
                continue
            files = list(sent_path.glob("*.txt"))
            if not files:
                print(f"⚠️ Warning: No .txt files found in {sent_path}.")
                continue
            for file_path in files:
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        review = f.read().strip()
                        if not review:
                            print(f"⚠️ Warning: Empty file {file_path}")
                            continue
                        reviews.append(review)
                        labels.append(sent_label)
                        rating = int(file_path.stem.split('_')[1])
                        ratings.append(rating)
                except (UnicodeDecodeError, ValueError) as e:
                    print(f"⚠️ Warning: Failed to process {file_path}: {e}")
        return reviews, labels, ratings

    train_reviews, train_labels, train_ratings = load_reviews_from_folder(train_path, 'both', 'train')
    test_reviews, test_labels, test_ratings = load_reviews_from_folder(test_path, 'both', 'test')

    train_df = pd.DataFrame({
        'review': train_reviews,
        'sentiment': train_labels,
        'rating': train_ratings,
        'split': 'train'
    })

    test_df = pd.DataFrame({
        'review': test_reviews,
        'sentiment': test_labels,
        'rating': test_ratings,
        'split': 'test'
    })

    df = pd.concat([train_df, test_df], ignore_index=True)

    print(f"✅ Dataset loaded successfully!")
    print(f"Total reviews: {len(df):,}")
    print(f"Training reviews: {len(train_df):,}")
    print(f"Test reviews: {len(test_df):,}")

    return df

def preprocess_for_tfidf(df, preprocessor, output_file="imdb_tfidf.csv"):
    """
    Apply preprocessing and save data for TF-IDF + Logistic Regression.
    """
    print("\n🚀 Applying TF-IDF Preprocessing")
    print("="*50)

    # Preprocess reviews
    df['processed_review'] = preprocessor.preprocess_corpus(df['review'].tolist())

    # Handle empty reviews
    empty_reviews = df['processed_review'].str.strip() == 'EMPTY_REVIEW'
    print(f"⚠️ Empty reviews after preprocessing: {empty_reviews.sum()}")
    df = df[~empty_reviews].copy()

    # Save preprocessed data
    model_data = df[['processed_review', 'sentiment', 'split']].copy()
    model_data.to_csv(output_file, index=False)
    print(f"✅ Preprocessed data saved to {output_file}")

    return df

def vectorize_tfidf(df, max_features=5000):
    """
    Convert preprocessed text to TF-IDF vectors.
    """
    print("\n📊 Converting to TF-IDF Vectors")
    print("="*50)

    vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, 2))
    X_train = vectorizer.fit_transform(df[df['split'] == 'train']['processed_review'])
    X_test = vectorizer.transform(df[df['split'] == 'test']['processed_review'])
    y_train = df[df['split'] == 'train']['sentiment']
    y_test = df[df['split'] == 'test']['sentiment']

    print(f"✅ TF-IDF vectors created:")
    print(f"  Training shape: {X_train.shape}")
    print(f"  Test shape: {X_test.shape}")

    return X_train, X_test, y_train, y_test, vectorizer

def main_tfidf_workflow(dataset_path):
    """
    Preprocessing workflow for TF-IDF + Logistic Regression, without visualizations.
    """
    print("🧹 Preprocessing for TF-IDF + Logistic Regression")
    print("="*50)

    # Load dataset
    df = load_imdb_dataset(dataset_path)
    if df is None:
        return None

    # Initialize preprocessor
    preprocessor = SentimentAwarePreprocessorTFIDF()
    print("✅ Preprocessor initialized for TF-IDF")

    # Preprocess data
    df = preprocess_for_tfidf(df, preprocessor)

    # Vectorize data
    X_train, X_test, y_train, y_test, vectorizer = vectorize_tfidf(df)

    print("\n🎉 Preprocessing Completed!")
    print("✅ Data is ready for Logistic Regression training")
    print(f"  Saved preprocessed data to: imdb_tfidf.csv")
    print(f"  X_train shape: {X_train.shape}")
    print(f"  X_test shape: {X_test.shape}")

    return df, X_train, X_test, y_train, y_test, vectorizer

if __name__ == "__main__":
    # UPDATE THIS PATH TO YOUR DATASET FOLDER
    dataset_path = "path/to/your/aclImdb"  # Change this!

    df, X_train, X_test, y_train, y_test, vectorizer = main_tfidf_workflow(dataset_path)

    # You can now train Logistic Regression using X_train, y_train
    # Example:
    # from sklearn.linear_model import LogisticRegression
    # model = LogisticRegression(max_iter=1000)
    # model.fit(X_train, y_train)
    # y_pred = model.predict(X_test)

🧹 Preprocessing for TF-IDF + Logistic Regression
📁 Loading IMDb Movie Reviews Dataset
✅ Dataset loaded successfully!
Total reviews: 50,000
Training reviews: 25,000
Test reviews: 25,000
✅ Preprocessor initialized for TF-IDF

🚀 Applying TF-IDF Preprocessing
⚠️ Empty reviews after preprocessing: 0
✅ Preprocessed data saved to imdb_tfidf.csv

📊 Converting to TF-IDF Vectors
✅ TF-IDF vectors created:
  Training shape: (25000, 5000)
  Test shape: (25000, 5000)

🎉 Preprocessing Completed!
✅ Data is ready for Logistic Regression training
  Saved preprocessed data to: imdb_tfidf.csv
  X_train shape: (25000, 5000)
  X_test shape: (25000, 5000)


###Training TF_IDF + Logistic Regression model



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
lr_model=LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred= lr_model.predict(X_test)

accuracy_lr= accuracy_score(y_test, y_pred)
precision_lr=precision_score(y_test, y_pred, average="weighted")
f1_lr=f1_score(y_test, y_pred, average="weighted")
recall_lr=recall_score(y_test, y_pred, average="weighted")
print(f"Accuracy= {accuracy_lr: .4f}")
print(f"Precision= {precision_lr: .4f}")
print(f"F1 Score= {f1_lr: .4f}")
print(f"Recall= {recall_lr: .4f}")

Accuracy=  0.8833
Precision=  0.8834
F1 Score=  0.8833
Recall=  0.8833


###Saving model

In [None]:
import joblib
joblib.dump(lr_model, 'logistic_regression_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

##LSTM model

###Preprocessing data

In [None]:
import pandas as pd
import numpy as np
import re
import string
from pathlib import Path
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet', quiet=True)

class SentimentAwarePreprocessorLSTM:
    """
    Text preprocessor optimized for LSTM: preserves emoticons, emphasis, and sentiment.
    """
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.base_stopwords = set(stopwords.words('english'))
        self.sentiment_preservers = {
            'not', 'no', 'never', 'none', 'neither', 'nobody', 'nothing', 'nowhere',
            'without', 'barely', 'hardly', 'scarcely', 'seldom', 'rarely',
            'very', 'really', 'extremely', 'quite', 'rather', 'too', 'so', 'such',
            'more', 'most', 'much', 'many', 'few', 'little', 'less', 'least',
            'but', 'however', 'although', 'though', 'despite', 'yet',
            'good', 'bad', 'best', 'worst', 'better', 'worse'
        }
        self.stopwords = self.base_stopwords - self.sentiment_preservers
        self.emoticon_pattern = r'[:\-;=][\)\(\[\]DPpOo\|\\\/\{\}@><\*]|[\)\(\[\]DPpOo\|\\\/\{\}@><\*][:\-;=]'
        self.emphasis_pattern = r'([a-zA-Z])\1{2,}'

    def clean_text_efficiently(self, text):
        """
        Clean text for LSTM: moderate cleaning, preserve emoticons and emphasis.
        """
        if pd.isna(text) or text == '':
            return ''
        text = str(text).lower()
        # Preserve emoticons as tokens
        emoticons = re.findall(self.emoticon_pattern, text)
        for i, emoticon in enumerate(emoticons):
            text = text.replace(emoticon, f' EMOTICON_{"POSITIVE" if emoticon in [":)", ":-)", ":D", "=)"] else "NEGATIVE"} ', 1)
        # Preserve emphasis
        text = re.sub(self.emphasis_pattern, r'\1 EMPHASIS', text)
        # Remove URLs, HTML tags, emails
        text = re.sub(r'https?://[^\s]+', '', text)
        text = re.sub(r'www\.[^\s]+', '', text)
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'\S+@\S+', '', text)
        # Handle contractions
        contraction_fixes = {
            "won't": "will not", "can't": "can not", "n't": " not",
            "'re": " are", "'ve": " have", "'ll": " will", "'d": " would",
            "'m": " am", "don't": "do not", "didn't": "did not",
            "wasn't": "was not", "weren't": "were not", "isn't": "is not",
            "aren't": "are not", "hasn't": "has not", "haven't": "have not",
            "wouldn't": "would not", "shouldn't": "should not", "couldn't": "could not"
        }
        for contraction, expansion in contraction_fixes.items():
            text = text.replace(contraction, expansion)
        # Keep punctuation for context
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def tokenize_and_filter(self, text):
        """
        Tokenize and filter: lemmatize, keep minimal stopwords.
        """
        if not text:
            return []
        tokens = text.split()
        tokens = [token for token in tokens if len(token) >= 2 or token in self.sentiment_preservers or token.startswith('EMOTICON_') or token == 'EMPHASIS']
        tokens = [self.lemmatizer.lemmatize(token) if not (token.startswith('EMOTICON_') or token == 'EMPHASIS') else token for token in tokens]
        return tokens

    def preprocess_text(self, text):
        """
        Complete preprocessing pipeline for a single text.
        """
        cleaned_text = self.clean_text_efficiently(text)
        tokens = self.tokenize_and_filter(cleaned_text)
        return ' '.join(tokens) if tokens else 'EMPTY_REVIEW'

    def preprocess_corpus(self, texts):
        """
        Preprocess entire corpus.
        """
        return [self.preprocess_text(text) for text in texts]

def load_imdb_dataset(dataset_path):
    """
    Load IMDb dataset from folder structure.
    """
    print("📁 Loading IMDb Dataset")
    dataset_path = Path(dataset_path)
    if not dataset_path.exists():
        print(f"❌ Path '{dataset_path}' not found!")
        return None
    train_path = dataset_path / "train"
    test_path = dataset_path / "test"
    if not train_path.exists() or not test_path.exists():
        print("❌ Train/test folders not found!")
        return None

    def load_reviews_from_folder(folder_path, label, split_name):
        reviews, labels, ratings = [], [], []
        for sentiment, sent_label in [('pos', 1), ('neg', 0)]:
            if label not in [sentiment, 'both']:
                continue
            sent_path = folder_path / sentiment
            if not sent_path.exists():
                print(f"⚠️ {sent_path} not found. Skipping.")
                continue
            files = list(sent_path.glob("*.txt"))
            if not files:
                print(f"⚠️ No .txt files in {sent_path}.")
                continue
            for file_path in files:
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        review = f.read().strip()
                        if not review:
                            print(f"⚠️ Empty file {file_path}")
                            continue
                        reviews.append(review)
                        labels.append(sent_label)
                        rating = int(file_path.stem.split('_')[1])
                        ratings.append(rating)
                except (UnicodeDecodeError, ValueError) as e:
                    print(f"⚠️ Failed to process {file_path}: {e}")
        return reviews, labels, ratings

    train_reviews, train_labels, train_ratings = load_reviews_from_folder(train_path, 'both', 'train')
    test_reviews, test_labels, test_ratings = load_reviews_from_folder(test_path, 'both', 'test')
    train_df = pd.DataFrame({'review': train_reviews, 'sentiment': train_labels, 'rating': train_ratings, 'split': 'train'})
    test_df = pd.DataFrame({'review': test_reviews, 'sentiment': test_labels, 'rating': test_ratings, 'split': 'test'})
    df = pd.concat([train_df, test_df], ignore_index=True)
    print(f"✅ Loaded {len(df):,} reviews (Train: {len(train_df):,}, Test: {len(test_df):,})")
    return df

def preprocess_for_lstm(df, preprocessor, output_file="imdb_lstm.csv", max_words=10000, max_len=200):
    """
    Preprocess and tokenize for LSTM, save data.
    """
    print("\n🚀 Applying LSTM Preprocessing")
    df['processed_review'] = preprocessor.preprocess_corpus(df['review'].tolist())
    empty_reviews = df['processed_review'].str.strip() == 'EMPTY_REVIEW'
    print(f"⚠️ Empty reviews: {empty_reviews.sum()}")
    df = df[~empty_reviews].copy()
    tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(df[df['split'] == 'train']['processed_review'])
    X_train = pad_sequences(tokenizer.texts_to_sequences(df[df['split'] == 'train']['processed_review']), maxlen=max_len)
    X_test = pad_sequences(tokenizer.texts_to_sequences(df[df['split'] == 'test']['processed_review']), maxlen=max_len)
    y_train = df[df['split'] == 'train']['sentiment'].values
    y_test = df[df['split'] == 'test']['sentiment'].values
    model_data = df[['processed_review', 'sentiment', 'split']].copy()
    model_data.to_csv(output_file, index=False)
    print(f"✅ Saved to {output_file}")
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
    return df, X_train, X_test, y_train, y_test, tokenizer

def main_lstm_workflow(dataset_path):
    """
    Preprocessing workflow for LSTM.
    """
    print("🧹 Preprocessing for LSTM")
    df = load_imdb_dataset(dataset_path)
    if df is None:
        return None
    preprocessor = SentimentAwarePreprocessorLSTM()
    df, X_train, X_test, y_train, y_test, tokenizer = preprocess_for_lstm(df, preprocessor)
    print("\n🎉 Preprocessing Completed!")
    print("✅ Data ready for LSTM training")
    return df, X_train, X_test, y_train, y_test, tokenizer

if __name__ == "__main__":
    dataset_path = "/content/aclImdb"  # Update if needed
    df, X_train, X_test, y_train, y_test, tokenizer = main_lstm_workflow(dataset_path)

🧹 Preprocessing for LSTM
📁 Loading IMDb Dataset
✅ Loaded 50,000 reviews (Train: 25,000, Test: 25,000)

🚀 Applying LSTM Preprocessing
⚠️ Empty reviews: 0
✅ Saved to imdb_lstm.csv
X_train shape: (25000, 200), X_test shape: (25000, 200)

🎉 Preprocessing Completed!
✅ Data ready for LSTM training


###Training model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=200),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=32)
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Epoch 1/30




[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 19ms/step - accuracy: 0.7323 - loss: 0.5137 - val_accuracy: 0.8562 - val_loss: 0.3402
Epoch 2/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 15ms/step - accuracy: 0.8959 - loss: 0.2665 - val_accuracy: 0.8638 - val_loss: 0.3202
Epoch 3/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - accuracy: 0.9325 - loss: 0.1825 - val_accuracy: 0.8616 - val_loss: 0.3594
Epoch 4/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 17ms/step - accuracy: 0.9568 - loss: 0.1245 - val_accuracy: 0.8487 - val_loss: 0.3772
Epoch 5/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - accuracy: 0.9679 - loss: 0.0925 - val_accuracy: 0.8490 - val_loss: 0.4422
Epoch 6/30
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.9790 - loss: 0.0638 - val_accuracy: 0.8561 - val_loss: 0.4822
Epoch 7/30
[1m782/782[0m 

Old snippet.Optimizing the model further.

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# 1️⃣ Fine-tune with a lower learning rate
model.compile(
    optimizer=Adam(learning_rate=1e-4),  # smaller LR for fine-tuning
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# 2️⃣ Callbacks to maximize benefit in few epochs
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=1, verbose=1
)  # reduces LR if model stops improving
early_stop = EarlyStopping(
    monitor='val_loss', patience=2, restore_best_weights=True, verbose=1
)  # stops early if no improvement

# 3️⃣ Continue training for only 5 epochs
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=32,
    callbacks=[reduce_lr, early_stop]
)

# 4️⃣ Evaluate again
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Optimized Test Accuracy: {accuracy:.4f}")


Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 15ms/step - accuracy: 1.0000 - loss: 7.6909e-06 - val_accuracy: 0.8540 - val_loss: 1.4509 - learning_rate: 1.0000e-04
Epoch 2/5
[1m779/782[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 3.2222e-06
Epoch 2: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-05.
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - accuracy: 1.0000 - loss: 3.2206e-06 - val_accuracy: 0.8538 - val_loss: 1.5411 - learning_rate: 1.0000e-04
Epoch 3/5
[1m778/782[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 1.6961e-06
Epoch 3: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 17ms/step - accuracy: 1.0000 - loss: 1.6960e-06 - val_accuracy: 0.8538 - val_loss: 1.5774 - learning_rate: 5.0000e-05
Epoch 3: early stopping
Restoring m

#NEW!!! Optimised LSTM using GloVe

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


--2025-09-12 13:08:22--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-09-12 13:08:22--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-09-12 13:08:22--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
import numpy as np

embedding_index = {}
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

print(f" Loaded {len(embedding_index):,} word vectors from GloVe")
embedding_dim = 100
max_words = 10000  # Same as your tokenizer's num_words

embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i >= max_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


✅ Loaded 400,000 word vectors from GloVe


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=200,
              weights=[embedding_matrix], trainable=False),  # GloVe frozen
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=1e-4),
              loss='binary_crossentropy',
              metrics=['accuracy'])
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)
]

model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=30,
    batch_size=64,
    callbacks=callbacks
)


NameError: name 'max_words' is not defined

Required more time to run epochs and needed GPU Access

In [None]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
accuracy_lstm= accuracy_score(y_test, y_pred)
precision_lstm=precision_score(y_test, y_pred, average="weighted")
f1_lstm=f1_score(y_test, y_pred, average="weighted")
recall_lstm=recall_score(y_test, y_pred, average="weighted")
print(f"Accuracy= {accuracy_lstm: .4f}")
print(f"Precision= {precision_lstm: .4f}")
print(f"F1 Score= {f1_lstm: .4f}")
print(f"Recall= {recall_lstm: .4f}")