<a href="https://colab.research.google.com/github/Harivamsh2005/NLP/blob/main/NLP_M_15_09_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# --- 1. Setup and Data Loading ---
# Ensure you have the necessary NLTK data
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

# Load the dataset (using a standard Kaggle disaster tweets dataset)
# This dataset is very similar in structure to what you might find on Reddit.
try:
    df = pd.read_csv('train.csv')
    print("Dataset loaded successfully from local file 'train.csv'.")
except FileNotFoundError:
    print("Local file not found. Loading from a public URL...")
    # Using a public URL for the Kaggle competition dataset for easy reproducibility
    url = 'https://storage.googleapis.com/bert_models/2020_07_23/nlp-getting-started.zip'
    df = pd.read_csv(url, compression='zip', header=0, sep=',', quotechar='"')
    print("Dataset loaded from URL.")


# --- 2. Text Preprocessing ---
def preprocess_text(text):
    """
    Cleans and preprocesses a single text entry.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

print("\nPreprocessing text data...")
df['clean_text'] = df['text'].apply(preprocess_text)


# --- 3. Data Splitting ---
X = df['clean_text']
y = df['target']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 4. Model Training and Evaluation ---
# We will test three n-gram configurations
ngram_configs = {
    "Unigrams Only": (1, 1),
    "Unigrams + Bigrams": (1, 2),
    "Unigrams + Bigrams + Trigrams": (1, 3)
}

results = {}

# Set common training parameters
EPOCHS = 10
BATCH_SIZE = 32
# Using EarlyStopping to prevent overfitting and reduce training time
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)


for name, ngrams in ngram_configs.items():
    print(f"\n{'='*20}\nProcessing: {name}\n{'='*20}")

    # Step 4a: TF-IDF Vectorization
    print(f"Creating TF-IDF vectors with {name}...")
    # We limit features to manage memory and speed, especially for trigrams
    vectorizer = TfidfVectorizer(ngram_range=ngrams, max_features=15000)
    X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
    X_test_tfidf = vectorizer.transform(X_test).toarray()

    vocab_size = X_train_tfidf.shape[1]
    print(f"Vocabulary size: {vocab_size}")

    # --- 4b: ANN Model ---
    print(f"\nTraining ANN with {name}...")
    ann_model = Sequential([
        Input(shape=(vocab_size,)),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    history_ann = ann_model.fit(
        X_train_tfidf, y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=0.1,
        callbacks=[early_stopping],
        verbose=0  # Set to 1 if you want to see training progress
    )

    train_loss_ann, train_acc_ann = ann_model.evaluate(X_train_tfidf, y_train, verbose=0)
    test_loss_ann, test_acc_ann = ann_model.evaluate(X_test_tfidf, y_test, verbose=0)
    results[f"ANN - {name}"] = {'Train Accuracy': train_acc_ann, 'Test Accuracy': test_acc_ann}
    print(f"ANN Results -> Train Acc: {train_acc_ann:.4f}, Test Acc: {test_acc_ann:.4f}")

    # --- 4c: LSTM Model ---
    # Reshape data for LSTM: (samples, timesteps, features)
    # Note: Using TF-IDF with LSTM is unconventional. LSTMs prefer sequences.
    # Here, we treat the entire TF-IDF vector as a single timestep.
    X_train_lstm = np.reshape(X_train_tfidf, (X_train_tfidf.shape[0], 1, vocab_size))
    X_test_lstm = np.reshape(X_test_tfidf, (X_test_tfidf.shape[0], 1, vocab_size))

    print(f"\nTraining LSTM with {name}...")
    lstm_model = Sequential([
        Input(shape=(1, vocab_size,)),
        LSTM(64),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    history_lstm = lstm_model.fit(
        X_train_lstm, y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=0.1,
        callbacks=[early_stopping],
        verbose=0 # Set to 1 if you want to see training progress
    )

    train_loss_lstm, train_acc_lstm = lstm_model.evaluate(X_train_lstm, y_train, verbose=0)
    test_loss_lstm, test_acc_lstm = lstm_model.evaluate(X_test_lstm, y_test, verbose=0)
    results[f"LSTM - {name}"] = {'Train Accuracy': train_acc_lstm, 'Test Accuracy': test_acc_lstm}
    print(f"LSTM Results -> Train Acc: {train_acc_lstm:.4f}, Test Acc: {test_acc_lstm:.4f}")

# --- 5. Final Comparison ---
print("\n\n--- Final Model Comparison ---")
results_df = pd.DataFrame(results).T.sort_values(by='Test Accuracy', ascending=False)
print(results_df)
print("\n--- End of Analysis ---")

Dataset loaded successfully from local file 'train.csv'.

Preprocessing text data...

Processing: Unigrams Only
Creating TF-IDF vectors with Unigrams Only...
Vocabulary size: 14888

Training ANN with Unigrams Only...
ANN Results -> Train Acc: 0.9253, Test Acc: 0.8109

Training LSTM with Unigrams Only...
LSTM Results -> Train Acc: 0.9286, Test Acc: 0.8030

Processing: Unigrams + Bigrams
Creating TF-IDF vectors with Unigrams + Bigrams...
Vocabulary size: 15000

Training ANN with Unigrams + Bigrams...
ANN Results -> Train Acc: 0.9177, Test Acc: 0.8083

Training LSTM with Unigrams + Bigrams...
LSTM Results -> Train Acc: 0.8997, Test Acc: 0.8122

Processing: Unigrams + Bigrams + Trigrams
Creating TF-IDF vectors with Unigrams + Bigrams + Trigrams...
Vocabulary size: 15000

Training ANN with Unigrams + Bigrams + Trigrams...
ANN Results -> Train Acc: 0.9061, Test Acc: 0.8096

Training LSTM with Unigrams + Bigrams + Trigrams...
LSTM Results -> Train Acc: 0.8898, Test Acc: 0.8162


--- Final Mod