In [7]:
# --- Imports ---
import pandas as pd
from ast import literal_eval
from gensim import corpora, models
from gensim.matutils import sparse2full
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
import numpy as np
import pyLDAvis.gensim_models
import pyLDAvis

# --- Load Preprocessed Data ---
df = pd.read_csv("preprocessed.csv")

# --- Tokenize the 'preprocessed_body' Column ---
df['tokens'] = df['preprocessed_body'].apply(literal_eval)

# --- Create Bag of Words (BoW) ---
dictionary = corpora.Dictionary(df['tokens'])
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=2000)
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

# --- Train LDA Model with 3 Topics ---
lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=3,
    random_state=42,
    passes=10
)

# --- LDA Vectors (Topic Distribution per Document) ---
lda_vectors = []
for bow in corpus:
    doc_topics = lda_model.get_document_topics(bow)
    dense_vector = sparse2full(doc_topics, lda_model.num_topics)
    lda_vectors.append(dense_vector)

# --- BoW Vectors (Dense Format) ---
bow_vectors = [sparse2full(doc_bow, len(dictionary)) for doc_bow in corpus]

# --- Sentiment Scores ---
df['sentiment'] = df['preprocessed_body'].apply(lambda text: TextBlob(text).sentiment.polarity)

# --- Combine Features: [BoW + LDA + Sentiment] ---
combined_features = []
for bow_vec, lda_vec, sentiment in zip(bow_vectors, lda_vectors, df['sentiment']):
    feature_vector = np.concatenate([bow_vec, lda_vec, [sentiment]])
    combined_features.append(feature_vector)

X = np.array(combined_features)

# --- Generate Binary Labels from Sentiment ---
df['label'] = df['sentiment'].apply(lambda s: 1 if s > 0 else 0)
y = df['label'].values

# Step 1: Split 10% for Unseen Validation Set ---
X_remaining, X_unseen, y_remaining, y_unseen, df_remaining, df_unseen = train_test_split(
    X, y, df, test_size=0.10, random_state=42, stratify=y
)

# Split Remaining 90% into 80% Train, 20% Test ---
X_train, X_test, y_train, y_test = train_test_split(
    X_remaining, y_remaining, test_size=0.20, random_state=42, stratify=y_remaining
)

# --- Train Classifier ---
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# --- Predict on Test Set ---
y_pred = clf.predict(X_test)

# --- Function to Display Performance Summary ---
def print_performance_report(title, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)

    print(f"\n {title}")
    print("=" * (len(title) + 4))
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1 Score : {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)
    print("\nDetailed Classification Report:")
    print(classification_report(y_true, y_pred, target_names=['Negative', 'Positive'], zero_division=0))

# --- Evaluate on Test Set ---
print_performance_report("Test Set Performance", y_test, y_pred)

# --- Evaluate on Unseen Validation Set ---
print_performance_report("Unseen Validation Set Performance", y_unseen, y_unseen_pred)


 Test Set Performance
Accuracy : 0.9989
Precision: 0.9989
Recall   : 0.9989
F1 Score : 0.9989

Confusion Matrix:
[[4585    5]
 [   5 4405]]

Detailed Classification Report:
              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00      4590
    Positive       1.00      1.00      1.00      4410

    accuracy                           1.00      9000
   macro avg       1.00      1.00      1.00      9000
weighted avg       1.00      1.00      1.00      9000


 Unseen Validation Set Performance
Accuracy : 0.9976
Precision: 0.9967
Recall   : 0.9984
F1 Score : 0.9976

Confusion Matrix:
[[2542    8]
 [   4 2446]]

Detailed Classification Report:
              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00      2550
    Positive       1.00      1.00      1.00      2450

    accuracy                           1.00      5000
   macro avg       1.00      1.00      1.00      5000
weighted avg       1.00      1.00      1