In [None]:
# ------------------- Imports -------------------
import pandas as pd
import ast
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from textblob import TextBlob
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

# ------------------- STEP 1: Load Dataset -------------------
df = pd.read_csv("LDA_Result.csv", converters={"preprocessed_body": ast.literal_eval})

# ------------------- STEP 2: Sentiment Scoring from Preprocessed -------------------
df['preprocessed_text'] = df['preprocessed_body'].apply(lambda tokens: " ".join(tokens))
df['sentiment_score'] = df['preprocessed_text'].apply(lambda text: TextBlob(text).sentiment.polarity)

def label_sentiment(score):
    if score > 0:
        return 'Positive'
    elif score < 0:
        return 'Negative'
    else:
        return 'Neutral'

df['sentiment_label'] = df['sentiment_score'].apply(label_sentiment)

# ------------------- STEP 3: Split Data -------------------
df_main, df_unseen = train_test_split(df, test_size=0.10, random_state=42, stratify=df['sentiment_label'])
df_train, df_test = train_test_split(df_main, test_size=0.20, random_state=42, stratify=df_main['sentiment_label'])

print(f" Dataset split complete:\n→ Train: {len(df_train)}\n→ Test: {len(df_test)}\n→ Unseen: {len(df_unseen)}")

# ------------------- STEP 4: Extract Top Keywords Per Sentiment -------------------
sentiment_words = defaultdict(list)
for _, row in df_train.iterrows():
    sentiment = row['sentiment_label']
    sentiment_words[sentiment].extend(row['preprocessed_body'])

top_keywords_per_sentiment = {
    sentiment: [word for word, _ in Counter(words).most_common(5)]
    for sentiment, words in sentiment_words.items()
}

# ------------------- STEP 5: Rule-Based Sentiment Classification -------------------
def predict_sentiment_by_keywords(tokens):
    scores = {}
    for sentiment, keywords in top_keywords_per_sentiment.items():
        matched = set(tokens).intersection(keywords)
        scores[sentiment] = len(matched)
    return max(scores, key=scores.get) if scores else 'Neutral'

def rule_explanation(tokens, label):
    sentiment_keywords = top_keywords_per_sentiment[label]
    matched = [word for word in tokens if word in sentiment_keywords]
    if matched:
        keyword_list = ', '.join(f'"{word}"' for word in matched)
        return f'If text contains {keyword_list} → {label}'
    else:
        return f'No strong sentiment-specific keywords found → {label}'

# Apply to test and unseen sets
df_test['predicted_sentiment'] = df_test['preprocessed_body'].apply(predict_sentiment_by_keywords)
df_test['rule_explanation'] = df_test.apply(
    lambda row: rule_explanation(row['preprocessed_body'], row['predicted_sentiment']), axis=1
)

df_unseen['predicted_sentiment'] = df_unseen['preprocessed_body'].apply(predict_sentiment_by_keywords)
df_unseen['rule_explanation'] = df_unseen.apply(
    lambda row: rule_explanation(row['preprocessed_body'], row['predicted_sentiment']), axis=1
)

# ------------------- STEP 6: Evaluation -------------------
def evaluate_sentiment(df_subset, set_name):
    y_true = df_subset['sentiment_label']
    y_pred = df_subset['predicted_sentiment']

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

    print(f"\n Rule-Based Sentiment Evaluation ({set_name}):")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")
    print(f"F1 Score : {f1:.4f}")

    cm = confusion_matrix(y_true, y_pred, labels=sorted(df['sentiment_label'].unique()))
    plt.figure(figsize=(8, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=sorted(df['sentiment_label'].unique()),
                yticklabels=sorted(df['sentiment_label'].unique()))
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(f"Confusion Matrix - {set_name}")
    plt.tight_layout()
    plt.show()

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, zero_division=0))

# Evaluate on test and unseen sets
evaluate_sentiment(df_test, "TEST SET")
evaluate_sentiment(df_unseen, "UNSEEN VALIDATION SET")

# ------------------- STEP 7: Preview -------------------
print("\n Preview of Classified TEST Set:")
display(df_test[['preprocessed_text', 'sentiment_label', 'predicted_sentiment', 'rule_explanation']].head(10))

print("\n Preview of UNSEEN VALIDATION Set:")
display(df_unseen[['preprocessed_text', 'sentiment_label', 'predicted_sentiment', 'rule_explanation']].head(10))

Unnamed: 0,body,labeled topic,extracted_rule
0,People need to do this kind of thing more ofte...,Political Views and Climate Policy,"If comment contains ""people"", ""people"", ""chang..."
1,Thats cute if things dont reverse course in ou...,Environmental Impact and Global Warming,"If comment contains ""climate"", ""change"" → Envi..."
2,Whats interesting is that you are arguing with...,Scientific Discussions on Climate Change,"If comment contains ""time"", ""climate"", ""change..."
3,i can agree with that a lot of the media is ow...,Political Views and Climate Policy,"If comment contains ""climate"", ""change"" → Poli..."
4,The rising seas are due to global warming but ...,Environmental Impact and Global Warming,"If comment contains ""global"", ""climate"", ""chan..."
5,It goes even further than that If some scienti...,Scientific Discussions on Climate Change,"If comment contains ""time"", ""climate"", ""change..."
6,Maybe this little bird that only sings a song ...,Environmental Impact and Global Warming,"If comment contains ""climate"", ""change"" → Envi..."
7,Funny how all climate change deniers use the e...,Scientific Discussions on Climate Change,"If comment contains ""climate"", ""change"", ""peop..."
8,If were going to start talking long term then ...,Environmental Impact and Global Warming,"If comment contains ""climate"", ""change"" → Envi..."
9,Its like when climate change is mentioned they...,Scientific Discussions on Climate Change,"If comment contains ""climate"", ""change"" → Scie..."
