# 04 â€“ Model Training & Evaluation

This notebook trains and evaluates **Naive Bayes**, **Logistic Regression**, and **Random Forest** on TF-IDF features.

**Goals:**
- Train all three models; build a **model comparison table** (accuracy, precision, recall, F1, confusion matrix).
- **Error analysis**: sample false positives and false negatives.
- **Confidence scoring**: e.g. "Fake with 76% confidence".
- **Custom input testing**: paste your own text and get prediction + confidence + top keywords.

## Load data and models

Either run `python src/train.py` first to train and save models, or train in this notebook. Below we load saved models if present.

In [None]:
import os
import sys
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from IPython.display import display

def find_project_root(start_dir):
    cur = os.path.abspath(start_dir)
    while True:
        if os.path.isdir(os.path.join(cur, "data")) and os.path.isdir(os.path.join(cur, "src")):
            return cur
        parent = os.path.dirname(cur)
        if parent == cur:
            raise FileNotFoundError("Run Jupyter from inside misinformation-detection-engine.")
        cur = parent

PROJECT_ROOT = find_project_root(os.getcwd())
PROCESSED_PATH = os.path.join(PROJECT_ROOT, "data", "processed", "processed_fake_news.csv")
MODELS_DIR = os.path.join(PROJECT_ROOT, "models")

df = pd.read_csv(PROCESSED_PATH)
tcol = "clean_text" if "clean_text" in df.columns else "text"
X = df[tcol].astype(str)
y = df["label"].astype(int)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Data loaded. Train size:", len(X_train), "Val size:", len(X_val))

In [None]:
# Load or fit vectorizer and models
vec_path = os.path.join(MODELS_DIR, "tfidf_vectorizer.joblib")
if os.path.exists(vec_path):
    vectorizer = joblib.load(vec_path)
    print("Loaded saved TF-IDF vectorizer.")
else:
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words="english")
    vectorizer.fit(X_train)
    print("Fitted TF-IDF vectorizer (not saved). Run src/train.py to save.")

X_train_vec = vectorizer.transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [None]:
# Load or train models
models = {}
for name, path in [("naive_bayes", "naive_bayes.joblib"), ("log_reg", "log_reg.joblib"), ("random_forest", "random_forest.joblib")]:
    full_path = os.path.join(MODELS_DIR, path)
    if os.path.exists(full_path):
        models[name] = joblib.load(full_path)
        print(f"Loaded {name}.")
    else:
        if name == "naive_bayes":
            models[name] = MultinomialNB()
        elif name == "log_reg":
            models[name] = LogisticRegression(max_iter=1000, n_jobs=-1)
        else:
            models[name] = RandomForestClassifier(n_estimators=200, max_depth=None, n_jobs=-1, random_state=42)
        models[name].fit(X_train_vec, y_train)
        print(f"Trained {name} (not saved). Run src/train.py to save.")

## Model comparison table

Accuracy, Precision, Recall, F1, Confusion matrix per model.

In [None]:
rows = []
for name, model in models.items():
    y_pred = model.predict(X_val_vec)
    rows.append({
        "model": name,
        "accuracy": accuracy_score(y_val, y_pred),
        "precision": precision_score(y_val, y_pred, zero_division=0),
        "recall": recall_score(y_val, y_pred, zero_division=0),
        "f1": f1_score(y_val, y_pred, zero_division=0),
    })
    print(f"\n=== {name} ===")
    print(classification_report(y_val, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

comparison_df = pd.DataFrame(rows)
display(comparison_df)

## Error analysis

Sample false positives (real misclassified as fake) and false negatives (fake misclassified as real).

In [None]:
# Use log_reg for error analysis (or pick any model)
model_name = "log_reg"
y_pred = models[model_name].predict(X_val_vec)
X_val_arr = X_val.values
fp_idx = np.where((y_val.values == 0) & (y_pred == 1))[0][:5]
fn_idx = np.where((y_val.values == 1) & (y_pred == 0))[0][:5]
print("False positives (true=real, pred=fake):")
for i in fp_idx:
    print("-", X_val_arr[i][:200], "...")
print("\nFalse negatives (true=fake, pred=real):")
for i in fn_idx:
    print("-", X_val_arr[i][:200], "...")

## Confidence scoring & custom input

Predict with confidence (e.g. "Fake with 76% confidence") and show top keywords.

In [None]:
def predict_with_confidence(text, model_name="log_reg", vectorizer=vectorizer, models=models):
    model = models[model_name]
    X_vec = vectorizer.transform([text])
    pred = model.predict(X_vec)[0]
    label_str = "fake" if pred == 1 else "real"
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_vec)[0]
        conf = float(np.max(proba))
        print(f"Prediction: {label_str.upper()} with {conf*100:.2f}% confidence")
    else:
        print(f"Prediction: {label_str.upper()}")
    return pred

# Example: custom input
custom_text = "The president announced a new policy today. Officials confirmed the details."
print("Custom input:", custom_text)
predict_with_confidence(custom_text)

In [None]:
# Top keywords influencing prediction (Logistic Regression)
vocab = vectorizer.get_feature_names_out()
lr = models["log_reg"]
coef = lr.coef_[0]
top_fake = np.argsort(coef)[-10:][::-1]
top_real = np.argsort(coef)[:10]
print("Top keywords for FAKE:", [vocab[i] for i in top_fake])
print("Top keywords for REAL:", [vocab[i] for i in top_real])