In [1]:
# 0. Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score
import joblib

# 1. Load data
fake = pd.read_csv('../data/raw/Fake.csv')   # ajusta rutas si es necesario
true = pd.read_csv('../data/raw/True.csv')
fake['label'] = 'fake'
true['label'] = 'real'
df = pd.concat([fake, true], ignore_index=True)

# 2. review data
print(df.shape)
print(df.label.value_counts())


(44898, 5)
label
fake    23481
real    21417
Name: count, dtype: int64


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, make_scorer, f1_score
import joblib

# 1. Load and label data
# Adjust paths relative to notebook folder
df_fake = pd.read_csv('../data/raw/Fake.csv')
df_true = pd.read_csv('../data/raw/True.csv')
df_fake['label'] = 'fake'
df_true['label'] = 'real'
df = pd.concat([df_fake, df_true], ignore_index=True)
# Combine title and body into one text field for training
df['content'] = df['title'].fillna('') + ' ' + df['text'].fillna('')

# 2. Split into features/labels and train/test sets
X = df['content']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 3. Build a pipeline: TF-IDF vectorizer + LogisticRegression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('clf', LogisticRegression(max_iter=2000, class_weight='balanced'))
])

# 4. Set up GridSearch over ngram_range and regularization C
param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2)],
    'clf__C': [0.01, 0.1, 1, 10]
}
# Define a scorer for F1 of the 'real' class
def real_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, pos_label='real')
real_f1_scorer = make_scorer(real_f1)

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring=real_f1_scorer,
    n_jobs=-1,
    error_score='raise'
)
print("Running grid search... this may take a while.")
grid.fit(X_train, y_train)
print("Best parameters found:", grid.best_params_)
best_pipeline = grid.best_estimator_

# 5. Calibrate with Platt scaling (sigmoid) using 5-fold CV on training data
# Use estimator positional argument instead of base_estimator keyword
calibrator = CalibratedClassifierCV(
    best_pipeline,
    method='sigmoid',
    cv=5
)
calibrator.fit(X_train, y_train)

# 6. Evaluate on test set
idx_real = list(calibrator.classes_).index('real')
probs_real = calibrator.predict_proba(X_test)[:, idx_real]
preds = np.where(probs_real >= 0.5, 'real', 'fake')

print("\nTest Set Evaluation (0.5 threshold):")
print(classification_report(y_test, preds))
print("ROC AUC:", roc_auc_score((y_test=='real').astype(int), probs_real))

# 7. Optional: find threshold that maximizes F1
precision, recall, thresholds = precision_recall_curve((y_test=='real').astype(int), probs_real)
f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
best_idx = f1_scores.argmax()
best_threshold = thresholds[best_idx]
print(f"Optimal threshold for max F1: {best_threshold:.3f}")

# 8. Save model and vectorizer (paths adjusted)
os.makedirs('../models', exist_ok=True)
joblib.dump(calibrator, '../models/final_fake_news_model.joblib')
joblib.dump(best_pipeline.named_steps['tfidf'], '../models/tfidf_vectorizer.joblib')

print("Training, calibration, and saving completed.")


Running grid search... this may take a while.
Best parameters found: {'clf__C': 10, 'tfidf__ngram_range': (1, 2)}

Test Set Evaluation (0.5 threshold):
              precision    recall  f1-score   support

        fake       0.99      0.99      0.99      4696
        real       0.99      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

ROC AUC: 0.9996937020123211
Optimal threshold for max F1: 0.573
Training, calibration, and saving completed.
