In [None]:
# 01. Exploratory Data Analysis (EDA)
#**Dataset:** Fake and Real News  
#**Author:** _Lucas Garcia_  
#**Date:** _2025-07-04_


In [2]:
# General libraries
import os
import pandas as pd
import numpy as np

# For plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Plot settings
%matplotlib inline
sns.set(style='whitegrid')


In [3]:
# Define file paths
DATA_DIR = "../data/raw"
FAKE_CSV = os.path.join(DATA_DIR, "Fake.csv")
REAL_CSV = os.path.join(DATA_DIR, "True.csv")

# Read CSV files
fake_df = pd.read_csv(FAKE_CSV)
real_df = pd.read_csv(REAL_CSV)

# Add label column
fake_df['label'] = 'fake'
real_df['label'] = 'real'

# Concatenate into a single DataFrame
df = pd.concat([fake_df, real_df], ignore_index=True)


In [None]:
# Inspect shapes
print("Fake shape:", fake_df.shape)
print("Real shape:", real_df.shape)
print("Total shape:", df.shape)

# Display first rows
df.head(5)


Fake shape: (23481, 5)
Real shape: (21417, 5)
Total shape: (44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake


In [None]:
# Count missing values per column
print("Missing values per column:\n", df.isnull().sum())

# Count duplicate rows
dup_count = df.duplicated().sum()
print(f"Number of duplicate rows: {dup_count}")


In [None]:
# Count labels
label_counts = df['label'].value_counts()
print(label_counts)

# Plot class distribution
plt.figure(figsize=(6,4))
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.title("Fake vs Real News Distribution")
plt.xlabel("Label")
plt.ylabel("Number of Articles")
plt.show()


In [None]:
# Calculate text length (number of words)
df['text_len'] = df['text'].apply(lambda t: len(str(t).split()))

# Display descriptive statistics
df['text_len'].describe()


In [None]:
# Text cleaning: lowercasing, remove punctuation, stopwords, tokenization
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Download resources (si aún no los tienes)
nltk.download('stopwords')
nltk.download('punkt')

# Initialize stopwords set and stemmer
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords & stem
    tokens = [stemmer.stem(t) for t in tokens if t not in stop_words]
    # Rejoin
    return ' '.join(tokens)

# Apply cleaning to the 'text' column
df['clean_text'] = df['text'].apply(clean_text)

# Vistazo al resultado
df[['text', 'clean_text']].head(3)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000)   # ajusta max_features si quieres
X = tfidf.fit_transform(df['clean_text'])
y = df['label'].map({'fake':0, 'real':1})    # convierte a 0/1

# Split train/validation/test: 70/15/15
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)


In [None]:
# Baseline model: Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

# Instantiate and train
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Predict on validation set
y_pred_nb = nb.predict(X_val)

# Metrics
print("Naive Bayes Validation Accuracy:", accuracy_score(y_val, y_pred_nb))
print("Naive Bayes Validation F1-score:", f1_score(y_val, y_pred_nb))
print("\nClassification Report:\n", classification_report(y_val, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_nb))


In [None]:
# Baseline model: Logistic Regression
from sklearn.linear_model import LogisticRegression

# Instantiate and train
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Predict on validation set
y_pred_lr = lr.predict(X_val)

# Metrics
print("Logistic Regression Validation Accuracy:", accuracy_score(y_val, y_pred_lr))
print("Logistic Regression Validation F1-score:", f1_score(y_val, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_val, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_lr))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],   # compatible con l1 y l2
}

# Set up GridSearchCV
grid_lr = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Run grid search
grid_lr.fit(X_train, y_train)

# Best parameters and score
print("Best params:", grid_lr.best_params_)
print("Best CV F1-score:", grid_lr.best_score_)


In [None]:
# Use the best estimator from grid search
best_lr = grid_lr.best_estimator_

# Validation metrics
y_val_pred = best_lr.predict(X_val)
print("Tuned LR Validation F1-score:", f1_score(y_val, y_val_pred))

# Test metrics
y_test_pred = best_lr.predict(X_test)
print("Tuned LR Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Tuned LR Test F1-score:", f1_score(y_test, y_test_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_test_pred))


In [None]:
from sklearn.metrics import plot_confusion_matrix, roc_curve, auc

final_model = best_lr  # o rf, o el que elijas

# Plot confusion matrix on test set
plot_confusion_matrix(final_model, X_test, y_test,
                      display_labels=['fake','real'],
                      cmap=plt.cm.Blues,
                      normalize='true')
plt.title("Normalized Confusion Matrix")
plt.show()

# ROC Curve
y_prob = final_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()


In [None]:
import joblib

# Save TF-IDF vectorizer
joblib.dump(tfidf, '../models/tfidf_vectorizer.joblib')
# Save final model
joblib.dump(final_model, '../models/final_fake_news_model.joblib')
