<a href="https://colab.research.google.com/github/Gogi0121/-Final-Year-Project-Yogesh-Yadav/blob/main/02_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 2: Preprocessing

# Fake News Detection — Data Preprocessing

---
**This notebook contains: Data loading + Train/Val/Test split, TF-IDF, tokenization for transformers.**


## Step 1: Imports

In [4]:
# Install (run once): pip install transformers datasets scikit-learn pandas numpy matplotlib seaborn shap torch wordcloud scipy statsmodels

import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import re
from collections import Counter

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                             precision_recall_fscore_support, roc_curve, auc, precision_recall_curve, average_precision_score)

# scipy.stats does NOT provide mcnemar; use statsmodels instead
from statsmodels.stats.contingency_tables import mcnemar

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)

try:
    plt.style.use('seaborn-v0_8-whitegrid')
except:
    plt.style.use('ggplot')

print(f"PyTorch: {torch.__version__}")
print(f"Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

PyTorch: 2.9.0+cu128
Device: Tesla T4


## Step 2: Load Dataset (DO NOT MODIFY)

In [5]:
url_fake = "https://raw.githubusercontent.com/laxmimerit/fake-real-news-dataset/main/data/Fake.csv"
url_true = "https://raw.githubusercontent.com/laxmimerit/fake-real-news-dataset/main/data/True.csv"

print("Downloading and Reconstructing Dataset...")

df_fake = pd.read_csv(url_fake)
df_true = pd.read_csv(url_true)
df_fake['label'] = 1
df_true['label'] = 0

df_combined = pd.concat([df_fake, df_true], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
df_combined['text'] = df_combined['title'] + " " + df_combined['text']
df_combined = df_combined[['text', 'label']]

print(f"✓ Dataset: {len(df_combined)} samples")

train_df, test_df = train_test_split(df_combined, test_size=0.2, random_state=42, stratify=df_combined['label'])
ds = DatasetDict({'train': Dataset.from_pandas(train_df), 'test': Dataset.from_pandas(test_df)})

print(f"Train: {len(ds['train'])}, Test: {len(ds['test'])}")

Downloading and Reconstructing Dataset...
✓ Dataset: 44898 samples
Train: 35918, Test: 8980


## Step 4: Preprocessing

In [6]:
X_train_text = list(ds['train']['text'])
y_train = np.array(ds['train']['label'])
X_test_text = list(ds['test']['text'])
y_test = np.array(ds['test']['label'])

# Train/Validation split (90/10) for model selection and early stopping
X_tr, X_val, y_tr, y_val = train_test_split(X_train_text, y_train, test_size=0.1, random_state=RANDOM_STATE, stratify=y_train)
print(f"Train: {len(X_tr)}, Val: {len(X_val)}, Test: {len(X_test_text)}")

tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_tr)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test_text)
print(f"TF-IDF: {X_train_tfidf.shape}")

# For transformers: create train+val datasets
train_val_df = pd.DataFrame({'text': X_tr + X_val, 'label': list(y_tr) + list(y_val)})
train_small = train_val_df.iloc[:len(X_tr)]
val_small = train_val_df.iloc[len(X_tr):]
ds_train_val = DatasetDict({
    'train': Dataset.from_pandas(train_small.reset_index(drop=True)),
    'validation': Dataset.from_pandas(val_small.reset_index(drop=True)),
    'test': ds['test']
})

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tok_fn(ex): return tokenizer(ex['text'], padding='max_length', truncation=True, max_length=256)
tokenized_ds = ds_train_val.map(tok_fn, batched=True)
tokenized_ds.set_format('torch', columns=['input_ids','attention_mask','label'])
print("✓ Preprocessing done")

Train: 32326, Val: 3592, Test: 8980
TF-IDF: (32326, 5000)


Map:   0%|          | 0/32326 [00:00<?, ? examples/s]

Map:   0%|          | 0/3592 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

✓ Preprocessing done
