# AI520: Natural Processing Language for Artificial Intelligence
Term: Summer 2025 \
Author: David Hiltzman \
Assignment: Team Project \
Team 01 \
Authors: \
Mitch Fade, David Hiltzman, Tyler Kepler, Jeff Nelson


## Abstract
This project will create a false news story detector using a publicly available Kaggle labeled dataset with known real and false stories. Data will first be preprocessed, and feature extraction will be performed. Classical machine learning (ML) methods and deep learning methods will be used to create classifiers of the preprocessed dataset and evaluated for effectiveness. Additionally, the explainability of the results will be analyzed to show which words contributed the most to the predictions. A false news detector may help bring clarity in a world of ever-growing misinformation and may give interesting results on which words are most common in fake news stories. 

## Imports

In [None]:
# Import required libraries
import pandas as pd
import re
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

## Text Preprocessing Functions

In [None]:
def preprocess_text(text):
    """
    Preprocess a given text string by normalizing, tokenizing, 
    removing stop words, and lemmatizing.
    
    Args:
        text: Input text string
        
    Returns:
        Preprocessed text string
    """
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    tokens = tokenizer.tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(t) for t in tokens]   # Lemmatize
    return ' '.join(tokens)

In [None]:
def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Normalize column names by stripping whitespace and converting to lowercase."""
    df.columns = df.columns.str.strip().str.lower()
    return df

In [None]:
def ensure_date_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Find a likely date column, parse to datetime, and write back to a unified 'date' column.
    If no date-like column exists, create an empty 'date' column.
    """
    candidates = [
        "date", "published", "publish_date", "publication_date",
        "pub_date", "created_at", "time", "timestamp"
    ]
    date_col = next((c for c in candidates if c in df.columns), None)
    
    if date_col is None:
        df["date"] = pd.NaT
    else:
        df["date"] = pd.to_datetime(df[date_col], errors="coerce")
    
    # Write dates as ISO strings (YYYY-MM-DD); keep NaN if unknown
    df["date"] = df["date"].dt.strftime("%Y-%m-%d")
    return df

In [None]:
# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = TreebankWordTokenizer()

## Load and Preprocess Data

In [None]:
# Load the datasets
df_true = pd.read_csv("True.csv")
df_fake = pd.read_csv("Fake.csv")

# Normalize headers to avoid case/space issues
df_true = normalize_columns(df_true)
df_fake = normalize_columns(df_fake)

# Make sure we have a usable 'date' column in both
df_true = ensure_date_column(df_true)
df_fake = ensure_date_column(df_fake)

print("Unprocessed True dataset shape:", df_true.shape)
print("Unprocessed Fake dataset shape:", df_fake.shape)
print("\nTrue dataset columns:", df_true.columns.tolist())
print("Fake dataset columns:", df_fake.columns.tolist())

## Data Validation and Preprocessing

In [None]:
# Guard for missing title/text columns
required = {"title", "text"}
missing_true = required - set(df_true.columns)
missing_fake = required - set(df_fake.columns)

if missing_true:
    raise KeyError(f"True.csv missing required columns: {missing_true}")
if missing_fake:
    raise KeyError(f"Fake.csv missing required columns: {missing_false}")

# Apply preprocessing to title and text columns
print("Preprocessing text data...")
df_true["processed_title"] = df_true["title"].astype(str).apply(preprocess_text)
df_fake["processed_title"] = df_fake["title"].astype(str).apply(preprocess_text)
df_true["processed_text"] = df_true["text"].astype(str).apply(preprocess_text)
df_fake["processed_text"] = df_fake["text"].astype(str).apply(preprocess_text)

print("Preprocessing complete!")

## Save Preprocessed Data and Display Sample

In [None]:
# Save preprocessed data
df_true[["processed_title", "processed_text", "date"]].to_csv("preprocessed_True.csv", index=False)
df_fake[["processed_title", "processed_text", "date"]].to_csv("preprocessed_Fake.csv", index=False)

print("Processed True dataset sample:")
print(df_true[["processed_title", "processed_text", "date"]].head())
print("\nProcessed Fake dataset sample:")
print(df_fake[["processed_title", "processed_text", "date"]].head())

## Prepare Data for Classification

In [None]:
# Load preprocessed data
df_fake = pd.read_csv("preprocessed_Fake.csv")
df_true = pd.read_csv("preprocessed_True.csv")

# Label the data
df_fake['label'] = 0  # fake news
df_true['label'] = 1  # true news

# Combine the datasets
df_combined = pd.concat([df_fake, df_true], ignore_index=True)

# Combine title and text for feature extraction
df_combined['combined'] = df_combined['processed_title'].fillna('') + ' ' + df_combined['processed_text'].fillna('')

print(f"Combined dataset shape: {df_combined.shape}")
print(f"Label distribution:\n{df_combined['label'].value_counts()}")

## Feature Extraction with TF-IDF

In [None]:
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform data
X = vectorizer.fit_transform(df_combined['combined'])  # Feature matrix
y = df_combined['label']  # Target labels

print("TF-IDF matrix shape:", X.shape)
print("Sample features:", vectorizer.get_feature_names_out()[:20])
print("Feature matrix sparsity:", (X != 0).nnz / (X.shape[0] * X.shape[1]))

## Classifier 1 - Logistic Regression

In [None]:
print("LOGISTIC REGRESSION CLASSIFIER")

# Create training/testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)

# Evaluate model
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_lr))

## Classifier 2 - Random Forest

In [None]:
print("RANDOM FOREST CLASSIFIER")

# Create training/testing split (same random state for fair comparison)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
rf_model = RandomForestClassifier(
    n_estimators=100,       # number of trees
    random_state=42,        # for reproducibility
    n_jobs=-1               # use all CPU cores
)
rf_model.fit(X_train, y_train)

# Evaluate model
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf))

## Classifier 3 - Linear SVM

In [None]:
print("LINEAR SVM CLASSIFIER")

# Create training/testing split (same random state for fair comparison)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Linear SVM pipeline with scaling
svm_model = make_pipeline(
    StandardScaler(with_mean=False),  # Don't center sparse matrices
    LinearSVC(dual="auto", C=1.0, tol=1e-3, max_iter=5000, random_state=42)
)

# Train model
svm_model.fit(X_train, y_train)

# Evaluate model
y_pred_svm = svm_model.predict(X_test)
print("Linear SVM Results:")
print(classification_report(y_test, y_pred_svm))

## Model Comparison and Summary

In [None]:
print("MODEL COMPARISON SUMMARY")
print("All models trained on the same train/test split (random_state=42)")
print("Dataset size:", df_combined.shape[0], "samples")
print("Feature dimensions:", X.shape[1], "TF-IDF features")
print("Train/Test split: 80/20")
print("\nRerun the classification cells above to see detailed results for each model.")