# Model 0: Baseline Model with Standardized Preprocessing + Logistic Regression

Set up a basic pipeline using **standardized preprocessing from preprocess.py** and Logistic Regression.

## 🔧 Steps:
1. Import Libraries and load data using **preprocess.py functions**
2. Preprocessing: Use **clean_text()** for standardized text cleaning
3. Vectorization: CountVectorizer (Bag-of-Words) - maintaining original approach
4. Model: LogisticRegression
5. Evaluation: Accuracy, confusion matrix, classification report

##  ✅ Purpose:
Establish a working pipeline using **standardized preprocessing functions** and maintain baseline score (~70-80% accuracy expected).

## 1. Import Libraries and Load Data

In [None]:
# Standard libraries
import pandas as pd
import numpy as np

# Sklearn libraries for modeling and evaluation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Import our standardized preprocessing functions
from preprocess import load_and_parse_data, create_train_validation_split, clean_text

In [None]:
# Load data using standardized function
print("Loading training data using standardized preprocessing...")
train_data = load_and_parse_data('data/training_data_lowercase.csv')

# Display basic information about the loaded data
print(f"\nData loaded successfully!")
print(f"Total articles: {len(train_data)}")
print(f"Sample article structure: {list(train_data[0].keys())}")

# Show first few examples
print("\nFirst 3 articles:")
for i in range(3):
    article = train_data[i]
    text_preview = article['text'][:100] + "..." if len(article['text']) > 100 else article['text']
    print(f"  Article {i+1}: Label={article['label']}, Text='{text_preview}'")

# Check label distribution
labels = [item['label'] for item in train_data]
unique_labels, counts = np.unique(labels, return_counts=True)
print(f"\nLabel distribution:")
for label, count in zip(unique_labels, counts):
    label_name = "Fake" if label == 0 else "Real"
    print(f"  {label_name} (label {label}): {count} articles ({count/len(labels)*100:.1f}%)")

## 2. Preprocessing Using Standardized Functions

In [None]:
# Apply standardized text cleaning to all articles
print("Applying standardized text cleaning...")
for item in train_data:
    item['clean_text'] = clean_text(item['text'])

# Show before/after cleaning examples
print("\nText cleaning examples:")
for i in range(3):
    article = train_data[i]
    original = article['text'][:100] + "..." if len(article['text']) > 100 else article['text']
    cleaned = article['clean_text'][:100] + "..." if len(article['clean_text']) > 100 else article['clean_text']
    print(f"\nExample {i+1}:")
    print(f"  Original: '{original}'")
    print(f"  Cleaned:  '{cleaned}'")

## 3. Train/Validation Split and Vectorization

In [None]:
# Create train/validation split using standardized function
print("Creating train/validation split using standardized function...")

# Prepare data for the split function
data_for_split = []
for item in train_data:
    data_for_split.append({
        'label': item['label'],
        'text': item['clean_text']  # Use cleaned text for modeling
    })

# Use standardized train/validation split
X_train, X_val, y_train, y_val = create_train_validation_split(
    data_for_split, 
    test_size=0.2, 
    random_state=42
)

print(f"\nTrain/validation split complete:")
print(f"  Training set: {len(X_train)} samples")
print(f"  Validation set: {len(X_val)} samples")

In [None]:
# Vectorization using CountVectorizer (maintaining original approach)


print("Applying CountVectorizer...")
vectorizer = CountVectorizer(
    max_features=10000,
    stop_words='english',
    lowercase=False,
    min_df=2,
    max_df=0.95
)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

print(f"\nVectorization complete:")
print(f"  Training matrix shape: {X_train_vec.shape}")
print(f"  Validation matrix shape: {X_val_vec.shape}")
print(f"  Vocabulary size: {len(vectorizer.vocabulary_)}")

## 4. Logistic Regression Model

In [None]:
# Model training
import time

print("Training Logistic Regression model...")
start_time = time.time()
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_vec, y_train)
training_time_minutes = (time.time() - start_time) / 60

print("Model training complete!")
print(f"Training time: {training_time_minutes:.2f} minutes")

## 5. Evaluation

In [None]:
# Evaluation on validation set
print("Evaluating model performance...")
y_pred = model.predict(X_val_vec)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"\n=== MODEL PERFORMANCE ===")
print(f"Validation Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Detailed classification report
print(f"\n=== DETAILED CLASSIFICATION REPORT ===")
print(classification_report(
    y_val, y_pred, 
    target_names=['Fake News (0)', 'Real News (1)'],
    digits=4
))

## 6. Confusion Matrix

In [None]:
# Confusion Matrix
plt.figure(figsize=(8, 6))
ConfusionMatrixDisplay.from_estimator(
    model, X_val_vec, y_val, 
    cmap='Blues',
    display_labels=['Fake News', 'Real News']
)
plt.title("Confusion Matrix - Logistic Regression with Standardized Preprocessing")
plt.show()

## 7. Save Results for Model Comparison

In [None]:
# Import the evaluation module
from model_eval import save_model_results

# Save results using clean keyword arguments
save_model_results(
    model_name="baseline_lr",
    display_name="Baseline LogisticRegression",
    accuracy=accuracy,
    training_time_minutes=training_time_minutes,
    model_architecture="LogisticRegression with CountVectorizer (10k features)",
    preprocessing_type="standardized_clean_text",
    hyperparameters={
        "max_features": 10000,
        "stop_words": "english",
        "min_df": 2,
        "max_df": 0.95,
        "max_iter": 1000,
        "random_state": 42
    },
    dataset_info={
        "training_samples": len(X_train),
        "validation_samples": len(X_val),
        "vocabulary_size": len(vectorizer.vocabulary_)
    }
)

print("\nModel results saved successfully for comparison!")