# Notebook 1: Baseline Models (Logistic Regression & BiLSTM)

This notebook establishes baseline performance using traditional machine learning and simple deep learning approaches.

## Contents
1. Logistic Regression with TF-IDF
2. BiLSTM Model
3. Baseline Comparison


In [None]:
import sys
sys.path.append('..')

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from src.config import DATA_DIR, MODELS_DIR, REPORTS_DIR, LABELS
from src.data_utils import load_raw_jigsaw, train_valid_test_split, build_dataloaders_rnn, basic_text_clean
from src.metrics import compute_classification_metrics
from src.training.bilstm_utils import train_bilstm_model, train_epoch_bilstm, eval_epoch_bilstm
from src.models.rnn_models import BiLSTMClassifier


## 1. Load and Split Data


In [None]:
df = load_raw_jigsaw(DATA_DIR / "jigsaw_train.csv")
train_df, valid_df, test_df = train_valid_test_split(df)

print(f"Train: {len(train_df)}")
print(f"Valid: {len(valid_df)}")
print(f"Test: {len(test_df)}")


## 2. Logistic Regression Baseline

Logistic Regression with TF-IDF features serves as a strong traditional ML baseline.


In [None]:
# Clean text
train_texts = [basic_text_clean(t) for t in train_df["comment_text"].tolist()]
valid_texts = [basic_text_clean(t) for t in valid_df["comment_text"].tolist()]

y_train = train_df[LABELS].values
y_valid = valid_df[LABELS].values

# TF-IDF vectorization
print("Fitting TF-IDF vectorizer...")
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), min_df=3)
X_train_tfidf = tfidf.fit_transform(train_texts)
X_valid_tfidf = tfidf.transform(valid_texts)

print(f"TF-IDF shape: {X_train_tfidf.shape}")


In [None]:
# Train multi-label logistic regression
print("Training Logistic Regression...")
logreg = MultiOutputClassifier(LogisticRegression(max_iter=100, C=4.0, solver='lbfgs'))
logreg.fit(X_train_tfidf, y_train)

# Predict
y_prob_logreg = np.array([clf.predict_proba(X_valid_tfidf)[:, 1] for clf in logreg.estimators_]).T

# Evaluate
metrics_logreg = compute_classification_metrics(y_valid, y_prob_logreg, threshold=0.5, label_names=LABELS)

print(f"\nLogistic Regression Results:")
print(f"Macro F1: {metrics_logreg['macro_f1']:.4f}")
print(f"Micro F1: {metrics_logreg['micro_f1']:.4f}")


## 3. BiLSTM Baseline

Train a BiLSTM model using our reusable training utilities.


In [None]:
# Option 1: Train from scratch (uncomment to run full training)
# model, vocab, metrics_bilstm = train_bilstm_model(epochs=3)

# Option 2: Quick demo with 1 epoch
print("Training BiLSTM for 1 epoch (demo)...")
train_loader, valid_loader, vocab = build_dataloaders_rnn(train_df, valid_df)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

vocab_size = len(vocab)
num_labels = len(LABELS)
model_bilstm = BiLSTMClassifier(
    vocab_size=vocab_size,
    embed_dim=128,
    hidden_dim=128,
    num_labels=num_labels,
    pad_idx=vocab["<pad>"],
).to(device)

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model_bilstm.parameters(), lr=1e-3)

# Train for 1-2 epochs (quick demo)
for epoch in range(1, 3):
    train_loss = train_epoch_bilstm(model_bilstm, train_loader, criterion, optimizer, device)
    y_true, y_prob_bilstm = eval_epoch_bilstm(model_bilstm, valid_loader, device)
    metrics_bilstm = compute_classification_metrics(y_true, y_prob_bilstm, threshold=0.5, label_names=LABELS)
    print(f"Epoch {epoch} - Loss: {train_loss:.4f}, Macro F1: {metrics_bilstm['macro_f1']:.4f}")


## 4. Compare Baselines


In [None]:
# Per-label comparison
comparison_data = []
for label in LABELS:
    logreg_f1 = metrics_logreg["per_label"][label]["f1"]
    bilstm_f1 = metrics_bilstm["per_label"][label]["f1"]
    comparison_data.append({
        "Label": label,
        "LogReg F1": logreg_f1,
        "BiLSTM F1": bilstm_f1,
        "Improvement": bilstm_f1 - logreg_f1
    })

comparison_df = pd.DataFrame(comparison_data)
print("\nPer-Label F1 Comparison:")
print(comparison_df.to_string(index=False))

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(LABELS))
width = 0.35

ax.bar(x - width/2, comparison_df["LogReg F1"], width, label='Logistic Regression', color='steelblue')
ax.bar(x + width/2, comparison_df["BiLSTM F1"], width, label='BiLSTM', color='coral')

ax.set_xlabel('Label')
ax.set_ylabel('F1 Score')
ax.set_title('Baseline Model Comparison: F1 Score per Label')
ax.set_xticks(x)
ax.set_xticklabels(LABELS, rotation=45)
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nOverall Comparison:")
print(f"LogReg - Macro F1: {metrics_logreg['macro_f1']:.4f}, Micro F1: {metrics_logreg['micro_f1']:.4f}")
print(f"BiLSTM - Macro F1: {metrics_bilstm['macro_f1']:.4f}, Micro F1: {metrics_bilstm['micro_f1']:.4f}")


## 5. Qualitative Examples


In [None]:
# Show predictions for a few examples
sample_indices = [0, 10, 20]

for idx in sample_indices:
    print(f"\n{'='*80}")
    print(f"Example {idx}:")
    print(f"Text: {valid_df.iloc[idx]['comment_text'][:150]}...")
    print(f"\nTrue labels: {[l for l in LABELS if valid_df.iloc[idx][l] == 1]}")
    print(f"\nLogReg predictions:")
    for i, label in enumerate(LABELS):
        print(f"  {label:15s}: {y_prob_logreg[idx, i]:.3f}")
    print(f"\nBiLSTM predictions:")
    for i, label in enumerate(LABELS):
        print(f"  {label:15s}: {y_prob_bilstm[idx, i]:.3f}")


## Summary

**Key Findings:**
1. **Logistic Regression**: Strong traditional baseline, especially for common labels
2. **BiLSTM**: Shows improvements with sequence modeling, captures context better
3. **Rare Labels**: Both models struggle with `threat` and `identity_hate` due to class imbalance

**Next Steps:**
- Try transformer models (BERT, DistilBERT) for better context understanding
- Add lexicon features to help with rare labels  
- Experiment with class balancing techniques
