# PubMed RCT - Baseline Model (TF-IDF + Naive Bayes)

This notebook builds a baseline classifier using TF-IDF features and Multinomial Naive Bayes.

**Approach:**
- TF-IDF vectorization (unigrams + bigrams)
- Multinomial Naive Bayes classifier
- Sklearn Pipeline for simplicity

In [None]:
# ============================================================
# Setup: auto-download dataset (works on Colab & locally)
# ============================================================
import os

DATA_DIR = "../data/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

if not os.path.exists(DATA_DIR):
    print("Dataset not found locally. Downloading...")
    os.makedirs("../data", exist_ok=True)
    os.system("git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git ../data/pubmed-rct")
    print("Dataset downloaded!")
else:
    print("Dataset already available.")

In [None]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

np.random.seed(42)
%matplotlib inline

## Load Data

In [None]:
def load_pubmed_data(filepath):
    """Load and preprocess PubMed RCT data from a text file.
    Returns a list of dicts with keys: target, text, line_number, total_lines.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    samples = []
    abstract_lines = ""

    for line in lines:
        if line.startswith("###"):
            abstract_lines = ""
        elif line.isspace():
            split = abstract_lines.splitlines()
            for i, al in enumerate(split):
                parts = al.split("\t")
                if len(parts) == 2:
                    samples.append({
                        "target": parts[0],
                        "text": parts[1].lower(),
                        "line_number": i,
                        "total_lines": len(split) - 1
                    })
        else:
            abstract_lines += line

    return samples

In [None]:
DATA_DIR = "../data/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"
CLASS_NAMES = ["BACKGROUND", "OBJECTIVE", "METHODS", "RESULTS", "CONCLUSIONS"]

train_samples = load_pubmed_data(os.path.join(DATA_DIR, "train.txt"))
val_samples = load_pubmed_data(os.path.join(DATA_DIR, "dev.txt"))
test_samples = load_pubmed_data(os.path.join(DATA_DIR, "test.txt"))

train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)

X_train, y_train = train_df["text"].to_numpy(), train_df["target"].to_numpy()
X_val, y_val = val_df["text"].to_numpy(), val_df["target"].to_numpy()
X_test, y_test = test_df["text"].to_numpy(), test_df["target"].to_numpy()

print(f"Train: {len(X_train):,}  |  Val: {len(X_val):,}  |  Test: {len(X_test):,}")

## Build and Train the Model

We use a Pipeline that combines:
1. **TfidfVectorizer**: converts text to TF-IDF vectors (unigrams + bigrams, max 10k features)
2. **MultinomialNB**: Naive Bayes classifier

In [None]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 2), max_features=10000,
                               min_df=2, max_df=0.95, sublinear_tf=True)),
    ("clf", MultinomialNB(alpha=1.0))
])

model.fit(X_train, y_train)
print(f"Vocabulary size: {len(model.named_steps['tfidf'].vocabulary_):,}")
print("Training done.")

## Evaluation

In [None]:
# Validation set
val_preds = model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)
print(f"Validation accuracy: {val_acc*100:.2f}%")
print()
print(classification_report(y_val, val_preds, target_names=CLASS_NAMES, digits=4))

In [None]:
# Test set
test_preds = model.predict(X_test)
test_acc = accuracy_score(y_test, test_preds)
print(f"Test accuracy: {test_acc*100:.2f}%")
print()
print(classification_report(y_test, test_preds, target_names=CLASS_NAMES, digits=4))

## Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, test_preds, labels=CLASS_NAMES)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES)
plt.title("Confusion Matrix - Baseline Model")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.tight_layout()
plt.show()

## Error Analysis

In [None]:
errors = pd.DataFrame({"text": X_test, "true": y_test, "pred": test_preds})
errors = errors[errors["true"] != errors["pred"]]

print(f"Total errors: {len(errors)} / {len(X_test)}  ({len(errors)/len(X_test)*100:.2f}%)")
print()
print("Most frequent confusions:")
confusion_counts = errors.groupby(["true", "pred"]).size().sort_values(ascending=False)
print(confusion_counts.head(5))
print()
print("Sample misclassified sentences:")
for _, row in errors.head(3).iterrows():
    print(f"  True: {row['true']}  |  Pred: {row['pred']}")
    print(f"  Text: {row['text'][:100]}...")
    print()

## Save Results

In [None]:
os.makedirs("../results", exist_ok=True)

results = {
    "model_name": "Baseline (TF-IDF + Naive Bayes)",
    "val_accuracy": float(val_acc),
    "test_accuracy": float(test_acc),
}
with open("../results/baseline_results.json", "w") as f:
    json.dump(results, f, indent=2)

print(f"Results saved. Test accuracy = {test_acc*100:.2f}%")