In [1]:
from sklearn.datasets import fetch_rcv1
rcv1 = fetch_rcv1()


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import average_precision_score, classification_report

In [3]:
X = rcv1.data[:10000]
y = rcv1.target[:10000]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [4]:
logreg = OneVsRestClassifier(
    LogisticRegression(
        solver="saga",           # supports L1/L2 + probability
        penalty="l2",
        C=1.0,
        max_iter=200,
        n_jobs=-1,              # ignored by base LR, parallel via OneVsRest
        class_weight="balanced" # good for imbalance
    ),
    n_jobs=-1
)
logreg.fit(X_train, y_train)

# Probabilities for PR-AUC
y_score_lr = logreg.predict_proba(X_test)

In [6]:
# Convert sparse matrices to dense arrays
y_test_dense = y_test.toarray() if hasattr(y_test, "toarray") else y_test
y_score_lr_dense = y_score_lr.toarray() if hasattr(y_score_lr, "toarray") else y_score_lr

In [8]:
from sklearn.metrics import f1_score, average_precision_score, classification_report
import numpy as np

# PR-AUC (micro)
pr_auc_micro_lr = average_precision_score(y_test_dense, y_score_lr_dense, average="micro")

# Threshold at 0.5 for F1
y_pred_lr = (y_score_lr_dense >= 0.5).astype(int)

f1_micro_lr = f1_score(y_test_dense, y_pred_lr, average="micro", zero_division=0)
f1_macro_lr = f1_score(y_test_dense, y_pred_lr, average="macro", zero_division=0)

print(f"LR One-vs-Rest — PR-AUC (micro): {pr_auc_micro_lr:.4f}")
print(f"LR One-vs-Rest — F1 micro: {f1_micro_lr:.4f} | F1 macro: {f1_macro_lr:.4f}")

# (Optional) per-class report
print(classification_report(y_test_dense, y_pred_lr, zero_division=0))

LR One-vs-Rest — PR-AUC (micro): 0.1376
LR One-vs-Rest — F1 micro: 0.3064 | F1 macro: 0.2892
              precision    recall  f1-score   support

           0       0.16      0.78      0.26        58
           1       0.15      0.93      0.26        28
           2       0.17      0.84      0.29        89
           3       1.00      0.06      0.12        16
           4       0.91      0.90      0.91       397
           5       0.84      0.93      0.88       232
           6       0.13      0.97      0.23        34
           7       0.70      0.85      0.77       178
           8       0.03      1.00      0.06         5
           9       0.88      0.16      0.27        95
          10       0.80      0.32      0.45        38
          11       0.13      0.85      0.22        20
          12       0.02      1.00      0.04         5
          13       0.15      0.95      0.27        20
          14       0.59      0.90      0.71       119
          15       0.73      0.54      0.6

## Logistice Regression Results Vs Neural Nets Results

| Metric         | Logistic Regression         | Neural Network                                 |
| -------------- | --------------------------- | ---------------------------------------------- |
| PR-AUC (micro) | **0.14**                    | **~0.58**                                      |
| F1 (micro)     | **0.31**                    | **~0.82**                                      |
| F1 (macro)     | **0.29**                    | **~0.45**                                      |
| Interpretation | Learns only linear patterns | Learns non-linear and contextual relationships |

A One-vs-Rest Logistic Regression was used as a baseline for multi-label classification. While it achieved reasonable recall on common topics, its precision and PR-AUC were low (PR-AUC = 0.14, F1 micro = 0.31). In contrast, the neural network achieved significantly higher performance (PR-AUC ≈ 0.58, F1 micro ≈ 0.82), demonstrating that non-linear modeling captures more complex relationships between features and topic labels.

**NOTE:** One-vs-Rest (OvR) — also called One-vs-All (OvA) — is a strategy used to extend Logistic Regression, which is naturally a binary classifier (i.e., predicts between two classes), so that it can handle multi-class or multi-label problems.

