# BIOSTAT 826 - Assignment 1
Mortality prediction in MIMIC-IV using ICD-10 diagnosis and procedure categories.

## Setup

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from utils.data import assemble_dataset, load_code_descriptions
from utils.evaluation import sigmoid
from utils.training import (
    LogisticRegressionModel,
    build_dataloader,
    coefficient_table,
    extract_linear_weights,
    set_seed,
    train_lr_sweep_with_models,
    tune_l1_strength,
)

In [None]:
set_seed(826)
data_dir = Path('/home/rl/mimic-iv-3.1/mimic-iv-3.1/hosp')
bundle = assemble_dataset(data_dir, min_count=10, seed=826)
desc = load_code_descriptions(data_dir)

X = bundle.X
y = bundle.y.astype(np.float32)
splits = bundle.splits
feature_names = bundle.feature_names

X_train, y_train = X[splits['train']], y[splits['train']]
X_val, y_val = X[splits['val']], y[splits['val']]
X_test, y_test = X[splits['test']], y[splits['test']]

print('n_admissions:', X.shape[0])
print('n_features:', X.shape[1])
print('mortality_rate:', float(y.mean()))

## Part 2 - Logistic Regression

In [None]:
train_loader = build_dataloader(X_train, y_train, batch_size=200, shuffle=True)
val_loader = build_dataloader(X_val, y_val, batch_size=200, shuffle=False)
test_loader = build_dataloader(X_test, y_test, batch_size=200, shuffle=False)

### Part 2.1/2.2: baseline + learning-rate sweep

In [None]:
lrs = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
sweep = train_lr_sweep_with_models(
    lambda: LogisticRegressionModel(X.shape[1]),
    train_loader,
    val_loader,
    lrs,
    max_epochs=40,
    patience=6,
)

summary = []
for lr, trained in sweep.items():
    train_hist = trained.result.history['train_loss']
    val_hist = trained.result.history['val_loss']
    summary.append({
        'lr': lr,
        'epochs': len(train_hist),
        'best_epoch': trained.result.best_epoch,
        'best_val_loss': min(val_hist),
    })
summary_df = pd.DataFrame(summary).sort_values('lr').reset_index(drop=True)
summary_df

In [None]:
plt.figure(figsize=(8, 5))
for lr, trained in sweep.items():
    plt.plot(trained.result.history['train_loss'], label=f'lr={lr:g}')
plt.xlabel('Epoch')
plt.ylabel('Mini-batch training loss')
plt.title('Logistic regression learning-rate sweep')
plt.legend()
plt.tight_layout()
plt.show()

### Part 2.3: top/bottom 20 log-odds categories with text descriptions

In [None]:
best_lr = float(summary_df.sort_values('best_val_loss').iloc[0]['lr'])
best_model = sweep[best_lr].model
best_weights = extract_linear_weights(best_model)
top20, bottom20 = coefficient_table(best_weights, feature_names, desc, top_k=20)

print(f'Best learning rate by validation loss: {best_lr:g}')
print('Top 20 categories increasing log-odds')
display(top20)
print('Top 20 categories decreasing log-odds')
display(bottom20)

### Part 2.4: L1 regularization and sparse coefficients

In [None]:
l1_grid = [0.0, 1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3]
chosen_l1, l1_trained, l1_table = tune_l1_strength(
    lambda: LogisticRegressionModel(X.shape[1]),
    train_loader,
    val_loader,
    l1_grid,
    target_sparsity=0.85,
    lr=1e-3,
    max_epochs=40,
    patience=6,
)

print('Chosen L1 lambda:', chosen_l1)
l1_table

In [None]:
l1_weights = extract_linear_weights(l1_trained.model)
l1_top20, l1_bottom20 = coefficient_table(l1_weights, feature_names, desc, top_k=20)

print('L1 model top 20 categories increasing log-odds')
display(l1_top20)
print('L1 model top 20 categories decreasing log-odds')
display(l1_bottom20)

## Notes for Remaining Parts
Part 2 is fully wired. Part 3/4 cells will be expanded next with the same utility modules.