# Train risk classification model

Load dataset (or generate synthetic), build features, train Random Forest, save model and dataset.

In [None]:
import sys
sys.path.insert(0, "..")

from pathlib import Path
import pandas as pd

from ml.train_model import generate_synthetic_data, build_features, train
from ml.preprocessing import ALL_FEATURES

In [None]:
# Option A: Load existing dataset
DATA_DIR = Path("../data")
CSV_PATH = DATA_DIR / "triage_dataset.csv"

if CSV_PATH.exists():
    df = pd.read_csv(CSV_PATH)
    print(f"Loaded {len(df)} rows from {CSV_PATH}")
else:
    # Option B: Generate and save
    df = generate_synthetic_data(2500)
    df.to_csv(CSV_PATH, index=False)
    print(f"Generated and saved {len(df)} rows to {CSV_PATH}")

In [None]:
# Train model (generates fresh synthetic data and saves to data/ by default)
model, scaler, summary = train(n_samples=2500, save_dataset=True)
print("Summary:", summary)

In [None]:
# Feature importance
import matplotlib.pyplot as plt

fi = pd.Series(model.feature_importances_, index=ALL_FEATURES).sort_values(ascending=True)
fi.tail(15).plot(kind="barh", figsize=(8, 6), title="Top feature importance")
plt.tight_layout()
plt.show()