# Colab: Train the Breast Cancer Diagnosis Predictor

## 1) Setup

In [None]:

!pip -q install numpy==1.26.4 pandas==2.2.2 scikit-learn==1.5.2 matplotlib==3.9.2 joblib==1.4.2


## 2) Load or upload processed data
- Upload `X_train.csv`, `y_train.csv`, `X_test.csv`, `y_test.csv` to `/content/data/processed/`

In [None]:

import os, pandas as pd, pathlib
base = pathlib.Path('/content')
(base/'data/processed').mkdir(parents=True, exist_ok=True)
# from google.colab import files  # Uncomment if you want to upload manually
# files.upload()  # then move files into data/processed


## 3) Copy in minimal training code

In [None]:

import argparse, pathlib, joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

def main(data_dir, outdir):
    data_dir = pathlib.Path(data_dir)
    X = pd.read_csv(data_dir / "X_train.csv")
    y = pd.read_csv(data_dir / "y_train.csv").squeeze()

    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42))
    ])

    pipe.fit(X_tr, y_tr)
    y_pred = pipe.predict(X_val)
    print(classification_report(y_val, y_pred))

    out = pathlib.Path(outdir)
    out.mkdir(parents=True, exist_ok=True)
    joblib.dump(pipe, out / "model.joblib")
    print(f"Saved model to {out / 'model.joblib'}")

if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--data", required=True, help="directory containing X_train.csv and y_train.csv")
    ap.add_argument("--outdir", default="models")
    args = ap.parse_args()
    main(args.data, args.outdir)


## 4) Train

In [None]:

# Save the train.py content to a file then run it
with open('/content/train.py', 'w') as f:
    f.write(_ih[-3])  # writes the previous code cell content (train_py)
!python /content/train.py --data /content/data/processed --outdir /content/models


## 5) Evaluate

In [None]:

import argparse, joblib, pathlib
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt

def main(data_dir, model_path):
    data_dir = pathlib.Path(data_dir)
    X = pd.read_csv(data_dir / "X_test.csv")
    y = pd.read_csv(data_dir / "y_test.csv").squeeze()

    model = joblib.load(model_path)
    y_pred = model.predict(X)

    print("Classification report:")
    print(classification_report(y, y_pred))

    try:
        # If the classifier supports predict_proba
        y_prob = model.predict_proba(X)[:,1]
        auc = roc_auc_score(y, y_prob)
        print(f"ROC AUC: {auc:.4f}")
        RocCurveDisplay.from_predictions(y, y_prob)
        plt.title("ROC Curve")
        plt.savefig("assets/screenshots/roc_curve.png", bbox_inches="tight")
        print("Saved ROC curve to assets/screenshots/roc_curve.png")
    except Exception as e:
        print("Could not compute ROC curve (no predict_proba?):", e)

    # Confusion matrix
    cm = confusion_matrix(y, y_pred)
    import numpy as np
    fig, ax = plt.subplots()
    ax.imshow(cm, cmap="Blues")
    ax.set_title("Confusion Matrix")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    for (i, j), val in np.ndenumerate(cm):
        ax.text(j, i, int(val), ha="center", va="center")
    plt.savefig("assets/screenshots/confusion_matrix.png", bbox_inches="tight")
    print("Saved confusion matrix to assets/screenshots/confusion_matrix.png")

if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--data", required=True, help="directory containing X_test.csv and y_test.csv")
    ap.add_argument("--model", required=True, help="path to model.joblib")
    args = ap.parse_args()
    main(args.data, args.model)


In [None]:

with open('/content/evaluate.py', 'w') as f:
    f.write(_ih[-2])  # writes evaluate_py
!python /content/evaluate.py --data /content/data/processed --model /content/models/model.joblib


## 6) Inference (optional)

In [None]:

import argparse, joblib, pandas as pd

def main(model_path, input_csv, output_csv):
    model = joblib.load(model_path)
    X = pd.read_csv(input_csv)
    preds = model.predict(X)
    pd.DataFrame({"prediction": preds}).to_csv(output_csv, index=False)
    print(f"Saved predictions to {output_csv}")

if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--model", required=True)
    ap.add_argument("--input", required=True)
    ap.add_argument("--output", default="predictions.csv")
    args = ap.parse_args()
    main(args.model, args.input, args.output)


In [None]:

with open('/content/infer.py', 'w') as f:
    f.write(_ih[-2])  # writes infer.py
# Example: create a dummy sample (replace with real features)
# import pandas as pd
# pd.DataFrame({...}).to_csv('/content/sample.csv', index=False)
# !python /content/infer.py --model /content/models/model.joblib --input /content/sample.csv --output /content/predictions.csv
