# Myc/Max — Random Forest Reproducible Notebook (v2)

Repo layout assumed:

```
ML_models/ML_RF/
├─ Data/                 # outputs + torch_prep_kfold.py (as in your tree)
├─ Inputs/               # exp_data_all.csv, rawdat.csv, ...
├─ Model/                # RF models
├─ run_model.py
└─ hyperopt.py (optional)
```

All subprocesses run with `cwd=ML_models/ML_RF` so outputs land in `ML_models/ML_RF/Data/`.

## 0) Paths, config, and helpers

In [None]:
from pathlib import Path
import sys, subprocess, json, os

REPO = Path.cwd()
MLRF = REPO / "ML_models" / "ML_RF"
DATA = MLRF / "Data"
MODEL = MLRF / "Model"
INPUTS = MLRF / "Inputs"

TORCH_PREP = MLRF / "Data" / "torch_prep_kfold.py"
RUN_MODEL  = MLRF / "run_model.py"

for p in [DATA, MODEL, INPUTS]:
    p.mkdir(parents=True, exist_ok=True)

print("Notebook CWD:", REPO)
print("MLRF exists:", MLRF.exists())
print("Scripts exist:", TORCH_PREP.exists(), RUN_MODEL.exists())

SEQUENCE_ID = "sequence"
LABEL_COL   = "bind_avg"

REFERENCE_FILE = INPUTS / "exp_data_all.csv"
FEATURE_FILES  = [MLRF / "rawdat.csv"]

print("Reference exists:", REFERENCE_FILE.exists())
print("All features exist:", all(Path(f).exists() for f in FEATURE_FILES))

TEST_PERCENT   = 0.15
KFOLD          = 5
NUM_REPEATS    = 1
KEEP_LAST_PCT  = 0
NAVG           = 50
SCR_FRACTIONS  = "0.0"
DATA_SCALE     = "log"

RF = dict(
    n_estimators=300,
    max_depth="None",
    max_features="auto",
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
)

def run(args, cwd=MLRF):
    print(">>>", " ".join(str(a) for a in args))
    try:
        r = subprocess.run(args, check=True, capture_output=True, text=True, cwd=str(cwd))
        if r.stdout: print(r.stdout)
        if r.stderr: print("[stderr]", r.stderr)
    except subprocess.CalledProcessError as e:
        print("[stdout]", e.stdout)
        print("[stderr]", e.stderr)
        raise


## 1) Initial split

In [None]:
args = [
    sys.executable, str(TORCH_PREP),
    "--initial_split",
    "--reference_file", str(REFERENCE_FILE),
    "--ref_id_col", SEQUENCE_ID,
    "--ref_label_col", LABEL_COL,
    "--filenames", *[str(p) for p in FEATURE_FILES],
    "--feature_id_col", SEQUENCE_ID,
    "--prefix", "mycmax",
    "--model_type", "reg",
    "--test_percentage", str(TEST_PERCENT),
    "--scramble_fractions", SCR_FRACTIONS,
]
run(args)

for p in [
    DATA / "mycmax_reg_scr0p00_trn_final.csv",
    DATA / "mycmax_reg_scr0p00_tst_preprocess.csv",
]:
    print(p.name, "→", p.exists(), "|", p)

## 2) Build repeated K-folds

In [None]:
args = [
    sys.executable, str(TORCH_PREP),
    "--process", "train",
    "--keep_last_percent", str(KEEP_LAST_PCT),
    "--navg", str(NAVG),
    "--kfold", str(KFOLD),
    "--num_repeats", str(NUM_REPEATS),
    "--prefix", "mycmax",
    "--model_type", "reg",
    "--scramble_fractions", SCR_FRACTIONS,
    "--random_state", "42",
]
run(args)

## 3) Train RF with CV

In [None]:
args = [
    sys.executable, str(RUN_MODEL),
    "--mode", "0",
    "--model_type", "reg", "--data_scale", DATA_SCALE,
    "--kfold", str(KFOLD), "--num_repeats", str(NUM_REPEATS),
    "--model_dir", "Model", "--data_dir", "Data",
    "--output_file", "predictions",
    "--ref_id_col", SEQUENCE_ID, "--ref_label_col", LABEL_COL,
    "--n_estimators", str(RF["n_estimators"]),
    "--max_depth", str(RF["max_depth"]),
    "--max_features", str(RF["max_features"]),
    "--min_samples_split", str(RF["min_samples_split"]),
    "--min_samples_leaf", str(RF["min_samples_leaf"]),
    "--random_state", str(RF["random_state"]),
    "--prefix", "mycmax", "--scramble_fractions", SCR_FRACTIONS,
]
run(args)

## 4) Prepare test set

In [None]:
args = [
    sys.executable, str(TORCH_PREP),
    "--process", "test",
    "--keep_last_percent", str(KEEP_LAST_PCT),
    "--navg", str(NAVG),
    "--prefix", "mycmax",
    "--model_type", "reg",
    "--scramble_fractions", SCR_FRACTIONS,
    "--random_state", "42",
]
run(args)

## 5) Evaluate on test

In [None]:
args = [
    sys.executable, str(RUN_MODEL),
    "--mode", "1",
    "--model_type", "reg", "--data_scale", DATA_SCALE,
    "--kfold", str(KFOLD), "--num_repeats", str(NUM_REPEATS),
    "--model_dir", "Model", "--data_dir", "Data",
    "--output_file", "predictions",
    "--ref_id_col", SEQUENCE_ID, "--ref_label_col", LABEL_COL,
    "--random_state", "42",
    "--prefix", "mycmax", "--scramble_fractions", SCR_FRACTIONS,
]
run(args)

for p in [
    DATA / "predictions_reg_final_avg_scr0p00.csv",
    DATA / "predictions_test_reg_scr0p00.csv",
]:
    print(p.name, "→", p.exists(), "|", p)

## 6) Plots

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, mean_squared_error
from scipy.stats import pearsonr

def _find_one(patterns):
    for pat in patterns:
        hits = list(DATA.glob(pat))
        if hits:
            return hits[0]
    return None

cv_path = _find_one([
    "predictions_reg_final_avg_scr0p00.csv",
    "predictions_*reg*final*avg*scr0p00*.csv",
])
tst_path = _find_one([
    "predictions_test_reg_scr0p00.csv",
    "predictions_*test*reg*scr0p00*.csv",
])

print("CV predictions:", cv_path)
print("Test predictions:", tst_path)

def load_preds(path):
    if path is None or not path.exists():
        print("[WARN] Missing predictions:", path); return None
    df = pd.read_csv(path)
    lc = [c.lower() for c in df.columns]
    def pick(opts):
        for o in opts:
            if o in lc: return df.columns[lc.index(o)]
        return None
    y_true = pick(["y_true","true","label","bind_avg","exp"])
    y_pred = pick(["y_pred","pred","prediction","yhat","y_hat","predicted"])
    if y_true is None or y_pred is None or y_true == y_pred:
        num = df.select_dtypes(include=[np.number])
        if num.shape[1] >= 2:
            y_true, y_pred = num.columns[-2], num.columns[-1]
        else:
            raise ValueError("Rename y_true/y_pred in this cell.")
    return df[[y_true, y_pred]].rename(columns={y_true:"y_true", y_pred:"y_pred"})

cv_df  = load_preds(cv_path)
tst_df = load_preds(tst_path)

def plot_scatter(df, title):
    if df is None: return
    x, y = df["y_true"].values, df["y_pred"].values
    pcc, _ = pearsonr(x, y)
    mse = mean_squared_error(x, y)
    plt.figure(figsize=(6,6))
    plt.scatter(x, y, alpha=0.7)
    plt.axhline(0, ls='--'); plt.axvline(0, ls='--')
    plt.title(f"{title}\nPearson={pcc:.2f}, MSE={mse:.3f}")
    plt.xlabel("Experimental ΔΔG_bind"); plt.ylabel("Predicted ΔΔG_bind")
    plt.show()

plot_scatter(cv_df,  "Training CV (avg)")
plot_scatter(tst_df, "Held-out Test")

def plot_binary_cm(df, thr=0.0, title="Binary @ ΔΔG=0"):
    if df is None: return
    y_true = (df["y_true"].values > thr).astype(int)
    y_hat  = (df["y_pred"].values > thr).astype(int)
    cm = confusion_matrix(y_true, y_hat, labels=[0,1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["non-binding","binding"])
    disp.plot(values_format='d'); plt.title(title); plt.show()

plot_binary_cm(cv_df, 0.0, "Training CV (ΔΔG=0)")
plot_binary_cm(tst_df, 0.0, "Test (ΔΔG=0)")


## 7) Reproducibility manifest

In [None]:
manifest = {
    "python": sys.version,
    "cwd": str(REPO),
    "mlrf": str(MLRF),
    "config": {
        "SEQUENCE_ID": SEQUENCE_ID,
        "LABEL_COL": LABEL_COL,
        "TEST_PERCENT": TEST_PERCENT,
        "KFOLD": KFOLD,
        "NUM_REPEATS": NUM_REPEATS,
        "KEEP_LAST_PCT": KEEP_LAST_PCT,
        "NAVG": NAVG,
        "SCR_FRACTIONS": SCR_FRACTIONS,
        "DATA_SCALE": DATA_SCALE,
        "RF": RF,
    },
    "inputs": {
        "reference": str(REFERENCE_FILE),
        "features": [str(p) for p in FEATURE_FILES],
    }
}
out = Path('/mnt/data/mycmax_rf_repro_manifest.json')
out.write_text(__import__('json').dumps(manifest, indent=2))
print("Saved manifest:", out)