In [6]:
import os
import subprocess
import math
from pathlib import Path

# --- Configuration ---
RDIR = "/mnt/c/Users/Administrator/Documents/unixdir/exercises/SMM/"
DDIR = "/mnt/c/Users/Administrator/Documents/unixdir/exercises/data/SMM/"

alleles = ["B4002"]
lambdas = [0, 0.02, 0.04, 0.08]
epis = [0, 0.01, 0.02, 0.04]
folds = range(5)

best_pcc = -1000
best_model = ""

In [7]:
# --- Pearson correlation calculator ---
def pearson_from_pairs(pairs):
    n = len(pairs)
    if n == 0:
        return 0.0, float("inf")
    
    x = [p[0] for p in pairs]
    y = [p[1] for p in pairs]
    
    x0 = sum(x) / n
    y0 = sum(y) / n
    
    t = nx = ny = err = 0.0
    for i in range(n):
        dx = x[i] - x0
        dy = y[i] - y0
        t += dx * dy
        nx += dx * dx
        ny += dy * dy
        err += (x[i] - y[i]) ** 2
    
    if nx * ny == 0:
        pcc = 0.0
    else:
        pcc = t / math.sqrt(nx * ny)
    
    mse = err / n
    return pcc, mse

In [8]:
# --- Main loop ---
for allele in alleles:
    allele_dir = Path(f"{allele}.res")
    allele_dir.mkdir(exist_ok=True)
    os.chdir(allele_dir)

    for l in lambdas:
        l_dir = Path(f"l.{l}")
        l_dir.mkdir(exist_ok=True)
        os.chdir(l_dir)

        for epi in epis:
            epi_dir = Path(f"epi.{epi}")
            epi_dir.mkdir(exist_ok=True)
            os.chdir(epi_dir)

            preds = []

            for n in folds:
                mat_file = f"mat.{n}"
                pred_file = f"c00{n}.pred"
                train_file = f"{DDIR}/{allele}/f00{n}"
                eval_file = f"{DDIR}/{allele}/c00{n}"

                # Run training
                if not Path(mat_file).exists():
                    subprocess.run(
                        ["python", f"{RDIR}/smm_gradient_descent.py",
                         "-l", str(l), "-epi", str(epi), "-t", train_file, "-e", eval_file],
                        stdout=open(mat_file, "w"),
                        stderr=subprocess.DEVNULL,
                        env={**os.environ, "QT_QPA_PLATFORM": "offscreen"}
                    )

                # Run evaluation
                if not Path(pred_file).exists():
                    with open(pred_file, "w") as fout:
                        subprocess.run(
                            ["python", f"{RDIR}/pep2score.py",
                             "-mat", mat_file, "-f", eval_file],
                            stdout=fout,
                            stderr=subprocess.DEVNULL,
                            env={**os.environ, "QT_QPA_PLATFORM": "offscreen"}
                        )

                # Parse predictions
                with open(pred_file) as pf:
                    for line in pf:
                        if "#" not in line and line.strip():
                            try:
                                parts = line.strip().split()
                                preds.append((float(parts[1]), float(parts[2])))
                            except:
                                continue

            # Compute PCC and MSE
            pcc, mse = pearson_from_pairs(preds)
            eval_output = f"{allele} lambda {l} epsilon {epi} PCC {pcc:.5f} MSE {mse:.5f}"
            print(eval_output)

            if pcc > best_pcc:
                best_pcc = pcc
                best_model = f"{allele} lambda {l} epsilon {epi}"

            os.chdir("..")  # up from epi.X

        os.chdir("..")  # up from l.X

    os.chdir("..")  # up from allele.res

# Final result
print("\nBest model:", best_model, "with correlation", f"{best_pcc:.5f}\n")

B4002 lambda 0 epsilon 0 PCC -0.30920 MSE 0.24208
B4002 lambda 0 epsilon 0.01 PCC 0.50703 MSE 0.03308
B4002 lambda 0 epsilon 0.02 PCC 0.52491 MSE 0.03292
B4002 lambda 0 epsilon 0.04 PCC 0.54164 MSE 0.03279
B4002 lambda 0.02 epsilon 0 PCC -0.30920 MSE 0.24208
B4002 lambda 0.02 epsilon 0.01 PCC 0.50671 MSE 0.03308
B4002 lambda 0.02 epsilon 0.02 PCC 0.52457 MSE 0.03290
B4002 lambda 0.02 epsilon 0.04 PCC 0.54119 MSE 0.03275
B4002 lambda 0.04 epsilon 0 PCC -0.30920 MSE 0.24208
B4002 lambda 0.04 epsilon 0.01 PCC 0.50640 MSE 0.03307
B4002 lambda 0.04 epsilon 0.02 PCC 0.52415 MSE 0.03289
B4002 lambda 0.04 epsilon 0.04 PCC 0.54070 MSE 0.03271
B4002 lambda 0.08 epsilon 0 PCC -0.30920 MSE 0.24208
B4002 lambda 0.08 epsilon 0.01 PCC 0.50572 MSE 0.03307
B4002 lambda 0.08 epsilon 0.02 PCC 0.52334 MSE 0.03287
B4002 lambda 0.08 epsilon 0.04 PCC 0.53967 MSE 0.03263

Best model: B4002 lambda 0 epsilon 0.04 with correlation 0.54164

