In [44]:
import os
import subprocess
import math
from pathlib import Path
import shutil
import numpy as np
import random
import copy
from scipy.stats import pearsonr
from pprint import pprint
from argparse import ArgumentParser

In [45]:
# --- Configuration ---
RDIR = "/home/luis_ubuntu/unixdir/Peptide_Binding/smm_unnested_cv"
DDIR = "/home/luis_ubuntu/unixdir/Peptide_Binding/Data"

alleles = ["A0101"]
lambdas = [0.02, 0.08, 0.1]
epis = [0.01, 0.04]
folds = range(5)

In [46]:
# --- Pearson correlation calculator ---
def pearson_from_pairs(pairs):
    n = len(pairs)
    if n == 0:
        return 0.0, float("inf")
    
    x = [p[0] for p in pairs]
    y = [p[1] for p in pairs]
    
    x0 = sum(x) / n
    y0 = sum(y) / n
    
    t = nx = ny = err = 0.0
    for i in range(n):
        dx = x[i] - x0
        dy = y[i] - y0
        t += dx * dy
        nx += dx * dx
        ny += dy * dy
        err += (x[i] - y[i]) ** 2
    
    if nx * ny == 0:
        pcc = 0.0
    else:
        pcc = t / math.sqrt(nx * ny)
    
    mse = err / n
    return pcc, mse

def make_and_enter(dir_path):
    path = Path(dir_path)
    path.mkdir(exist_ok=True)
    os.chdir(path)

def run_training_and_evaluation(RDIR, train_file, eval_file, mat_file, pred_file, _lambda, _epsilon):
    # Run training
    if not Path(mat_file).exists():
        with open(mat_file, "w") as fout:
            subprocess.run(
                [
                    "python", f"{RDIR}/smm_gradient_descent.py",
                    "-l", str(_lambda),
                    "-epi", str(_epsilon),
                    "-t", train_file
                ],
                stdout=fout,
                stderr=subprocess.DEVNULL,
                env={**os.environ, "QT_QPA_PLATFORM": "offscreen"}
            )

    # Run evaluation
    if not Path(pred_file).exists():
        with open(pred_file, "w") as fout:
            subprocess.run(
                [
                    "python", f"{RDIR}/pep2score.py",
                    "-mat", mat_file,
                    "-f", eval_file
                ],
                stdout=fout,
                stderr=subprocess.DEVNULL,
                env={**os.environ, "QT_QPA_PLATFORM": "offscreen"}
            )

def save_final_predictions(allele, best_lambda, best_epsilon):
    """
    Concatenates all .pred files from the best hyperparameter directory
    and writes the result into {allele}_final_predictions file.
    """
    pred_lines = []
    best_pred_dir = Path(f"l.{best_lambda}/epi.{best_epsilon}")

    # Collect lines from all .pred files, ignoring comments
    for pred_file in best_pred_dir.glob("*.pred"):
        with open(pred_file) as pf:
            for line in pf:
                if not line.startswith("#") and line.strip():
                    pred_lines.append(line)

    # Write to the output file in the parent directory
    output_file = f"{RDIR}/{allele}.res/{allele}_prediction"
    with open(output_file, "w") as final_out:
        final_out.writelines(pred_lines)

In [48]:
os.chdir(RDIR)
# --- Main loop ---
for allele in alleles:
    best_pcc = -1000
    best_model = ""
    best_lambda = ""
    best_epsilon = ""

    make_and_enter(f"{allele}.res")

    for l in lambdas:
        make_and_enter(f"l.{l}")

        for epi in epis:
            make_and_enter(f"epi.{epi}")

            preds = []

            for n in folds:
                mat_file = f"mat.{n}"
                pred_file = f"c00{n}.pred"
                
                train_file = f"{DDIR}/{allele}/f00{n}"
                eval_file = f"{DDIR}/{allele}/c00{n}"

                # Run training and evalutation
                run_training_and_evaluation(RDIR, train_file, eval_file, mat_file=mat_file, pred_file=pred_file, _lambda=l, _epsilon=epi)

                # Parse predictions
                with open(pred_file) as pf:
                    for line in pf:
                        if "#" not in line and line.strip():
                            try:
                                parts = line.strip().split()
                                preds.append((float(parts[1]), float(parts[2])))
                            except:
                                continue

            # Compute PCC and MSE
            pcc, mse = pearson_from_pairs(preds)
            eval_output = f"{allele} lambda {l} epsilon {epi} PCC {pcc:.5f} MSE {mse:.5f}"
            print(eval_output)

            if pcc > best_pcc:
                best_pcc = pcc
                best_lambda = l
                best_epsilon = epi
                best_model = f"lambda {best_lambda} epsilon {best_epsilon}"

            os.chdir("..")  # up from epi.X

        os.chdir("..")  # up from l.X
    
    # Final result
    print("\nBest model for allele", allele,": ", best_model, "with correlation", f"{best_pcc:.5f}\n")

    save_final_predictions(allele, best_lambda, best_epsilon)

    pairs = []
    with open(f"{allele}_prediction") as f:
        for line in f:
            parts = line.strip().split()
            try:
                x = float(parts[1])  # second column
                y = float(parts[2])  # third column
                pairs.append((x, y))
            except (IndexError, ValueError):
                continue
                
    pcc, mse = pearson_from_pairs(pairs)
    eval_output = f"Final prediciton for allele:{allele}; PCC {pcc:.5f} MSE {mse:.5f}"
    print(eval_output)
    
    os.chdir("..")  # up from allele.res

A0101 lambda 0.02 epsilon 0.01 PCC 0.62780 MSE 0.01752
A0101 lambda 0.02 epsilon 0.04 PCC 0.55790 MSE 0.02024
A0101 lambda 0.08 epsilon 0.01 PCC 0.62779 MSE 0.01751
A0101 lambda 0.08 epsilon 0.04 PCC 0.55795 MSE 0.02024
A0101 lambda 0.1 epsilon 0.01 PCC 0.62780 MSE 0.01751
A0101 lambda 0.1 epsilon 0.04 PCC 0.55794 MSE 0.02023

Best model for allele A0101 : lambda 0.02 epsilon 0.01 with correlation 0.62780

Final prediciton for allele:A0101; PCC 0.62780 MSE 0.01752
