In [1]:
import sys
import os


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
print(sys.path[-1])

/home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali


In [2]:
BASE_DIR = os.path.dirname(os.getcwd())
META_DIR = os.path.join(BASE_DIR, "meta")
META_QUERIES_DIR = os.path.join(META_DIR, "queries")
DATA_DIR = os.path.join(BASE_DIR, "data")
DATA_REDUCED_DIR = os.path.join(BASE_DIR, "data_reduced")
RESULTS_DIR = os.path.join(BASE_DIR, "results")
IMGS_DIR = os.path.join(BASE_DIR, "imgs")

os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(IMGS_DIR, exist_ok=True)

print(f"{'BASE_DIR':<20}{BASE_DIR}")
print(f"{'META_DIR':<20}{META_DIR}")
print(f"{'META_QUERIES_DIR':<20}{META_QUERIES_DIR}")
print(f"{'DATA_DIR':<20}{DATA_DIR}")
print(f"{'DATA_REDUCED_DIR':<20}{DATA_REDUCED_DIR}")
print(f"{'RESULTS_DIR':<20}{RESULTS_DIR}")
print(f"{'IMGS_DIR':<20}{IMGS_DIR}")

BASE_DIR            /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali
META_DIR            /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta
META_QUERIES_DIR    /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta/queries
DATA_DIR            /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/data
DATA_REDUCED_DIR    /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/data_reduced
RESULTS_DIR         /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/results
IMGS_DIR            /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/imgs


In [3]:
import pandas as pd
from tqdm import tqdm

from src.utils import get_coords, quick_load, calculate_best_rmsd
from src.monte_carlo import MonteCarloAligner

In [4]:
QUERIES = os.listdir(META_QUERIES_DIR)
QUERIES_PATHS = [os.path.join(META_QUERIES_DIR, query) for query in QUERIES]

In [None]:
ITERATION_LIMIT_ = 10_000
# BETAS_MODE_ = "V"
# BETAS_RANGE_ = (
#     0.01,   # min_beta
#     10.0,   # start_beta
#     100.0,  # end_beta
#     0.25,   # min_beta_position
# )
BETAS_MODE_ = "exponential"
BETAS_RANGE_ = [1.0, 100.0]
REHEAT_ = True

In [7]:
dfs = {}
for query, query_path in zip(QUERIES, QUERIES_PATHS):
    query = query.replace(".csv", "")

    # Skip processed queries
    if os.path.exists(os.path.join(RESULTS_DIR, f"{query}.csv")):
        print(f"Skipping already processed query: {query}")
        continue

    print(f"{'Query:':<20}{query_path}")


    df = pd.read_csv(query_path, index_col=None)
    df = df[["domain_id", "class", "fold", "superfamily", "family"]]

    rmsds = []
    mca_score = []
    query_pdb = os.path.join(
        DATA_DIR, query, f"{query}.pdb"
    )
    query_reduced = os.path.join(
        DATA_REDUCED_DIR, query, f"{query}.pkl.gz"
    )

    query_coords = get_coords(query_pdb, "Q")
    query_reduced_mat = quick_load(query_reduced)
    for i, row in tqdm(
        df.iterrows(),
        total=len(df),
        desc=f"Processing {query}",
        unit="comparison",
    ):
        if i == 0:
            # Comparison with self, used as baseline - query vs query
            ref_id = query
            ref_pdb = query_pdb
            ref_reduced = query_reduced
            iteration_limit = 1_000
            betas_mode = "exponential"
            betas_range = [50, 100]
            reheat = True
        else:
            ref_id = row["domain_id"]
            ref_pdb = os.path.join(
                "/".join(query_pdb.split("/")[:-1]),
                "references",
                f"{ref_id}.pdb"
            )
            ref_reduced = os.path.join(
                "/".join(query_reduced.split("/")[:-1]),
                "references",
                f"{ref_id}.pkl.gz"
            )
            iteration_limit = ITERATION_LIMIT_
            betas_mode = BETAS_MODE_
            betas_range = BETAS_RANGE_
            reheat = REHEAT_

        # RMSD calculation
        ref_coords = get_coords(ref_pdb, "R")
        rmsd, n = calculate_best_rmsd(query_coords, ref_coords)
        rmsds.append((rmsd, n))

        # Monte Carlo Alignment
        img_path = os.path.join(IMGS_DIR, query)
        os.makedirs(img_path, exist_ok=True)
        try:
            ref_reduced_mat = quick_load(ref_reduced)
        except FileNotFoundError:
            print(f"Reference reduced matrix not found: {ref_reduced}")
            mca_score.append(None)
            continue
        algn = MonteCarloAligner(
            query_reduced_mat, ref_reduced_mat,
            iteration_limit=iteration_limit,
            betas_mode=betas_mode,
            betas_range=betas_range,
            reheat=reheat,
        )
        _, score, _ = algn.run_simulation()
        algn.plot_convergence(
            title=f"Monte Carlo Alignment Convergence for {ref_id}",
            show=False,
            filename=os.path.join(img_path, f"{ref_id}.png")
        )
        mca_score.append(score)

    df["rmsd"] = rmsds
    df["mca_score"] = mca_score
    dfs[query] = df

Skipping already processed query: d3jbra3
Skipping already processed query: d6pzna1
Query:              /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta/queries/d1ijqa2.csv


Processing d1ijqa2:  23%|██▎       | 26/112 [00:47<02:48,  1.96s/comparison]

Reference reduced matrix not found: /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/data_reduced/d1ijqa2/references/g1bom.1.pkl.gz


Processing d1ijqa2: 100%|██████████| 112/112 [02:37<00:00,  1.41s/comparison]


Query:              /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta/queries/d3jcla3.csv


Processing d3jcla3: 100%|██████████| 51/51 [21:50<00:00, 25.70s/comparison]


Query:              /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta/queries/d1j1va_.csv


Processing d1j1va_: 100%|██████████| 79/79 [06:42<00:00,  5.10s/comparison]


In [8]:
for query, df in dfs.items():
    df.to_csv(
        os.path.join(RESULTS_DIR, f"{query}.csv"),
        index=False
    )
    print(f"Saved results for {query} to {RESULTS_DIR}")

Saved results for d1ijqa2 to /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/results
Saved results for d3jcla3 to /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/results
Saved results for d1j1va_ to /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/results
