In [1]:
import sys
import os


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
print(sys.path[-1])

/home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali


In [2]:
BASE_DIR = os.path.dirname(os.getcwd())
META_DIR = os.path.join(BASE_DIR, "meta")
META_QUERIES_DIR = os.path.join(META_DIR, "queries")
DATA_DIR = os.path.join(BASE_DIR, "data")
DATA_REDUCED_DIR = os.path.join(BASE_DIR, "data_reduced")
RESULTS_DIR = os.path.join(BASE_DIR, "results")
IMGS_DIR = os.path.join(BASE_DIR, "imgs")

os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(IMGS_DIR, exist_ok=True)

print(f"{'BASE_DIR':<20}{BASE_DIR}")
print(f"{'META_DIR':<20}{META_DIR}")
print(f"{'META_QUERIES_DIR':<20}{META_QUERIES_DIR}")
print(f"{'DATA_DIR':<20}{DATA_DIR}")
print(f"{'DATA_REDUCED_DIR':<20}{DATA_REDUCED_DIR}")
print(f"{'RESULTS_DIR':<20}{RESULTS_DIR}")
print(f"{'IMGS_DIR':<20}{IMGS_DIR}")

BASE_DIR            /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali
META_DIR            /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta
META_QUERIES_DIR    /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta/queries
DATA_DIR            /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/data
DATA_REDUCED_DIR    /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/data_reduced
RESULTS_DIR         /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/results
IMGS_DIR            /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/imgs


In [3]:
import pandas as pd
from tqdm import tqdm

from src.utils import get_coords, quick_load, calculate_best_rmsd
from src.monte_carlo import MonteCarloAligner

In [4]:
QUERIES = os.listdir(META_QUERIES_DIR)
QUERIES_PATHS = [os.path.join(META_QUERIES_DIR, query) for query in QUERIES]

In [5]:
ITERATION_LIMIT_ = 10_000
# BETAS_MODE = "U"
# BETAS_RANGE = (
#     0.01,   # min_beta
#     10.0,   # start_beta
#     100.0,  # end_beta
#     0.25,   # min_beta_position
# )
BETAS_MODE_ = "exponential"
BETAS_RANGE_ = [1.0, 100.0]
REHEAT_ = True

In [6]:
dfs = {}
for query, query_path in zip(QUERIES, QUERIES_PATHS):
    query = query.replace(".csv", "")
    print(f"{'Query:':<20}{query_path}")

    df = pd.read_csv(query_path, index_col=None)
    df = df[["domain_id", "class", "fold", "superfamily", "family"]]

    rmsds = []
    mca_score = []
    query_pdb = os.path.join(
        DATA_DIR, query, f"{query}.pdb"
    )
    query_reduced = os.path.join(
        DATA_REDUCED_DIR, query, f"{query}.pkl.gz"
    )

    query_coords = get_coords(query_pdb, "Q")
    query_reduced_mat = quick_load(query_reduced)

    for i, row in tqdm(
        df.iterrows(),
        total=len(df),
        desc=f"Processing {query}",
        unit="comparison",
    ):
        if i == 0:
            # Comparison with self, used as baseline - query vs query
            ref_id = query
            ref_pdb = query_pdb
            ref_reduced = query_reduced
            iteration_limit = 1_000
            betas_mode = "exponential"
            betas_range = [50, 100]
            reheat = True
        else:
            ref_id = row["domain_id"]
            ref_pdb = os.path.join(
                "/".join(query_pdb.split("/")[:-1]),
                "references",
                f"{ref_id}.pdb"
            )
            ref_reduced = os.path.join(
                "/".join(query_reduced.split("/")[:-1]),
                "references",
                f"{ref_id}.pkl.gz"
            )
            iteration_limit = ITERATION_LIMIT_
            betas_mode = BETAS_MODE_
            betas_range = BETAS_RANGE_
            reheat = REHEAT_

        # RMSD calculation
        ref_coords = get_coords(ref_pdb, "R")
        rmsd, n = calculate_best_rmsd(query_coords, ref_coords)
        rmsds.append((rmsd, n))

        # Monte Carlo Alignment
        img_path = os.path.join(IMGS_DIR, query)
        os.makedirs(img_path, exist_ok=True)
        ref_reduced_mat = quick_load(ref_reduced)
        algn = MonteCarloAligner(
            query_reduced_mat, ref_reduced_mat,
            iteration_limit=iteration_limit,
            betas_mode=betas_mode,
            betas_range=betas_range,
            reheat=reheat,
        )
        _, score, _ = algn.run_simulation()
        algn.plot_convergence(
            title=f"Monte Carlo Alignment Convergence for {ref_id}",
            show=False,
            filename=os.path.join(img_path, f"{ref_id}.png")
        )
        mca_score.append(score)

    df["rmsd"] = rmsds
    df["mca_score"] = mca_score
    dfs[query] = df

Query:              /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta/queries/d7dhfa1.csv


Processing d7dhfa1: 100%|██████████| 24/24 [06:14<00:00, 15.61s/comparison]


Query:              /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta/queries/d4bcda1.csv


Processing d4bcda1: 100%|██████████| 22/22 [05:42<00:00, 15.56s/comparison]


Query:              /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta/queries/d1fi6a_.csv


Processing d1fi6a_: 100%|██████████| 22/22 [02:09<00:00,  5.90s/comparison]


Query:              /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta/queries/d2gjva1.csv


Processing d2gjva1: 100%|██████████| 12/12 [02:24<00:00, 12.01s/comparison]


Query:              /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/meta/queries/d3brma_.csv


Processing d3brma_: 100%|██████████| 17/17 [04:52<00:00, 17.19s/comparison]


In [7]:
for query, df in dfs.items():
    df.to_csv(
        os.path.join(RESULTS_DIR, f"{query}.csv"),
        index=False
    )
    print(f"Saved results for {query} to {RESULTS_DIR}")

Saved results for d7dhfa1 to /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/results
Saved results for d4bcda1 to /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/results
Saved results for d1fi6a_ to /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/results
Saved results for d2gjva1 to /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/results
Saved results for d3brma_ to /home/cotsios/dsit/2nd-semester/algos-in-mol-bio/py-dali/results
