In [44]:
import pandas as pd
import os
import time
from tqdm import tqdm
import datetime
import argparse


output_folder = "/data/home/natant/Negatives/Runs/debug"
file_path = os.path.join(output_folder, "model_combinations.csv")
model_combinations_df = pd.read_csv(file_path)

correct_neg_mode_names = {
    "dinucl_shuffled": "dinucl-shuffled",
    "dinucl_sampled": "dinucl-sampled"
}
correct_neg_mode_names_reverse = {v: k for k, v in correct_neg_mode_names.items()}

def get_current_df():
    ckpt_files = [f for f in os.listdir(output_folder) if f.endswith('.ckpt')]
    updated_ckpt_files = [
        f.replace(old, new) if old in f else f
        for f in ckpt_files
        for old, new in correct_neg_mode_names.items()
        if old in f or all(k not in f for k in correct_neg_mode_names)
    ]
    # Remove duplicates if a file matches multiple keys
    updated_ckpt_files = list(dict.fromkeys(updated_ckpt_files))
    data = []
    for file in updated_ckpt_files:
        celltype = file.split("_")[0]
        TF = "_".join(file.split("_")[1:-8])
        neg_mode = file.split("_")[-8]
        CV = file.split("_")[-7]
        date = file.split("_")[-6]
        time = file.split("_")[-5]
        data.append({"Cell Type": celltype, "TF": TF, "Negative Sampling Mode": neg_mode, "Cross Val Fold": CV, "Date": date, "Time": time})
    return pd.DataFrame(data)


In [40]:
ckpt_files = [f for f in os.listdir(output_folder) if f.endswith('.ckpt')]
updated_ckpt_files = [
    f.replace(old, new) if old in f else f
    for f in ckpt_files
    for old, new in correct_neg_mode_names.items()
    if old in f or all(k not in f for k in correct_neg_mode_names)
]
# Remove duplicates if a file matches multiple keys
updated_ckpt_files = list(dict.fromkeys(updated_ckpt_files))

In [41]:
updated_ckpt_files

['MCF-7_CTCF_dinucl-sampled_CV-3_20250523_09:56_epoch=13_val_loss=0.45_AUROC=0.87.ckpt',
 'GM12878_CTCF_neighbors_CV-4_20250523_12:46_epoch=14_val_loss=0.33_AUROC=0.97.ckpt',
 'GM12878_YY1_(SC-281)_shuffled_CV-0_20250523_13:05_epoch=06_val_loss=0.43_AUROC=0.93.ckpt',
 'MCF-7_ZNF217_dinucl-sampled_CV-2_20250523_10:58_epoch=24_val_loss=0.67_AUROC=0.67.ckpt',
 'MCF-7_TCF7L2_dinucl-shuffled_CV-3_20250523_10:36_epoch=17_val_loss=0.69_AUROC=0.68.ckpt',
 'MCF-7_TCF7L2_dinucl-shuffled_CV-0_20250523_10:32_epoch=19_val_loss=0.67_AUROC=0.69.ckpt',
 'MCF-7_CTCF_dinucl-sampled_CV-4_20250523_10:01_epoch=14_val_loss=0.55_AUROC=0.82.ckpt',
 'MCF-7_CTCF_dinucl-shuffled_CV-1_20250523_10:02_epoch=14_val_loss=0.52_AUROC=0.82.ckpt',
 'GM12878_YY1_(SC-281)_dinucl-shuffled_CV-0_20250523_12:59_epoch=09_val_loss=0.53_AUROC=0.81.ckpt',
 'MCF-7_CTCF_neighbors_CV-5_20250523_10:23_epoch=11_val_loss=0.47_AUROC=0.88.ckpt',
 'MCF-7_ZNF217_dinucl-shuffled_CV-3_20250523_11:05_epoch=23_val_loss=0.70_AUROC=0.70.ckpt',
 '

In [45]:
df = get_current_df()


In [50]:
df_mcf7 = df[df['Cell Type'] == 'MCF-7']
model_combinations_df_mcf7 = model_combinations_df[model_combinations_df['Cell Type'] == 'MCF-7']

In [51]:
df_mcf7

Unnamed: 0,Cell Type,TF,Negative Sampling Mode,Cross Val Fold,Date,Time
0,MCF-7,CTCF,dinucl-sampled,CV-3,20250523,09:56
3,MCF-7,ZNF217,dinucl-sampled,CV-2,20250523,10:58
4,MCF-7,TCF7L2,dinucl-shuffled,CV-3,20250523,10:36
5,MCF-7,TCF7L2,dinucl-shuffled,CV-0,20250523,10:32
6,MCF-7,CTCF,dinucl-sampled,CV-4,20250523,10:01
...,...,...,...,...,...,...
125,MCF-7,ZNF217,neighbors,CV-4,20250523,11:19
126,MCF-7,GATA3_(SC-268),neighbors,CV-3,20250523,11:47
127,MCF-7,ZNF217,dinucl-shuffled,CV-0,20250523,11:02
131,MCF-7,ZNF217,neighbors,CV-1,20250523,11:15


In [52]:
model_combinations_df_mcf7

Unnamed: 0,Cell Type,TF,Negative Sampling Mode,Cross Val Fold
0,MCF-7,CTCF,dinucl_sampled,0
1,MCF-7,CTCF,dinucl_sampled,1
2,MCF-7,CTCF,dinucl_sampled,2
3,MCF-7,CTCF,dinucl_sampled,3
4,MCF-7,CTCF,dinucl_sampled,4
...,...,...,...,...
91,MCF-7,GATA3_(SC-268),neighbors,1
92,MCF-7,GATA3_(SC-268),neighbors,2
93,MCF-7,GATA3_(SC-268),neighbors,3
94,MCF-7,GATA3_(SC-268),neighbors,4


In [22]:
model_combinations_df_mcf7

Unnamed: 0,Cell Type,TF,Negative Sampling Mode,Cross Val Fold
0,MCF-7,CTCF,dinucl_sampled,0
1,MCF-7,CTCF,dinucl_sampled,1
2,MCF-7,CTCF,dinucl_sampled,2
3,MCF-7,CTCF,dinucl_sampled,3
4,MCF-7,CTCF,dinucl_sampled,4
...,...,...,...,...
91,MCF-7,GATA3_(SC-268),neighbors,1
92,MCF-7,GATA3_(SC-268),neighbors,2
93,MCF-7,GATA3_(SC-268),neighbors,3
94,MCF-7,GATA3_(SC-268),neighbors,4


In [None]:
    while True:
        os.system('clear')
        df = get_current_df()
        print("Model Training Progress Monitor")
        print("=====================================")
        print("Tracking:", output_folder)
        print("Last Update:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        # Combine 'Date' and 'Time' columns into a single datetime column and find the earliest and latest timepoints
        datetimes = pd.to_datetime(df['Date'] + ' ' + df['Time'])
        earliest_timepoint = datetimes.min()
        latest_timepoint = datetimes.max()
        time_difference = latest_timepoint - earliest_timepoint
        print("Approximate Running Time:", time_difference)
        print("=====================================")
        completed = len(df)
        total = len(model_combinations_df)
        print(f"Overall Progress: {completed}/{total} ({completed/total:.2%})")
        with tqdm(total=total, desc="Overall Progress") as pbar:
            pbar.update(completed)

        celltype_totals = model_combinations_df['Cell Type'].value_counts().to_dict()
        celltype_completions = df['Cell Type'].value_counts().to_dict()
        for celltype in celltype_totals:
            total = celltype_totals[celltype]
            completed = celltype_completions.get(celltype, 0)
            print(f"{celltype} Progress: {completed}/{total} ({completed/total:.2%})")
            with tqdm(total=total, desc=f"{celltype} Progress") as pbar:
                pbar.update(completed)
        time.sleep(60)