In [12]:
import itertools
import os
import pathlib
import sys

import numpy as np
import pandas as pd
from arg_parsing_utils import check_for_missing_args, parse_args
from notebook_init_utils import bandicoot_check, init_notebook

root_dir, in_notebook = init_notebook()

profile_base_dir = bandicoot_check(
    pathlib.Path(os.path.expanduser("~/mnt/bandicoot/NF1_organoid_data")).resolve(),
    root_dir,
)

from file_checking import check_number_of_files

In [13]:
patients_file_path = pathlib.Path("../../data/patient_IDs.txt").resolve(strict=True)
patients = pd.read_csv(patients_file_path, header=None)[0].tolist()
patients

['NF0014_T1',
 'NF0014_T2',
 'NF0016_T1',
 'NF0018_T6',
 'NF0021_T1',
 'NF0030_T1',
 'NF0035_T1',
 'NF0037_T1',
 'NF0037_T1_CQ1',
 'NF0040_T1',
 'SARCO219_T2',
 'SARCO361_T1']

In [14]:
features_to_check_for = {
    "patient": [],
    "well_fov": [],
    "file_path": [],
    "exists": [],
}
for patient in patients:
    extracted_features_dir = pathlib.Path(
        f"{profile_base_dir}/data/{patient}/extracted_features/"
    ).resolve(strict=True)
    # get all of the well_fov directories
    well_fov_dirs = [d for d in extracted_features_dir.iterdir() if d.is_dir()]
    well_fovs = [d.name for d in well_fov_dirs if "run_stats" not in d.name]
    print(f"Patient {patient} has {len(well_fovs)} well_fovs to check.")
    for well_fov in well_fovs:
        converted_profile_dir = pathlib.Path(
            f"{profile_base_dir}/data/{patient}/image_based_profiles/0.converted_profiles/{well_fov}/{well_fov}.duckdb"
        ).resolve()
        features_to_check_for["patient"].append(patient)
        features_to_check_for["well_fov"].append(well_fov)
        features_to_check_for["file_path"].append(str(converted_profile_dir))
        features_to_check_for["exists"].append(converted_profile_dir.exists())


features_to_check_for_df = pd.DataFrame(features_to_check_for)
# print the number of total, present, and missing files
total_files = len(features_to_check_for_df)
present_files = features_to_check_for_df["exists"].sum()
missing_files = total_files - present_files
print(f"Total files to check: {total_files}")
print(f"Present files: {present_files}")
print(f"Missing files: {missing_files}")
features_to_check_for_df.head()

Patient NF0014_T1 has 104 well_fovs to check.
Patient NF0014_T2 has 350 well_fovs to check.
Patient NF0016_T1 has 123 well_fovs to check.
Patient NF0018_T6 has 160 well_fovs to check.
Patient NF0021_T1 has 348 well_fovs to check.
Patient NF0030_T1 has 207 well_fovs to check.
Patient NF0035_T1 has 349 well_fovs to check.
Patient NF0037_T1 has 420 well_fovs to check.
Patient NF0037_T1_CQ1 has 693 well_fovs to check.
Patient NF0040_T1 has 420 well_fovs to check.
Patient SARCO219_T2 has 199 well_fovs to check.
Patient SARCO361_T1 has 350 well_fovs to check.
Total files to check: 3723
Present files: 1894
Missing files: 1829


Unnamed: 0,patient,well_fov,file_path,exists
0,NF0014_T1,E5-2,/home/lippincm/mnt/bandicoot/NF1_organoid_data...,True
1,NF0014_T1,D5-2,/home/lippincm/mnt/bandicoot/NF1_organoid_data...,True
2,NF0014_T1,G6-1,/home/lippincm/mnt/bandicoot/NF1_organoid_data...,True
3,NF0014_T1,G8-1,/home/lippincm/mnt/bandicoot/NF1_organoid_data...,True
4,NF0014_T1,C9-2,/home/lippincm/mnt/bandicoot/NF1_organoid_data...,True


In [15]:
load_file_path = pathlib.Path("../load_data/load_file.txt").resolve()
load_file_path.parent.mkdir(parents=True, exist_ok=True)
with open(load_file_path, "w") as f:
    for idx, row in features_to_check_for_df.iterrows():
        if row["exists"]:
            f.write(f"{row['patient']}\t{row['well_fov']}\n")