In [1]:
import argparse
import pathlib
import shutil
import sys

import numpy as np
import pandas as pd
import tqdm

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
    # check if in a jupyter notebook
try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

sys.path.append(str(pathlib.Path(f"{root_dir}/utils").resolve()))
from file_checking import check_number_of_files

In [2]:
patientIDS_file = pathlib.Path(f"{root_dir}/data/patient_IDs.txt").resolve(strict=True)
if not patientIDS_file.is_file():
    raise FileNotFoundError(f"File {patientIDS_file} not found.")
patientIDS = []
with open(patientIDS_file, "r") as f:
    for line in f:
        patientIDS.append(line.strip())
patientIDS

['NF0014',
 'NF0016',
 'NF0018',
 'NF0021',
 'NF0030',
 'NF0040',
 'SARCO219',
 'SARCO361']

In [3]:
# set the correct number of files to check for each directory
n_files = {
    "segmentation_data": 16,
    "zstack_data": 5,
    "profiling_input_images": 9,
}

In [4]:
rerun_dict = {
    "patient": [],
    "well_fov": [],
    "zstack_counts": [],
    "segmentation_counts": [],
    "profiling_input_images_counts": [],
}
for patient in patientIDS:
    if not patient.isalnum():
        raise ValueError(f"Patient ID {patient} is not alphanumeric.")

    # set path to the processed data dir
    segmentation_data_dir = pathlib.Path(
        f"{root_dir}/data/{patient}/segmentation_masks/"
    ).resolve()
    zstack_dir = pathlib.Path(f"{root_dir}/data/{patient}/zstack_images/").resolve(
        strict=True
    )
    profiling_input_images_dir = pathlib.Path(
        f"{root_dir}/data/{patient}/profiling_input_images/"
    ).resolve()
    profiling_input_images_dir.mkdir(parents=True, exist_ok=True)
    well_fovs = [d.name for d in zstack_dir.glob("*") if d.is_dir()]

    for well_fov in well_fovs:
        rerun_dict["patient"].append(patient)
        rerun_dict["well_fov"].append(well_fov)
        if (
            check_number_of_files(zstack_dir / well_fov, n_files["zstack_data"])
            is not None
        ):
            rerun_dict["zstack_counts"].append(
                check_number_of_files(zstack_dir / well_fov, n_files["zstack_data"])[1]
            )
        else:
            rerun_dict["zstack_counts"].append(n_files["zstack_data"])
        if (
            check_number_of_files(
                segmentation_data_dir / well_fov, n_files["segmentation_data"]
            )
            is not None
        ):
            rerun_dict["segmentation_counts"].append(
                check_number_of_files(
                    segmentation_data_dir / well_fov, n_files["segmentation_data"]
                )[1]
            )
        else:
            rerun_dict["segmentation_counts"].append(n_files["segmentation_data"])
        if (
            check_number_of_files(
                profiling_input_images_dir / well_fov, n_files["profiling_input_images"]
            )
            is not None
        ):
            rerun_dict["profiling_input_images_counts"].append(
                check_number_of_files(
                    profiling_input_images_dir / well_fov,
                    n_files["profiling_input_images"],
                )[1]
            )
        else:
            rerun_dict["profiling_input_images_counts"].append(
                n_files["profiling_input_images"]
            )

G8-1 expected 16 files, but found 0 files.
G8-1 expected 16 files, but found 0 files.
D5-1 expected 16 files, but found 0 files.
D5-1 expected 16 files, but found 0 files.
F8-1 expected 16 files, but found 0 files.
F8-1 expected 16 files, but found 0 files.
G11-1 expected 16 files, but found 0 files.
G11-1 expected 16 files, but found 0 files.
G7-1 expected 16 files, but found 0 files.
G7-1 expected 16 files, but found 0 files.
E3-2 expected 16 files, but found 0 files.
E3-2 expected 16 files, but found 0 files.
C11-2 expected 16 files, but found 0 files.
C11-2 expected 16 files, but found 0 files.
E11-1 expected 16 files, but found 0 files.
E11-1 expected 16 files, but found 0 files.
D11-3 expected 16 files, but found 0 files.
D11-3 expected 16 files, but found 0 files.
E7-2 expected 16 files, but found 0 files.
E7-2 expected 16 files, but found 0 files.
C3-2 expected 16 files, but found 0 files.
C3-2 expected 16 files, but found 0 files.
D4-1 expected 16 files, but found 0 files.
D4-

In [5]:
rerun_df = pd.DataFrame(rerun_dict)
rerun_df["rerun_boolean"] = np.where(
    # (rerun_df["segmentation_counts"] != n_files["segmentation_data"])
    # |
    (rerun_df["zstack_counts"] != n_files["zstack_data"])
    | (rerun_df["profiling_input_images_counts"] != n_files["profiling_input_images"]),
    True,
    False,
)
rerun_df.head()

Unnamed: 0,patient,well_fov,zstack_counts,segmentation_counts,profiling_input_images_counts,rerun_boolean
0,NF0014,G8-1,5,0,9,False
1,NF0014,D5-1,5,0,9,False
2,NF0014,F8-1,5,0,9,False
3,NF0014,G11-1,5,0,9,False
4,NF0014,G7-1,5,0,9,False


In [6]:
rerun_df = rerun_df.loc[rerun_df["rerun_boolean"] == True]
rerun_df.head()

Unnamed: 0,patient,well_fov,zstack_counts,segmentation_counts,profiling_input_images_counts,rerun_boolean
722,NF0030,D5-1,5,4,5,True
736,NF0030,F11-4,5,0,0,True
741,NF0030,F2-4,5,0,0,True
748,NF0030,E11-2,5,0,5,True
755,NF0030,D3-2,5,1,5,True


In [7]:
# write the patient and well_fov to a file to be pared by a shell script
# This will be used to rerun the segmentation and zstack processing
rerun_file = pathlib.Path(f"{root_dir}/2.segment_images/rerun_jobs.txt").resolve()
with open(rerun_file, "w") as f:
    for index, row in rerun_df.iterrows():
        if row["rerun_boolean"]:
            f.write(f"{row['patient']}\t{row['well_fov']}\n")

In [8]:
print(f"""
For {len((rerun_df["patient"].unique()))} patients,
{len(rerun_df)} wells/fovs were checked,
{len(rerun_df.loc[rerun_df["rerun_boolean"]])} wells/fovs need to be rerun.
This is determined by the number of files in the segmentation, 
zstack and profiling input images directories.
""")


For 2 patients,
36 wells/fovs were checked,
36 wells/fovs need to be rerun.
This is determined by the number of files in the segmentation, 
zstack and profiling input images directories.



In [9]:
for patient in rerun_df["patient"].unique():
    print(f"""
    Patient: {patient}
    {len(rerun_df.loc[rerun_df["patient"] == patient])} wells/fovs need to be rerun.""")


    Patient: NF0030
    26 wells/fovs need to be rerun.

    Patient: NF0040
    10 wells/fovs need to be rerun.
