In [1]:
import argparse
import os
import pathlib
import sys
import time

import numpy as np
import pandas as pd
import psutil
import skimage

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False
if in_notebook:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

import gc

# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")
sys.path.append(str(root_dir / "3.cellprofiling" / "featurization_utils"))
from featurization_parsable_arguments import parse_featurization_args

In [2]:
if not in_notebook:
    arguments_dict = parse_featurization_args()
    patient = arguments_dict["patient"]
    well_fov = arguments_dict["well_fov"]


else:
    well_fov = "C4-2"
    patient = "NF0014"


output_parent_path = pathlib.Path(
    f"{root_dir}/data/{patient}/extracted_features/{well_fov}/"
).resolve(strict=True)

In [3]:
features_dict = {
    "feature_name": [],
    "feature_processor": [],
    "file_path": [],
}
dict_of_dfs = {}
# get each of the features
feature_files = list(output_parent_path.glob("*parquet"))
feature_files = [f for f in feature_files if f.is_file()]
for file in feature_files:
    if "Area" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Area")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Area_CPU"] = pd.read_parquet(file)
    elif "Area" in file.name and "GPU" in file.name:
        features_dict["feature_name"].append("Area")
        features_dict["feature_processor"].append("GPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Area_GPU"] = pd.read_parquet(file)
    elif "Coloc" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Coloc")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Coloc_CPU"] = pd.read_parquet(file)
    elif "Coloc" in file.name and "GPU" in file.name:
        features_dict["feature_name"].append("Coloc")
        features_dict["feature_processor"].append("GPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Coloc_GPU"] = pd.read_parquet(file)
    elif "Intensity" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Intensity")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Intensity_CPU"] = pd.read_parquet(file)
    elif "Intensity" in file.name and "GPU" in file.name:
        features_dict["feature_name"].append("Intensity")
        features_dict["feature_processor"].append("GPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Intensity_GPU"] = pd.read_parquet(file)
    elif "Gran" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Granularity")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Granularity_CPU"] = pd.read_parquet(file)
    elif "Gran" in file.name and "GPU" in file.name:
        features_dict["feature_name"].append("Granularity")
        features_dict["feature_processor"].append("GPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Granularity_GPU"] = pd.read_parquet(file)
    elif "Neighbors" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Neighbors")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Neighbors_CPU"] = pd.read_parquet(file)
    elif "Texture" in file.name and "CPU" in file.name:
        features_dict["feature_name"].append("Texture")
        features_dict["feature_processor"].append("CPU")
        features_dict["file_path"].append(file)
        dict_of_dfs["Texture_CPU"] = pd.read_parquet(file)
    else:
        print(f"Unknown feature file: {file.name}")

In [4]:
import hashlib


def get_file_hash(file_path):
    """Calculate SHA256 hash of a file without loading it as an image."""
    try:
        with open(file_path, "rb") as f:
            return hashlib.sha256(f.read()).hexdigest()
    except Exception as e:
        return f"Error: {e}"

In [5]:
features_df = pd.DataFrame.from_dict(features_dict)

features_df.rename(columns={"index": "feature_processor"}, inplace=True)
# get the file size in KB
features_df["file_size_KB"] = features_df["file_path"].apply(
    lambda x: x.stat().st_size / (1024)
)
# get the sha256 hash of the file
features_df["sha256"] = features_df["file_path"].apply(get_file_hash)
features_df

Unnamed: 0,feature_name,feature_processor,file_path,file_size_KB,sha256
0,Intensity,CPU,/home/lippincm/Documents/GFF_3D_organoid_profi...,28.285156,cf20b4ef5bf383b5b8f2b4a0b919454d96aaa85068cc71...
1,Coloc,GPU,/home/lippincm/Documents/GFF_3D_organoid_profi...,31.249023,a3d62069e0d87d2663a5c42969be44ba762ab77ab0e71b...
2,Coloc,CPU,/home/lippincm/Documents/GFF_3D_organoid_profi...,31.249023,a3d62069e0d87d2663a5c42969be44ba762ab77ab0e71b...
3,Texture,CPU,/home/lippincm/Documents/GFF_3D_organoid_profi...,15.901367,2a691c2c3c902f5aca484ab5ae31faed3f7b1ad733b771...
4,Granularity,CPU,/home/lippincm/Documents/GFF_3D_organoid_profi...,15.851562,5daf9ed504427870bc1c486428963f11ebda8ce5598cb6...
5,Granularity,GPU,/home/lippincm/Documents/GFF_3D_organoid_profi...,18.351562,b59d2af983c8317aa1b892f73153a32a68127ee21342e2...
6,Neighbors,CPU,/home/lippincm/Documents/GFF_3D_organoid_profi...,3.459961,5057656d8fa4d0330c74ea1d7de4d0c41ca3085f437dc8...
7,Area,GPU,/home/lippincm/Documents/GFF_3D_organoid_profi...,14.212891,63dc8d3f3049e77d6730f6b32376a74395df893df6b9a1...
8,Intensity,GPU,/home/lippincm/Documents/GFF_3D_organoid_profi...,28.285156,cf20b4ef5bf383b5b8f2b4a0b919454d96aaa85068cc71...
9,Area,CPU,/home/lippincm/Documents/GFF_3D_organoid_profi...,14.212891,63dc8d3f3049e77d6730f6b32376a74395df893df6b9a1...


In [6]:
# pivot the dataframe to have one row per feature and processor type
features_df = features_df.pivot(
    index=["feature_name"], columns="feature_processor", values="file_path"
).reset_index()
features_df["CPU_file_size_KB"] = features_df["CPU"].apply(
    lambda x: x.stat().st_size / (1024) if isinstance(x, pathlib.Path) else None
)
features_df["GPU_file_size_KB"] = features_df["GPU"].apply(
    lambda x: x.stat().st_size / (1024) if isinstance(x, pathlib.Path) else None
)
features_df["CPU_sha256"] = features_df["CPU"].apply(get_file_hash)
features_df["GPU_sha256"] = features_df["GPU"].apply(get_file_hash)
features_df.insert(
    1,
    "sha256_match",
    features_df.apply(lambda row: row["CPU_sha256"] == row["GPU_sha256"], axis=1),
)
features_df

feature_processor,feature_name,sha256_match,CPU,GPU,CPU_file_size_KB,GPU_file_size_KB,CPU_sha256,GPU_sha256
0,Area,True,/home/lippincm/Documents/GFF_3D_organoid_profi...,/home/lippincm/Documents/GFF_3D_organoid_profi...,14.212891,14.212891,63dc8d3f3049e77d6730f6b32376a74395df893df6b9a1...,63dc8d3f3049e77d6730f6b32376a74395df893df6b9a1...
1,Coloc,True,/home/lippincm/Documents/GFF_3D_organoid_profi...,/home/lippincm/Documents/GFF_3D_organoid_profi...,31.249023,31.249023,a3d62069e0d87d2663a5c42969be44ba762ab77ab0e71b...,a3d62069e0d87d2663a5c42969be44ba762ab77ab0e71b...
2,Granularity,False,/home/lippincm/Documents/GFF_3D_organoid_profi...,/home/lippincm/Documents/GFF_3D_organoid_profi...,15.851562,18.351562,5daf9ed504427870bc1c486428963f11ebda8ce5598cb6...,b59d2af983c8317aa1b892f73153a32a68127ee21342e2...
3,Intensity,True,/home/lippincm/Documents/GFF_3D_organoid_profi...,/home/lippincm/Documents/GFF_3D_organoid_profi...,28.285156,28.285156,cf20b4ef5bf383b5b8f2b4a0b919454d96aaa85068cc71...,cf20b4ef5bf383b5b8f2b4a0b919454d96aaa85068cc71...
4,Neighbors,False,/home/lippincm/Documents/GFF_3D_organoid_profi...,,3.459961,,5057656d8fa4d0330c74ea1d7de4d0c41ca3085f437dc8...,"Error: expected str, bytes or os.PathLike obje..."
5,Texture,False,/home/lippincm/Documents/GFF_3D_organoid_profi...,,15.901367,,2a691c2c3c902f5aca484ab5ae31faed3f7b1ad733b771...,"Error: expected str, bytes or os.PathLike obje..."


In [7]:
# return all rows that do not match exactly between CPU and GPU
mismatched_rows = dict_of_dfs["Granularity_CPU"].compare(
    dict_of_dfs["Granularity_GPU"], keep_shape=True, keep_equal=True
)
pd.set_option("display.max_columns", None)  # Show all columns

In [8]:
# drop the multiindex
mismatched_rows.columns = [
    "_".join(col).strip() for col in mismatched_rows.columns.values
]
# rename the columns to be more descriptive self -> CPU, other -> GPU
mismatched_rows.columns = [
    x.replace("self", "_CPU").replace("other", "_GPU") for x in mismatched_rows.columns
]
mismatched_rows.head()

Unnamed: 0,image_set__CPU,image_set__GPU,object_id__CPU,object_id__GPU,Granularity_Nuclei_DNA_GRANULARITY.1__CPU,Granularity_Nuclei_DNA_GRANULARITY.1__GPU,Granularity_Nuclei_DNA_GRANULARITY.10__CPU,Granularity_Nuclei_DNA_GRANULARITY.10__GPU,Granularity_Nuclei_DNA_GRANULARITY.11__CPU,Granularity_Nuclei_DNA_GRANULARITY.11__GPU,Granularity_Nuclei_DNA_GRANULARITY.12__CPU,Granularity_Nuclei_DNA_GRANULARITY.12__GPU,Granularity_Nuclei_DNA_GRANULARITY.13__CPU,Granularity_Nuclei_DNA_GRANULARITY.13__GPU,Granularity_Nuclei_DNA_GRANULARITY.14__CPU,Granularity_Nuclei_DNA_GRANULARITY.14__GPU,Granularity_Nuclei_DNA_GRANULARITY.15__CPU,Granularity_Nuclei_DNA_GRANULARITY.15__GPU,Granularity_Nuclei_DNA_GRANULARITY.16__CPU,Granularity_Nuclei_DNA_GRANULARITY.16__GPU,Granularity_Nuclei_DNA_GRANULARITY.2__CPU,Granularity_Nuclei_DNA_GRANULARITY.2__GPU,Granularity_Nuclei_DNA_GRANULARITY.3__CPU,Granularity_Nuclei_DNA_GRANULARITY.3__GPU,Granularity_Nuclei_DNA_GRANULARITY.4__CPU,Granularity_Nuclei_DNA_GRANULARITY.4__GPU,Granularity_Nuclei_DNA_GRANULARITY.5__CPU,Granularity_Nuclei_DNA_GRANULARITY.5__GPU,Granularity_Nuclei_DNA_GRANULARITY.6__CPU,Granularity_Nuclei_DNA_GRANULARITY.6__GPU,Granularity_Nuclei_DNA_GRANULARITY.7__CPU,Granularity_Nuclei_DNA_GRANULARITY.7__GPU,Granularity_Nuclei_DNA_GRANULARITY.8__CPU,Granularity_Nuclei_DNA_GRANULARITY.8__GPU,Granularity_Nuclei_DNA_GRANULARITY.9__CPU,Granularity_Nuclei_DNA_GRANULARITY.9__GPU
0,C4-2,C4-2,6,6,49.763809,41.296987,100.0,99.8333,100.0,99.8333,100.0,99.8333,100.0,99.8333,100.0,99.8333,100.0,99.8333,100.0,99.8333,49.763809,42.745195,49.763809,43.776853,49.763809,46.181723,49.763809,47.15696,100.0,99.8333,100.0,99.8333,100.0,99.8333,100.0,99.8333
1,C4-2,C4-2,11,11,16.304822,77.661164,100.0,98.828821,100.0,98.828821,100.0,98.828821,100.0,98.828821,100.0,98.828821,100.0,98.828821,100.0,98.828821,25.285376,78.212269,25.573586,78.60486,42.509758,79.520016,47.161103,79.891136,100.0,98.828821,100.0,98.828821,100.0,98.828821,100.0,98.828821
2,C4-2,C4-2,21,21,8.373193,72.950514,100.0,99.001959,100.0,99.001959,100.0,99.001959,100.0,99.001959,100.0,99.001959,100.0,99.001959,100.0,99.001959,11.592796,73.617835,14.261925,74.093213,32.36435,75.201355,37.095237,75.650735,100.0,99.001959,100.0,99.001959,100.0,99.001959,100.0,99.001959
3,C4-2,C4-2,25,25,7.414486,74.236864,100.0,98.844995,100.0,98.844995,100.0,98.844995,100.0,98.844995,100.0,98.844995,100.0,98.844995,100.0,98.844995,18.939889,74.872454,21.149156,75.325228,33.827861,76.380676,39.013175,76.808688,100.0,98.844995,100.0,98.844995,100.0,98.844995,100.0,98.844995
4,C4-2,C4-2,49,49,8.882276,74.470715,100.0,98.883709,100.0,98.883709,100.0,98.883709,100.0,98.883709,100.0,98.883709,100.0,98.883709,100.0,98.883709,15.610779,75.100545,18.894252,75.549216,35.260924,76.595101,40.154917,77.019235,100.0,98.883709,100.0,98.883709,100.0,98.883709,100.0,98.883709


In [9]:
# check if for each column the sub columns are equal
prior_col_main = None
prior_col_sub = None
for col in mismatched_rows.columns:
    col_main = col.split("__")[0]
    col_sub = col.split("__")[1]
    if prior_col_main == col_main:
        prior_col_CPU_series = mismatched_rows[f"{prior_col_main}__{prior_col_sub}"]
        prior_col_GPU_series = mismatched_rows[f"{col_main}__{col_sub}"]

        if not prior_col_CPU_series.equals(prior_col_GPU_series):
            print(f"Mismatch found in {col_main}")
            # print the mismatched rows
    prior_col_main = col_main
    prior_col_sub = col_sub

Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.1
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.10
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.11
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.12
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.13
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.14
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.15
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.16
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.2
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.3
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.4
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.5
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.6
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.7
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.8
Mismatch found in Granularity_Nuclei_DNA_GRANULARITY.9


In [10]:
for index, row in mismatched_rows.iterrows():
    if (
        row["Granularity_Nuclei_DNA_GRANULARITY.1__CPU"]
        != row["Granularity_Nuclei_DNA_GRANULARITY.1__GPU"]
    ):
        print(
            f"Mismatch at index {index}: CPU = {row['Granularity_Nuclei_DNA_GRANULARITY.1__CPU']}, GPU = {row['Granularity_Nuclei_DNA_GRANULARITY.1__GPU']}"
        )

Mismatch at index 0: CPU = 49.76380889992856, GPU = 41.296987473748125
Mismatch at index 1: CPU = 16.304822274446035, GPU = 77.66116379268162
Mismatch at index 2: CPU = 8.373193091030767, GPU = 72.95051442092803
Mismatch at index 3: CPU = 7.4144856962143875, GPU = 74.23686447462342
Mismatch at index 4: CPU = 8.882275713684164, GPU = 74.47071539183337
Mismatch at index 5: CPU = 5.878367058835491, GPU = 79.63978708270518
Mismatch at index 6: CPU = 13.71810466519803, GPU = 78.41548454007116
Mismatch at index 7: CPU = 67.30074073415102, GPU = 77.21263277100502
Mismatch at index 8: CPU = 65.20871920198711, GPU = 46.22642210922139
Mismatch at index 9: CPU = 10.883911586393564, GPU = 75.90219582227566
Mismatch at index 10: CPU = 4.127074522217701, GPU = 82.54402514602421
Mismatch at index 11: CPU = 25.07892500908911, GPU = 18.196963640102954
Mismatch at index 12: CPU = 6.20709840383785, GPU = 79.9871722098714
Mismatch at index 13: CPU = 9.406922460495814, GPU = 80.05292758324508
Mismatch at i