In [1]:
import glob
import pathlib
import sqlite3

import pandas as pd
from pycytominer import aggregate, annotate, feature_select, normalize

try:
    cfg = get_ipython().config
    in_notebook = True
except NameError:
    in_notebook = False

## Set paths and variables

In [2]:
sqlite_path = pathlib.Path(
    "../../5.cellprofiler_analysis/analysis_output/endpoint_whole_image"
).resolve()
# get the files in the children directories
sqlite_files = glob.glob(f"{sqlite_path}/**/*.sqlite", recursive=True)

In [3]:
# load in platemap file as a pandas dataframe
platemap_path = pathlib.Path("../../data/platemap_AnnexinV_2ch.csv").resolve()
platemap_df = pd.read_csv(platemap_path)

# directory where the annotated parquet files are saved to
output_dir = pathlib.Path("../data/endpoint_whole_image/").resolve()
output_dir.mkdir(exist_ok=True, parents=True)

normalized_data_dir = pathlib.Path(
    output_dir, "normalized_whole_image.parquet"
).resolve()
feature_selected_data_dir = pathlib.Path(
    output_dir, "feature_selected_whole_image.parquet"
).resolve()
aggregated_data_dir = pathlib.Path(
    output_dir, "aggregated_whole_image.parquet"
).resolve()

## Convert

In [4]:
preset = """SELECT * FROM Per_Image;"""

In [5]:
blacklist_keywords = [
    "Skeleton",
    "URL",
    "ExecutionTime",
    "Frame",
    "Group",
    "Height",
    "Width",
    "MD5",
    "Scaling",
    "Series",
]

In [6]:
list_of_dfs = []
for file in sqlite_files:
    source_path = pathlib.Path(file)
    output_file_dir = output_dir / source_path.stem
    # get the path to the sqlite file
    with sqlite3.connect(source_path) as conn:
        query = "SELECT * FROM Per_Image;"
        df = pd.read_sql_query(query, conn)
    list_of_dfs.append(df)

df = pd.concat(list_of_dfs, ignore_index=True)
df = df.drop_duplicates()

list_of_col_to_remove = []
for col in df.columns:
    for keyword in blacklist_keywords:
        if keyword in col:
            list_of_col_to_remove.append(col)
df.drop(columns=list_of_col_to_remove, inplace=True)

for col in df.columns:
    if col.startswith("Image_"):
        df.rename(columns={col: col.replace("Image_", "")}, inplace=True)
print(df.shape)

(117, 137)


## Annotate

In [7]:
# add metadata from platemap file to extracted single cell features
annotated_df = annotate(
    profiles=df,
    platemap=platemap_df,
    join_on=["Metadata_well", "Metadata_Well"],
)
# drop duplicate columns
annotated_df.drop_duplicates(inplace=True)
columns_to_drop = [
    "ImageNumber",
    "FileName_AnnexinV",
    "FileName_DNA",
    "PathName_AnnexinV",
    "PathName_DNA",
]
annotated_df.drop(columns=columns_to_drop, inplace=True)
print(annotated_df.shape)
annotated_df.head()

(117, 136)


Unnamed: 0,Metadata_plate,Metadata_compound,Metadata_dose,Metadata_control,Metadata_Channel,Metadata_FOV,Metadata_FileLocation,Metadata_Time,Metadata_Well,Metadata_Z_slice,...,Texture_SumVariance_DNA_3_02_256,Texture_SumVariance_DNA_3_03_256,Texture_Variance_AnnexinV_3_00_256,Texture_Variance_AnnexinV_3_01_256,Texture_Variance_AnnexinV_3_02_256,Texture_Variance_AnnexinV_3_03_256,Texture_Variance_DNA_3_00_256,Texture_Variance_DNA_3_01_256,Texture_Variance_DNA_3_02_256,Texture_Variance_DNA_3_03_256
0,1,Staurosporine,1.22,test,,4,,14,C-04,1,...,0.187911,0.182091,0.257873,0.258048,0.25796,0.258048,0.053625,0.053661,0.053592,0.053661
1,1,Staurosporine,1.22,test,,2,,14,C-04,1,...,0.20865,0.200193,0.218253,0.218183,0.218197,0.218182,0.060559,0.060571,0.060572,0.06057
2,1,Staurosporine,1.22,test,,3,,14,C-04,1,...,0.204565,0.197763,0.254029,0.2541,0.253996,0.2541,0.058763,0.058764,0.058709,0.058764
3,1,Staurosporine,1.22,test,,1,,14,C-04,1,...,0.183499,0.176284,0.245752,0.24546,0.245324,0.24546,0.053496,0.053502,0.053459,0.053502
48,1,Staurosporine,2.44,test,,4,,14,E-05,1,...,0.212943,0.205753,0.307101,0.307107,0.307118,0.307107,0.059614,0.059568,0.059524,0.059568


## Normalize

In [8]:
metadata_columns = [x for x in annotated_df.columns if "Metadata_" in x]
feature_columns = [x for x in annotated_df.columns if "Metadata_" not in x]

In [9]:
normalized_df = normalize(
    # df with annotated raw merged single cell features
    profiles=annotated_df,
    # specify samples used as normalization reference (negative control)
    samples="Metadata_compound == 'Staurosporine' and Metadata_dose == 0.0",
    # normalization method used
    method="standardize",
    features=feature_columns,
    meta_features=metadata_columns,
)
normalized_df = normalized_df.drop_duplicates()
normalized_df = normalized_df.reset_index(drop=True)
print(normalized_df.shape)
normalized_df.to_parquet(normalized_data_dir, index=False)

(117, 136)


## Feature selection

In [10]:
# define operations to be performed on the data
# list of operations for feature select function to use on input profile
feature_select_ops = [
    "variance_threshold",
    "blocklist",
    "drop_na_columns",
    "correlation_threshold",
]

feature_select_df = feature_select(
    normalized_df,
    operation=feature_select_ops,
    # specify features to be used for feature selection
    features=feature_columns,
)


print(f"Number of features before feature selection: {normalized_df.shape[1]}")
print(f"Number of features after feature selection: {feature_select_df.shape[1]}")
feature_select_df.to_parquet(
    feature_selected_data_dir,
    index=False,
)
print(feature_select_df.shape)
feature_select_df.head()

Number of features before feature selection: 136
Number of features after feature selection: 30
(117, 30)


Unnamed: 0,Metadata_plate,Metadata_compound,Metadata_dose,Metadata_control,Metadata_Channel,Metadata_FOV,Metadata_FileLocation,Metadata_Time,Metadata_Well,Metadata_Z_slice,...,Texture_Correlation_DNA_3_02_256,Texture_DifferenceEntropy_DNA_3_02_256,Texture_DifferenceVariance_AnnexinV_3_02_256,Texture_DifferenceVariance_DNA_3_00_256,Texture_InfoMeas1_AnnexinV_3_03_256,Texture_InfoMeas1_DNA_3_01_256,Texture_InfoMeas2_AnnexinV_3_03_256,Texture_InverseDifferenceMoment_AnnexinV_3_00_256,Texture_SumVariance_AnnexinV_3_03_256,Texture_SumVariance_DNA_3_02_256
0,1,Staurosporine,1.22,test,,4,,14,C-04,1,...,1.337356,0.368165,0.593027,-0.134154,-1.445089,-1.062122,0.221441,2.019266,-0.663868,1.129909
1,1,Staurosporine,1.22,test,,2,,14,C-04,1,...,0.139861,1.193812,1.66816,-1.100615,0.093066,0.978298,-0.947193,1.953388,-1.935278,1.637002
2,1,Staurosporine,1.22,test,,3,,14,C-04,1,...,0.910705,0.849906,0.840446,-0.660441,-0.424044,-0.519855,-0.109288,1.308669,-0.873052,1.537121
3,1,Staurosporine,1.22,test,,1,,14,C-04,1,...,-0.095736,0.794491,0.059917,-0.659922,-1.294545,1.519666,0.094697,2.002679,-1.055458,1.022017
4,1,Staurosporine,2.44,test,,4,,14,E-05,1,...,2.716246,0.178535,-0.713072,-0.61322,-0.987307,-2.79086,0.875835,-0.011052,0.687659,1.741973


## Aggregation

In [11]:
metadata_cols = feature_select_df.columns[
    feature_select_df.columns.str.contains("Metadata")
]
feature_cols = feature_select_df.columns[
    ~feature_select_df.columns.str.contains("Metadata")
].to_list()

aggregated_df = aggregate(
    feature_select_df,
    features=feature_cols,
    strata=["Metadata_Well", "Metadata_dose"],
    operation="median",
)

print(aggregated_df.shape)
aggregated_df.to_parquet(aggregated_data_dir)
print(aggregated_df.shape)
aggregated_df.head()

(30, 22)
(30, 22)


Unnamed: 0,Metadata_Well,Metadata_dose,Intensity_LowerQuartileIntensity_AnnexinV,Intensity_MADIntensity_AnnexinV,Intensity_MADIntensity_DNA,Intensity_MaxIntensity_AnnexinV,Intensity_MaxIntensity_DNA,Intensity_MeanIntensity_AnnexinV,Intensity_UpperQuartileIntensity_AnnexinV,Intensity_UpperQuartileIntensity_DNA,...,Texture_Correlation_DNA_3_02_256,Texture_DifferenceEntropy_DNA_3_02_256,Texture_DifferenceVariance_AnnexinV_3_02_256,Texture_DifferenceVariance_DNA_3_00_256,Texture_InfoMeas1_AnnexinV_3_03_256,Texture_InfoMeas1_DNA_3_01_256,Texture_InfoMeas2_AnnexinV_3_03_256,Texture_InverseDifferenceMoment_AnnexinV_3_00_256,Texture_SumVariance_AnnexinV_3_03_256,Texture_SumVariance_DNA_3_02_256
0,C-02,0.0,-0.040456,-0.266708,0.0,0.094972,0.306223,-1.092121,-1.184509,0.169031,...,0.165528,0.314377,-0.268905,-0.308198,0.765783,0.844632,-1.194412,1.109419,-0.624342,0.042757
1,C-03,0.61,-0.525924,0.070186,0.0,-0.320466,-0.208609,-0.615134,-0.796851,1.183216,...,-0.368861,-0.051352,0.220198,0.219561,0.770876,1.327008,-0.748198,0.563655,-1.127459,-0.354506
2,C-04,1.22,-0.525924,-0.940496,0.0,-0.5836,0.629764,-1.682439,-1.184509,2.197401,...,0.525283,0.822199,0.716736,-0.660181,-0.859295,0.229222,-0.007296,1.978034,-0.964255,1.333515
3,C-05,2.44,-0.525924,-0.856272,0.0,-0.851614,0.320392,-1.388464,-1.442947,0.169031,...,1.396388,-1.122557,1.171765,-0.279053,0.741473,-0.028035,-1.080175,0.776666,-1.133651,-0.212651
4,C-06,4.88,-0.525924,-0.940496,0.0,-0.690945,0.556554,-0.744168,-0.99068,0.169031,...,2.067098,-0.146865,0.883802,-0.545491,-0.164224,-0.301771,-0.110923,0.44786,-0.301524,0.804387
