This noteboook pre-processes the single cell morphology data to be ready for exploratory analysis and machine learning.

In [1]:
import pathlib

import numpy as np
import pandas as pd
import papermill as pm
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
# Parameters
cell_type = "SHSY5Y"

In [3]:
# Define inputs
feature_file = pathlib.Path(f"../data/{cell_type}_sc_norm_fs.parquet")
feature_df = pd.read_parquet(feature_file)

In [4]:
# replace all " " with "_" in all values of the dataframe
feature_df = feature_df.replace(to_replace=" ", value="_", regex=True)

In [5]:
# remove uM in each row of the Metadata_inducer1_concentration column
feature_df["Metadata_inducer1_concentration"] = feature_df[
    "Metadata_inducer1_concentration"
].str.replace("µM", "")

In [6]:
feature_df["Metadata_inducer1_concentration"].unique()

array([None, '0.100', '1.000', '10.000', '5.000', '20.000', '0.010',
       '100.000', '2.500'], dtype=object)

In [7]:
# define output file path
feature_df_out_path = pathlib.Path(f"../data/{cell_type}_preprocessed_sc_norm.parquet")

In [8]:
print(feature_df.shape)
feature_df.head()

(597902, 1270)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrGasdermin_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_03_256,Nuclei_Texture_SumVariance_CorrPM_3_03_256
0,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,,,...,-1.841853,-0.597438,-1.295016,-3.127056,-3.004511,-1.481117,1.851482,0.024721,0.307472,0.092086
1,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,,,...,-0.841272,-0.681935,-0.330951,-3.725197,-0.827474,-0.461348,0.897731,-0.041156,1.443262,0.009843
2,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,,,...,0.500885,-0.229003,-1.254209,-0.691997,-1.374967,-1.337252,0.82597,-0.044386,-0.020445,0.000848
3,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,,,...,-0.560136,-0.276664,-0.313186,-0.844206,-0.798213,-1.032992,0.546308,-0.058328,-0.009632,-0.005811
4,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,,,...,-0.82855,-0.761657,-0.309806,-0.5017,-0.616252,-0.681687,1.034724,-0.035518,-0.038205,0.01769


In [9]:
# removing costes features as they behave with great variance across all data
feature_df = feature_df.drop(feature_df.filter(regex="Costes").columns, axis=1)
print(feature_df.shape)
feature_df.head()

(597902, 1270)


Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_02_256,Nuclei_Texture_InverseDifferenceMoment_CorrER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_CorrMito_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_00_256,Nuclei_Texture_InverseDifferenceMoment_CorrPM_3_02_256,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrGasdermin_3_01_256,Nuclei_Texture_SumVariance_CorrMito_3_03_256,Nuclei_Texture_SumVariance_CorrPM_3_03_256
0,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,,,...,-1.841853,-0.597438,-1.295016,-3.127056,-3.004511,-1.481117,1.851482,0.024721,0.307472,0.092086
1,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,,,...,-0.841272,-0.681935,-0.330951,-3.725197,-0.827474,-0.461348,0.897731,-0.041156,1.443262,0.009843
2,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,,,...,0.500885,-0.229003,-1.254209,-0.691997,-1.374967,-1.337252,0.82597,-0.044386,-0.020445,0.000848
3,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,,,...,-0.560136,-0.276664,-0.313186,-0.844206,-0.798213,-1.032992,0.546308,-0.058328,-0.009632,-0.005811
4,SH-SY5Y,B13,3765,6,Media_ctr,,,media_ctr,,,...,-0.82855,-0.761657,-0.309806,-0.5017,-0.616252,-0.681687,1.034724,-0.035518,-0.038205,0.01769


In [10]:
# replacing '/' in treatment dosage column to avoid errors in file interpolation including such strings
feature_df = feature_df.replace(to_replace="/", value="_per_", regex=True)

In [11]:
# replace nan values with 0

columns_to_fill = [
    "Metadata_inducer1_concentration",
    "Metadata_inducer2_concentration",
    "Metadata_inhibitor_concentration",
]
feature_df[columns_to_fill].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_df[columns_to_fill].fillna(0, inplace=True)


In [12]:
# replace all None values with 0
feature_df["Metadata_inducer1_concentration"].fillna(0, inplace=True)

In [13]:
# create a list of columns to be converted to float
col_list = [
    "Metadata_inducer1_concentration",
    "Metadata_inducer2_concentration",
    "Metadata_inhibitor_concentration",
]
# loop through the list and convert each column to float
for i in col_list:
    feature_df[i] = feature_df[i].apply(
        lambda x: f"{float(x):.3f}" if float(x) != 0 else float(x)
    )

#### Combine Inducer1 and Inducer2 into one column

In [14]:
# treatment column merge
conditions = [
    (feature_df["Metadata_inducer2"].isnull()),
    feature_df["Metadata_inducer2"].notnull(),
]

results = [
    (feature_df["Metadata_inducer1"]).astype(str),
    (
        feature_df["Metadata_inducer1"]
        + "_"
        + feature_df["Metadata_inducer2"].astype(str)
    ),
]
feature_df["Metadata_Treatment"] = np.select(condlist=conditions, choicelist=results)


# dose column merge
results = [
    (
        feature_df["Metadata_inducer1_concentration"].astype(str)
        + "_"
        + feature_df["Metadata_inducer1_concentration_unit"].astype(str)
    ),
    (
        feature_df["Metadata_inducer1_concentration"].astype(str)
        + "_"
        + feature_df["Metadata_inducer1_concentration_unit"].astype(str)
        + "_"
        + feature_df["Metadata_inducer2_concentration"].astype(str)
        + "_"
        + feature_df["Metadata_inducer2_concentration_unit"].astype(str)
    ),
]
feature_df["Metadata_Dose"] = np.select(condlist=conditions, choicelist=results)

## N Beta Column condition generation
columns generated to used for linear modeling where terms separated by '__' will be a beta coefficient 

In [15]:
# one beta of inudcer1, inducer1 concentration, inhibitor, and inhibitor concentration all as 1 beta term
feature_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "_"
    + feature_df["Metadata_Dose"].astype(str)
    # + "_"
    # + feature_df['Metadata_inducer1_concentration_unit'].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration_unit"].astype(str)
).astype(str)


# two beta of inducer1, inhibitor, and inhibitor concentration all as 1 beta term + inducer1 concentration as 2nd beta term
feature_df["twob_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "_"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
).astype(str)

# three beta of inducer 1 as 1 beta term, inducer1 concentration as 2nd beta term, inhibitor and inhibitor concentration as 3rd beta term
feature_df["threeb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
    + "__"
    + feature_df["Metadata_inducer1_concentration_unit"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
).astype(str)

# four beta of inducer 1 as 1 beta term, inducer1 concentration as 2nd beta term, inhibitor as 3rd beta term, and inhibitor concentration as 4th beta term
feature_df["fourb_Metadata_Treatment_Dose_Inhibitor_Dose"] = (
    feature_df["Metadata_Treatment"]
    + "__"
    + feature_df["Metadata_Dose"].astype(str)
    + "__"
    + feature_df["Metadata_inducer1_concentration_unit"].astype(str)
    + "_"
    + feature_df["Metadata_inhibitor"].astype(str)
    + "__"
    + feature_df["Metadata_inhibitor_concentration"].astype(str)
).astype(str)

In [16]:
replacement_dict = {
    "None": "0",
    "µ": "u",
    "nan": "0",
}
for pattern, replacement in replacement_dict.items():
    print(pattern, replacement)
    feature_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = feature_df[
        "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"
    ].replace(to_replace=str(pattern), value=str(replacement), regex=True)

None 0


µ u


nan 0


In [17]:
feature_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] = feature_df[
    "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"
].str.replace("media_ctr_0.0_0_Media_ctr_0_0", "media_ctr_0.0_0_Media_ctr_0.0_0")

In [18]:
# need to convert to strings to save as parquet
# if the column is an object then convert it to a string
for column in feature_df.columns:
    if feature_df[column].dtype == "object":
        feature_df[column] = feature_df[column].astype(str)

In [19]:
# write to parquet file
feature_df.to_parquet(feature_df_out_path)