In [1]:
import ast
import itertools
import pathlib
import sys
import warnings

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import toml
from joblib import dump
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.utils import parallel_backend, shuffle

In [2]:
# Parameters
cell_type = "PBMC"
aggregation = True
nomic = True
flag = True
control = "DMSO_0.100_DMSO_0.025"
treatment = "LPS_100.000_DMSO_0.025"

In [3]:
MODEL_TYPE = "regression"
if flag == False:
    # read in toml file and get parameters
    toml_path = pathlib.Path("../1.train_models/single_class_config.toml")
    with open(toml_path, "r") as f:
        config = toml.load(f)
    f.close()
    control = config["logistic_regression_params"]["control"]
    treatment = config["logistic_regression_params"]["treatments"]
    aggregation = ast.literal_eval(config["logistic_regression_params"]["aggregation"])
    nomic = ast.literal_eval(config["logistic_regression_params"]["nomic"])
    cell_type = config["logistic_regression_params"]["cell_type"]
    print(aggregation, nomic, cell_type)

In [11]:
# path = pathlib.Path(f"../../data/{cell_type}_preprocessed_sc_norm.parquet")
path = pathlib.Path(
    "../../data/PBMC_subset_sc_norm_DMSO_0.100_DMSO_0.025_LPS_100.000_DMSO_0.025.parquet"
)

data_df = pq.read_table(path).to_pandas()

data_df.head()

Unnamed: 0,Metadata_cell_type,Metadata_Well,Metadata_number_of_singlecells,Metadata_incubation inducer (h),Metadata_inhibitor,Metadata_inhibitor_concentration,Metadata_inhibitor_concentration_unit,Metadata_inducer1,Metadata_inducer1_concentration,Metadata_inducer1_concentration_unit,...,Nuclei_Texture_SumEntropy_CorrPM_3_01_256,Nuclei_Texture_SumVariance_CorrPM_3_01_256,Nuclei_Texture_Variance_CorrER_3_00_256,Nuclei_Texture_Variance_CorrGasdermin_3_00_256,Metadata_Treatment,Metadata_Dose,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,twob_Metadata_Treatment_Dose_Inhibitor_Dose,threeb_Metadata_Treatment_Dose_Inhibitor_Dose,fourb_Metadata_Treatment_Dose_Inhibitor_Dose
315708,PBMC,B06,25314,6,DMSO,0.025,%,DMSO,0.1,%,...,-0.29958,-0.137086,-0.111979,-0.115607,DMSO,0.1,DMSO_0.100_DMSO_0.025,DMSO_DMSO_0.025__0.100,DMSO__0.100__DMSO_0.025,DMSO__0.100__DMSO__0.025
315709,PBMC,B06,25314,6,DMSO,0.025,%,DMSO,0.1,%,...,-0.906742,-0.156287,-0.171005,-0.161728,DMSO,0.1,DMSO_0.100_DMSO_0.025,DMSO_DMSO_0.025__0.100,DMSO__0.100__DMSO_0.025,DMSO__0.100__DMSO__0.025
315710,PBMC,B06,25314,6,DMSO,0.025,%,DMSO,0.1,%,...,-0.392149,-0.146205,-0.16905,-0.166613,DMSO,0.1,DMSO_0.100_DMSO_0.025,DMSO_DMSO_0.025__0.100,DMSO__0.100__DMSO_0.025,DMSO__0.100__DMSO__0.025
315711,PBMC,B06,25314,6,DMSO,0.025,%,DMSO,0.1,%,...,-1.128252,-0.162255,-0.170928,-0.155808,DMSO,0.1,DMSO_0.100_DMSO_0.025,DMSO_DMSO_0.025__0.100,DMSO__0.100__DMSO_0.025,DMSO__0.100__DMSO__0.025
315712,PBMC,B06,25314,6,DMSO,0.025,%,DMSO,0.1,%,...,-0.815351,-0.142273,-0.192711,-0.19427,DMSO,0.1,DMSO_0.100_DMSO_0.025,DMSO_DMSO_0.025__0.100,DMSO__0.100__DMSO_0.025,DMSO__0.100__DMSO__0.025


In [12]:
if nomic == True:
    # import nomic data
    nomic_df_path = pathlib.Path(
        f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}_cleanup4correlation.csv"
    )
    df_nomic = pd.read_csv(nomic_df_path)

    df_nomic = pd.read_csv(nomic_df_path)
    # drop columns that contain [pgML]
    df_nomic = df_nomic.drop(
        columns=[col for col in df_nomic.columns if "[pgML]" in col]
    )
    # drop first 25 columns
    # df_nomic = df_nomic.drop(columns=df_nomic.columns[3:25])
    # df_nomic = df_nomic.drop(columns=df_nomic.columns[0:2])
else:
    df_nomic = None

In [13]:
# subset each column that contains metadata
metadata = data_df.filter(regex="Metadata")

# get all columns that are not metadata except for metadata_Well
data = data_df.drop(metadata.columns, axis=1)

# get the metadata_Well column
metadata_well = metadata[
    ["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
]

data_df = pd.merge(data, metadata_well, left_index=True, right_index=True)

In [14]:
if (aggregation == True) and (nomic == True):

    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
    data_df = data_df.drop(columns=["Metadata_position_x"])

elif (aggregation == True) and (nomic == False):
    # subset each column that contains metadata
    metadata = data.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
elif (aggregation == False) and (nomic == True):
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
    data_df = data_df.drop(columns=["Metadata_position_x"])
elif aggregation == False and nomic == False:
    pass
else:
    print("Error")

In [15]:
# drop all metadata columns
labeled_data = data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
data_x = data_df.drop(metadata.columns, axis=1)

This model and code is both inspired and reused from: https://github.com/WayScience/phenotypic_profiling_model/blob/main/1.split_data/split_data.ipynb
The bulk of this work was done by **Roshan Kern** I have only made minor changes to the code to make it more modular and easier to use for my purposes.

In [16]:
# get oneb_Metadata_Treatment_Dose_Inhibitor_Dose  =='DMSO_0.100_DMSO_0.025' and 'LPS_100.000_DMSO_0.025 and Thapsigargin_10.000_DMSO_0.025'
data_df = data_df[
    data_df["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin([control, treatment])
]

In [17]:
# ratio of data to be used for testing (ex 0.15 = 15%)
test_ratio = 0.25

# get indexes of training and testing data
training_data, testing_data = train_test_split(
    data_df,
    test_size=test_ratio,
    stratify=data_df[["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]],
    random_state=1,
)
train_indexes = training_data.index.to_numpy()
test_indexes = testing_data.index.to_numpy()

print(f"Training data has shape: {training_data.shape}")
print(f"Testing data has shape: {testing_data.shape}")

Training data has shape: (9, 1435)
Testing data has shape: (3, 1435)


In [18]:
# create pandas dataframe with all indexes and their respective labels, stratified by phenotypic class
index_data = []
for index in train_indexes:
    index_data.append({"labeled_data_index": index, "label": "train"})
for index in test_indexes:
    index_data.append({"labeled_data_index": index, "label": "test"})

# make index data a dataframe and sort it by labeled data index
index_data = pd.DataFrame(index_data).sort_values(["labeled_data_index"])

In [19]:
# set save path
if aggregation == True:
    if nomic == True:
        save_path = pathlib.Path(
            f"./indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}"
        )
    elif nomic == False:
        save_path = pathlib.Path(
            f"./indexes/{cell_type}/{MODEL_TYPE}{control}_{treatment}"
        )
elif aggregation == False:
    if nomic == True:
        save_path = pathlib.Path(
            f"./indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}"
        )
    elif nomic == False:
        save_path = pathlib.Path(
            f"./indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}"
        )
else:
    print("Error")

print(save_path)
# create save path if it doesn't exist
save_path.mkdir(parents=True, exist_ok=True)

indexes/PBMC/regression/DMSO_0.100_DMSO_0.025_LPS_100.000_DMSO_0.025


In [20]:
# save indexes as tsv file
if aggregation == True:
    if nomic == True:
        index_data.to_csv(
            f"{save_path}/aggregated_sc_and_nomic_data_split_indexes.tsv", sep="\t"
        )
    elif nomic == False:
        index_data.to_csv(f"{save_path}/aggregated_sc_data_split_indexes.tsv", sep="\t")
elif aggregation == False:
    if nomic == True:
        index_data.to_csv(f"{save_path}/sc_and_nomic_data_split_indexes.tsv", sep="\t")
    elif nomic == False:
        index_data.to_csv(f"{save_path}/sc_split_indexes.tsv", sep="\t")
else:
    print("Error")