In [1]:
import ast
import itertools
import pathlib
import sys
import warnings

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import toml
from joblib import dump
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNetCV, LogisticRegression, MultiTaskElasticNetCV

# import mse
from sklearn.metrics import mean_squared_error, r2_score

# import RepeatedKFold
from sklearn.model_selection import (
    GridSearchCV,
    RepeatedKFold,
    StratifiedKFold,
    train_test_split,
)
from sklearn.utils import parallel_backend, shuffle

In [2]:
# Parameters
cell_type = "PBMC"
aggregation = True
nomic = True
flag = True
control = "DMSO_0.100_DMSO_0.025"
treatment = "LPS_100.000_DMSO_0.025"

In [3]:
MODEL_TYPE = "regression"
if flag == False:
    # read in toml file and get parameters
    toml_path = pathlib.Path("single_class_config.toml")
    with open(toml_path, "r") as f:
        config = toml.load(f)
    control = config["logistic_regression_params"]["control"]
    treatment = config["logistic_regression_params"]["treatments"]
    aggregation = ast.literal_eval(config["logistic_regression_params"]["aggregation"])
    nomic = ast.literal_eval(config["logistic_regression_params"]["nomic"])
    cell_type = config["logistic_regression_params"]["cell_type"]

In [4]:
# load training data from indexes and features dataframe
# data_split_path = pathlib.Path(f"../0.split_data/indexes/data_split_indexes.tsv")
# data_path = pathlib.Path(f"../../data/{cell_type}_preprocessed_sc_norm.parquet")
data_path = pathlib.Path(
    "../../data/PBMC_subset_sc_norm_DMSO_0.100_DMSO_0.025_LPS_100.000_DMSO_0.025.parquet"
)

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
data_df = pq.read_table(data_path).to_pandas()

# import nomic data
nomic_df_path = pathlib.Path(
    f"../../2.Nomic_nELISA_Analysis/Data/clean/Plate2/nELISA_plate_430420_{cell_type}_cleanup4correlation.csv"
)
df_nomic = pd.read_csv(nomic_df_path)

# clean up nomic data
df_nomic = df_nomic.drop(columns=[col for col in df_nomic.columns if "[pgML]" in col])
# drop first 25 columns (Metadata that is not needed)
# df_nomic = df_nomic.drop(columns=df_nomic.columns[3:25])
# df_nomic = df_nomic.drop(columns=df_nomic.columns[0:2])

In [5]:
print(df_nomic["Activin A [NSU]"].std())
print(df_nomic["Activin A [NSU]"].mean())
print(df_nomic["Activin A [NSU]"].max())
print(df_nomic["Activin A [NSU]"].min())
# min max scale nomic data (0-1) using sklearn

0.27220830038007515
0.21036656278540763
1.0
0.0


In [6]:
if (aggregation == True) and (nomic == True):
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/aggregated_sc_and_nomic_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
    data_df = pd.merge(
        data_df,
        df_nomic,
        left_on=["Metadata_Well", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
        right_on=["Metadata_position_x", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
    )
    data_df = data_df.drop(columns=["Metadata_position_x"])
elif (aggregation == True) and (nomic == False):
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/aggregated_sc_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    # subset each column that contains metadata
    metadata = data_df.filter(regex="Metadata")
    data_df = data_df.drop(metadata.columns, axis=1)
    data_df = pd.concat([data_df, metadata["Metadata_Well"]], axis=1)
    # groupby well and take mean of each well
    data_df = data_df.groupby("Metadata_Well").mean()
    # drop duplicate rows in the metadata_well column
    metadata = metadata.drop_duplicates(subset=["Metadata_Well"])
    # get the metadata for each well
    data_df = pd.merge(
        data_df, metadata, left_on="Metadata_Well", right_on="Metadata_Well"
    )
elif (aggregation == False) and (nomic == True):
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/sc_and_nomic_data_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
    data_df = pd.merge(
        data_df, df_nomic, left_on="Metadata_Well", right_on="Metadata_position_x"
    )
    data_df = data_df.drop(columns=["Metadata_position_x"])
elif aggregation == False and nomic == False:
    data_split_path = pathlib.Path(
        f"../0.split_data/indexes/{cell_type}/{MODEL_TYPE}/{control}_{treatment}/sc_split_indexes.tsv"
    )
    data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
else:
    print("Error")

In [7]:
# select tht indexes for the training and test set
train_indexes = data_split_indexes.loc[data_split_indexes["label"] == "train"]

In [8]:
# subset data_df by indexes in data_split_indexes
training_data = data_df.loc[train_indexes["labeled_data_index"]]

In [9]:
# get oneb_Metadata_Treatment_Dose_Inhibitor_Dose  =='DMSO_0.100_DMSO_0.025' and 'LPS_100.000_DMSO_0.025 and Thapsigargin_10.000_DMSO_0.025'
training_data = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
        [control, treatment]
    )
]
training_data

Unnamed: 0,Metadata_Well,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,Cytoplasm_AreaShape_Zernike_2_0,Cytoplasm_AreaShape_Zernike_2_2,...,uPA [NSU],VCAM-1 [NSU],VEGF Receptor 2 (Flk-1) [NSU],VEGF-A (165) [NSU],VEGF-C [NSU],VEGF-D [NSU],VEGFR-1 [NSU],WISP-1 (CCN4) [NSU],XCL1 (Lymphotactin) [NSU],fourb_Metadata_Treatment_Dose_Inhibitor_Dose_y
0,B06,0.022371,-0.022614,-0.039573,-0.0493,-0.013386,-0.035579,-0.026341,-0.034624,-0.015618,...,0.469875,0.395392,0.560129,0.504521,0.490444,0.258834,0.238358,0.524276,0.25067,DMSO__0.100__DMSO__0.025
1,B07,-0.071753,0.049418,-0.080162,-0.110935,0.009102,0.018723,0.007016,0.008824,0.01246,...,0.493033,0.171562,0.615867,0.288153,0.506528,0.264141,0.296782,0.541689,0.167078,DMSO__0.100__DMSO__0.025
2,C06,0.061913,-0.047412,0.05225,0.083849,0.024151,-0.026974,-0.013448,-0.00572,-0.019584,...,0.570146,0.032391,0.476656,0.315426,0.589522,0.38117,0.168645,0.455092,0.228752,DMSO__0.100__DMSO__0.025
4,I06,0.048882,-0.014356,0.110362,0.115949,-0.005469,0.024033,0.043835,-0.037867,0.036271,...,0.374554,0.486915,0.389375,0.369421,0.680276,0.182956,0.263281,0.213596,0.064645,DMSO__0.100__DMSO__0.025
6,J02,-0.00417,0.009121,0.015575,-0.078282,0.009186,-0.100801,-0.090802,0.004174,-0.023955,...,0.428286,0.288884,0.527908,0.210755,0.448465,0.422773,0.535603,0.209011,0.170498,LPS__100.000__DMSO__0.025
8,J06,-0.012228,-0.005155,-0.048954,-0.03268,-0.012399,-0.034741,-0.04695,0.054012,-0.04809,...,0.630644,0.586271,0.258029,0.561051,0.551671,0.582053,0.087565,0.140992,0.234191,DMSO__0.100__DMSO__0.025
9,J07,-0.000175,0.007426,0.030708,0.024561,-0.001792,0.018262,0.021859,-0.007972,0.019015,...,0.46285,0.490826,0.466632,0.635065,0.333763,0.440537,0.218204,0.341123,0.263401,DMSO__0.100__DMSO__0.025
10,J08,-0.048434,0.044146,-0.002124,-0.099024,-0.014325,-0.072734,-0.081361,0.036256,-0.009928,...,0.315633,0.364173,0.607592,0.176816,0.37892,0.310344,0.651217,0.679571,0.222324,LPS__100.000__DMSO__0.025
11,J09,-0.001276,-0.002025,0.041489,-0.035258,-0.000377,-0.07968,-0.059874,0.012565,0.000216,...,0.527316,0.405934,0.619578,0.329964,0.57783,0.0,0.456104,0.255216,0.255703,LPS__100.000__DMSO__0.025


In [10]:
# at random downsample the DMSO treatment to match the number of wells in the LPS treatment
seed = 0
# get the number of wells in the LPS treatment
trt_wells = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == treatment
].shape[0]
# get the number of wells in the DMSO treatment
dmso_wells = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == control
].shape[0]
# downsample the DMSO treatment to match the number of wells in the LPS treatment
dmso_holdout = training_data[
    training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"] == control
].sample(n=trt_wells, random_state=seed)
# remove the downsampled DMSO wells from the data
training_data = training_data.drop(dmso_holdout.index)

In [11]:
# define metadata columns
# subset each column that contains metadata
metadata = training_data.filter(regex="Metadata")
# drop all metadata columns
data_x = training_data.drop(metadata.columns, axis=1)
labeled_data = training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
# get all columns that contain "NSU" in the column name
data_y_cols = data_x.filter(regex="NSU").columns
data_y = training_data[data_y_cols]
data_x = data_x.drop(data_y_cols, axis=1)

In [12]:
for col in data_y.columns:
    train_y = data_y[col]
    shuffles = ["final", "shuffled_baseline"]
    feature_types = ["CP"]
    # create stratified data sets for continuous labels

    model = ElasticNetCV(
        random_state=0,
        max_iter=1000000,
        cv=5,
        l1_ratio=[0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 0.99],
        alphas=[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
        fit_intercept=True,
        selection="random",
    )
    # train model on training data on all combinations of model types, feature types, and phenotypic classes
    for shuffle, feature_type in itertools.product(shuffles, feature_types):
        if shuffle == "shuffled_baseline":
            print("Shuffling data")
            for column in data_x:
                np.random.shuffle(data_x[column].values)
        else:
            print("Not shuffling data")
        # define parameters to search over
        with parallel_backend("multiprocessing"):
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore", category=ConvergenceWarning, module="sklearn"
                )
                # create a logistic regression model

                model.fit(data_x, train_y)
        # print(f"Best alpha: {regression_model_params.alpha_}\n Best l1_ratio: {regression_model_params.l1_ratio_}")
        score = model.score(data_x, train_y)
        print("Fold score:", score)
        preds = model.predict(data_x)
        print(f"RMSE: {np.sqrt(mean_squared_error(train_y, preds))}")

        if (aggregation == True) and (nomic == True):
            results_dir = f"./models/single_class/{cell_type}/aggregated_with_nomic/{MODEL_TYPE}/{control}__{treatment}"
        elif (aggregation == True) and (nomic == False):
            results_dir = f"./models/single_class/{cell_type}/aggregated/{MODEL_TYPE}/{control}__{treatment}"
        elif (aggregation == False) and (nomic == True):
            results_dir = f"./models/single_class/{cell_type}/sc_with_nomic/{MODEL_TYPE}/{control}__{treatment}"
        elif (aggregation == False) and (nomic == False):
            results_dir = f"./models/single_class/{cell_type}/sc/{MODEL_TYPE}/{control}__{treatment}"
        else:
            print("Error")

        # create results directory if it doesn't exist
        pathlib.Path(results_dir).mkdir(parents=True, exist_ok=True)

        # print(f'Features: {model.feature_names_in_}')
        print(f"There were {len(model.feature_names_in_)} features selected")

        # save final estimator
        dump(
            model,
            f"{results_dir}/{shuffle}__{feature_type}_{col}.joblib",
        )

        # save condfig copy specific to this model to the folder with the results
        # use pathlib
        config_copy_path = pathlib.Path(
            f"{results_dir}/{shuffle}__{feature_type}_{col}.toml"
        )
        # write toml file with parameters used from injected parameters
        with open(config_copy_path, "a") as f:
            f.write(f"model_type='{shuffle}'\n")
            f.write(f"control='{control}'\n")
            f.write(f"treatments='{treatment}'\n")
            f.write(f"aggregation={aggregation}\n")
            f.write(f"nomic={nomic}\n")
            f.write(f"cell_type='{cell_type}'\n")
            f.write(f"feature='{col}'\n")

Not shuffling data
Fold score: 0.999999109936506
RMSE: 0.0003194747598252315
There were 1245 features selected
Shuffling data
Fold score: 0.9999984203237906
RMSE: 0.00042560830777799234
There were 1245 features selected
Not shuffling data
Fold score: 0.9749113679169215
RMSE: 0.026890452974576762
There were 1245 features selected
Shuffling data
Fold score: 0.998370504322631
RMSE: 0.006853086807743666
There were 1245 features selected
Not shuffling data
Fold score: 0.0
RMSE: 0.3043031161287454
There were 1245 features selected
Shuffling data
Fold score: 0.0
RMSE: 0.3043031161287454
There were 1245 features selected
Not shuffling data
Fold score: 0.0
RMSE: 0.083453242518188
There were 1245 features selected
Shuffling data
Fold score: 0.0
RMSE: 0.083453242518188
There were 1245 features selected
Not shuffling data
Fold score: 0.9992249405439338
RMSE: 0.004642508453024991
There were 1245 features selected
Shuffling data
Fold score: 0.9995961305097694
RMSE: 0.0033512414322143967
There were 1