In [1]:
import argparse
import itertools
import pathlib
import warnings

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import toml
from joblib import dump
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNetCV, LogisticRegression, MultiTaskElasticNetCV

# import RepeatedKFold
from sklearn.model_selection import (
    GridSearchCV,
    LeaveOneOut,
    RepeatedKFold,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.utils import parallel_backend

In [2]:
argparser = argparse.ArgumentParser()
argparser.add_argument("--cell_type", type=str, default="all")
argparser.add_argument("--shuffle", type=str, default=False)
argparser.add_argument("--cytokine", type=str, default="cytokine")

args = argparser.parse_args()

cell_type = args.cell_type
cytokine = args.cytokine
shuffle = args.shuffle

print(cell_type, shuffle, cytokine)
if shuffle == "True":
    shuffle = True
elif shuffle == "False":
    shuffle = False
else:
    raise ValueError("shuffle must be True or False")
print(f"shuffle: {shuffle}")

usage: ipykernel_launcher.py [-h] [--cell_type CELL_TYPE] [--shuffle SHUFFLE]
                             [--cytokine CYTOKINE]
ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9028 --control=9026 --hb=9020 --Session.signature_scheme="hmac-sha256" --Session.key=b"0aa5bca1-3190-427b-b8a3-2b0e4ebc458b" --shell=9027 --transport="tcp" --iopub=9029 --f=/home/lippincm/.local/share/jupyter/runtime/kernel-v2-1787727kMwPdybh5b1u.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [4]:
aggregation = True
nomic = True

In [5]:
# set shuffle value
if shuffle:
    shuffle = "shuffled_baseline"
else:
    shuffle = "final"

In [6]:
MODEL_TYPE = "regression"

In [7]:
# load training data from indexes and features dataframe
data_split_path = pathlib.Path(
    f"../../0.split_data/indexes/{cell_type}/regression/aggregated_sc_and_nomic_data_split_indexes.tsv"
)
data_path = pathlib.Path(
    f"../../../data/{cell_type}_preprocessed_sc_norm_aggregated_nomic.parquet"
)

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
data_df = pd.read_parquet(data_path)

data_split_indexes = pd.read_csv(data_split_path, sep="\t")

In [8]:
data_df

Unnamed: 0,Metadata_Well,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_Extent,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,Cytoplasm_AreaShape_MinorAxisLength,Cytoplasm_AreaShape_Orientation,Cytoplasm_AreaShape_Zernike_0_0,Cytoplasm_AreaShape_Zernike_1_1,...,uPA [NSU],VCAM-1 [NSU],VEGF Receptor 2 (Flk-1) [NSU],VEGF-A (165) [NSU],VEGF-C [NSU],VEGF-D [NSU],VEGFR-1 [NSU],WISP-1 (CCN4) [NSU],XCL1 (Lymphotactin) [NSU],oneb_Treatment_Dose_Inhibitor_Dose
0,B13,0.000263,0.050292,0.011215,-0.032031,0.139148,0.092653,-0.022733,-0.004550,-0.019608,...,0.187623,0.147899,0.500155,0.064014,0.275746,0.385698,0.176153,0.727225,0.529847,media_ctr_0.0_0_Media_ctr_0.0_0
1,B14,-0.063223,-0.001418,0.035864,0.036794,0.037936,0.031201,-0.012884,0.028338,0.019985,...,0.114937,0.222551,0.534935,0.403588,0.213858,0.201769,0.577968,0.733987,0.763760,DMSO_0.100_%_DMSO_0.025_%
2,B15,-0.062009,0.001236,0.044042,0.030464,-0.002026,0.006311,0.010789,0.030538,0.022751,...,0.214721,0.723799,0.592903,0.287432,0.375582,0.577606,0.554267,0.457302,0.298807,DMSO_0.100_%_DMSO_0.025_%
3,B16,-0.031699,0.047344,-0.003990,0.002975,0.115183,0.070404,-0.007908,-0.010212,-0.004997,...,0.104669,0.483786,0.324065,0.174874,0.188023,0.427482,0.702465,0.502791,0.336315,LPS_0.010_ug_per_ml_DMSO_0.025_%
4,B17,-0.045468,0.038261,0.034279,0.023820,0.163262,0.120615,-0.000391,0.018250,-0.015776,...,0.105744,0.266150,0.396018,0.000000,0.203170,0.652367,0.510090,0.703775,0.283843,LPS_0.010_ug_per_ml_DMSO_0.025_%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,O17,-0.053652,0.141416,-0.037510,0.004477,0.173556,-0.014476,0.023686,-0.069350,-0.008961,...,0.090001,0.368463,1.000000,0.277555,0.315176,0.616905,0.560623,0.531130,0.511373,LPS_Nigericin_1.000_ug_per_ml_10.000_uM_Z-VAD-...
148,O20,-0.008031,0.149927,-0.087296,-0.039525,0.227378,-0.005209,0.012483,-0.121338,-0.041426,...,0.123652,0.448746,0.195383,0.073424,0.288437,0.665233,0.547869,0.580180,0.470265,LPS_Nigericin_1.000_ug_per_ml_10.000_uM_Z-VAD-...
149,O21,-0.039158,0.083196,-0.033546,-0.022291,0.151747,0.022673,0.001550,-0.070861,-0.024675,...,0.228478,0.610956,0.528192,0.291855,0.267683,0.696918,0.608740,0.592028,0.499456,LPS_Nigericin_1.000_ug_per_ml_10.000_uM_Z-VAD-...
150,O22,-0.078584,0.011797,0.072764,0.058422,0.033983,0.037911,0.019646,0.063310,0.006814,...,0.204027,0.525615,0.539442,0.577948,0.138297,0.509006,0.663276,0.597001,0.403857,media_ctr_0.0_0_Media_ctr_0.0_0


In [9]:
# select tht indexes for the training and test set
train_indexes = data_split_indexes.loc[data_split_indexes["label"] == "train"]

In [10]:
# subset data_df by indexes in data_split_indexes
training_data = data_df.loc[train_indexes["labeled_data_index"]]

In [11]:
# define metadata columns
# subset each column that contains metadata
metadata = training_data.filter(regex="Metadata")
# drop all metadata columns
data_x = training_data.drop(metadata.columns, axis=1)
labeled_data = training_data["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
# get all columns that contain "NSU" in the column name
data_y_cols = data_x.filter(regex="NSU").columns
train_y = training_data[data_y_cols]
train_x = data_x.drop(data_y_cols, axis=1)
train_x = train_x.drop(columns="oneb_Treatment_Dose_Inhibitor_Dose")

In [12]:
loo = LeaveOneOut()
loo.get_n_splits(train_x)
loo.get_n_splits(train_y)

70

In [13]:
train_data_y = train_y[cytokine]
model = ElasticNetCV(
    random_state=0,
    max_iter=100000,
    cv=loo,
    l1_ratio=[0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9, 0.99],
    alphas=[0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
    fit_intercept=True,
    selection="random",
)
# train model on training data on all combinations of model types, feature types, and phenotypic classes

if shuffle == "shuffled_baseline":
    print("Shuffling data")
    for column in train_x:
        np.random.shuffle(train_x[column].values)
else:
    print("Not shuffling data")
# define parameters to search over
with parallel_backend("multiprocessing"):
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")
        # create a logistic regression model
        model.fit(train_x, train_data_y)
        scores = cross_val_score(
            model,
            train_x,
            train_data_y,
            scoring="neg_mean_absolute_error",
            cv=loo,
            n_jobs=-1,
        )
        print(scores)
        print(f"Mean MAE: {scores.mean()}")
        print(f"Std MAE: {scores.std()}")
        print(f"R2: {model.score(train_x, train_data_y)}")

if (aggregation == True) and (nomic == True):
    results_dir = f"../models/regression/{cell_type}/aggregated_with_nomic/"
elif (aggregation == True) and (nomic == False):
    results_dir = f"../models/regression/{cell_type}/aggregated/"
elif (aggregation == False) and (nomic == True):
    results_dir = f"../models/regression/{cell_type}/sc_with_nomic/"
elif (aggregation == False) and (nomic == False):
    results_dir = f"../models/regression/{cell_type}/sc/"
else:
    print("Error")

# create results directory if it doesn't exist
pathlib.Path(results_dir).mkdir(parents=True, exist_ok=True)

# save final estimator
if shuffle == "shuffled_baseline":
    dump(
        model,
        f"{results_dir}/{cytokine}_shuffled_baseline__all_nomic.joblib",
    )
elif shuffle == "final":
    dump(
        model,
        f"{results_dir}/{cytokine}_final__all_nomic.joblib",
    )
else:
    print("Error")

# save condfig copy specific to this model to the folder with the results
# use pathlib
if shuffle == "shuffled_baseline":
    config_copy_path = pathlib.Path(
        f"{results_dir}/{cytokine}_shuffled_baseline__all_nomic.toml"
    )
elif shuffle == "final":
    config_copy_path = pathlib.Path(f"{results_dir}/{cytokine}_final__all_nomic.toml")
else:
    print("Error")

# write toml file with parameters used from injected parameters

with open(config_copy_path, "w") as f:
    f.write(f"model_type='{shuffle}'\n")
    f.write(f"aggregation={aggregation}\n")
    f.write(f"nomic={nomic}\n")
    f.write(f"cell_type='{cell_type}'\n")
    f.write(f"feature=all\n")

Shuffling data
[-0.0407089  -0.12347824 -0.00417615 -0.14744677 -0.34745577 -0.17560153
 -0.07458241 -0.26174729 -0.00088164 -0.16937999 -0.13583144 -0.12370284
 -0.35494403 -0.25362918 -0.06605206 -0.32636196 -0.01176965 -0.51640401
 -0.08778198 -0.06337778 -0.29962796 -0.05372512 -0.27109809 -0.17944232
 -0.14730411 -0.07313888 -0.10181131 -0.20107807 -0.04060527 -0.19398017
 -0.39797726 -0.10059665 -0.04941206 -0.00262925 -0.02036221 -0.16035512
 -0.3243947  -0.29736133 -0.2734925  -0.01891354 -0.31561164 -0.10048033
 -0.2651683  -0.13510262 -0.18853725 -0.26944424 -0.04285145 -0.21054632
 -0.20890699 -0.12217288 -0.21647726 -0.13682001 -0.25357771 -0.06020021
 -0.16638644 -0.25655337 -0.25733518 -0.18493111 -0.04518186 -0.4013779
 -0.24796009 -0.24951825 -0.3093526  -0.22360432 -0.01814902 -0.19968379
 -0.09304058 -0.06921005 -0.32627076 -0.20466708]
Mean MAE: -0.17531055947145746
Std MAE: 0.11399477266707174
R2: 0.0
