## Hyperparameter tuning via Optuna

### Being a binary model this notebook will be limited to predicting one class 1 or 0, yes or no.
### Here I will be predicting if a cell received a treatment or not

In [1]:
import argparse
import json
import pathlib
import sys

import numpy as np
import optuna
import pandas as pd
import pyarrow.parquet as pq
import toml
import torch
from sklearn import preprocessing

MLP_parent_path = pathlib.Path("../../../utils/")
sys.path.append(str(MLP_parent_path.resolve()))
MLP_path = pathlib.Path("../../../utils/MLP_utils").resolve()

from MLP_utils.parameters import Parameters
from MLP_utils.utils import (
    Dataset_formatter,
    data_split,
    extract_best_trial_params,
    objective_model_optimizer,
    parameter_set,
    plot_metric_vs_epoch,
    results_output,
    test_optimized_model,
    train_optimized_model,
    un_nest,
)
from sklearn.model_selection import train_test_split

from utils import df_stats

In [None]:
# set up the parser
parser = argparse.ArgumentParser(description="Run hyperparameter optimization")
parser.add_argument(
    "--cell_type",
    type=str,
    default="all",
    help="Cell type to run hyperparameter optimization for",
)
parser.add_argument(
    "--model_name",
    type=str,
    default="all",
    help="Model name to run hyperparameter optimization for",
)

# parse arguments
args = parser.parse_args()

CELL_TYPE = args.cell_type
MODEL_NAME = args.model_name

# CELL_TYPE = "SHSY5Y"
# MODEL_NAME = "MLP"

In [3]:
ml_configs_file = pathlib.Path(MLP_path / "multi_class_config.toml").resolve(
    strict=True
)
ml_configs = toml.load(ml_configs_file)
params = Parameters()
mlp_params = parameter_set(params, ml_configs)

# overwrite params via command line arguments from papermill
mlp_params.CELL_TYPE = CELL_TYPE
mlp_params.MODEL_NAME = MODEL_NAME
MODEL_TYPE = mlp_params.MODEL_TYPE
HYPERPARAMETER_BATCH_SIZE = mlp_params.HYPERPARAMETER_BATCH_SIZE

In [4]:
# Import Data
# set data file path under pathlib path for multi-system use

file_path = pathlib.Path(
    f"../../../data/{mlp_params.CELL_TYPE}_preprocessed_sc_norm_aggregated.parquet"
).resolve(strict=True)

df1 = pd.read_parquet(file_path)

In [5]:
# get paths for toml files
ground_truth_file_path = pathlib.Path(MLP_path / "ground_truth.toml").resolve(
    strict=True
)
# read toml files
ground_truth = toml.load(ground_truth_file_path)

In [6]:
# get information from toml files
apoptosis_groups_list = ground_truth["Apoptosis"]["apoptosis_groups_list"]
pyroptosis_groups_list = ground_truth["Pyroptosis"]["pyroptosis_groups_list"]
healthy_groups_list = ground_truth["Healthy"]["healthy_groups_list"]

In [7]:
np.random.seed(0)
if mlp_params.DATA_SUBSET_OPTION == "True":
    df1 = df1.groupby("oneb_Metadata_Treatment_Dose_Inhibitor_Dose").apply(
        lambda x: x.sample(n=mlp_params.DATA_SUBSET_NUMBER, random_state=0)
    )
    print("Data Subset Is On")
    print(f"Data is subset to {mlp_params.DATA_SUBSET_NUMBER} per treatment group")
    print(df1.shape)
    df1.reset_index(drop=True, inplace=True)
else:
    print("Data Subset Is Off")

Data Subset Is Off


In [8]:
# add apoptosis, pyroptosis and healthy columns to dataframe
df1["apoptosis"] = df1["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
    apoptosis_groups_list
)
df1["pyroptosis"] = df1["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
    pyroptosis_groups_list
)
df1["healthy"] = df1["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"].isin(
    healthy_groups_list
)

# merge apoptosis, pyroptosis, and healthy columns into one column
conditions = [
    (df1["apoptosis"] == True),
    (df1["pyroptosis"] == True),
    (df1["healthy"] == True),
]
choices = ["apoptosis", "pyroptosis", "healthy"]
df1["labels"] = np.select(conditions, choices, default="healthy")

# drop apoptosis, pyroptosis, and healthy columns
df1.drop(columns=["apoptosis", "pyroptosis", "healthy"], inplace=True)

### Split said data

In [9]:
# randomly select wells to hold out for testing one per treatment group
# stratified by treatment group
np.random.seed(seed=0)
wells_to_hold = (
    df1.groupby("oneb_Metadata_Treatment_Dose_Inhibitor_Dose")
    .agg(np.random.choice)["Metadata_Well"]
    .to_list()
)
df_test = df1[df1["Metadata_Well"].isin(wells_to_hold)]
df_train = df1[~df1["Metadata_Well"].isin(wells_to_hold)]


print("Wells held out for testing:", df_test["Metadata_Well"].unique())
print("Wells to use for training and validation", df1["Metadata_Well"].unique())
print(df_test.shape, df_train.shape)

Wells held out for testing: ['B19' 'B20' 'C16' 'C19' 'D16' 'D18' 'E13' 'E14' 'E16' 'E23' 'F13' 'F20'
 'F23' 'G17' 'G23' 'H15' 'H19' 'H20' 'I14' 'I21' 'I22' 'J18' 'J21' 'K15'
 'K20' 'K23' 'L17' 'L23' 'M14' 'M16' 'M19' 'N15' 'N17' 'N23' 'O13' 'O17'
 'O22']
Wells to use for training and validation ['B13' 'B14' 'B15' 'B16' 'B17' 'B18' 'B19' 'B20' 'B21' 'B22' 'B23' 'C13'
 'C14' 'C15' 'C16' 'C17' 'C18' 'C19' 'C20' 'C21' 'C22' 'C23' 'D13' 'D14'
 'D15' 'D16' 'D17' 'D18' 'D19' 'D20' 'D21' 'D22' 'D23' 'E13' 'E14' 'E15'
 'E16' 'E17' 'E18' 'E19' 'E20' 'E21' 'E22' 'E23' 'F13' 'F14' 'F15' 'F16'
 'F17' 'F18' 'F19' 'F20' 'F21' 'F22' 'F23' 'G13' 'G14' 'G15' 'G16' 'G17'
 'G18' 'G19' 'G20' 'G21' 'G22' 'G23' 'H13' 'H14' 'H15' 'H16' 'H17' 'H18'
 'H19' 'H20' 'H21' 'H22' 'H23' 'I13' 'I14' 'I15' 'I16' 'I17' 'I18' 'I19'
 'I20' 'I21' 'I22' 'I23' 'J13' 'J14' 'J15' 'J16' 'J17' 'J18' 'J19' 'J20'
 'J21' 'J22' 'J23' 'K13' 'K14' 'K15' 'K16' 'K17' 'K18' 'K19' 'K20' 'K21'
 'K22' 'K23' 'L13' 'L14' 'L15' 'L16' 'L17' 'L18

In [10]:
df_train, df_val = train_test_split(
    df_train, test_size=0.2, random_state=0, stratify=df_train["labels"]
)

In [11]:
print(
    f"""
    Testing set length: {len(df_test)}\n
    Training set length: {len(df_train)}\n
    Validation set length: {len(df_val)}\n
    Added set length: {len(df_test) + len(df_train)}
    Total actual set length: {len(df1)}
"""
)


    Testing set length: 37

    Training set length: 93

    Validation set length: 24

    Added set length: 130
    Total actual set length: 154



In [12]:
# get the indexes for the training and testing sets

training_data_set_index = df_train.index
testing_data_set_index = df_test.index
validation_data_set_index = df_val.index


assert len(training_data_set_index) + len(validation_data_set_index) + len(
    testing_data_set_index
) == len(df1)

In [13]:
print(
    training_data_set_index.shape,
    validation_data_set_index.shape,
    testing_data_set_index.shape,
)
print(
    training_data_set_index.shape[0]
    + validation_data_set_index.shape[0]
    + testing_data_set_index.shape[0]
)

(93,) (24,) (37,)
154


In [14]:
# create pandas dataframe with all indexes and their respective labels, stratified by phenotypic class
index_data = []
for index in training_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "train"})

for index in testing_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "test"})
for index in validation_data_set_index:
    index_data.append({"labeled_data_index": index, "label": "val"})


# make index data a dataframe and sort it by labeled data index
index_data = pd.DataFrame(index_data)
index_data

Unnamed: 0,labeled_data_index,label
0,12,train
1,50,train
2,61,train
3,60,train
4,62,train
...,...,...
149,90,val
150,4,val
151,69,val
152,21,val


In [15]:
index_data["label"].unique()

array(['train', 'test', 'val'], dtype=object)

In [16]:
save_path = pathlib.Path(f"../indexes/{CELL_TYPE}/multi_class/")

print(save_path)
# create save path if it doesn't exist
save_path.mkdir(parents=True, exist_ok=True)
# save indexes as tsv file
index_data.to_csv(
    f"{save_path}/{params.CELL_TYPE}_data_split_indexes.tsv", sep="\t", index=False
)

../indexes/SHSY5Y/multi_class


In [17]:
# get the class weights for the loss function to account for class imbalance
# get the number of samples in each class
targets, counts = np.unique(df1["labels"], return_counts=True)
print(targets, counts)
total_counts = np.sum(counts)
# get the class weights
class_weights = []
class_targets = []
for class_name in enumerate(targets):
    class_targets.append(class_name[1])
for count in enumerate(counts):
    class_weights.append(1 - (count[1] / total_counts))
print(class_targets, class_weights)
# write the class weights to a file for use in the model
class_weights_file = pathlib.Path(f"../class_weights/{CELL_TYPE}/multi_class/")
class_weights_file.mkdir(parents=True, exist_ok=True)
class_targets_dicts = {
    class_targets[i]: class_weights[i] for i in range(len(class_targets))
}
# write the file to json
class_weights_file = class_weights_file / "class_weights.json"
with open(class_weights_file, "w") as file:
    json.dump(class_targets_dicts, file)

['apoptosis' 'healthy' 'pyroptosis'] [ 8 74 72]
['apoptosis', 'healthy', 'pyroptosis'] [0.948051948051948, 0.5194805194805194, 0.5324675324675325]
