In [1]:
import pathlib
import warnings
from typing import List, Tuple

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
from sklearn.decomposition import PCA
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.multioutput import MultiOutputRegressor

## Import the data 

In [2]:
# load the training data
train_data_file_path = pathlib.Path("../data_splits/train.parquet").resolve(strict=True)
test_data_file_path = pathlib.Path("../data_splits/test.parquet").resolve(strict=True)
model_dir = pathlib.Path("../models/").resolve()
model_dir.mkdir(parents=True, exist_ok=True)
results_dir = pathlib.Path("../results/").resolve()
results_dir.mkdir(parents=True, exist_ok=True)
train_df = pd.read_parquet(train_data_file_path)
test_df = pd.read_parquet(test_data_file_path)
train_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Metadata_number_of_singlecells,Metadata_apoptosis_ground_truth,Metadata_plate,Metadata_compound,Metadata_control,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,...,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_00_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_01_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_02_256,Terminal_Nuclei_Texture_Correlation_AnnexinV_3_03_256,Terminal_Nuclei_Texture_Correlation_DNA_3_02_256,Terminal_Nuclei_Texture_DifferenceVariance_AnnexinV_3_01_256,Terminal_Nuclei_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Terminal_Nuclei_Texture_InverseDifferenceMoment_DNA_3_01_256,Terminal_Nuclei_Texture_SumAverage_AnnexinV_3_00_256,Terminal_Nuclei_Texture_SumAverage_DNA_3_00_256
0,C-02,0.0,154.0,control,1,Staurosporine,negative,0.035033,0.554286,-0.355493,...,-0.4321,-0.587543,-0.440194,-0.486455,0.670119,-0.103772,0.463732,0.521993,-0.073682,0.416877
1,C-03,0.61,167.0,negative,1,Staurosporine,test,-0.370743,0.297396,0.328665,...,-0.739634,-0.648602,-0.791632,-0.665333,0.670119,-0.526875,0.332559,0.521993,-0.121325,0.416877
2,C-05,2.44,164.0,negative,1,Staurosporine,test,-0.540085,0.050681,0.477738,...,-0.458198,-0.648602,-0.647667,-0.665333,0.670119,-0.588308,0.165274,0.521993,-0.104207,0.416877
3,C-08,19.53,133.0,negative,1,Staurosporine,test,-0.946101,-0.547828,1.173377,...,-0.5842,-0.580215,-0.466295,-0.665333,0.670119,0.048387,0.436127,0.521993,-0.099644,0.416877
4,C-09,39.06,88.0,positive,1,Staurosporine,positive,-0.983683,-0.845926,1.188321,...,-0.223819,-0.343047,-0.069238,-0.485761,-0.68228,0.087037,0.501674,0.11508,0.019048,1.008354


In [3]:
metadata_columns = [x for x in train_df.columns if "Metadata" in x]
terminal_columns = [
    x for x in train_df.columns if "Terminal" in x and "Metadata" not in x
]


def shuffle_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Shuffle the data in the DataFrame.
    """
    df_shuffled = df.copy()
    for col in df_shuffled.columns:
        # permute the columns
        df_shuffled[col] = np.random.permutation(df_shuffled[col])
    return df_shuffled


def x_y_data_separator(
    df: pd.DataFrame,
    y_columns: list,
    metadata_columns: list,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Separate the data into X, y, and metadata.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame to separate. ASSUMPTION:
            The metadata columns contain the string "Metadata" and the y columns contain the string "Terminal".
            The column names are passed in as lists.
    y_columns : list
        The y columns to separate.
    metadata_columns : list
        The metadata columns to separate.

    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
        Three DataFrames: X, y, and metadata.
    """
    metadata = df[metadata_columns]
    df.drop(columns=metadata_columns, inplace=True)
    X = df.drop(columns=y_columns)
    y = df[y_columns]
    return X, y, metadata


shuffled_train_df = train_df.copy()
shuffled_train_df = shuffle_data(shuffled_train_df)
shuffled_test_df = test_df.copy()
shuffled_test_df = shuffle_data(shuffled_test_df)

# split the data into train and test sets
# train
(train_X, train_y, train_metadata) = x_y_data_separator(
    df=train_df, y_columns=terminal_columns, metadata_columns=metadata_columns
)
(train_shuffled_X, train_shuffled_y, train_metadata_shuffled) = x_y_data_separator(
    df=shuffled_train_df, y_columns=terminal_columns, metadata_columns=metadata_columns
)

# test
(test_X, test_y, test_metadata) = x_y_data_separator(
    df=test_df, y_columns=terminal_columns, metadata_columns=metadata_columns
)
(test_shuffled_X, test_shuffled_y, test_metadata_shuffled) = x_y_data_separator(
    df=shuffled_test_df, y_columns=terminal_columns, metadata_columns=metadata_columns
)


# check the shape of the data
print(f"train_X shape: {train_X.shape}, train_y shape: {train_y.shape}")
print(
    f"train_shuffled_X shape: {train_shuffled_X.shape}, train_shuffled_y shape: {train_shuffled_y.shape}"
)

print(f"test_X shape: {test_X.shape}, test_y shape: {test_y.shape}")
print(
    f"test_shuffled_X shape: {test_shuffled_X.shape}, test_shuffled_y shape: {test_shuffled_y.shape}"
)

train_X shape: (19, 2389), train_y shape: (19, 517)
train_shuffled_X shape: (19, 2389), train_shuffled_y shape: (19, 517)
test_X shape: (10, 2389), test_y shape: (10, 517)
test_shuffled_X shape: (10, 2389), test_shuffled_y shape: (10, 517)


In [4]:
model_features = [
    "Terminal_Cytoplasm_Intensity_MaxIntensity_AnnexinV",
    "Terminal_Cells_Intensity_MaxIntensityEdge_AnnexinV",
]

In [5]:
dict_of_train_tests = {
    "train": {
        "X": train_X,
        "y": train_y,
        "metadata": train_metadata,
    },
    "train_shuffled": {
        "X": train_shuffled_X,
        "y": train_shuffled_y,
        "metadata": train_metadata_shuffled,
    },
    "test": {
        "X": test_X,
        "y": test_y,
        "metadata": test_metadata,
    },
    "test_shuffled": {
        "X": test_shuffled_X,
        "y": test_shuffled_y,
        "metadata": test_metadata_shuffled,
    },
}

## Model training

In [6]:
# Define cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=0)  # 5-fold cross-validation
# elastic net parameters
elastic_net_params = {
    "alpha": [0.1, 1.0, 10.0, 100.0, 1000.0],  # Regularization strength
    "l1_ratio": [0.1, 0.25, 0.5, 0.75, 0.9, 1.0],  # l1_ratio = 1.0 is Lasso
    "max_iter": 10000,  # Increase max_iter for convergence
}
elastic_net_all_terminal_features_model = MultiOutputRegressor(
    ElasticNetCV(
        alphas=elastic_net_params["alpha"],
        l1_ratio=elastic_net_params["l1_ratio"],
        cv=cv,
        random_state=0,
        max_iter=elastic_net_params["max_iter"],
    )
)

elastic_net_single_terminal_features_model = ElasticNetCV(
    alphas=elastic_net_params["alpha"],
    l1_ratio=elastic_net_params["l1_ratio"],
    cv=cv,
    random_state=0,
    max_iter=elastic_net_params["max_iter"],
)

# train the model
for train_test_key, train_test_data in tqdm.tqdm(dict_of_train_tests.items()):
    if "test" in train_test_key:
        print(f"Skipping {train_test_key} as it is a test set.")
        continue
    X = train_test_data["X"]
    y = train_test_data["y"]
    metadata = train_test_data["metadata"]

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        elastic_net_all_terminal_features_model.fit(X, y)

    # save the model
    model_path = (
        model_dir / f"{train_test_key}_elastic_net_model_all_terminal_features.joblib"
    )
    joblib.dump(elastic_net_all_terminal_features_model, model_path)
    dict_of_train_tests[train_test_key]["model_path"] = model_path

    for single_feature in model_features:
        # Fit the model with a single terminal feature
        y_single_feature = y[[single_feature]]

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
            elastic_net_single_terminal_features_model.fit(X, y_single_feature)

        # Save the model
        single_feature_model_path = (
            model_dir
            / f"{train_test_key}_elastic_net_model_singlefeature_{single_feature}.joblib"
        )
        joblib.dump(
            elastic_net_single_terminal_features_model, single_feature_model_path
        )
        dict_of_train_tests[train_test_key][
            f"model_path_{single_feature}"
        ] = single_feature_model_path

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
100%|██████████| 4/4 [04:57<00:00, 74.30s/it] 

Skipping test as it is a test set.
Skipping test_shuffled as it is a test set.





In [7]:
train_test_data["X"]

Unnamed: 0,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Eccentricity_CP,Cells_AreaShape_Extent_CP,Cells_AreaShape_MinorAxisLength_CP,Cells_AreaShape_Orientation_CP,Cells_AreaShape_Solidity_CP,Cells_AreaShape_Zernike_1_1_CP,Cells_AreaShape_Zernike_2_0_CP,Cells_AreaShape_Zernike_2_2_CP,Cells_AreaShape_Zernike_3_1_CP,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,-0.029311,0.368518,1.032952,-0.505968,-0.153106,0.6493,-0.379808,0.613683,0.359134,-0.071198,...,-0.06909,0.140661,0.158417,0.244612,-0.618422,-0.130119,0.052955,0.244138,-0.201412,0.051745
1,-0.154453,0.46039,0.908984,-0.441659,-0.037214,0.134407,-0.333315,0.714689,0.635781,-0.99347,...,-0.055269,0.271085,0.263656,0.43219,-0.739533,0.225114,-0.438132,0.353043,0.063429,-0.045407
2,-1.095522,0.434519,-0.025407,-0.848466,0.107389,0.735313,-0.558056,0.185645,-0.007981,-1.018237,...,-0.302253,0.16266,0.244134,0.357114,-0.752717,0.263188,-0.543274,0.26162,-0.032073,0.048291
3,-0.840086,0.122046,1.371185,-1.427888,-0.005201,0.675363,-0.356183,0.138672,0.566413,-0.329281,...,-0.061838,0.088992,-0.239292,0.23021,-0.879316,0.255927,0.082148,0.28011,0.296713,0.011998
4,-0.91413,-1.124097,-0.086369,0.288439,0.083404,0.457687,-0.419879,0.333136,0.788437,-0.792329,...,-0.113154,0.12943,0.431402,0.288943,-0.554971,-0.246784,0.113832,0.198005,0.018578,0.417572
5,-0.991112,-0.496237,1.132921,0.157144,0.114886,0.768676,-0.411011,0.30275,0.549401,-0.442568,...,-0.048388,0.108033,-0.060511,0.18103,1.537154,0.20147,0.087626,-0.029892,0.072032,-0.057601
6,-1.098514,-0.967699,0.282078,0.088974,-0.303916,0.802432,-0.223932,0.096589,-0.144728,-0.721924,...,-0.2148,-0.07412,0.388052,0.214409,1.107605,0.438399,0.175131,0.316955,-0.243856,0.157681
7,-0.239434,-0.335576,0.960856,-2.046775,0.002362,0.475614,-0.306384,0.449131,0.19576,-0.296414,...,-1.19613,1.062278,0.367917,0.228027,-0.540242,0.254555,0.048784,0.07716,0.038163,0.052481
8,-0.853027,-1.224798,1.405449,-2.19959,-0.162683,0.638544,-0.187887,0.266551,0.688064,-0.310619,...,-0.686129,0.087457,-0.068446,0.518685,0.336981,0.318545,0.078862,0.36965,-0.007682,-0.031621
9,-0.447601,-0.32812,0.366804,0.215604,0.336271,0.296295,-0.480925,0.610814,0.515855,-1.037232,...,-0.166547,1.127019,0.451985,0.029158,-0.666231,0.232722,0.043801,-0.140186,0.198029,0.229739


In [8]:
# test the model
for train_test_key, train_test_data in tqdm.tqdm(dict_of_train_tests.items()):
    if "train" in train_test_key:
        print(f"Skipping {train_test_key} as it is a training set.")
        continue
    X = train_test_data["X"]
    y = train_test_data["y"]
    metadata = train_test_data["metadata"]
    if "shuffled" in train_test_key:
        model_path = dict_of_train_tests["train_shuffled"]["model_path"]
    else:
        model_path = dict_of_train_tests["train"]["model_path"]

    # load the model
    model = joblib.load(model_path)

    # make predictions
    y_pred = model.predict(X)

    alphas = model.estimators_[0].alpha_
    l1_ratios = model.estimators_[0].l1_ratio_
    print(f"Model parameters for {train_test_key}:")
    print(f"Alphas: {alphas}, L1 Ratios: {l1_ratios}")

    # calculate metrics
    metrics = {
        "explained_variance": explained_variance_score(y, y_pred),
        "mean_absolute_error": mean_absolute_error(y, y_pred),
        "mean_squared_error": mean_squared_error(y, y_pred),
        "r2_score": r2_score(y, y_pred),
    }

  0%|          | 0/4 [00:00<?, ?it/s]

Skipping train as it is a training set.
Skipping train_shuffled as it is a training set.


 75%|███████▌  | 3/4 [00:02<00:00,  1.10it/s]

Model parameters for test:
Alphas: 0.1, L1 Ratios: 0.1


100%|██████████| 4/4 [00:07<00:00,  1.97s/it]

Model parameters for test_shuffled:
Alphas: 1.0, L1 Ratios: 0.5





In [9]:
terminal_columns_file_path = results_dir / "terminal_columns.txt"
with open(terminal_columns_file_path, "w") as f:
    for col in terminal_columns:
        f.write(f"{col}\n")