In [39]:
import pathlib
import warnings
from typing import List, Tuple

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
from sklearn.decomposition import PCA
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.multioutput import MultiOutputRegressor

In [40]:
def shuffle_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Shuffle the data in the DataFrame.
    """
    df_shuffled = df.copy()
    for col in df_shuffled.columns:
        # permute the columns
        df_shuffled[col] = np.random.permutation(df_shuffled[col])
    return df_shuffled

In [41]:
# read in the data
sc_file_path = pathlib.Path("../results/cleaned_sc_profile.parquet").resolve(
    strict=True
)
sc_endpoint_file_path = pathlib.Path(
    "../results/cleaned_endpoint_sc_profile.parquet"
).resolve(strict=True)

train_test_wells_file_path = pathlib.Path(
    "../../5.bulk_timelapse_model/data_splits/train_test_wells.parquet"
).resolve(strict=True)
model_dir = pathlib.Path("../models").resolve()
model_dir.mkdir(parents=True, exist_ok=True)
sc_profile = pd.read_parquet(sc_file_path)
sc_endpoint_profile = pd.read_parquet(sc_endpoint_file_path)
train_test_wells = pd.read_parquet(train_test_wells_file_path)
print(f"sc_profile shape: {sc_profile.shape}")
print(f"sc_endpoint_profile shape: {sc_endpoint_profile.shape}")
data_split_file_path = pathlib.Path("../results/data_splits.parquet").resolve()
data_splits = pd.read_parquet(data_split_file_path)

sc_profile shape: (188065, 2381)
sc_endpoint_profile shape: (5767, 545)


In [42]:
# get the training data
training_df_X = sc_profile.iloc[
    data_splits["index"][data_splits["data_split"] == "train"]
]
training_df_y = sc_endpoint_profile.loc[
    sc_endpoint_profile["Metadata_sc_unique_track_id"].isin(
        training_df_X["Metadata_sc_unique_track_id"]
    )
]
assert (
    training_df_X["Metadata_sc_unique_track_id"].nunique()
    == training_df_y["Metadata_sc_unique_track_id"].nunique()
)
training_df_X_shuffled = training_df_X.copy()
training_df_X_shuffled = shuffle_data(training_df_X_shuffled)

In [49]:
train_x_metadata = [x for x in training_df_X.columns if "Metadata" in x]
train_y_metadata = [y for y in training_df_y.columns if "Metadata" in y]
training_X_features = [x for x in training_df_X.columns if x not in train_x_metadata]
training_y_features = [y for y in training_df_y.columns if y not in train_y_metadata]

train_x_shuffled_metadata = [
    x for x in training_df_X_shuffled.columns if "Metadata" in x
]
train_y_shuffled_metadata = [y for y in training_df_y.columns if "Metadata" in y]
train_x_shuffled_features = [
    x for x in training_df_X_shuffled.columns if x not in train_x_shuffled_metadata
]

train_df_x_metadata = training_df_X[train_x_metadata]
train_df_y_metadata = training_df_y[train_y_metadata]
train_df_x_features = training_df_X[training_X_features]
train_df_y_features = training_df_y[training_y_features]
train_df_x_shuffled_metadata = training_df_X_shuffled[train_x_shuffled_metadata]
train_df_x_shuffled_features = training_df_X_shuffled[train_x_shuffled_features]

In [50]:
annexin_feature = "Cytoplasm_Intensity_IntegratedIntensity_AnnexinV"

In [51]:
# Define cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=0)  # 5-fold cross-validation
# elastic net parameters
elastic_net_params = {
    "alpha": [0.1, 1.0, 10.0, 100.0, 1000.0],  # Regularization strength
    "l1_ratio": [0.1, 0.25, 0.5, 0.75, 1.0],  # l1_ratio = 1.0 is Lasso
    "max_iter": 10000,  # Increase max_iter for convergence
}
elastic_net_all_annexinv_features_model = MultiOutputRegressor(
    ElasticNetCV(
        alphas=elastic_net_params["alpha"],
        l1_ratio=elastic_net_params["l1_ratio"],
        cv=cv,
        random_state=0,
        max_iter=elastic_net_params["max_iter"],
    )
)
elastic_net_all_annexinv_features_model_shuffled = (
    elastic_net_all_annexinv_features_model
)
elastic_net_single_terminal_features_model = ElasticNetCV(
    alphas=elastic_net_params["alpha"],
    l1_ratio=elastic_net_params["l1_ratio"],
    cv=cv,
    random_state=0,
    max_iter=elastic_net_params["max_iter"],
)
elastic_net_single_terminal_features_model_shuffled = (
    elastic_net_single_terminal_features_model
)

In [53]:
dict_of_train_tests = {
    "single_feature": {
        "train": {
            "X": train_df_x_features,
            "y": train_df_y_features[annexin_feature],
            "x_metadata": train_df_x_metadata,
            "y_metadata": train_df_y_metadata,
            "model": elastic_net_single_terminal_features_model,
            "model_name": "elastic_net_single_terminal_features_model",
        },
        "train_shuffled": {
            "X": train_df_x_shuffled_features,
            "y": train_df_y_features[annexin_feature],
            "x_metadata": train_df_x_shuffled_metadata,
            "y_metadata": train_df_y_features,
            "model": elastic_net_single_terminal_features_model_shuffled,
            "model_name": "elastic_net_single_terminal_features_model_shuffled",
        },
    },
    "annexinV_features": {
        "train": {
            "X": train_df_x_features,
            "y": train_df_y_features,
            "x_metadata": train_df_x_metadata,
            "y_metadata": train_df_y_metadata,
            "model": elastic_net_all_annexinv_features_model,
            "model_name": "elastic_net_all_annexinv_features_model",
        },
        "train_shuffled": {
            "X": train_df_x_shuffled_features,
            "y": train_df_y_features,
            "x_metadata": train_df_x_shuffled_metadata,
            "y_metadata": train_df_y_metadata,
            "model": elastic_net_all_annexinv_features_model_shuffled,
            "model_name": "elastic_net_all_annexinv_features_model_shuffled",
        },
    },
}

In [54]:
# train the model
for model_type in dict_of_train_tests.keys():
    for train_test_key, train_test_data in tqdm.tqdm(
        dict_of_train_tests[model_type].items()
    ):
        if "test" in train_test_key:
            print(f"Skipping {train_test_key} as it is a test set.")
            continue
        print(f"Training model for {train_test_key}...{model_type}")
        X = train_test_data["X"]
        y = train_test_data["y"]
        metadata = train_test_data["metadata"]
        print(
            f"X shape: {X.shape}, y shape: {y.shape}, metadata shape: {metadata.shape}"
        )
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
            train_test_data["model"].fit(X, y)

        # save the model
        model_path = (
            model_dir / f"{train_test_key}_{train_test_data['model_name']}.joblib"
        )
        joblib.dump(train_test_data["model"], model_path)
        dict_of_train_tests[model_type][train_test_key]["model_path"] = model_path

  0%|          | 0/2 [00:00<?, ?it/s]

Training model for train...single_feature





KeyError: 'metadata'

In [None]:
# test the model
for model_type in dict_of_train_tests.keys():
    for train_test_key, train_test_data in tqdm.tqdm(
        dict_of_train_tests[model_type].items()
    ):
        if "train" in train_test_key:
            print(f"Skipping {train_test_key} as it is a training set.")
            continue
        print(model_type, train_test_key)
        X = train_test_data["X"]
        y = train_test_data["y"]
        metadata = train_test_data["metadata"]
        if "shuffled" in train_test_key:
            model_path = dict_of_train_tests[model_type]["train_shuffled"]["model_path"]
        else:
            model_path = dict_of_train_tests[model_type]["train"]["model_path"]

        # load the model
        model = joblib.load(model_path)

        # make predictions
        y_pred = model.predict(X)
        if model_type == "single_feature":
            model.alpha_
            model.l1_ratio_
        else:

            alphas = model.estimators_[0].alpha_
            l1_ratios = model.estimators_[0].l1_ratio_
            print(f"Model parameters for {train_test_key}:")
            print(f"Alphas: {alphas}, L1 Ratios: {l1_ratios}")

        # calculate metrics
        metrics = {
            "explained_variance": explained_variance_score(y, y_pred),
            "mean_absolute_error": mean_absolute_error(y, y_pred),
            "mean_squared_error": mean_squared_error(y, y_pred),
            "r2_score": r2_score(y, y_pred),
        }

100%|██████████| 4/4 [00:00<00:00, 129.40it/s]


Skipping train as it is a training set.
Skipping train_shuffled as it is a training set.
single_feature test
single_feature test_shuffled


  0%|          | 0/4 [00:00<?, ?it/s]

Skipping train as it is a training set.
Skipping train_shuffled as it is a training set.
annexinV_features test


 75%|███████▌  | 3/4 [00:02<00:00,  1.10it/s]

Model parameters for test:
Alphas: 0.1, L1 Ratios: 0.1
annexinV_features test_shuffled


100%|██████████| 4/4 [00:13<00:00,  3.34s/it]

Model parameters for test_shuffled:
Alphas: 1000.0, L1 Ratios: 0.1



