In [1]:
import pathlib
from typing import List, Tuple

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor

## Import the data 

In [2]:
# load the training data
train_data_file_path = pathlib.Path("../data_splits/train.parquet").resolve(strict=True)
test_data_file_path = pathlib.Path("../data_splits/test.parquet").resolve(strict=True)
model_dir = pathlib.Path("../models/").resolve()
model_dir.mkdir(parents=True, exist_ok=True)
results_dir = pathlib.Path("../results/").resolve()
results_dir.mkdir(parents=True, exist_ok=True)
train_df = pd.read_parquet(train_data_file_path)
test_df = pd.read_parquet(test_data_file_path)
train_df.head()

Unnamed: 0,Metadata_Well,Metadata_dose,Cells_AreaShape_BoundingBoxArea_CP,Metadata_apoptosis_ground_truth,Cells_AreaShape_Compactness_CP,Cells_AreaShape_Extent_CP,Cells_AreaShape_FormFactor_CP,Cells_AreaShape_MinorAxisLength_CP,Cells_AreaShape_Solidity_CP,Cells_AreaShape_Zernike_1_1_CP,...,Terminal_Intensity_MADIntensity_AnnexinV,Terminal_Intensity_MADIntensity_DNA,Terminal_Intensity_MaxIntensity_AnnexinV,Terminal_Intensity_MaxIntensity_DNA,Terminal_Intensity_MeanIntensity_AnnexinV,Terminal_Intensity_MeanIntensity_DNA,Terminal_Intensity_StdIntensity_AnnexinV,Terminal_Intensity_StdIntensity_DNA,Terminal_Intensity_UpperQuartileIntensity_AnnexinV,Terminal_Intensity_UpperQuartileIntensity_DNA
0,C-03,0.61,0.151874,negative,-0.375467,0.298199,0.231699,0.548092,0.458562,-0.351042,...,-0.151765,0.0,-0.453394,1.025694,-1.044321,0.626815,-1.099832,0.270661,-1.230912,0.901388
1,C-05,2.44,0.060478,negative,-0.471116,0.378293,0.38047,0.469752,0.4099,-0.187871,...,-1.138238,0.0,-0.712917,0.484793,-1.784165,-0.594062,-0.768595,0.250849,-1.685941,0.0
2,C-06,4.88,-0.017911,negative,-0.519354,0.420167,0.459129,0.463801,0.42887,-0.12883,...,-0.645001,0.0,-0.739316,0.757975,-0.777688,0.087873,-0.609561,0.862966,-1.079236,0.0
3,C-07,9.77,-0.371168,negative,-0.629097,0.656105,0.64796,0.227454,0.527867,-0.197815,...,-0.89162,0.0,-0.881986,-0.103914,-0.953797,0.808517,-0.614739,1.012193,-1.989294,0.0
4,C-10,78.13,-1.254695,positive,-0.853136,0.927698,1.082197,-1.138856,0.537082,-0.439466,...,-5.659572,0.0,-0.514538,1.085794,1.279305,1.296806,1.934838,6.683572,0.740881,-1.802776


In [3]:
metadata_columns = [x for x in train_df.columns if "Metadata" in x]
terminal_columns = [x for x in train_df.columns if "Terminal" in x]


def x_y_data_separator(
    df: pd.DataFrame,
    y_columns: list,
    metadata_columns: list,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    df_shuffled = df.copy()
    for col in df_shuffled.columns:
        # permute the columns
        df_shuffled[col] = np.random.permutation(df_shuffled[col])
    metadata = df[metadata_columns]
    df.drop(columns=metadata_columns, inplace=True)
    X = df.drop(columns=y_columns)
    y = df[y_columns]

    metadata_shuffled = df_shuffled[metadata_columns]
    df_shuffled.drop(columns=metadata_columns, inplace=True)
    X_shuffled = df_shuffled.drop(columns=y_columns)
    y_shuffled = df_shuffled[y_columns]
    return X, y, metadata, X_shuffled, y_shuffled, metadata_shuffled


(
    train_X,
    train_y,
    train_metadata,
    train_shuffled_X,
    train_shuffled_y,
    train_metadata_shuffled,
) = x_y_data_separator(
    df=train_df, y_columns=terminal_columns, metadata_columns=metadata_columns
)

(
    test_X,
    test_y,
    test_metadata,
    test_shuffled_X,
    test_shuffled_y,
    test_metadata_shuffled,
) = x_y_data_separator(
    df=test_df, y_columns=terminal_columns, metadata_columns=metadata_columns
)

# check the shape of the data
print(f"train_X shape: {train_X.shape}, train_y shape: {train_y.shape}")
print(
    f"train_shuffled_X shape: {train_shuffled_X.shape}, train_shuffled_y shape: {train_shuffled_y.shape}"
)

print(f"test_X shape: {test_X.shape}, test_y shape: {test_y.shape}")
print(
    f"test_shuffled_X shape: {test_shuffled_X.shape}, test_shuffled_y shape: {test_shuffled_y.shape}"
)

train_X shape: (20, 2338), train_y shape: (20, 11)
train_shuffled_X shape: (20, 2338), train_shuffled_y shape: (20, 11)
test_X shape: (10, 2338), test_y shape: (10, 11)
test_shuffled_X shape: (10, 2338), test_shuffled_y shape: (10, 11)


## Model training

In [4]:
# train the multi-output regression model
model = MultiOutputRegressor(
    RandomForestRegressor(
        n_estimators=1000,
        random_state=0,
    )
)

model.fit(train_X, train_y)
# save the model

model_file_path = pathlib.Path("../models/multi_regression_model.joblib").resolve()
joblib.dump(model, model_file_path)


shuffled_model = MultiOutputRegressor(
    RandomForestRegressor(
        n_estimators=1000,
        random_state=0,
    )
)
shuffled_model.fit(train_shuffled_X, train_shuffled_y)
# save the model
shuffled_model_file_path = pathlib.Path(
    "../models/shuffled_multi_regression_model.joblib"
).resolve()
joblib.dump(shuffled_model, shuffled_model_file_path)

['/home/lippincm/4TB_A/live_cell_timelapse_apoptosis_analysis/5.bulk_timelapse_model/models/shuffled_multi_regression_model.joblib']

## Model Evaluation

In [5]:
dict_of_train_tests = {
    "train": {
        "X": train_X,
        "y": train_y,
        "metadata": train_metadata,
        "model_path": model_file_path,
    },
    "train_shuffled": {
        "X": train_shuffled_X,
        "y": train_shuffled_y,
        "metadata": train_metadata_shuffled,
        "model_path": shuffled_model_file_path,
    },
    "test": {
        "X": test_X,
        "y": test_y,
        "metadata": test_metadata,
        "model_path": model_file_path,
    },
    "test_shuffled": {
        "X": test_shuffled_X,
        "y": test_shuffled_y,
        "metadata": test_metadata_shuffled,
        "model_path": shuffled_model_file_path,
    },
}

In [6]:
output_dict_of_dfs = {"prediction_stats_df": [], "predictions_df": []}
for split in dict_of_train_tests.keys():
    if "shuffle" in split:
        shuffle = True
    else:
        shuffle = False
    X = dict_of_train_tests[split]["X"]
    y = dict_of_train_tests[split]["y"]
    metadata = dict_of_train_tests[split]["metadata"]
    model = joblib.load(dict_of_train_tests[split]["model_path"])

    # make predictions on the training data
    y_pred = model.predict(X)
    # calculate the mean absolute error
    mae = mean_absolute_error(y, y_pred)
    # calculate the mse
    mse = mean_squared_error(y, y_pred)
    # calculate the r2 score
    r2 = r2_score(y, y_pred)
    # calculate the explained variance score
    evs = explained_variance_score(y, y_pred)

    prediction_stats_df = pd.DataFrame(
        {"MAE": [mae], "MSE": [mse], "R2": [r2], "EVS": [evs]}
    )

    prediction_stats_df["data_split"] = split
    prediction_stats_df["shuffled"] = shuffle
    output_dict_of_dfs["prediction_stats_df"].append(prediction_stats_df)

    predictions_df = pd.DataFrame(y_pred, columns=terminal_columns)
    predictions_df.insert(0, "Metadata_data_split", split)
    predictions_df.insert(1, "Metadata_shuffled", shuffle)
    # add the metadata columns to the predictions_df
    for col in metadata.columns:
        predictions_df.insert(2, col, metadata[col])
    output_dict_of_dfs["predictions_df"].append(predictions_df)


prediction_stats_df = pd.concat(output_dict_of_dfs["prediction_stats_df"], axis=0)
predictions_df = pd.concat(output_dict_of_dfs["predictions_df"], axis=0)
print(prediction_stats_df.shape)
print(predictions_df.shape)

(4, 6)
(60, 16)


In [8]:
# save the final training results
prediction_stats_df_file_path = results_dir / "prediction_stats_df.parquet"
prediction_stats_df.to_parquet(prediction_stats_df_file_path, index=False)
predictions_df_file_path = results_dir / "predictions_df_final_timepoint.parquet"
predictions_df.to_parquet(predictions_df_file_path, index=False)
# write the terminal column names to a file
terminal_columns_file_path = results_dir / "terminal_columns.txt"
with open(terminal_columns_file_path, "w") as f:
    for col in terminal_columns:
        f.write(f"{col}\n")