In [1]:
import pathlib
import warnings
from typing import List, Tuple

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
from sklearn.decomposition import PCA
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import (
    explained_variance_score,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.multioutput import MultiOutputRegressor

In [2]:
def shuffle_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Shuffle the data in the DataFrame.
    """
    df_shuffled = df.copy()
    for col in df_shuffled.columns:
        # permute the columns
        df_shuffled[col] = np.random.permutation(df_shuffled[col])
    return df_shuffled

In [3]:
# read in the data
sc_train_final_time_point_file_path = pathlib.Path(
    "../results/train_sc_profile.parquet"
).resolve()
sc_test_final_time_point_file_path = pathlib.Path(
    "../results/test_sc_profile.parquet"
).resolve()

sc_train_endpoint_file_path = pathlib.Path(
    "../results/train_sc_profile_terminal_time.parquet"
).resolve()
sc_test_endpoint_file_path = pathlib.Path(
    "../results/test_sc_profile_terminal_time.parquet"
).resolve()

sc_train_X_df = pd.read_parquet(sc_train_final_time_point_file_path)
sc_test_X_df = pd.read_parquet(sc_test_final_time_point_file_path)
sc_train_y_df = pd.read_parquet(sc_train_endpoint_file_path)
sc_test_y_df = pd.read_parquet(sc_test_endpoint_file_path)

print(f"Training data shape: {sc_train_X_df.shape}, {sc_train_y_df.shape}")
print(f"Testing data shape: {sc_test_X_df.shape}, {sc_test_y_df.shape}")
print(f"Training data shape: {sc_train_X_df.shape}, {sc_train_y_df.shape}")
print(f"Testing data shape: {sc_test_X_df.shape}, {sc_test_y_df.shape}")

model_dir = pathlib.Path("../models").resolve()
model_dir.mkdir(parents=True, exist_ok=True)
results_dir = pathlib.Path("../results").resolve()
results_dir.mkdir(parents=True, exist_ok=True)

Training data shape: (23720, 2379), (2300, 541)
Testing data shape: (49544, 2379), (4910, 541)
Training data shape: (23720, 2379), (2300, 541)
Testing data shape: (49544, 2379), (4910, 541)


In [4]:
# load models
elastic_net_all_annexinv_features_model_path = pathlib.Path(
    "../models/train_elastic_net_all_annexinv_features_model.joblib"
).resolve(strict=True)
elastic_net_all_annexinv_features_model_shuffled_path = pathlib.Path(
    "../models/train_shuffled_elastic_net_all_annexinv_features_model_shuffled.joblib"
).resolve(strict=True)
elastic_net_single_terminal_features_model_path = pathlib.Path(
    "../models/train_elastic_net_single_terminal_features_model.joblib"
).resolve(strict=True)
elastic_net_single_terminal_features_model_shuffled_path = pathlib.Path(
    "../models/train_shuffled_elastic_net_single_terminal_features_model_shuffled.joblib"
).resolve(strict=True)


elastic_net_all_annexinv_features_model = joblib.load(
    elastic_net_all_annexinv_features_model_path
)
elastic_net_all_annexinv_features_model_shuffled = joblib.load(
    elastic_net_all_annexinv_features_model_shuffled_path
)
elastic_net_single_terminal_features_model = joblib.load(
    elastic_net_single_terminal_features_model_path
)
elastic_net_single_terminal_features_model_shuffled = joblib.load(
    elastic_net_single_terminal_features_model_shuffled_path
)

In [5]:
print(
    f"training_df_X shape: {sc_train_X_df.shape}",
    sc_train_X_df["Metadata_sc_unique_track_id"].nunique(),
)
print(
    f"training_df_y shape: {sc_train_y_df.shape}",
    sc_train_y_df["Metadata_sc_unique_track_id"].nunique(),
)
print(
    f"test_df_X shape: {sc_test_X_df.shape}",
    sc_test_X_df["Metadata_sc_unique_track_id"].nunique(),
)
print(
    f"test_df_y shape: {sc_test_y_df.shape}",
    sc_test_y_df["Metadata_sc_unique_track_id"].nunique(),
)
assert (
    sc_train_X_df["Metadata_sc_unique_track_id"].nunique()
    == sc_train_y_df["Metadata_sc_unique_track_id"].nunique()
)
assert (
    sc_train_X_df["Metadata_sc_unique_track_id"].nunique()
    == sc_train_y_df["Metadata_sc_unique_track_id"].nunique()
)
sc_train_X_df_shuffled = sc_train_X_df.copy()
sc_train_X_df_shuffled = shuffle_data(sc_train_X_df_shuffled)
sc_test_X_df_shuffled = sc_test_X_df.copy()
sc_test_X_df_shuffled = shuffle_data(sc_test_X_df_shuffled)

training_df_X shape: (23720, 2379) 2300
training_df_y shape: (2300, 541) 2300
test_df_X shape: (49544, 2379) 4910
test_df_y shape: (4910, 541) 4910


In [6]:
train_x_metadata = [x for x in sc_train_X_df.columns if "Metadata" in x]
train_y_metadata = [y for y in sc_train_y_df.columns if "Metadata" in y]
train_X_features = [x for x in sc_train_X_df.columns if x not in train_x_metadata]
train_y_features = [y for y in sc_train_y_df.columns if y not in train_y_metadata]

train_x_shuffled_metadata = [
    x for x in sc_train_X_df_shuffled.columns if "Metadata" in x
]
train_y_shuffled_metadata = [y for y in sc_train_y_df.columns if "Metadata" in y]
train_x_shuffled_features = [
    x for x in sc_train_X_df_shuffled.columns if x not in train_x_shuffled_metadata
]

train_df_x_metadata = sc_train_X_df[train_x_metadata]
train_df_y_metadata = sc_train_y_df[train_y_metadata]
train_df_x_features = sc_train_X_df[train_X_features]
train_df_y_features = sc_train_y_df[train_y_features]
train_df_x_shuffled_metadata = sc_train_X_df_shuffled[train_x_shuffled_metadata]
train_df_x_shuffled_features = sc_train_X_df_shuffled[train_x_shuffled_features]

In [7]:
test_x_metadata = [x for x in sc_test_X_df.columns if "Metadata" in x]
test_y_metadata = [y for y in sc_test_y_df.columns if "Metadata" in y]
testing_X_features = [x for x in sc_test_X_df.columns if x not in test_x_metadata]
testing_y_features = [y for y in sc_test_y_df.columns if y not in test_y_metadata]

test_x_shuffled_metadata = [x for x in sc_test_X_df_shuffled.columns if "Metadata" in x]
test_y_shuffled_metadata = [y for y in sc_test_y_df.columns if "Metadata" in y]
test_x_shuffled_features = [
    x for x in sc_test_X_df_shuffled.columns if x not in test_x_shuffled_metadata
]

test_df_x_metadata = sc_test_X_df[test_x_metadata]
test_df_y_metadata = sc_test_y_df[test_y_metadata]
test_df_x_features = sc_test_X_df[testing_X_features]
test_df_y_features = sc_test_y_df[testing_y_features]
test_df_x_shuffled_metadata = sc_test_X_df_shuffled[test_x_shuffled_metadata]
test_df_x_shuffled_features = sc_test_X_df_shuffled[test_x_shuffled_features]

In [8]:
annexin_feature = "Cytoplasm_Intensity_IntegratedIntensity_AnnexinV"

In [9]:
dict_of_train_tests = {
    "single_feature": {
        "train": {
            "X": train_df_x_features.to_numpy(),
            "y": train_df_y_features[annexin_feature].to_numpy(),
            "x_metadata": train_df_x_metadata,
            "y_metadata": train_df_y_metadata,
            "model": elastic_net_single_terminal_features_model,
            "model_name": "elastic_net_single_terminal_features_model",
            "y_column_names": [annexin_feature],
        },
        "train_shuffled": {
            "X": train_df_x_shuffled_features.to_numpy(),
            "y": train_df_y_features[annexin_feature].to_numpy(),
            "x_metadata": train_df_x_shuffled_metadata,
            "y_metadata": train_df_y_metadata,
            "model": elastic_net_single_terminal_features_model_shuffled,
            "model_name": "elastic_net_single_terminal_features_model_shuffled",
            "y_column_names": [annexin_feature],
        },
        "test": {
            "X": test_df_x_features.to_numpy(),
            "y": test_df_y_features[annexin_feature].to_numpy(),
            "x_metadata": test_df_x_metadata,
            "y_metadata": test_df_y_metadata,
            "model": elastic_net_single_terminal_features_model,
            "model_name": "elastic_net_single_terminal_features_model",
            "y_column_names": [annexin_feature],
        },
        "test_shuffled": {
            "X": test_df_x_shuffled_features.to_numpy(),
            "y": test_df_y_features[annexin_feature].to_numpy(),
            "x_metadata": test_df_x_shuffled_metadata,
            "y_metadata": test_df_y_metadata,
            "model": elastic_net_single_terminal_features_model_shuffled,
            "model_name": "elastic_net_single_terminal_features_model_shuffled",
            "y_column_names": [annexin_feature],
        },
    },
    "annexinV_features": {
        "train": {
            "X": train_df_x_features.to_numpy(),
            "y": train_df_y_features.to_numpy(),
            "x_metadata": train_df_x_metadata,
            "y_metadata": train_df_y_metadata,
            "model": elastic_net_all_annexinv_features_model,
            "model_name": "elastic_net_all_annexinv_features_model",
            "y_column_names": train_df_y_features.columns.tolist(),
        },
        "train_shuffled": {
            "X": train_df_x_shuffled_features.to_numpy(),
            "y": train_df_y_features.to_numpy(),
            "x_metadata": train_df_x_shuffled_metadata,
            "y_metadata": train_df_y_metadata,
            "model": elastic_net_all_annexinv_features_model_shuffled,
            "model_name": "elastic_net_all_annexinv_features_model_shuffled",
            "y_column_names": train_df_y_features.columns.tolist(),
        },
        "test": {
            "X": test_df_x_features.to_numpy(),
            "y": test_df_y_features.to_numpy(),
            "x_metadata": test_df_x_metadata,
            "y_metadata": test_df_y_metadata,
            "model": elastic_net_all_annexinv_features_model,
            "model_name": "elastic_net_all_annexinv_features_model",
            "y_column_names": test_df_y_features.columns.tolist(),
        },
        "test_shuffled": {
            "X": test_df_x_shuffled_features.to_numpy(),
            "y": test_df_y_features.to_numpy(),
            "x_metadata": test_df_x_shuffled_metadata,
            "y_metadata": test_df_y_metadata,
            "model": elastic_net_all_annexinv_features_model_shuffled,
            "model_name": "elastic_net_all_annexinv_features_model_shuffled",
            "y_column_names": test_df_y_features.columns.tolist(),
        },
    },
}

In [10]:
prediction_df_list = []
stats_df_list = []
# test the model
for model_type in tqdm.tqdm(
    dict_of_train_tests.keys(), desc="Model types", leave=False
):
    for train_test_key, train_test_data in tqdm.tqdm(
        dict_of_train_tests[model_type].items()
    ):
        print(f"Testing model for {train_test_key}...{model_type}")
        X = train_test_data["X"]
        y = train_test_data["y"]
        x_metadata = dict_of_train_tests[model_type][train_test_key]["x_metadata"]
        y_metadata = dict_of_train_tests[model_type][train_test_key]["y_metadata"]
        # find the number of NaNs
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
            warnings.filterwarnings("ignore", category=UserWarning)
            y_pred = dict_of_train_tests[model_type][train_test_key]["model"].predict(X)

        if "shuffle" in train_test_key:
            shuffle = "Shuffled baseline"
        else:
            shuffle = "Model"

        predictions_df = pd.DataFrame(
            y_pred,
            columns=dict_of_train_tests[model_type][train_test_key]["y_column_names"],
        )
        predictions_df.insert(0, "Model_type", model_type)
        predictions_df.insert(1, "Train_test_key", train_test_key)
        predictions_df.insert(2, "Model_name", train_test_data["model_name"])
        predictions_df.insert(3, "dose", x_metadata["Metadata_dose"].values)
        predictions_df.insert(4, "time", x_metadata["Metadata_Time"].values)
        predictions_df.insert(
            5, "sc_unique_track_id", x_metadata["Metadata_sc_unique_track_id"].values
        )
        predictions_df.insert(6, "shuffle", shuffle)

        prediction_df_list.append(predictions_df)

Model types:   0%|          | 0/2 [00:00<?, ?it/s]

Testing model for train...single_feature
Testing model for train_shuffled...single_feature
Testing model for test...single_feature
Testing model for test_shuffled...single_feature


100%|██████████| 4/4 [00:00<00:00, 17.05it/s]
Model types:  50%|█████     | 1/2 [00:00<00:00,  4.23it/s]

Testing model for train...annexinV_features




Testing model for train_shuffled...annexinV_features




Testing model for test...annexinV_features




Testing model for test_shuffled...annexinV_features


100%|██████████| 4/4 [01:58<00:00, 29.55s/it]
                                                          

In [11]:
predictions_df = pd.concat(prediction_df_list, axis=0).reset_index(drop=True)
predictions_df.head()

Unnamed: 0,Model_type,Train_test_key,Model_name,dose,time,sc_unique_track_id,shuffle,Cytoplasm_Intensity_IntegratedIntensity_AnnexinV,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_Compactness,...,Nuclei_Texture_Correlation_AnnexinV_3_00_256,Nuclei_Texture_Correlation_AnnexinV_3_01_256,Nuclei_Texture_Correlation_AnnexinV_3_02_256,Nuclei_Texture_Correlation_AnnexinV_3_03_256,Nuclei_Texture_Correlation_DNA_3_02_256,Nuclei_Texture_DifferenceVariance_AnnexinV_3_01_256,Nuclei_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DNA_3_03_256,Nuclei_Texture_SumAverage_AnnexinV_3_00_256,Nuclei_Texture_SumAverage_DNA_3_01_256
0,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_9,Model,0.11835,,,...,,,,,,,,,,
1,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_15,Model,0.006795,,,...,,,,,,,,,,
2,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_26,Model,0.310491,,,...,,,,,,,,,,
3,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_40,Model,0.115992,,,...,,,,,,,,,,
4,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_47,Model,0.072022,,,...,,,,,,,,,,


In [12]:
# save the predictions
predictions_file_path = results_dir / "model_test_predictions_all_time_points.parquet"
predictions_df.to_parquet(predictions_file_path, index=False)

In [13]:
predictions_df.head()

Unnamed: 0,Model_type,Train_test_key,Model_name,dose,time,sc_unique_track_id,shuffle,Cytoplasm_Intensity_IntegratedIntensity_AnnexinV,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_Compactness,...,Nuclei_Texture_Correlation_AnnexinV_3_00_256,Nuclei_Texture_Correlation_AnnexinV_3_01_256,Nuclei_Texture_Correlation_AnnexinV_3_02_256,Nuclei_Texture_Correlation_AnnexinV_3_03_256,Nuclei_Texture_Correlation_DNA_3_02_256,Nuclei_Texture_DifferenceVariance_AnnexinV_3_01_256,Nuclei_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DNA_3_03_256,Nuclei_Texture_SumAverage_AnnexinV_3_00_256,Nuclei_Texture_SumAverage_DNA_3_01_256
0,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_9,Model,0.11835,,,...,,,,,,,,,,
1,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_15,Model,0.006795,,,...,,,,,,,,,,
2,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_26,Model,0.310491,,,...,,,,,,,,,,
3,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_40,Model,0.115992,,,...,,,,,,,,,,
4,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_47,Model,0.072022,,,...,,,,,,,,,,


## get approximated statistics for the plots

In [25]:
# File paths
predicted_results_file_path = pathlib.Path(
    "../results/model_test_predictions_all_time_points.parquet"
)
# load the final timepoint actual results
actual_results_file_path = pathlib.Path("../results/cleaned_endpoint_profiles.parquet")

predicted_results = pd.read_parquet(predicted_results_file_path)
actual_results = pd.read_parquet(actual_results_file_path)

# convert time from float to integer
predicted_results["time"] = predicted_results["time"].astype(int)


annexinv_feature = "Cytoplasm_Intensity_IntegratedIntensity_AnnexinV"

# rename some of the columns in the actual results for consistency
actual_results = actual_results.rename(
    columns={
        "Metadata_sc_unique_track_id": "sc_unique_track_id",
        "Metadata_dose": "dose",
        "Metadata_Time": "time",
    }
)

# select columns
columns_to_select = ["sc_unique_track_id", "dose", "time", annexinv_feature]
actual_results = actual_results[columns_to_select]

actual_results["Model_type"] = "single_feature"
actual_results["shuffle"] = "Model"
# assign a train_test_key column based on sc_unique_track_id that are in the predicted_results
actual_results["Train_test_key"] = actual_results["sc_unique_track_id"].apply(
    lambda x: (
        "train"
        if x
        in predicted_results[predicted_results["Train_test_key"] == "train"][
            "sc_unique_track_id"
        ].values
        else (
            "test"
            if x
            in predicted_results[predicted_results["Train_test_key"] == "test"][
                "sc_unique_track_id"
            ].values
            else (
                "train_shuffled"
                if x
                in predicted_results[
                    predicted_results["Train_test_key"] == "train_shuffled"
                ]["sc_unique_track_id"].values
                else "test_shuffled"
            )
        )
    )
)
print(predicted_results["dose"].unique())
print(predicted_results["time"].unique())
print(predicted_results["Model_type"].unique())
print(predicted_results["Train_test_key"].unique())
print(predicted_results["shuffle"].unique())
print(actual_results["Train_test_key"].unique())
print(actual_results["shuffle"].unique())

predicted_results.head()

['39.06' '4.88' '9.77' '0.61' '2.44' '156.25' '19.53' '1.22' '78.13' '0.0']
[ 0  2  3  4  5  6  8  9 10 11 12  1  7]
['single_feature' 'annexinV_features']
['train' 'train_shuffled' 'test' 'test_shuffled']
['Model' 'Shuffled baseline']
['test_shuffled' 'train' 'test']
['Model']


Unnamed: 0,Model_type,Train_test_key,Model_name,dose,time,sc_unique_track_id,shuffle,Cytoplasm_Intensity_IntegratedIntensity_AnnexinV,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_Compactness,...,Nuclei_Texture_Correlation_AnnexinV_3_00_256,Nuclei_Texture_Correlation_AnnexinV_3_01_256,Nuclei_Texture_Correlation_AnnexinV_3_02_256,Nuclei_Texture_Correlation_AnnexinV_3_03_256,Nuclei_Texture_Correlation_DNA_3_02_256,Nuclei_Texture_DifferenceVariance_AnnexinV_3_01_256,Nuclei_Texture_InverseDifferenceMoment_AnnexinV_3_03_256,Nuclei_Texture_InverseDifferenceMoment_DNA_3_03_256,Nuclei_Texture_SumAverage_AnnexinV_3_00_256,Nuclei_Texture_SumAverage_DNA_3_01_256
0,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_9,Model,0.11835,,,...,,,,,,,,,,
1,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_15,Model,0.006795,,,...,,,,,,,,,,
2,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_26,Model,0.310491,,,...,,,,,,,,,,
3,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_40,Model,0.115992,,,...,,,,,,,,,,
4,single_feature,train,elastic_net_single_terminal_features_model,39.06,0,C-09_0002_47,Model,0.072022,,,...,,,,,,,,,,


In [26]:
# copy all of the actual results
shuffled_actual_results = actual_results.copy()
shuffled_actual_results["shuffle"] = "Shuffled baseline"
shuffled_actual_results["Train_test_key"] = shuffled_actual_results[
    "Train_test_key"
].apply(
    lambda x: (
        "train_shuffled" if x == "train" else ("test_shuffled" if x == "test" else x)
    )
)
actual_results = pd.concat([actual_results, shuffled_actual_results], ignore_index=True)

# drop rows in actual_results that do have the sc_unique_track_id in the predicted_results
actual_results = actual_results[
    actual_results["sc_unique_track_id"].isin(predicted_results["sc_unique_track_id"])
]

# More efficient way to add missing columns
predicted_cols = set(predicted_results.columns)
actual_cols = set(actual_results.columns)
missing_cols = predicted_cols - actual_cols

if missing_cols:
    # Create a DataFrame with missing columns filled with NaN
    missing_data = pd.DataFrame(
        {col: [np.nan] * len(actual_results) for col in missing_cols}
    )
    # Concatenate horizontally to add missing columns at once
    actual_results = pd.concat([actual_results, missing_data], axis=1)

# Reorder columns to match
actual_results = actual_results[predicted_results.columns]

# Now concatenate the dataframes
merged_results = pd.concat([predicted_results, actual_results], ignore_index=True)

# replace '0' dose with '0.0' for consistency
merged_results["dose"] = merged_results["dose"].astype(str)
merged_results["dose"] = merged_results["dose"].replace("0", "0.0")

# Create ordered categorical for dose
dose_levels = [
    "0.0",
    "0.61",
    "1.22",
    "2.44",
    "4.88",
    "9.77",
    "19.53",
    "39.06",
    "78.13",
    "156.25",
]
merged_results["dose"] = pd.Categorical(
    merged_results["dose"], categories=dose_levels, ordered=True
)


merged_results["time"] = merged_results["time"] * 30  # minutes

# Select final columns
final_columns = [
    "sc_unique_track_id",
    "dose",
    "time",
    "Model_type",
    "Train_test_key",
    annexinv_feature,
    "shuffle",
]
merged_results = merged_results[final_columns]
merged_results.dropna(how="all", inplace=True)
merged_results

Unnamed: 0,sc_unique_track_id,dose,time,Model_type,Train_test_key,Cytoplasm_Intensity_IntegratedIntensity_AnnexinV,shuffle
0,C-09_0002_9,39.06,0.0,single_feature,train,0.118350,Model
1,C-09_0002_15,39.06,0.0,single_feature,train,0.006795,Model
2,C-09_0002_26,39.06,0.0,single_feature,train,0.310491,Model
3,C-09_0002_40,39.06,0.0,single_feature,train,0.115992,Model
4,C-09_0002_47,39.06,0.0,single_feature,train,0.072022,Model
...,...,...,...,...,...,...,...
309863,E-04_0003_179,1.22,390.0,single_feature,test_shuffled,0.083662,Shuffled baseline
309864,E-04_0003_64,1.22,390.0,single_feature,test_shuffled,-1.227703,Shuffled baseline
309865,E-04_0003_106,1.22,390.0,single_feature,test_shuffled,-0.565594,Shuffled baseline
309866,E-04_0003_148,1.22,390.0,single_feature,test_shuffled,-0.958674,Shuffled baseline


In [27]:
# find the final_timepoint splits based on the sc_unique_track_id

In [28]:
print(merged_results["dose"].unique())
print(merged_results["time"].unique())
print(merged_results["Model_type"].unique())
print(merged_results["Train_test_key"].unique())
print(merged_results["shuffle"].unique())

['39.06', '4.88', '9.77', '0.61', '2.44', '156.25', '19.53', '1.22', '78.13', '0.0']
Categories (10, object): ['0.0' < '0.61' < '1.22' < '2.44' ... '19.53' < '39.06' < '78.13' < '156.25']
[  0.  60.  90. 120. 150. 180. 240. 270. 300. 330. 360.  30. 210. 390.]
['single_feature' 'annexinV_features']
['train' 'train_shuffled' 'test' 'test_shuffled']
['Model' 'Shuffled baseline']


In [None]:
unique_cells = merged_results["sc_unique_track_id"].unique()
# predicted_results = predicted_results.loc[predicted_results['Model_type'] == 'single_feature']
# keep predicted results that contain all time points for each unique cell
# predicted_results = (
#     predicted_results.groupby('sc_unique_track_id')
#     .filter(lambda x: len(x) == predicted_results['time'].nunique())
# )

In [31]:
merged_results.groupby(
    [
        "dose",
        # "time",
        "Model_type",
        "Train_test_key",
        "shuffle",
        "sc_unique_track_id",
    ]
).count()

  merged_results.groupby([


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,time,Cytoplasm_Intensity_IntegratedIntensity_AnnexinV
dose,Model_type,Train_test_key,shuffle,sc_unique_track_id,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,annexinV_features,test,Model,C-02_0001_10,11,11
0.0,annexinV_features,test,Model,C-02_0001_100,12,12
0.0,annexinV_features,test,Model,C-02_0001_103,5,5
0.0,annexinV_features,test,Model,C-02_0001_104,4,4
0.0,annexinV_features,test,Model,C-02_0001_109,12,12
...,...,...,...,...,...,...
156.25,single_feature,train_shuffled,Shuffled baseline,E-11_0004_73,1,1
156.25,single_feature,train_shuffled,Shuffled baseline,E-11_0004_78,2,2
156.25,single_feature,train_shuffled,Shuffled baseline,E-11_0004_8,2,2
156.25,single_feature,train_shuffled,Shuffled baseline,E-11_0004_9,3,3


In [None]:
# calculate r2 and mse for each timepoint compared to the final timepoint actual results
stats_dict = {
    # "sc_unique_track_id": [],
    "dose": [],
    "time": [],
    "Model_type": [],
    "Train_test_key": [],
    "shuffle": [],
    "r2": [],
    "mse": [],
    "explained_variance": [],
}

for dose in merged_results["dose"].unique():
    print(dose)
    dose_data = merged_results[merged_results["dose"] == dose]
    for shuffle in dose_data["shuffle"].unique():
        shuffle_data = dose_data[dose_data["shuffle"] == shuffle]
        for model_type in shuffle_data["Model_type"].unique():
            print(model_type)
            model_type_data = shuffle_data[shuffle_data["Model_type"] == model_type]
            for train_test_key in model_type_data["Train_test_key"].unique():
                train_test_key_data = model_type_data[
                    model_type_data["Train_test_key"] == train_test_key
                ]
                gt_time = train_test_key_data["time"].max()
                gt_time_data = train_test_key_data[
                    train_test_key_data["time"] == gt_time
                ]
                for time in train_test_key_data["time"].unique():
                    if time == gt_time:
                        continue  # skip the final timepoint
                    time_data = train_test_key_data[train_test_key_data["time"] == time]
                    print(len(gt_time_data), len(time_data))
#                     r2 = r2_score(
#                         gt_time_data[annexinv_feature],
#                         time_data[annexinv_feature]
#                     )
#                     mse = mean_squared_error(
#                         gt_time_data[annexinv_feature],
#                         time_data[annexinv_feature]
#                     )
#                     explained_var = explained_variance_score(
#                         gt_time_data[annexinv_feature],
#                         time_data[annexinv_feature]
#                     )

#                     stats_dict['dose'].append(dose)
#                     stats_dict['time'].append(time)
#                     stats_dict['Model_type'].append(model_type)
#                     stats_dict['Train_test_key'].append(train_test_key)
#                     stats_dict['shuffle'].append(shuffle)
#                     stats_dict['r2'].append(r2)
#                     stats_dict['mse'].append(mse)

#                     stats_dict['explained_variance'].append(explained_var)
# stats_df = pd.DataFrame(stats_dict)
# stats_df.head()

39.06
single_feature
270 163


ValueError: Found input variables with inconsistent numbers of samples: [270, 163]

In [32]:
time_data

Unnamed: 0,sc_unique_track_id,dose,time,Model_type,Train_test_key,Cytoplasm_Intensity_IntegratedIntensity_AnnexinV,shuffle
0,C-09_0002_9,39.06,0.0,single_feature,train,0.118350,Model
1,C-09_0002_15,39.06,0.0,single_feature,train,0.006795,Model
2,C-09_0002_26,39.06,0.0,single_feature,train,0.310491,Model
3,C-09_0002_40,39.06,0.0,single_feature,train,0.115992,Model
4,C-09_0002_47,39.06,0.0,single_feature,train,0.072022,Model
...,...,...,...,...,...,...,...
21277,C-09_0003_126,39.06,0.0,single_feature,train,0.035861,Model
21278,C-09_0003_127,39.06,0.0,single_feature,train,0.030362,Model
21279,C-09_0003_136,39.06,0.0,single_feature,train,0.086294,Model
21280,C-09_0003_138,39.06,0.0,single_feature,train,0.056164,Model


In [None]:
g

In [None]:
y_pred