In [7]:
import pandas as pd
import scripts
from datetime import datetime
import torchmetrics
import torch

In [13]:
# get data from 5 models
rnaseq = pd.read_csv("results/pred_baseline_rnaseq_20241206_18:25:44_b6ac866c-ead5-41d3-9dd3-30a418937e67.csv")
methyl = pd.read_csv("results/pred_baseline_methylations_20241206_18:26:54_46123574-5571-47f2-9b53-cf7964138929.csv")
prot = pd.read_csv("results/pred_baseline_proteomics_20241206_18:28:16_970a0a96-885e-41cc-b81f-d99dff686f87.csv")
muta = pd.read_csv("results/pred_baseline_muta_20241206_18:24:40_1c2580dd-a6f6-4d14-9dfd-f674cb162809.csv")
cnv = pd.read_csv("results/pred_baseline_cnvs_20241206_18:24:14_2a4a9716-3efd-4b43-9702-c6dbd53db681.csv")
#concat = pd.read_csv("results/pred_baseline_concatenated_20241123_010128_ffdb308e-5cff-448f-9dac-a9e99abc226b.csv")

In [14]:
# to proceed averaging step, it is require to check if the predicted values are ordered identically 
check_cell_col = pd.concat([rnaseq["cell_line"], methyl["cell_line"], prot["cell_line"], muta["cell_line"], cnv["cell_line"]],axis = 1, ignore_index=True)
all_same = check_cell_col.nunique(axis=1) == 1
rows_with_different_values = check_cell_col[~all_same]
print(rows_with_different_values) # yes!

Empty DataFrame
Columns: [0, 1, 2, 3, 4]
Index: []


In [18]:
# Since the predicted values from all 5 models have same order, we can simply take the mean rowwise 
result = rnaseq[["cell_line", "drug_id", "target"]]
pred_values = pd.concat([rnaseq["prediction"], methyl["prediction"], prot["prediction"], muta["prediction"], cnv["prediction"]],axis = 1, ignore_index=True)

mean_prediction = pred_values.mean(axis=1)

col = ["cell_line", "drug_id", "prediction", "target"]
result["prediction"] = mean_prediction.astype(float)
result = result[col]
result["target"] = result["target"].astype(float)

display(result)

Unnamed: 0,cell_line,drug_id,prediction,target
0,381.0,0.0,1.774348,0.954770
1,300.0,0.0,2.642107,2.223223
2,88.0,0.0,1.699847,3.415529
3,82.0,0.0,2.049446,1.369229
4,81.0,0.0,2.283061,2.151403
...,...,...,...,...
19951,238.0,361.0,3.807941,4.063567
19952,248.0,361.0,4.148027,4.012202
19953,257.0,361.0,4.571984,5.390047
19954,572.0,361.0,4.350806,4.509333


In [19]:
# save the result
filename = scripts.generate_filename(model_name = "baseline", dataset_name = "ensemble", extension = "csv")
result.to_csv("results/" + filename, index=False)
print(f"Predictions saved to: results/{filename}")

Predictions saved to: results/pred_baseline_ensemble_20241206_19:07:26_5b9155bf-bfce-482f-9805-2271babcfeff.csv


In [21]:
# evaluation

metrics = torchmetrics.MetricTracker(torchmetrics.MetricCollection(
    {"R_cellwise_residuals":scripts.GroupwiseMetric(metric=torchmetrics.functional.pearson_corrcoef,
                          grouping="drugs",
                          average="macro",
                          residualize=True),
    "R_cellwise":scripts.GroupwiseMetric(metric=torchmetrics.functional.pearson_corrcoef,
                          grouping="cell_lines",
                          average="macro",
                          residualize=False),
    "MSE":torchmetrics.MeanSquaredError()}))

metrics.increment()

metrics.update( # convert each series to tensor, to make it suitable for metrics.update()
    torch.tensor(result["prediction"].values, dtype=torch.float32),
    torch.tensor(result["target"].values, dtype=torch.float32),
    cell_lines=torch.tensor(result["cell_line"].values),
    drugs=torch.tensor(result["drug_id"].values),
)

ens_final_metrics = {it[0]: it[1].item() for it in metrics.compute().items()}
print(ens_final_metrics)

  return torch.linalg.solve(A, Xy).T


{'MSE': 1.6461766958236694, 'R_cellwise': 0.8975447416305542, 'R_cellwise_residuals': 0.32703089714050293}


In [23]:
eresult = ens_final_metrics

model_name = "baseline_ensemble"
eresult["Model"] = model_name
time = datetime.now().strftime("%Y%m%d_%H:%M")
eresult["Time"] = time

eresult_df = pd.DataFrame([eresult])


new_column_order = ["Model", "MSE", "R_cellwise", "R_cellwise_residuals","Time"]
eresult_df = eresult_df[new_column_order]

ev_table = pd.read_csv("results/evalutation_table.csv")
ev_table = pd.concat([ev_table, eresult_df], ignore_index=True)
ev_table = ev_table.drop_duplicates(subset=["Model"],keep = "last")
display(ev_table)

Unnamed: 0,MSE,R_cellwise,R_cellwise_residuals,Model,Time
0,1.795264,0.890693,0.333604,baseline_rnaseq,20241206_19:25
1,1.977368,0.874748,0.23871,baseline_proteomics,20241206_19:26
2,2.400242,0.866647,0.093873,baseline_mutations,20241206_19:26
3,2.243853,0.868511,0.167871,baseline_methylations,20241206_19:26
4,1.810338,0.88623,0.320672,baseline_concat,20241206_19:26
5,1.646177,0.897545,0.327031,baseline_ensemble,20241206_19:26


In [24]:
ev_table.to_csv("results/evalutation_table.csv", index = False)

In [None]:
display(ev_table)

ohh.. ensemble model is actually good!