# Deconvolution Task

We tackle the deconvolution task using a supervised deconvolution approach. More specifically we are using a random forrest regressor for the prediction of the cell type proportions.


In [1]:
# import all libraries you need here
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# reproducibility
seed = 42

In [5]:
# load and prep the data


# Paths train data for deconv
X_train_bulk_path = "data/deconv/train/train_bulk.csv"
y_train_bulk_path = "data/deconv/train/train_bulk_trueprops.csv"

# Path test data for deconv
X_test_bulk_path = "data/deconv/test/bulkified_data.csv"

# Load and prepare train data
X_train_bulk = pd.read_csv(X_train_bulk_path).set_index('Unnamed: 0').T
y_train_bulk = pd.read_csv(y_train_bulk_path).set_index('highLevelType').T

# Align train data by samples
X_train_bulk, y_train_bulk = X_train_bulk.align(y_train_bulk, axis=0)

# Load and prepare test data
X_test_bulk = pd.read_csv(X_test_bulk_path).set_index('Unnamed: 0').T

# make copy of entire train data for later training of final model (i.e. copies before splitting of validation data)
X_train_bulk_all = X_train_bulk
y_train_bulk_all = y_train_bulk

# Split train data into training and validation sets
X_train_bulk, X_val_bulk, y_train_bulk, y_val_bulk = train_test_split(
    X_train_bulk, y_train_bulk, test_size=0.2, random_state=seed
)

In [6]:
# load and train model + produce predictions & eval (test with validation data)


# Train regression model
model = RandomForestRegressor(random_state=seed)
model.fit(X_train_bulk, y_train_bulk)

# Predict on the validation set
y_valid_pred = model.predict(X_val_bulk)

# Compute RMSE for each cell type & average RMSE (on validation data)
rmse_per_cell_type = np.sqrt(mean_squared_error(y_val_bulk, y_valid_pred, multioutput='raw_values'))
average_rmse = np.mean(rmse_per_cell_type)

print("RMSE per cell type (validation data):", rmse_per_cell_type)
print("Average RMSE (validation data):", average_rmse)

RMSE per cell type (validation data): [0.11548921 0.02488297 0.08492294 0.02329374 0.01461357 0.12096108
 0.02772031 0.03070315 0.03522043]
Average RMSE (validation data): 0.053089711441450346


In [10]:
# train final model for submission on all train data (including validation data)


# train model using train + valid data from above
model.fit(X_train_bulk_all, y_train_bulk_all)

# final prediction on test data
y_test_pred = model.predict(X_test_bulk)

In [11]:
# Get correct cell type and bulk sample names
initial_order = y_train_bulk.columns.tolist()  # Original cell type names
desired_order = ['T', 'Endothelial', 'Fibroblast', 'Plasmablast', 'B', 'Myofibroblast', 'NK', 'Myeloid', 'Mast']
bulk_samples = X_test_bulk.index.tolist()  # List of bulk sample names

# Create the DataFrame with the original order
deconv_submission_df = pd.DataFrame(
    y_test_pred.T,  # Transpose to have cell types as rows and bulk samples as columns
    columns=bulk_samples,  # Bulk sample names as columns
    index=initial_order     # Original cell type order as rows
)

# Reorder rows to match the desired order of cell types
deconv_submission_df = deconv_submission_df.reindex(desired_order)

# Add an unnamed index column starting from 0
deconv_submission_df.reset_index(inplace=True)
deconv_submission_df.index.name = ""  # Ensure the index column has no name
deconv_submission_df

Unnamed: 0,index,s5_0,s5_1,s5_2,s5_3,s5_4,s6_0,s6_1,s6_2,s6_3,...,s9_0,s9_1,s9_2,s9_3,s9_4,s10_0,s10_1,s10_2,s10_3,s10_4
,,,,,,,,,,,,,,,,,,,,,
0.0,T,0.432315,0.446913,0.435657,0.443322,0.450747,0.514556,0.505531,0.476967,0.500694,...,0.514496,0.508823,0.515059,0.509013,0.518548,0.522823,0.515738,0.51473,0.521769,0.514889
1.0,Endothelial,0.028534,0.028657,0.027829,0.029241,0.029924,0.034818,0.034237,0.033363,0.034893,...,0.042083,0.041095,0.043893,0.043609,0.042822,0.044086,0.043269,0.042767,0.045505,0.041051
2.0,Fibroblast,0.085961,0.084727,0.085829,0.084821,0.083563,0.076775,0.083086,0.083901,0.078965,...,0.075573,0.074212,0.075215,0.075291,0.075208,0.077266,0.076106,0.076401,0.075367,0.075482
3.0,Plasmablast,0.048672,0.042119,0.046046,0.047311,0.040664,0.040232,0.045327,0.048832,0.042422,...,0.043997,0.0423,0.041977,0.044054,0.042946,0.043783,0.045225,0.047748,0.044412,0.041429
4.0,B,0.08112,0.079913,0.082986,0.080955,0.079814,0.083183,0.082397,0.083798,0.085277,...,0.084541,0.087634,0.08888,0.091609,0.084243,0.082471,0.083342,0.084475,0.084414,0.080278
5.0,Myofibroblast,0.057142,0.057991,0.059595,0.057263,0.057657,0.05839,0.056412,0.054317,0.054193,...,0.05261,0.055824,0.051984,0.051211,0.052869,0.052349,0.053686,0.052708,0.051061,0.057507
6.0,NK,0.181166,0.175108,0.182147,0.173983,0.169066,0.119574,0.125251,0.140683,0.129814,...,0.107867,0.115894,0.108276,0.110327,0.107424,0.109285,0.112765,0.109758,0.107619,0.119297
7.0,Myeloid,0.056807,0.056905,0.055345,0.056434,0.059611,0.046753,0.04211,0.047873,0.042973,...,0.04039,0.038549,0.038443,0.038066,0.040701,0.038679,0.040063,0.039068,0.040286,0.038705
8.0,Mast,0.028284,0.027666,0.024566,0.026671,0.028954,0.025718,0.025649,0.030267,0.03077,...,0.038441,0.035669,0.036274,0.036821,0.035238,0.029258,0.029804,0.032345,0.029567,0.031361


In [12]:
# Save the DataFrame to a CSV file in the required format
sub_decon_path = "submission/pred_props_deconv.csv"
deconv_submission_df.to_csv(sub_decon_path, index=True, index_label="")  # Add unnamed index column
print(f"Submission file saved to: {sub_decon_path}")

Submission file saved to: submission/pred_props_deconv.csv


# Submission Steps

Next we perform some provided Deconvolution task sanity checks.


In [None]:
# load as reference
all_bulkified = pd.read_csv("data/deconv/test/bulkified_data.csv",index_col=0)

In [20]:
# pred_props should be a DataFrame containing the estimated cell type proportions for the patients in all_bulkified
# pred_props.columns = ['index','s5_0','s5_1',...,'s10_3','s10_4'] = np.append(["index"],all_bulkified.columns)
# pred_props['index'] = ['T', 'Endothelial', 'Fibroblast', 'Plasmablast', 'B', 'Myofibroblast',
#       'NK', 'Myeloid', 'Mast']

In [21]:
assert all(deconv_submission_df.columns == np.append(["index"],all_bulkified.columns)), "Wrong columns"

In [22]:
assert all(deconv_submission_df['index']== ['T', 'Endothelial', 'Fibroblast', 'Plasmablast', 'B', 'Myofibroblast',
       'NK', 'Myeloid', 'Mast']), "Wrong order for cell types"

In [23]:
assert all(deconv_submission_df.drop("index",axis=1).sum().round()==1), "The proportions for a single patient must sum to 1"

# Add clustering part below before zipping everything up.


In [None]:
# cluster_labels should be a DataFrame containing the cluster labels for each cell
# cluster_labels.columns = ["index", "cluster"]
# cluster_labels["index"] = test_adata.columns

In [None]:
assert all(cluster_labels.columns == ["index", "cluster"]), "Wrong columns"

In [None]:
assert all(cluster_labels["index"] == test_adata.obs_names), "The cell ids are either not all present or not in the right order"

In [None]:
import zipfile

archive_name = "LastName_FirstName_Project2.zip" # TODO

with zipfile.ZipFile(results_path / archive_name, "x") as zf:
    with zf.open(f"pred_props.csv", "w") as buffer:
        pred_props.to_csv(buffer)
    with zf.open(f"cluster_membership.csv", "w") as buffer:
        cluster_labels.to_csv(buffer)
    zf.close()