In [1]:
# Import libraries that are required to run your project
# You are allowed to add more libraries as you need

import os
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

## Work Package 1.1 - Modeling Choices & Data Pre-processing

In [161]:
### DATA PATHS

# NOTE: 
# bed and bigwig files contain signals of all chromosomes (including sex chromosomes).
# Training and validation split based on chromosomes has been done for you. 
# However, you can resplit the data in any way you want.

# Path for datasets
path_cwd = os.getcwd()
path_data = path_cwd+"/ML4G_Project_1_Data"

# Metadata for genes of cell lines X1 and X2
train_info_X1_path = path_data+"/CAGE-train/CAGE-train/X1_train_info.tsv"
train_info_X2_path = path_data+"/CAGE-train/CAGE-train/X2_train_info.tsv"
val_info_X1_path = path_data+"/CAGE-train/CAGE-train/X1_val_info.tsv"
val_info_X2_path = path_data+"/CAGE-train/CAGE-train/X2_val_info.tsv"

# Gene expression values for cell lines X1 and X2
train_y_X1_path = path_data+"/CAGE-train/CAGE-train/X1_train_y.tsv"
train_y_X2_path = path_data+"/CAGE-train/CAGE-train/X2_train_y.tsv"
val_y_X1_path = path_data+"/CAGE-train/CAGE-train/X1_val_y.tsv"
val_y_X2_path = path_data+"/CAGE-train/CAGE-train/X2_val_y.tsv"

# DNase and histone modification data for cell lines X1, X2 and X3
bed_files_X1 = ["/DNase-bed/X1.bed",
                "/H3K4me1-bed/X1.bed",
                "/H3K4me3-bed/X1.bed",
                "/H3K9me3-bed/X1.bed",
                "/H3K27ac-bed/X1.bed",
                "/H3K27me3-bed/X1.bed",
                "/H3K36me3-bed/X1.bed"]
bed_file_paths_X1 = [path_data+file for file in bed_files_X1]

bed_files_X2 = ["/DNase-bed/X2.bed",
                "/H3K4me1-bed/X2.bed",
                "/H3K4me3-bed/X2.bed",
                "/H3K9me3-bed/X2.bed",
                "/H3K27ac-bed/X2.bed",
                "/H3K27me3-bed/X2.bed",
                "/H3K36me3-bed/X2.bed"]
bed_file_paths_X2 = [path_data+file for file in bed_files_X1]

bed_files_X3 = ["/DNase-bed/X3.bed",
                "/H3K4me1-bed/X3.bed",
                "/H3K4me3-bed/X3.bed",
                "/H3K9me3-bed/X3.bed",
                "/H3K27ac-bed/X3.bed",
                "/H3K27me3-bed/X3.bed",
                "/H3K36me3-bed/X3.bed"]
bed_file_paths_X3 = [path_data+file for file in bed_files_X1]

# Small dataset for debugging
debug_info_path = path_data+"/info.tsv"
debug_bed_file = path_data+"/bed_file.tsv"

In [170]:
### FUNCTION FOR EXTRACTION OF FEATURES
def extract_features(bed_path, info_path, max_distance, resolution, stride):
    """
    Function extracting binary features from bed datasets
    :param bed_path: path to bed file of interest
    :param info_path: path to info file of interest
    :param max_distance: maximal distance from TSS that should be considered
    :param resolution: window size of aggregation for dimensionality reduction
    :param stride: stride for dimensionality reduction
    :return: pandas df of type int8 containing binary features
    """

    # Load data
    df_info = pd.read_csv(info_path, sep='\t', usecols=[0,1,4])
    df_peak_data = pd.read_csv(bed_path, sep='\t', usecols=[0,1,2], names = ["chromosome", "peak_start", "peak_end"])

    # Get genes and initialize features df with False as entries
    df_features = pd.DataFrame(data=0,columns=[i-max_distance-1 for i in range(1, 2*(max_distance+1))], index=df_info["gene_name"], dtype="int8")

    # Fill df according to info data
    for i in df_info.index:
        gene = df_info["gene_name"][i]
        tss = df_info["TSS_start"][i]
        chromosome = df_info["chr"][i]
        tss_l = tss - max_distance
        tss_r = tss + max_distance

        # Print progress
        if i == 0:
            print("Start preprocessing of:", "\n"+
                  "Dataset:", bed_path, "\n"+
                  "Infoset:", info_path)
        if i == df_info.index[-1]:
            print("Done!" + "\n" + "-----------------------------------")

        # Find relevant peaks
        peaks = df_peak_data.loc[(df_peak_data["peak_start"] <= tss_r) &
                                 (df_peak_data["peak_end"] >= tss_l)]

        # Fill features dataset
        for j in range(peaks.shape[0]):
            # Make sure that peak is on the same chromosome
            if peaks["chromosome"].iloc[j] != chromosome: continue

            # Get peak boundaries
            peak_l = peaks["peak_start"].iloc[j]
            peak_r = peaks["peak_end"].iloc[j]

            # Consider possible cases
            if (peak_l >= tss_l) and (peak_r <= tss_r):
                df_features.loc[[gene], peak_l-tss : peak_r-tss] = 1

            elif (peak_l <= tss_r) and (peak_r >= tss_r):
                df_features.loc[[gene], peak_l-tss : tss_r-tss] = 1

            elif (peak_l <= tss_l) and (peak_r <= tss_r):
                df_features.loc[[gene], tss_l-tss : peak_r-tss] = 1

            elif (peak_l <= tss_l) and (peak_r >= tss_r):
                df_features.loc[[gene], tss_l-tss : tss_r-tss] = 1

    # Introduce resolution (rather inefficient...)
    df_features=df_features.rolling(window=resolution,
                                      axis=1,
                                      step=stride,
                                      min_periods=1,
                                      center=True).mean()

    # df_features[df_features >= 0.5] = 1
    # df_features[df_features < 0.5] = 0

    return df_features#.astype("int8")


In [171]:
### FUNCTION FOR CREATING TRAINING DATASET
def create_set(bed_paths, df_info, max_distance, resolution, stride):
    """
    Create training dataset
    :param bed_paths:
    :param df_info:
    :param max_distance:
    :param resolution:
    :param stride:
    :return:
    """
    
    df_train = pd.concat([extract_features(path,df_info, max_distance, resolution, stride) for path in bed_paths], axis=1)
    df_train.columns = [i for i in range(df_train.columns.size)]

    return df_train

In [190]:
### PREPARE DATA FOR TRAINING
# set preprocessing parameter
# s

max_distance = 10000
resolution = 500
stride = 250

# load labels
train_y_X1 = pd.read_csv(train_y_X1_path, delimiter="\t")
train_x_X1 = create_set(bed_file_paths_X1, train_info_X1_path, max_distance, resolution, stride)

val_y_X1 = pd.read_csv(val_y_X1_path, delimiter="\t")
val_x_X1 = create_set(bed_file_paths_X1, val_info_X1_path, max_distance, resolution, stride)

val_y_X2 = pd.read_csv(val_y_X2_path, delimiter="\t")
val_x_X2 = create_set(bed_file_paths_X2, val_info_X2_path, max_distance, resolution, stride)

# store datasets
train_x_X1.to_csv("ML4G_Project_1_Data/Preprocessed-train/train_x_X1_"+str(max_distance)+"_"+str(resolution)+"_"+str(stride)+".csv", index=True)
val_x_X1.to_csv("ML4G_Project_1_Data/Preprocessed-train/val_x_X1_"+str(max_distance)+"_"+str(resolution)+"_"+str(stride)+".csv", index=True)
val_x_X2.to_csv("ML4G_Project_1_Data/Preprocessed-train/val_x_X2_"+str(max_distance)+"_"+str(resolution)+"_"+str(stride)+".csv", index=True)

Start preprocessing of: 
Dataset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/DNase-bed/X1.bed 
Infoset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/CAGE-train/CAGE-train/X1_train_info.tsv
Done!
-----------------------------------
Start preprocessing of: 
Dataset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/H3K4me1-bed/X1.bed 
Infoset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/CAGE-train/CAGE-train/X1_train_info.tsv
Done!
-----------------------------------
Start preprocessing of: 
Dataset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/H3K4me3-bed/X1.bed 
Infoset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/CAGE-train/CAGE-train/X1_train_info.tsv
Done!
-----------------------------------
Start preprocessing of: 
Dataset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/H3K9me3-bed/X1.bed 
Infoset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/CAGE-train/CAGE-train/X1_train_info.tsv
Done!
-----------------------------------
Start 

In [191]:
### LOAD DATASETS
max_distance = 10000
resolution = 500
stride = 250

train_y_X1 = pd.read_csv(train_y_X1_path, delimiter="\t")
train_x_X1 = pd.read_csv("ML4G_Project_1_Data/Preprocessed-train/train_x_X1_"+
                         str(max_distance)+"_"+str(resolution)+"_"+str(stride)+".csv", index_col=0)

val_y_X1 = pd.read_csv(val_y_X1_path, delimiter="\t")
val_x_X1 = pd.read_csv("ML4G_Project_1_Data/Preprocessed-train/val_x_X1_"+
                       str(max_distance)+"_"+str(resolution)+"_"+str(stride)+".csv", index_col=0)

val_y_X2 = pd.read_csv(val_y_X2_path, delimiter="\t")
val_x_X2 = pd.read_csv("ML4G_Project_1_Data/Preprocessed-train/val_x_X2_"+
                       str(max_distance)+"_"+str(resolution)+"_"+str(stride)+".csv", index_col=0)


## Work Package 1.2 - Model Building

In [195]:
### XGBOOST MODEL
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from scipy import stats
import pandas as pd
from sklearn import preprocessing

# Create regression matrices
dtrain_reg = xgb.DMatrix(train_x_X1, train_y_X1["gex"], enable_categorical=True)
dtest_reg = xgb.DMatrix(val_x_X1, val_y_X1["gex"], enable_categorical=True)

# Define training parameters
params = {"objective": "rank:pairwise"}
evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]
n = 11

# Train xgboost model
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
    evals=evals,
    verbose_eval=1,
    # Activate early stopping
    early_stopping_rounds=1
)

# Make predictions and score
dtest_reg = xgb.DMatrix(val_x_X2, val_y_X2["gex"], enable_categorical=True)
preds = model.predict(dtest_reg)
rmse = mean_squared_error(val_y_X2["gex"], preds, squared=False)
spmc = stats.spearmanr(preds, val_y_X2["gex"]).statistic
print(f"RMSE of the base model: {rmse:.3f}")
print(f"SPMC of the base model: {spmc:.3f}")

[0]	validation-map:0.86467	train-map:0.85376
[1]	validation-map:0.88105	train-map:0.87631
[2]	validation-map:0.88482	train-map:0.88227
[3]	validation-map:0.89219	train-map:0.88665
[4]	validation-map:0.89748	train-map:0.89313
[5]	validation-map:0.89530	train-map:0.89369
[6]	validation-map:0.89662	train-map:0.89833
[7]	validation-map:0.89399	train-map:0.89918
[8]	validation-map:0.90167	train-map:0.90216
[9]	validation-map:0.90242	train-map:0.90350
[10]	validation-map:0.90306	train-map:0.90630
RMSE of the base model: 276.657
SPMC of the base model: 0.682


In [9]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer
import xgboost as xgb

def score_func(y, y_pred):
    return spearmanr(y,y_pred).statistic
scorer=make_scorer(score_func)

dtrain_reg = xgb.DMatrix(train_x_X1, train_y_X1["gex"], enable_categorical=True)
dtest_reg = xgb.DMatrix(val_x_X1, val_y_X1["gex"], enable_categorical=True)

model=xgb.XGBRegressor(booster='gbtree')
param_grid={
    'n_estimators': Integer(60,200),
    'learning_rate': Real(1e-5,1e-1)

                          ,prior='log-uniform'),
    'max_depth': Integer(1,10),
}


opt = BayesSearchCV(
    model,
    param_grid,
    scoring=scorer,
    n_iter=100,
    random_state=7,
    cv=5,
    verbose=3)

opt.fit(train_x_X1, train_y_X1["gex"])

print(f'Best CV on same Cell Line Score: {opt.best_score_}')
print(f'Best params: {opt.best_params_}')
y_hat_2= opt.predict(val_x_X2)
score=spearmanr(y_hat_2, val_y_X2)
print(f'Score on different Cell Line: {score}')

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.8336204337284793, max_depth=17, n_estimators=198;, score=0.483 total time=  36.3s
[CV 2/5] END learning_rate=0.8336204337284793, max_depth=17, n_estimators=198;, score=0.497 total time=  36.0s
[CV 3/5] END learning_rate=0.8336204337284793, max_depth=17, n_estimators=198;, score=0.475 total time=  35.8s
[CV 4/5] END learning_rate=0.8336204337284793, max_depth=17, n_estimators=198;, score=0.481 total time=  35.6s
[CV 5/5] END learning_rate=0.8336204337284793, max_depth=17, n_estimators=198;, score=0.461 total time=  36.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.05016886531089607, max_depth=20, n_estimators=151;, score=0.683 total time=  34.7s
[CV 2/5] END learning_rate=0.05016886531089607, max_depth=20, n_estimators=151;, score=0.683 total time=  34.7s
[CV 3/5] END learning_rate=0.05016886531089607, max_depth=20, n_estimators=151;, score=0.689 total time



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=6, n_estimators=178;, score=0.746 total time=   8.1s
[CV 2/5] END learning_rate=0.001, max_depth=6, n_estimators=178;, score=0.754 total time=   8.2s
[CV 3/5] END learning_rate=0.001, max_depth=6, n_estimators=178;, score=0.762 total time=   8.7s
[CV 4/5] END learning_rate=0.001, max_depth=6, n_estimators=178;, score=0.759 total time=   8.2s
[CV 5/5] END learning_rate=0.001, max_depth=6, n_estimators=178;, score=0.749 total time=   8.2s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.00337684870087572, max_depth=6, n_estimators=60;, score=0.747 total time=   2.9s
[CV 2/5] END learning_rate=0.00337684870087572, max_depth=6, n_estimators=60;, score=0.751 total time=   3.1s
[CV 3/5] END learning_rate=0.00337684870087572, max_depth=6, n_estimators=60;, score=0.763 total time=   2.9s
[CV 4/5] END learning_rate=0.00337684870087572, max_depth=6, n_esti



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=6, n_estimators=200;, score=0.747 total time=   9.1s
[CV 2/5] END learning_rate=0.001, max_depth=6, n_estimators=200;, score=0.751 total time=   9.2s
[CV 3/5] END learning_rate=0.001, max_depth=6, n_estimators=200;, score=0.763 total time=   9.3s
[CV 4/5] END learning_rate=0.001, max_depth=6, n_estimators=200;, score=0.760 total time=   9.2s
[CV 5/5] END learning_rate=0.001, max_depth=6, n_estimators=200;, score=0.749 total time=   9.3s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=6, n_estimators=162;, score=0.745 total time=   7.5s
[CV 2/5] END learning_rate=0.001, max_depth=6, n_estimators=162;, score=0.759 total time=   7.6s
[CV 3/5] END learning_rate=0.001, max_depth=6, n_estimators=162;, score=0.762 total time=   7.6s
[CV 4/5] END learning_rate=0.001, max_depth=6, n_estimators=162;, score=0.759 total time=   7.5s
[CV 5/5] 



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=6, n_estimators=153;, score=0.744 total time=   7.0s
[CV 2/5] END learning_rate=0.001, max_depth=6, n_estimators=153;, score=0.759 total time=   7.1s
[CV 3/5] END learning_rate=0.001, max_depth=6, n_estimators=153;, score=0.762 total time=   7.2s
[CV 4/5] END learning_rate=0.001, max_depth=6, n_estimators=153;, score=0.759 total time=   7.3s
[CV 5/5] END learning_rate=0.001, max_depth=6, n_estimators=153;, score=0.750 total time=   7.3s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.0030824836405284406, max_depth=4, n_estimators=199;, score=0.746 total time=   6.9s
[CV 2/5] END learning_rate=0.0030824836405284406, max_depth=4, n_estimators=199;, score=0.737 total time=   6.9s
[CV 3/5] END learning_rate=0.0030824836405284406, max_depth=4, n_estimators=199;, score=0.748 total time=   6.9s
[CV 4/5] END learning_rate=0.0030824836405284406, max_dept



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=6, n_estimators=171;, score=0.746 total time=   8.0s
[CV 2/5] END learning_rate=0.001, max_depth=6, n_estimators=171;, score=0.759 total time=   8.1s
[CV 3/5] END learning_rate=0.001, max_depth=6, n_estimators=171;, score=0.762 total time=   8.1s
[CV 4/5] END learning_rate=0.001, max_depth=6, n_estimators=171;, score=0.759 total time=   8.1s
[CV 5/5] END learning_rate=0.001, max_depth=6, n_estimators=171;, score=0.750 total time=   8.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=6, n_estimators=170;, score=0.746 total time=   7.9s
[CV 2/5] END learning_rate=0.001, max_depth=6, n_estimators=170;, score=0.759 total time=   8.0s
[CV 3/5] END learning_rate=0.001, max_depth=6, n_estimators=170;, score=0.762 total time=   8.1s
[CV 4/5] END learning_rate=0.001, max_depth=6, n_estimators=170;, score=0.759 total time=   8.0s
[CV 5/5] 



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=6, n_estimators=167;, score=0.745 total time=   7.6s
[CV 2/5] END learning_rate=0.001, max_depth=6, n_estimators=167;, score=0.759 total time=   7.7s
[CV 3/5] END learning_rate=0.001, max_depth=6, n_estimators=167;, score=0.762 total time=   7.7s
[CV 4/5] END learning_rate=0.001, max_depth=6, n_estimators=167;, score=0.759 total time=   7.6s
[CV 5/5] END learning_rate=0.001, max_depth=6, n_estimators=167;, score=0.750 total time=   7.6s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=16, n_estimators=200;, score=0.722 total time=  22.8s
[CV 2/5] END learning_rate=0.001, max_depth=16, n_estimators=200;, score=0.704 total time=  23.5s
[CV 3/5] END learning_rate=0.001, max_depth=16, n_estimators=200;, score=0.728 total time=  23.5s
[CV 4/5] END learning_rate=0.001, max_depth=16, n_estimators=200;, score=0.740 total time=  22.9s
[CV 5



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=6, n_estimators=166;, score=0.746 total time=   7.6s
[CV 2/5] END learning_rate=0.001, max_depth=6, n_estimators=166;, score=0.759 total time=   7.6s
[CV 3/5] END learning_rate=0.001, max_depth=6, n_estimators=166;, score=0.762 total time=   7.6s
[CV 4/5] END learning_rate=0.001, max_depth=6, n_estimators=166;, score=0.759 total time=   7.7s
[CV 5/5] END learning_rate=0.001, max_depth=6, n_estimators=166;, score=0.750 total time=   7.6s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.745 total time=   7.4s
[CV 2/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.759 total time=   7.5s
[CV 3/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.762 total time=   7.5s
[CV 4/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.759 total time=   7.4s
[CV 5/5] 



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.745 total time=   7.7s
[CV 2/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.759 total time=   7.7s
[CV 3/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.762 total time=   7.6s
[CV 4/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.759 total time=   7.6s
[CV 5/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.750 total time=   7.6s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.006532215780503404, max_depth=11, n_estimators=60;, score=0.745 total time=   5.0s
[CV 2/5] END learning_rate=0.006532215780503404, max_depth=11, n_estimators=60;, score=0.739 total time=   4.8s
[CV 3/5] END learning_rate=0.006532215780503404, max_depth=11, n_estimators=60;, score=0.747 total time=   5.2s
[CV 4/5] END learning_rate=0.006532215780503404, max_depth=11



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.745 total time=   7.5s
[CV 2/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.759 total time=   7.7s
[CV 3/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.762 total time=   7.7s
[CV 4/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.759 total time=   7.8s
[CV 5/5] END learning_rate=0.001, max_depth=6, n_estimators=165;, score=0.750 total time=   7.8s
Best CV on same Cell Line Score: 0.7561436849401922
Best params: OrderedDict([('learning_rate', 0.001), ('max_depth', 5), ('n_estimators', 200)])
Score on different Cell Line: SignificanceResult(statistic=array([[1.        , 0.12929117, 0.66163834],
       [0.12929117, 1.        , 0.11570484],
       [0.66163834, 0.11570484, 1.        ]]), pvalue=array([[0.00000000e+000, 8.16852652e-009, 7.11751780e-249],
       [8.16852652e-009, 0.00000000e+000, 2.539

In [11]:
y_hat_2= opt.predict(val_x_X2)
score=spearmanr(y_hat_2, val_y_X2['gex'])
print(f'Score on different Cell Line: {score.statistic}')

Score on different Cell Line: 0.6616383443874877


In [None]:
y_hat_2.shape
val_y_X2.shape

In [None]:
### TEAPOT ANALYSIS
from tpot import TPOTRegressor

#Run teapot analysis
version = "1.1"
tpot = TPOTRegressor(generations=5, population_size=5, verbosity=2, random_state=42)
tpot.fit(train_x_X1.to_numpy(), train_y_X1["gex"].to_numpy())
tpot.export('teapots/teapot_'+version+'.py')

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from scipy import stats

preds = tpot.predict(val_x_X2.to_numpy())
rmse = mean_squared_error(val_y_X2["gex"], preds, squared=False)
spmc = stats.spearmanr(preds, val_y_X2["gex"]).statistic
print(f"RMSE of the base model: {rmse:.3f}")
print(f"SPMC of the base model: {spmc:.3f}")

## Work Package 1.3 - Prediction on Test Data (Evaluation Metric)

In [None]:
# TODO:
# Using the model trained in WP 1.2, make predictions on the test data (chr 1 of cell line X3).
# Store predictions in a variable called "pred" which is a numpy array.

pred = None
# ---------------------------INSERT CODE HERE---------------------------




# ----------------------------------------------------------------------

# Check if "pred" meets the specified constrains
assert isinstance(pred, np.ndarray), 'Prediction array must be a numpy array'
assert np.issubdtype(pred.dtype, np.number), 'Prediction array must be numeric'
assert pred.shape[0] == len(test_genes), 'Each gene should have a unique predicted expression'

#### Store Predictions in the Required Format

In [None]:
# Store predictions in a ZIP. 
# Upload this zip on the project website under "Your submission".
# Zip this notebook along with the conda environment (and README, optional) and upload this under "Your code".

save_dir = 'path/to/save/output/file'  # TODO
file_name = 'gex_predicted.csv'         # PLEASE DO NOT CHANGE THIS
zip_name = "LastName_FirstName_Project1.zip" # TODO
save_path = f'{save_dir}/{zip_name}'
compression_options = dict(method="zip", archive_name=file_name)

test_genes['gex_predicted'] = pred.tolist()
test_genes[['gene_name', 'gex_predicted']].to_csv(save_path, compression=compression_options)