In [116]:
import os
import pandas as pd
import numpy as np
from scipy.stats.mstats import spearmanr
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer
import xgboost as xgb
import time
import pickle
import warnings

np.int = np.int64
warnings.simplefilter(action='ignore', category=FutureWarning)

In [117]:
# bed and bigwig files contain signals of all chromosomes (including sex chromosomes).
# Training and validation split based on chromosomes has been done for you.
# However, you can resplit the data in any way you want.

# Path for datasets
path_cwd = os.getcwd()
path_data = path_cwd+"/ML4G_Project_1_Data"

# Metadata for genes of cell lines X1 and X2
train_info_X1_path = path_data+"/CAGE-train/CAGE-train/X1_train_info.tsv"
train_info_X2_path = path_data+"/CAGE-train/CAGE-train/X2_train_info.tsv"
val_info_X1_path = path_data+"/CAGE-train/CAGE-train/X1_val_info.tsv"
val_info_X2_path = path_data+"/CAGE-train/CAGE-train/X2_val_info.tsv"

test_info_X3_path = path_data+"/CAGE-train/CAGE-train/X3_test_info.tsv"

# Gene expression values for cell lines X1 and X2
train_y_X1_path = path_data+"/CAGE-train/CAGE-train/X1_train_y.tsv"
train_y_X2_path = path_data+"/CAGE-train/CAGE-train/X2_train_y.tsv"
val_y_X1_path = path_data+"/CAGE-train/CAGE-train/X1_val_y.tsv"
val_y_X2_path = path_data+"/CAGE-train/CAGE-train/X2_val_y.tsv"

# DNase and histone modification data for cell lines X1, X2 and X3
bed_files_X1 = ["/DNase-bed/X1.bed",
                "/H3K4me1-bed/X1.bed",
                "/H3K4me3-bed/X1.bed",
                "/H3K9me3-bed/X1.bed",
                "/H3K27ac-bed/X1.bed",
                "/H3K27me3-bed/X1.bed",
                "/H3K36me3-bed/X1.bed"]
bed_file_paths_X1 = [path_data+file for file in bed_files_X1]

bed_files_X2 = ["/DNase-bed/X2.bed",
                "/H3K4me1-bed/X2.bed",
                "/H3K4me3-bed/X2.bed",
                "/H3K9me3-bed/X2.bed",
                "/H3K27ac-bed/X2.bed",
                "/H3K27me3-bed/X2.bed",
                "/H3K36me3-bed/X2.bed"]
bed_file_paths_X2 = [path_data+file for file in bed_files_X1]

bed_files_X3 = ["/DNase-bed/X3.bed",
                "/H3K4me1-bed/X3.bed",
                "/H3K4me3-bed/X3.bed",
                "/H3K9me3-bed/X3.bed",
                "/H3K27ac-bed/X3.bed",
                "/H3K27me3-bed/X3.bed",
                "/H3K36me3-bed/X3.bed"]
bed_file_paths_X3 = [path_data+file for file in bed_files_X1]

In [118]:
### FUNCTION FOR EXTRACTION OF FEATURES
def extract_features(bed_path, info_path, max_distance, resolution, stride, verbose=0, use_score=True):
    """
    Function extracting binary features from bed datasets
    :param bed_path: path to bed file of interest
    :param info_path: path to info file of interest
    :param max_distance: maximal distance from TSS that should be considered
    :param resolution: window size of aggregation for dimensionality reduction
    :param stride: stride for dimensionality reduction
    :return: pandas df of type int8 containing binary features
    """

    # Load data
    df_info = pd.read_csv(info_path, sep='\t', usecols=[0,1,4])

    # Get peak data with score column
    if ("DNase" in bed_path):
        score_col = 6
    else: score_col = 4

    df_peak_data = pd.read_csv(bed_path, sep='\t', usecols=[0,1,2,score_col], names = ["chromosome", "peak_start", "peak_end", "score"])
    # Get genes and initialize features df with False as entries
    if use_score: df_features = pd.DataFrame(data=0,columns=[i-max_distance-1 for i in range(1, 2*(max_distance+1))], index=df_info["gene_name"], dtype=float)
    else: df_features = pd.DataFrame(data=0,columns=[i-max_distance-1 for i in range(1, 2*(max_distance+1))], index=df_info["gene_name"])

    # Fill df according to info data
    for i in df_info.index:
        gene = df_info["gene_name"][i]
        tss = df_info["TSS_start"][i]
        chromosome = df_info["chr"][i]
        tss_l = tss - max_distance
        tss_r = tss + max_distance

        # Print progress
        if verbose:
            if i == 0:
                print("Start preprocessing of:", "\n"+
                      "Dataset:", bed_path, "\n"+
                      "Infoset:", info_path)
            if i == df_info.index[-1]:
                print("Done!" + "\n" + "-----------------------------------")

        # Find relevant peaks
        peaks = df_peak_data.loc[(df_peak_data["peak_start"] <= tss_r) &
                                 (df_peak_data["peak_end"] >= tss_l)]

        # Fill features dataset
        for j in range(peaks.shape[0]):
            # Make sure that peak is on the same chromosome
            if peaks["chromosome"].iloc[j] != chromosome: continue

            # Get peak boundaries
            peak_l = peaks["peak_start"].iloc[j]
            peak_r = peaks["peak_end"].iloc[j]

            if use_score:
                # Get score
                score = peaks["score"].iloc[j]
            else:
                score = 1

            # Consider possible cases
            if (peak_l >= tss_l) and (peak_r <= tss_r):
                df_features.loc[[gene], peak_l-tss : peak_r-tss] = score

            elif (peak_l <= tss_r) and (peak_r >= tss_r):
                df_features.loc[[gene], peak_l-tss : tss_r-tss] = score

            elif (peak_l <= tss_l) and (peak_r <= tss_r):
                df_features.loc[[gene], tss_l-tss : peak_r-tss] = score

            elif (peak_l <= tss_l) and (peak_r >= tss_r):
                df_features.loc[[gene], tss_l-tss : tss_r-tss] = score

    # Introduce resolution (rather inefficient...)
    df_features=df_features.rolling(window=resolution,
                                    step=stride,
                                    axis=1,
                                    min_periods=1,
                                    center=True).mean()

    return df_features


In [119]:
### FUNCTION FOR CREATING TRAINING DATASET

def create_set_np(bed_paths, df_info, max_distance, resolution, stride,verbose=0, use_score=True):
    '''
    create training validation and test dataset
    :param bed_paths:
    :param df_info:
    :param max_distance:
    :param resolution:
    :param stride:
    :param verbose:
    :return:
    '''
    for idx,path in enumerate(bed_paths):
        n=len(bed_paths)
        if idx==0:
            temp=extract_features(path,df_info, max_distance, resolution, stride, verbose, use_score)
            n_genes, n_timestamps=temp.shape
            features=np.zeros((n_genes, n_timestamps*n))
            features[:,:n_timestamps]=temp
        else:
            features[:,idx*n_timestamps: (idx+1)*n_timestamps]=extract_features(path,df_info, max_distance, resolution, stride,verbose).to_numpy()
    return features

In [120]:
# CREATION OF SCORE FUNCTION
def score_func(y, y_pred):
    return spearmanr(y,y_pred).statistic

scorer=make_scorer(score_func) #needed to be able to use spearmanr as score function in scikit-learn

In [121]:
# CREATION OF COMPLETE (HPO + TESTING) TRAINING AND TESTING LOOP
def Train_Test_loop(outer_params,model,inner_params,train_paths,val_paths,test_paths,mod_identifier_path='1to2',verbose=1):
    '''
    :param window_size:
    :param resolution:
    :param stride:
    :param model:
    :param inner_params: Dictionary describing the search space for the HPO
    :param train_paths: look above to see how to define this and the next two parameters
    :param val_paths:
    :param test_paths:
    :param verbose: regulate the printing
    :return: a pandas dataframe summarising the result for every combination of the outer parameters and the index for the best model in that dataframe
    '''
    results = {}
    results['score_test'], results['score_val'], results['time'], results['model'],results['n_features'] = [], [], [], [], []
    for a in inner_params.keys():
        results[a] = []
    best_score = 0
    best_model = None
    file_name = path_cwd+'/Models'
    counter, n_iter = 0, len(outer_params)
    for w, r ,s in outer_params:
        counter += 1
        print(f'Iteration {counter} out of {n_iter}\nWINDOW: {w}, RESOLUTION: {r}, STRIDE:{s}')
        start =time.time()
        #creation of datasets
        y_train = pd.read_csv(train_paths[2], delimiter="\t")['gex'].to_numpy()
        X_train = create_set_np(train_paths[0], train_paths[1], w, r, s)
        y_val = pd.read_csv(val_paths[2], delimiter="\t")['gex'].to_numpy()
        X_val = create_set_np(val_paths[0], val_paths[1], w, r, s)
        y_test = pd.read_csv(test_paths[2], delimiter="\t")['gex'].to_numpy()
        X_test = create_set_np(test_paths[0], test_paths[1], w, r, s)
        X_complete_train = np.concatenate([X_train,X_val])
        y_complete_train = np.concatenate([y_train,y_val])
        n_train = X_train.shape[0]
        CV = [([i for i in range(n_train)],[i for i in range(n_train, y_complete_train.shape[0])])]
        end = time.time()
        if verbose:
            print(f'Number of features: {X_complete_train.shape[1]}')
            print(f'\nPre-processing ended in: {round(end-start)} seconds')
        #model definition and training
        results['n_features'] += [X_complete_train.shape[1]]
        model = model
        opt = BayesSearchCV(
            model,
            inner_params,
            scoring = scorer,
            n_iter = 30,
            random_state = 7,
            cv = CV,
            verbose = 0)
        start = time.time()
        opt.fit(X_complete_train,y_complete_train)
        end = time.time()
        #results update
        score = spearmanr(opt.predict(X_test),y_test).statistic
        if verbose:
            print(f'Hyperparameter search ended in: {round(end-start)} seconds\n'
                    f'Optimal hyper parameters:{[(a,b) for a,b in opt.best_params_.items()]}\n'
                    f'Score in Validation: {round(opt.best_score_,4)}')
            print(f'Score in Test Set: {round(score,4)}\n------------------')
        results['score_test']+=[round(score,4)]
        results['score_val']+=[round(opt.best_score_,4)]
        results['time']+=[round(end-start)]
        for a,b in opt.best_params_.items():
            results[a]+=[b]
        results['model']=opt.best_estimator_
        if score> best_score:
            best_model_index = (w,r,s)
            best_score = score
            pickle.dump(opt,open(path_cwd+'/Results/best_model'+ mod_identifier_path+'.pickle','wb'))
        pickle.dump(results,open(path_cwd+'/Results/intermediate'+ mod_identifier_path+'.pickle','wb'))
    index = pd.MultiIndex.from_tuples(outer_params, names=['window_size','resolution','stride'])
    results = pd.DataFrame(results, index=index)
    pickle.dump(results, open(path_cwd + '/Results/DF' + mod_identifier_path+ '.pickle', 'wb'))
    return results, best_model_index

In [122]:
# SETTING OF PARAMETERS FOR THE LOOP FUNCTION

outer_params= [(100,10,1), (500,20,10), (1000,30,20), (2000,45,35), (3000,50,45)]

train_paths=[bed_file_paths_X1,train_info_X1_path, train_y_X1_path]
val_paths=[bed_file_paths_X1,val_info_X1_path, val_y_X1_path]
test_paths=[bed_file_paths_X2,val_info_X2_path, val_y_X2_path]


model=xgb.XGBRegressor(booster='gbtree')

inner_params={
    'n_estimators': Integer(100,300),
    'learning_rate': Real(1e-5, 3e-1,prior='log-uniform'),
    'max_depth': Integer(1,10),
}

In [123]:
results,best_model_index=Train_Test_loop(outer_params,
                                         model,
                                         inner_params,
                                         train_paths,
                                         val_paths,
                                         test_paths,
                                         mod_identifier_path='1to2',
                                         verbose=1)

Iteration 1 out of 5
WINDOW: 100, RESOLUTION: 10, STRIDE:1


Number of features: 1407

Pre-processing ended in: 65 seconds
Hyperparameter search ended in: 772 seconds
Optimal hyper parameters:[('learning_rate', 0.023960977249880475), ('max_depth', 8), ('n_estimators', 100)]
Score in Validation: 0.7945
Score in Test Set: 0.6982
------------------
Iteration 2 out of 5
WINDOW: 500, RESOLUTION: 20, STRIDE:10
Number of features: 707

Pre-processing ended in: 75 seconds
Hyperparameter search ended in: 324 seconds
Optimal hyper parameters:[('learning_rate', 0.006771133137623331), ('max_depth', 8), ('n_estimators', 300)]
Score in Validation: 0.8024
Score in Test Set: 0.7029
------------------
Iteration 3 out of 5
WINDOW: 1000, RESOLUTION: 30, STRIDE:20
Number of features: 707

Pre-processing ended in: 72 seconds
Hyperparameter search ended in: 285 seconds
Optimal hyper parameters:[('learning_rate', 0.010623311897245238), ('max_depth', 6), ('n_estimators', 166)]
Score in Validation: 0.795
Score in Test Set: 0.6979
------------------
Iteration 4 out of 5
