In [3]:
# Import libraries that are required to run your project
# You are allowed to add more libraries as you need

import os
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

## Work Package 1.1 - Modeling Choices & Data Pre-processing

In [28]:
# TODO:
# Load your feature (bed and/or bigwig and/or fasta) and target files (tsv) here.
# Decide which features to use for training. Feel free to process them however you need.

# NOTE: 
# bed and bigwig files contain signals of all chromosomes (including sex chromosomes).
# Training and validation split based on chromosomes has been done for you. 
# However, you can resplit the data in any way you want.

# Path for datasets
path_cwd = os.getcwd()
path_data = path_cwd+"/ML4G_Project_1_Data"

# Metadata for genes of cell lines X1 and X2
train_info_X1 = path_data+"/CAGE-train/CAGE-train/X1_train_info.tsv"
train_info_X2 = path_data+"/CAGE-train/CAGE-train/X2_train_info.tsv"
val_info_X1 = path_data+"/CAGE-train/CAGE-train/X1_val_info.tsv"
val_info_X2 = path_data+"/CAGE-train/CAGE-train/X2_val_info.tsv"

# Gene expression values for cell lines X1 and X2
train_y_X1 = path_data+"/CAGE-train/CAGE-train/X1_train_y.tsv"
train_y_X2 = path_data+"/CAGE-train/CAGE-train/X2_train_y.tsv"
val_y_X1 = path_data+"/CAGE-train/CAGE-train/X1_val_y.tsv"
val_y_X2 = path_data+"/CAGE-train/CAGE-train/X2_val_y.tsv"

# DNase and histone modification data for cell lines X1, X2 and X3
bed_files_X1 = ["/DNase-bed/X1.bed",
                "/H3K4me1-bed/X1.bed",
                "/H3K4me3-bed/X1.bed",
                "/H3K9me3-bed/X1.bed",
                "/H3K27ac-bed/X1.bed",
                "/H3K27me3-bed/X1.bed",
                "/H3K36me3-bed/X1.bed"]
bed_file_paths_X1 = [path_data+file for file in bed_files_X1]

bed_files_X2 = ["/DNase-bed/X2.bed",
                "/H3K4me1-bed/X2.bed",
                "/H3K4me3-bed/X2.bed",
                "/H3K9me3-bed/X2.bed",
                "/H3K27ac-bed/X2.bed",
                "/H3K27me3-bed/X2.bed",
                "/H3K36me3-bed/X2.bed"]
bed_file_paths_X2 = [path_data+file for file in bed_files_X1]

bed_files_X3 = ["/DNase-bed/X3.bed",
                "/H3K4me1-bed/X3.bed",
                "/H3K4me3-bed/X3.bed",
                "/H3K9me3-bed/X3.bed",
                "/H3K27ac-bed/X3.bed",
                "/H3K27me3-bed/X3.bed",
                "/H3K36me3-bed/X3.bed"]
bed_file_paths_X3 = [path_data+file for file in bed_files_X1]


In [295]:
def extract_features(bed_path, info_path, max_distance, resolution):
    """
    # Function for extracting features from a single dataset
    :param path:
    :return: pandas df with features
    """

    # Load data
    df_info = pd.read_csv(info_path, sep='\t', usecols=[0,1,4])
    df_peak_data = pd.read_csv(bed_path, sep='\t', usecols=[0,1,2], names = ["chromosome", "peak_start", "peak_end"])

    # Get genes and initialize features df with False as entries
    df_features = pd.DataFrame(data=False,columns=[i-max_distance-1 for i in range(1, 2*(max_distance+1))], index=df_info["gene_name"])

    # Fill df according to info data
    #tss_sites = (df_info['gene_name'], df_info['TSS_start'])
    for i in df_info.index:
        gene = df_info["gene_name"][i]
        tss = df_info["TSS_start"][i]
        chromosome = df_info["chr"][i]
        tss_l = tss - max_distance
        tss_r = tss + max_distance

        # Print progress
        if i % 2500 == 0:
            print("Progress:", i, "out of", df_info.shape[0], "\n" +
                  "Dataset:", bed_path, "\n" +
                  "Gene:", gene, "\n\n")

        # Find relevant peaks
        peaks = df_peak_data.loc[(df_peak_data["peak_start"] < tss_r) &
                                 (df_peak_data["peak_end"] > tss_l)]

        # Fill features dataset
        for j in range(peaks.shape[0]):
            # Make sure that peak is on the same chromosome
            if peaks["chromosome"].iloc[j] != chromosome: continue

            # Get peak boundaries
            peak_l = peaks["peak_start"].iloc[j]
            peak_r = peaks["peak_end"].iloc[j]

            # Consider possible cases
            if (peak_l >= tss_l) and (peak_r <= tss_r):
                df_features.loc[[gene], peak_l-tss : peak_r-tss] = True

            elif (peak_l <= tss_r) and (peak_r >= tss_r):
                df_features.loc[[gene], peak_l-tss : tss_r-tss] = True

            elif (peak_l <= tss_l) and (peak_r <= tss_r):
                df_features.loc[[gene], tss_l-tss : peak_r-tss] = True

            elif (peak_l <= tss_l) and (peak_r <= tss_r):
                df_features.loc[[gene], tss_l-tss : tss_r-tss] = True

        # Introduce resolution
        # Something like this could work
        #features.astype(int).rolling(window=10,axis=1, step=10).sum()
    return df_features


Unnamed: 0_level_0,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,...,91,92,93,94,95,96,97,98,99,100
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SLC20A1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C11orf58,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
ZSCAN9,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
CD19,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
TMEM123,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACOX1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MLXIP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ASGR2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR5A1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [296]:
features = extract_features(bed_file_paths_X1[0], train_info_X1, 100, 1)


Progress: 0 out of 14310 
Dataset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/DNase-bed/X1.bed 
Gene: SLC20A1 


Progress: 2500 out of 14310 
Dataset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/DNase-bed/X1.bed 
Gene: PPP1CA 


Progress: 5000 out of 14310 
Dataset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/DNase-bed/X1.bed 
Gene: TSSC4 


Progress: 7500 out of 14310 
Dataset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/DNase-bed/X1.bed 
Gene: WRNIP1 


Progress: 10000 out of 14310 
Dataset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/DNase-bed/X1.bed 
Gene: LHX2 


Progress: 12500 out of 14310 
Dataset: /home/mike/Masters_DS/ml4g_2023/ML4G_Project_1_Data/DNase-bed/X1.bed 
Gene: CHST12 




Unnamed: 0_level_0,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,...,91,92,93,94,95,96,97,98,99,100
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SLC20A1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C11orf58,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
ZSCAN9,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
CD19,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
TMEM123,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACOX1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
MLXIP,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
ASGR2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
OR5A1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Work Package 1.2 - Model Building

In [None]:
# TODO: 
# Select the best model to predict gene expression from the obtained features in WP 1.1.

# ---------------------------INSERT CODE HERE---------------------------




# ----------------------------------------------------------------------


## Work Package 1.3 - Prediction on Test Data (Evaluation Metric)

In [None]:
# TODO:
# Using the model trained in WP 1.2, make predictions on the test data (chr 1 of cell line X3).
# Store predictions in a variable called "pred" which is a numpy array.

pred = None
# ---------------------------INSERT CODE HERE---------------------------




# ----------------------------------------------------------------------

# Check if "pred" meets the specified constrains
assert isinstance(pred, np.ndarray), 'Prediction array must be a numpy array'
assert np.issubdtype(pred.dtype, np.number), 'Prediction array must be numeric'
assert pred.shape[0] == len(test_genes), 'Each gene should have a unique predicted expression'

#### Store Predictions in the Required Format

In [None]:
# Store predictions in a ZIP. 
# Upload this zip on the project website under "Your submission".
# Zip this notebook along with the conda environment (and README, optional) and upload this under "Your code".

save_dir = 'path/to/save/output/file'  # TODO
file_name = 'gex_predicted.csv'         # PLEASE DO NOT CHANGE THIS
zip_name = "LastName_FirstName_Project1.zip" # TODO
save_path = f'{save_dir}/{zip_name}'
compression_options = dict(method="zip", archive_name=file_name)

test_genes['gex_predicted'] = pred.tolist()
test_genes[['gene_name', 'gex_predicted']].to_csv(save_path, compression=compression_options)