In [171]:
# Import libraries that are required to run your project
# You are allowed to add more libraries as you need

import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import pyBigWig
from tqdm import tqdm

In [172]:
BIN_SIZE = 64*2
SEQ_LENGTH = 32768//2
PATH_TRAIN_X1_FEATURES = 'data/ML4G_Project_1_Data/CAGE-train/X1_train_info.tsv'
PATH_TRAIN_X1_TARGETS = 'data/ML4G_Project_1_Data/CAGE-train/X1_train_y.tsv'
PATH_H3K4me1_X1_BW = 'data/ML4G_Project_1_Data/H3K4me1-bigwig/X1.bigwig'
PATH_H3K4me3_X1_BW = 'data/ML4G_Project_1_Data/H3K4me3-bigwig/X1.bw'
PATH_H3K9me3_X1_BW = 'data/ML4G_Project_1_Data/H3K9me3-bigwig/X1.bw'
PATH_H3K27ac_X1_BW = 'data/ML4G_Project_1_Data/H3K27ac-bigwig/X1.bigwig'
PATH_H3K27me3_X1_BW = 'data/ML4G_Project_1_Data/H3K27me3-bigwig/X1.bw'
PATH_H3K36me3_X1_BW = 'data/ML4G_Project_1_Data/H3K36me3-bigwig/X1.bw'

In [None]:
df_features = pd.read_csv(PATH_TRAIN_X1_FEATURES, delimiter='\t')
df_features.head()


In [None]:
df_targets = pd.read_csv(PATH_TRAIN_X1_TARGETS, delimiter='\t')
df_targets.head()

In [177]:
assert (df_features['gene_name'] == df_targets['gene_name']).all()
y = np.zeros((df_targets.shape[0], 1))
y = df_targets['gex'].values

In [None]:
dummy_array = np.zeros(SEQ_LENGTH)
df_Xy = pd.DataFrame({
    'id': ['id'], 
    'H3K4me1': [dummy_array],
    'H3K4me3': [dummy_array],
    'H3K9me3': [dummy_array],
    'H3K27ac': [dummy_array],
    'H3K27me3': [dummy_array],
    'H3K36me3': [dummy_array],
    'target': [dummy_array]
    })
df_Xy.head()
df_Xy.shape[0]

In [152]:
def build_features(chrom: str, TSS_start: int, strand: str, bw) -> np.ndarray:
    features = np.zeros(SEQ_LENGTH//BIN_SIZE)
    for i in range(features.shape[0]):
        start = TSS_start - SEQ_LENGTH//2 + i*BIN_SIZE
        end = start + BIN_SIZE
        values = bw.values(chrom, start, end)
        features[i] = np.mean(values)
    if strand == '-':
        features = features[::-1]
    return features


In [153]:

H3K4me1_X1 = pyBigWig.open(PATH_H3K4me1_X1_BW)
H3K4me3_X1 = pyBigWig.open(PATH_H3K4me3_X1_BW)
H3K9me3_X1 = pyBigWig.open(PATH_H3K9me3_X1_BW)
H3K27ac_X1 = pyBigWig.open(PATH_H3K27ac_X1_BW)
H3K27me3_X1 = pyBigWig.open(PATH_H3K27me3_X1_BW)
H3K36me3_X1 = pyBigWig.open(PATH_H3K36me3_X1_BW)

In [154]:
assert (df_features['gene_name'] == df_targets['gene_name']).all()

In [None]:
for i in tqdm(range(df_features.shape[0])):
    row = df_features.iloc[i]
    chrom = row['chr']
    TSS_start = row['TSS_start']
    strand = row['strand']
    new_row = {
        'id': row['gene_name'] + '_X1',
        'H3K4me1': build_features(chrom, TSS_start, strand, H3K4me1_X1),
        'H3K4me3': build_features(chrom, TSS_start, strand, H3K4me3_X1),
        'H3K9me3': build_features(chrom, TSS_start, strand, H3K9me3_X1),
        'H3K27ac': build_features(chrom, TSS_start, strand, H3K27ac_X1),
        'H3K27me3': build_features(chrom, TSS_start, strand, H3K27me3_X1),
        'H3K36me3': build_features(chrom, TSS_start, strand, H3K36me3_X1),
        'target': df_targets.iloc[i]['gex']
    }

    df_Xy.loc[i] = new_row

In [156]:
df_Xy.to_pickle('X1_train.pkl')


In [None]:
df_Xy.head()

In [None]:
df_Xy.loc[1]['H3K4me3']

In [None]:
df_Xy.head()

## Work Package 1.1 - Modeling Choices & Data Pre-processing

In [None]:
# TODO: 
# Load your feature (bed and/or bigwig and/or fasta) and target files (tsv) here.
# Decide which features to use for training. Feel free to process them however you need.

# NOTE: 
# bed and bigwig files contain signals of all chromosomes (including sex chromosomes).
# Training and validation split based on chromosomes has been done for you. 
# However, you can resplit the data in any way you want.

path_data = "/path/to/your/data/files"  # TODO
path_test = "/path/to/test/info/file"   # X3_test_info.tsv ; TODO
test_genes = pd.read_csv(path_test, sep='\t')
# ---------------------------INSERT CODE HERE---------------------------



# ---------------------------------------------------------------------- 

## Work Package 1.2 - Model Building

In [None]:
# TODO: 
# Select the best model to predict gene expression from the obtained features in WP 1.1.

# ---------------------------INSERT CODE HERE---------------------------




# ----------------------------------------------------------------------


## Work Package 1.3 - Prediction on Test Data (Evaluation Metric)

In [None]:
# TODO:
# Using the model trained in WP 1.2, make predictions on the test data (chr 1 of cell line X3).
# Store predictions in a variable called "pred" which is a numpy array.

pred = None
# ---------------------------INSERT CODE HERE---------------------------




# ----------------------------------------------------------------------

# Check if "pred" meets the specified constrains
assert isinstance(pred, np.ndarray), 'Prediction array must be a numpy array'
assert np.issubdtype(pred.dtype, np.number), 'Prediction array must be numeric'
assert pred.shape[0] == len(test_genes), 'Each gene should have a unique predicted expression'

#### Store Predictions in the Required Format

In [None]:
# Store predictions in a ZIP. 
# Upload this zip on the project website under "Your submission".
# Zip this notebook along with the conda environment (and README, optional) and upload this under "Your code".

save_dir = 'path/to/save/output/file'  # TODO
file_name = 'gex_predicted.csv'         # PLEASE DO NOT CHANGE THIS
zip_name = "LastName_FirstName_Project1.zip" # TODO
save_path = f'{save_dir}/{zip_name}'
compression_options = dict(method="zip", archive_name=file_name)

test_genes['gex_predicted'] = pred.tolist()
test_genes[['gene_name', 'gex_predicted']].to_csv(save_path, compression=compression_options)