In [2]:
# Import libraries that are required to run your project
# You are allowed to add more libraries as you need

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

## Work Package 1.1 - Modeling Choices & Data Pre-processing

In [None]:
# TODO: 
# Load your feature (bed and/or bigwig and/or fasta) and target files (tsv) here.
# Decide which features to use for training. Feel free to process them however you need.

# NOTE: 
# bed and bigwig files contain signals of all chromosomes (including sex chromosomes).
# Training and validation split based on chromosomes has been done for you. 
# However, you can resplit the data in any way you want.

#path_data = "/path/to/your/data/files"  # TODO
path_data = "/Users/sidhu/Documents/GENOMICS/GENOMICS/Data/"
path_test = "/path/to/test/info/file"   # X3_test_info.tsv ; TODO
test_genes = pd.read_csv(path_test, sep='\t')
# ---------------------------INSERT CODE HERE---------------------------




# ---------------------------------------------------------------------- 

### 1.1.1 DATASET DESCRIPTION

In [45]:
tsv_path = "/Users/sidhu/Documents/GENOMICS/GENOMICS/Data/CAGE-train/"
X1_train_info = pd.read_csv(tsv_path + "X1_train_info.tsv", sep= '\t')
X1_train_y = pd.read_csv(tsv_path + "X1_train_y.tsv", sep= '\t')
X1_val_info = pd.read_csv(tsv_path + "X1_val_info.tsv", sep= '\t')
X1_val_y = pd.read_csv(tsv_path + "X1_val_y.tsv", sep= '\t')
X3_test_info = pd.read_csv(tsv_path + "X3_test_info.tsv", sep= '\t')

### CAGE-train folder

We are given train_info and labels y for 3 different DNA strands X1, X2 & X3

### For X1 and X2

#### TRAIN DATA

##### train_info: 
contains the 14'310 entries each containing 7 values:

1. gene_name
2. chr number: contain all numbers [2, 22] \ {14, 19}
3. gene_start
4. gene_end
5. TSS_start
6. TSS_end
7. strand: if + we read from gene_start to gene_end, for - we read from gene_end to gene_start

##### train_y: 
contains also 14'310 entries with the label y representing GEX (gene expression)

#### VALIDATION DATA

The val_info and val_y follow the same procedure except that they only have 1974 entries and they only contain chromosom 19 & 14

### X3

For X3 we have 1984 entries again with 7 values given as in train_info except we only have chr 1.


## TODO:
1. What is difference between gene_starte/end and TSS_start/end.
2. What is strand +/- and do we need it?
3. You could have potentially multiple TSS in one gene, why not the case in the dataset?



In [75]:
X1_train_info.loc[0]

gene_name       SLC20A1
chr                chr2
gene_start    112645939
gene_end      112663825
TSS_start     112658362
TSS_end       112658412
strand                +
Name: 0, dtype: object

In [32]:
X1_train_y.loc[0]

gene_name    SLC20A1
gex              0.0
Name: 0, dtype: object

In [33]:
X1_val_info.loc[0]

gene_name         ECH1
chr              chr19
gene_start    38815422
gene_end      38831841
TSS_start     38816220
TSS_end       38816270
strand               -
Name: 0, dtype: object

In [34]:
X1_val_y.loc[0]

gene_name    ECH1
gex           0.0
Name: 0, dtype: object

In [49]:
X3_test_info.loc[0]

gene_name         CAPN9
chr                chr1
gene_start    230747384
gene_end      230802003
TSS_start     230791876
TSS_end       230791926
strand                +
Name: 0, dtype: object

## DNase-bed Folder

This is our chromatin accessibility dataset.


The Dnase Folder has 3 Files X1.bed, X2.bed and X3.bed

The X1.bed has 56'451 entries with 10 values each
X2.bed has 181'145
X3.bed has 71'811

We have genderchromosome chrX and chrY also included in this data.


1. chromosom name
2. gene_start/TSS_start
3. gene_end/TSS_end
4. unknown
5. unknown
6. unknown
7. unknown
8. unknown
9. unknown
10. unknown

## TODO:

1. Determine what these unknown values are
2. Do we need the gender chromosome chrX and chrY



In [85]:
Dnase_bed_path = "/Users/sidhu/Documents/GENOMICS/GENOMICS/Data/DNase-bed/"
X1_dnase = pd.read_csv(Dnase_bed_path + "X1.bed", sep= '\t')
X2_dnase = pd.read_csv(Dnase_bed_path + "X2.bed", sep= '\t')
X3_dnase = pd.read_csv(Dnase_bed_path + "X3.bed", sep= '\t')

In [87]:
X1_dnase.loc[0]

chr1                       chr1
181405                   267990
181555                   268140
.                             .
0                             0
..1                           .
12.379542683931337    12.379543
-1                           -1
-1.1                         -1
75                           75
Name: 0, dtype: object

## H3* Folders

These are our Histone marks dataset

We have 6 .bed folder for different histones. In particular we have H3K4me1, H3K4me3, H3K9me3, H3K27ac, H3K27me3 & H3K36me3

H3K4me1 and H3K27ac are listed as active enhancers!

![alt text](histone.png "Title")

All the files have 3 .bed files for X1, X2 & X3 and are built the same for the expection of #entries

Example: H3k37ac.bed --> X1.bed


The X1_K27ac has 57'886, X2_K27ac has 76092, X3_K27ac has 54'473 entries each containing 10 values

1. chromosom name
2. gene_start/TSS_start
3. gene_end/TSS_end
4. Peak number
5. unknown
6. unknown
7. unknown
8. unknown
9. unknown
10. unknown


In [77]:
H3K27ac_bed_path = "/Users/sidhu/Documents/GENOMICS/GENOMICS/Data/H3K27ac-bed/"
X1_K27ac = pd.read_csv(H3K27ac_bed_path + "X1.bed", sep= '\t')
X2_K27ac = pd.read_csv(H3K27ac_bed_path + "X2.bed", sep= '\t')
X3_K27ac = pd.read_csv(H3K27ac_bed_path + "X3.bed", sep= '\t')

In [88]:
X3_K27ac.loc[0]

chr10                      chr10
100001756              100002161
100002056              100002452
Peak_52027            Peak_53113
16                            16
.                              .
4.0293549562682225      3.971164
5.54052                  5.44367
3.67548                  3.58673
139                          126
Name: 0, dtype: object

In [83]:
H3K27me3_bed_path = "/Users/sidhu/Documents/GENOMICS/GENOMICS/Data/H3K27me3-bed/"
X1_K27me3 = pd.read_csv(H3K27me3_bed_path + "X1.bed", sep= '\t')
X2_K27me3 = pd.read_csv(H3K27me3_bed_path + "X2.bed", sep= '\t')
X3_K27me3 = pd.read_csv(H3K27me3_bed_path + "X3.bed", sep= '\t')

In [84]:
X1_K27me3

Unnamed: 0,chr10,100100915,100101123,Peak_17277,107,.,31.155124653739612,4.07966,0.66916,75
0,chr10,100140031,100140219,Peak_14127,127,.,34.003416,4.41054,0.88574,77
1,chr10,100566712,100566879,Peak_36184,53,.,25.393536,2.92767,0.12659,59
2,chr10,100567356,100567545,Peak_13190,131,.,36.565097,4.54974,0.98555,101
3,chr10,100570317,100570551,Peak_29275,68,.,28.896399,3.22608,0.25694,216
4,chr10,100571000,100571280,Peak_3317,205,.,44.566205,6.02366,1.73814,235
...,...,...,...,...,...,...,...,...,...,...
26894,chrX,74423063,74423242,Peak_24560,88,.,32.981163,3.68802,0.42361,43
26895,chrX,8731525,8731685,Peak_24561,88,.,32.981163,3.68802,0.42361,47
26896,chrX,8880394,8880658,Peak_5203,195,.,41.900185,5.88995,1.70396,100
26897,chrX,9788557,9788731,Peak_20095,107,.,33.413850,4.07966,0.66916,83


## Work Package 1.2 - Model Building

In [None]:
# TODO: 
# Select the best model to predict gene expression from the obtained features in WP 1.1.

# ---------------------------INSERT CODE HERE---------------------------




# ----------------------------------------------------------------------


## Work Package 1.3 - Prediction on Test Data (Evaluation Metric)

In [None]:
# TODO:
# Using the model trained in WP 1.2, make predictions on the test data (chr 1 of cell line X3).
# Store predictions in a variable called "pred" which is a numpy array.

pred = None
# ---------------------------INSERT CODE HERE---------------------------




# ----------------------------------------------------------------------

# Check if "pred" meets the specified constrains
assert isinstance(pred, np.ndarray), 'Prediction array must be a numpy array'
assert np.issubdtype(pred.dtype, np.number), 'Prediction array must be numeric'
assert pred.shape[0] == len(test_genes), 'Each gene should have a unique predicted expression'

#### Store Predictions in the Required Format

In [None]:
# Store predictions in a ZIP. 
# Upload this zip on the project website under "Your submission".
# Zip this notebook along with the conda environment (and README, optional) and upload this under "Your code".

save_dir = 'path/to/save/output/file'  # TODO
file_name = 'gex_predicted.csv'         # PLEASE DO NOT CHANGE THIS
zip_name = "LastName_FirstName_Project1.zip" # TODO
save_path = f'{save_dir}/{zip_name}'
compression_options = dict(method="zip", archive_name=file_name)

test_genes['gex_predicted'] = pred.tolist()
test_genes[['gene_name', 'gex_predicted']].to_csv(save_path, compression=compression_options)