In [50]:
# Import libraries that are required to run your project
# You are allowed to add more libraries as you need

import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import pyBigWig as pbw
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



In [64]:
# TODO: 
# Load your feature (bed and/or bigwig and/or fasta) and target files (tsv) here.
# Decide which features to use for training. Feel free to process them however you need.

# NOTE: 
# bed and bigwig files contain signals of all chromosomes (including sex chromosomes).
# Training and validation spl£it based on chromosomes has been done for you. 
# However, you can resplit the data in any way you want.

#path_data = "/path/to/your/data/files"  # TODO
path_data = "/Users/sidhu/Documents/GENOMICS/GENOMICS/Data/"
path_test = "/Users/sidhu/Documents/GENOMICS/GENOMICS/Data/CAGE-train/"   # X3_test_info.tsv ; TODO
histone = ["DNase-bigwig/", "H3K4me1-bigwig/", "H3K4me3-bigwig/", "H3K9me3-bigwig/", "H3K27ac-bigwig/", "H3K27me3-bigwig/", "H3K36me3-bigwig/"]
#test_genes = pd.read_csv(path_test, sep='\t')
# ---------------------------INSERT CODE HERE---------------------------


### X1

# Training and validation set (with labels)
X1_train_info = pd.read_csv(path_test + "X1_train_info.tsv", sep= '\t')
X1_train_y = pd.read_csv(path_test + "X1_train_y.tsv", sep= '\t')
X1_val_info = pd.read_csv(path_test + "X1_val_info.tsv", sep= '\t')
X1_val_y = pd.read_csv(path_test + "X1_val_y.tsv", sep= '\t')


## Dataset

# DNase
Dnase = pbw.open(path_data + str(histone[0]) + str("X1.bw"))

# Histones
H1 = pbw.open(path_data + str(histone[1]) + str("X1.bw"))
H2 = pbw.open(path_data + str(histone[2]) + str("X1.bw"))
H3 = pbw.open(path_data + str(histone[3]) + str("X1.bw"))
H4 = pbw.open(path_data + str(histone[4]) + str("X1.bw"))
H5 = pbw.open(path_data + str(histone[5]) + str("X1.bw"))
H6 = pbw.open(path_data + str(histone[6]) + str("X1.bw"))

Dataset = [Dnase, H1, H2, H3, H4, H5, H6]
types = ["max", "min", "std", "coverage"]

# ---------------------------------------------------------------------- 

In [65]:
###
# DESCP: Extract a 40'000X1 image for CNN
# INPUT: Xbw is the tsv file
#        bins is the number of pixels we want
#        window is how much info around TSS_mid do we want wo capture
# OUTPUT: Images contains for each gene one image (bins, 28) with  4 channels (max, min , std, coverage) for each of the 7 datasets

def image_creator(Xbw, bins, window=40000):
    
    Images = np.zeros((Xbw.index[-1]+1, len(types)*len(Dataset), bins))
    
    
    for index in tqdm(Xbw.index):
        
       # Extract all information from the input file
        chrom = str(Xbw.loc[index][1])
        gene_start = int(Xbw.loc[index][2])
        gene_end = int(Xbw.loc[index][3])
        TSS_start = int(Xbw.loc[index][4])
        TSS_end = int(Xbw.loc[index][5])
        strand = Xbw.loc[index][6]
        
        # Calculate TSS_mid
        TSS_mid = int(TSS_start + (TSS_end-TSS_start)/2)
        
        
        # Calculate lenghts of each chromosom 
        Dnase.chroms("chr1")
        
        
        # Create an image which window large with TSS_mid in the middle
        
        for d, dataset in enumerate(Dataset):
            for t, typ in enumerate(types):
                if TSS_mid - int(window/2) < 0:
                    if TSS_mid + int(window/2) > dataset.chroms(chrom):
                        image_l = dataset.stats(chrom, 0, dataset.chroms(chrom), type=typ, nBins=bins)
                        Images[index][d+t] = np.array(image_l)
        
    return Images

In [66]:
# Preprocessed data
train_X = image_creator(X1_train_info, bins=1000, window=40000)

100%|███████████████████████████████████| 14310/14310 [00:04<00:00, 3404.30it/s]


In [67]:
train_X.shape

(14310, 28, 1000)

In [68]:
np.isnan(train_X[0:100]).any()

False

In [69]:
val_X = image_creator(X1_val_info, bins=1000, window=40000)

100%|█████████████████████████████████████| 1974/1974 [00:00<00:00, 3369.30it/s]


In [70]:
train_Y = X1_train_y["gex"].to_numpy()
val_Y = X1_val_y["gex"].to_numpy()

In [71]:
train_X = torch.from_numpy(train_X).double()
val_X = torch.from_numpy(val_X).to(torch.double)
train_Y = torch.from_numpy(train_Y).to(torch.double)
val_Y = torch.from_numpy(val_Y).to(torch.double)

In [75]:
# CNN MODEL
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels=28, out_channels=56, kernel_size=5, stride=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2, dilation=1)
        self.conv2 = nn.Conv1d(56, 112, 5, 1)
        self.fc1 = nn.Linear(112*5*5, 1000)
        self.fc2 = nn.Linear(1000, 100)
        self.fc3 = nn.Linear(100, 1)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()
net.float()

Net(
  (conv1): Conv1d(28, 56, kernel_size=(5,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(56, 112, kernel_size=(5,), stride=(1,))
  (fc1): Linear(in_features=2800, out_features=1000, bias=True)
  (fc2): Linear(in_features=1000, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=1, bias=True)
)

In [76]:
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [77]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i in range(0, train_X.size()[0]):

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(train_X[i].double())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

RuntimeError: expected scalar type Double but found Float