In [1]:
# Imports
import kipoi
import os
import numpy as np
import pandas as pd

import pickle
from importlib import reload

### Test run
Make a test run to see everything works properly

In [4]:
!kipoi test "5UTR_Model"

[32mINFO[0m [44m[kipoi.data][0m successfully loaded the dataloader 5UTR_Model/. from /data/ouga04b/ag_gagneur/home/karollus/5UTRModel/Collab/kipoi/5UTR_Model/dataloader.py::StrandedSequenceVariantDataloader[0m
Using TensorFlow backend.
2019-11-15 13:45:02.467640: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2019-11-15 13:45:02.481898: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2095185000 Hz
2019-11-15 13:45:02.484454: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x557fa01f0590 executing computations on platform Host. Devices:
2019-11-15 13:45:02.484529: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): <undefined>, <undefined>
2019-11-15 13:45:03.992947: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x557fa0275590 executing computations on platform CUDA. Devices:
2019-11-15

### Load the model

In [6]:
# Source model
model = kipoi.get_model("5UTR_Model", source="dir")

### Provide the parameters
The dataloader needs a fasta, bgzip compressed vcf and bed3+ input files, where a specific user-specified column (>3, 1-based) of the bed denotes the strandand a specific user-specified column (>3, 1-based) of the bed denotes the transcript id (or some other id that explains which exons in the bed belong together to form one utr). Additionally, a file can be provided that defines the chromosome order, if it is not alphanumeric (or lexicographic in case there is no chr prefix). The ordering specified in this file must be valid for vcf and bed! All columns of the bed, except the first three, the id and the strand, are ignored. The bed and vcf must both be sorted (by position) and a tabix index must be present (must lie in the same directory and have the same name + .tbi). The num_chr flag indicates whether chromosomes are listed numerically or with a chr prefix.This must be consistent across all input files!

In [7]:
# Path of the vcf file
vcf_path = "TestFiles/patho.vcf.gz"

# Path of the fasta file
fasta_path = "/s/genomes/human/hg19/fasta/hg19.fa"
# Set true if fasta has no chr prefix, false otherwise
num_chr = False

# Path of the bed file specifying human 5utr
# If chr prefix is present, use:
# gencodev19_5utr_sorted.bed, else:
# gencodev19_5utr_sorted_noprefix.bed
bed_path = "TestFiles/gencodev19_5utr_sorted.bed"
id_column = 4

# 
#chr_order_file = "TestFiles/chrom_order_noprefix.txt"

# output file path
output_file_path = "patho.tsv"

# id map path
id_map_path = "TestFiles/hg19_idmap.tsv"

### Run Prediction

In [8]:
model.pipeline.predict_to_file(output_file_path, {"intervals_file":bed_path, 
                               "fasta_file":fasta_path,
                               "vcf_file":vcf_path,
                               "id_column":id_column,
                               "num_chr":num_chr},
                              batch_size=64)

  sep='\t')
  sep='\t')

  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:03<00:00,  3.81s/it][A

### Merge with Gene Names and show output

In [9]:
# Load data as dataframe and merge with gene names
df = pd.read_csv(output_file_path, sep="\t")
df = df.rename(index=str, columns={"metadata/chr":"chr",
          "metadata/exon_positions":"exon_positions",
          "metadata/id":"transcript_id",
          "metadata/strand":"strand",
          "metadata/variants":"variants",
          "preds/mrl_fold_change":"mrl_fold_change",
          "preds/shift_1":"shift_1",
          "preds/shift_2":"shift_2"}
)

# Id map
df_map = pd.read_csv(id_map_path, sep="\t")
df = df.merge(df_map, on="transcript_id")

In [10]:
df

Unnamed: 0,chr,exon_positions,transcript_id,strand,variants,mrl_fold_change,shift_1,shift_2,gene_id,gene_name
0,chr7,19156944-19157295,ENST00000242261,-,chr7:19157207:G>T;chr7:19157225:C>A,-0.789764,-0.395907,-0.773174,ENSG00000122691,TWIST1
1,chr2,96931119-96931250;96931606-96931732,ENST00000258439,-,chr2:96931137:G>A,-1.285575,-1.012779,0.028387,ENSG00000135956,TMEM127
2,chr3,98312348-98312567,ENST00000264193,-,chr3:98312358:C>T,-1.006342,-0.834495,0.085337,ENSG00000080819,CPOX
3,chr5,147211140-147211349,ENST00000296695,-,chr5:147211193:G>A,-0.361473,-0.218929,-0.283831,ENSG00000164266,SPINK1
4,chr9,21974826-21975097,ENST00000304494,-,chr9:21974860:C>A,-0.910533,-0.525239,-1.060424,ENSG00000147889,CDKN2A
5,chr11,5248251-5248301,ENST00000335295,-,chr11:5248280:C>T,-0.828528,0.00658,-0.919977,ENSG00000244734,HBB
6,chr1,209974758-209974761;209975316-209975388;209979...,ENST00000367021,-,chr1:209975361:T>A,-1.068,-0.861996,0.055964,ENSG00000117595,IRF6
7,chr1,93297581-93297671,ENST00000370321,+,chr1:93297626:C>A,-0.799757,-0.670084,0.025305,ENSG00000122406,RPL5
8,chr17,66508542-66508720;66511534-66511540,ENST00000392711,+,chr17:66508599:G>A,-1.02586,0.015113,-1.124092,ENSG00000108946,PRKAR1A
9,chr2,96931119-96931227;96931606-96931732,ENST00000432959,-,chr2:96931137:G>A,-1.029096,-0.896816,-0.570133,ENSG00000135956,TMEM127
