In [1]:
# Imports
import kipoi
import os
import numpy as np
import pandas as pd

### Source Model

In [3]:
# Source model directly from directory
model = kipoi.get_model("../Xpresso_kipoi/human_median", source="dir")

0.00B [00:00, ?B/s]

Downloading https://zenodo.org/record/4075690/files/humanMedian_trainepoch.11-0.426.h5 to /data/nasif12/home_if12/karollus/5UTRModel/xpresso/Xpresso_kipoi/Xpresso_kipoi/downloaded/model_files/human_median/weights/9d00a3bc614da81655328b6e278569e2


1.39MB [00:00, 1.84MB/s]                            

Instructions for updating:
Colocations handled automatically by placer.





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.




### Download and prepare example files (optional)

In [4]:
import urllib.request
import gzip
import shutil
import pyranges as pr

In [5]:
# make ExampleFile directory if it does not exist
if not os.path.exists("ExampleFiles"):
    os.makedirs("ExampleFiles")

In [6]:
# Download GTF
urllib.request.urlretrieve("https://zenodo.org/record/1466102/files/example_files-gencode.v24.annotation_chr22.gtf?download=1", 'ExampleFiles/chrom22.gtf')
# Download fasta
urllib.request.urlretrieve("https://zenodo.org/record/1466102/files/example_files-hg38_chr22.fa?download=1", 'ExampleFiles/chrom22.fa')

('ExampleFiles/chrom22.fa', <http.client.HTTPMessage at 0x2b89ab7aefd0>)

In [13]:
# Extract implied TSS sites from gtf
# Read in with pyranges
gr = pr.read_gtf('ExampleFiles/chrom22.gtf')
# Extract protein coding genes
prot_genes = gr.df[(gr.df.Feature == 'gene') & (gr.df.gene_type == 'protein_coding')]
# Compute implied TSS
prot_genes['TSS'] = (prot_genes.Start * (prot_genes.Strand == "+")) + (prot_genes.End * (prot_genes.Strand == "-"))
# Determine region around TSS
prot_genes['region_start'] = prot_genes.TSS + (-7000*(prot_genes.Strand == "+")) + (-3500 * (prot_genes.Strand == "-"))
prot_genes['region_end'] = prot_genes.TSS + (3500*(prot_genes.Strand == "+")) + (7000 * (prot_genes.Strand == "-"))
# Add nuisance column to make bed6
prot_genes["score"] = "."

In [15]:
# write bed file
bed = prot_genes[['Chromosome', 'region_start', 'region_end', 'gene_id', 'score', 'Strand']]
bed.to_csv("ExampleFiles/chrom22.bed", sep='\t', header=False, index=False)

### Provide the Parameters

In [16]:
# Path of the fasta file
fasta_path = "ExampleFiles/chrom22.fa"
# Set false if fasta has a chr prefix, true otherwise
num_chr = False

# Path of the bed file specifying the promoter regions
bed_path = "ExampleFiles/chrom22.bed"

# output file path
output_file_path = "predictions.tsv"

### Run Prediction

In [17]:
model.pipeline.predict_to_file(output_file_path, {"intervals_file":bed_path, 
                               "fasta_file":fasta_path,
                               "num_chr_fasta":num_chr},
                              batch_size=64)

100%|██████████| 7/7 [00:16<00:00,  2.42s/it]


### Load results

In [18]:
# Load data as dataframe
df = pd.read_csv(output_file_path, sep="\t")
df

Unnamed: 0,metadata/ranges/chr,metadata/ranges/end,metadata/ranges/id,metadata/ranges/start,metadata/ranges/strand,preds/expression_pred
0,chr22,11070000,0,11059500,+,-0.097702
1,chr22,15531657,1,15521157,+,-1.052358
2,chr22,15693525,2,15683025,+,-0.806651
3,chr22,17088453,3,17077953,+,0.872204
4,chr22,17363448,4,17352948,+,-0.903286
...,...,...,...,...,...,...
434,chr22,50585465,434,50574965,-,0.178786
435,chr22,50589965,435,50579465,-,0.955409
436,chr22,50608455,436,50597955,-,-0.159009
437,chr22,50635173,437,50624673,-,0.754210


In [22]:
# Merge back with gene_ids
df = df.rename(columns={"metadata/ranges/chr":"Chromosome", "metadata/ranges/start":"region_start", "metadata/ranges/end":"region_end", "metadata/ranges/strand":"strand"})
merged = prot_genes.merge(df, on=["Chromosome", "region_start", "region_end"])

In [24]:
merged

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,havana_transcript,protein_id,ccdsid,TSS,region_start,region_end,score,metadata/ranges/id,strand,preds/expression_pred
0,chr22,ENSEMBL,gene,11066500,11068089,.,+,.,ENSG00000279973.1,protein_coding,...,,,,11066500,11059500,11070000,.,0,+,-0.097702
1,chr22,HAVANA,gene,15528157,15529139,.,+,.,ENSG00000130538.4,protein_coding,...,,,,15528157,15521157,15531657,.,1,+,-1.052358
2,chr22,HAVANA,gene,15690025,15721631,.,+,.,ENSG00000198062.14,protein_coding,...,,,,15690025,15683025,15693525,.,2,+,-0.806651
3,chr22,HAVANA,gene,17084953,17115694,.,+,.,ENSG00000177663.13,protein_coding,...,,,,17084953,17077953,17088453,.,3,+,0.872204
4,chr22,HAVANA,gene,17359948,17558149,.,+,.,ENSG00000099954.18,protein_coding,...,,,,17359948,17352948,17363448,.,4,+,-0.903286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,chr22,HAVANA,gene,50568860,50578465,.,-,.,ENSG00000205560.12,protein_coding,...,,,,50578465,50574965,50585465,.,434,-,0.178786
435,chr22,HAVANA,gene,50568868,50582965,.,-,.,ENSG00000254413.8,protein_coding,...,,,,50582965,50579465,50589965,.,435,-,0.955409
436,chr22,HAVANA,gene,50578948,50601455,.,-,.,ENSG00000100288.19,protein_coding,...,,,,50601455,50597955,50608455,.,436,-,-0.159009
437,chr22,HAVANA,gene,50622753,50628173,.,-,.,ENSG00000100299.17,protein_coding,...,,,,50628173,50624673,50635173,.,437,-,0.754210
