In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Hide FutureWarning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pyranges as pr

path = '/path/to/data'
alignment_path = 'alignment_v45'
genome_file = 'GRCh38.p14.genome.fa'
genome_path = os.path.join(path, 'gencode_human/version_45', genome_file)
window_padding = 100

In [4]:
metadata_file = 'reads/metadata_tissue.tsv'
metadata = pd.read_csv(os.path.join(path, metadata_file), sep='\t')
tissues = metadata['group'].unique()

In [None]:
CAGE_path = "/path/to/code/snakemake-pipeline/resources/CAGE/"
# filter out tissues with no corresponding CAGE data
tissues = [tissue for tissue in tissues if os.path.exists(os.path.join(CAGE_path, f'{tissue}.bed'))]
tissues

['aorta', 'brain', 'colon', 'heart', 'lung', 'muscle']

In [None]:
# isotools_path = '/path/to/code/notebooks/results/isotools_'
fofn_path = '/path/to/code/snakemake-pipeline/results/tissues_gtf.fofn'

tool_gtfs = pd.read_csv(fofn_path, sep='\t', header=0)
# check that there are no duplicates of tool + tissue
assert tool_gtfs.groupby(['tool', 'tissue']).size().max() == 1
# filter for tissues with CAGE data
tool_gtfs = tool_gtfs[tool_gtfs['tissue'].isin(tissues)]
tool_gtfs

In [7]:
display_tool = 'FLAIR'
display_tissue = 'brain'

In [8]:
CAGE_data = {}
for tissue in tissues:
    CAGE_data[tissue] = pr.read_bed(os.path.join(CAGE_path, f'{tissue}.bed'))

tool_gtfs['pr'] = tool_gtfs['gtf'].apply(lambda x: pr.read_gtf(x))

In [9]:
tool_gtfs.loc[(tool_gtfs['tool'] == display_tool) & (tool_gtfs['tissue'] == display_tissue), 'pr'].values[0]

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,exon_number
0,chr1,FLAIR,transcript,175387,175644,.,+,.,chr1:175000,m64109_220203_225821/159646893/ccs,
1,chr1,FLAIR,exon,175387,175644,.,+,.,chr1:175000,m64109_220203_225821/159646893/ccs,0
2,chr1,FLAIR,transcript,197700,199762,.,+,.,chr1:197000,m64109_220205_045735/17237446/ccs,
3,chr1,FLAIR,exon,197700,199762,.,+,.,chr1:197000,m64109_220205_045735/17237446/ccs,0
4,chr1,FLAIR,transcript,265404,267777,.,+,.,chr1:265000,m64109_220205_045735/39125266/ccs,
...,...,...,...,...,...,...,...,...,...,...,...
821086,chrY,FLAIR,exon,307731,307880,.,-,.,ENSG00000292358.1,m64109_220203_225821/137103082/ccs,2
821087,chrY,FLAIR,exon,311418,311627,.,-,.,ENSG00000292358.1,m64109_220203_225821/137103082/ccs,3
821088,chrY,FLAIR,exon,312765,315020,.,-,.,ENSG00000292358.1,m64109_220203_225821/137103082/ccs,4
821089,chrY,FLAIR,exon,316913,317051,.,-,.,ENSG00000292358.1,m64109_220203_225821/137103082/ccs,5


Find TSS based on strand

In [10]:
# tss_cat = ["5' fragment", "novel exonic TSS", "novel intronic TSS"]

def tss_from_transcriptome(transcriptome):
    transcriptome = transcriptome[transcriptome.Feature == 'transcript']
    starts = transcriptome[transcriptome.Strand == '+']
    starts.End = starts.Start + 1
    ends = transcriptome[transcriptome.Strand == '-']
    ends.Start = ends.End - 1
    tss_sites = pr.concat([starts, ends]).drop_duplicate_positions()
    print(pr.concat([starts, ends]).df.shape[0] - tss_sites.df.shape[0], "duplicate positions dropped")
    return tss_sites.drop()

tool_gtfs['tss_sites'] = tool_gtfs['pr'].apply(tss_from_transcriptome)
tool_gtfs.loc[(tool_gtfs['tool'] == display_tool) & (tool_gtfs['tissue'] == display_tissue), 'tss_sites'].values[0]

19826 duplicate positions dropped
32504 duplicate positions dropped
21453 duplicate positions dropped
95768 duplicate positions dropped
50988 duplicate positions dropped
17123 duplicate positions dropped


Unnamed: 0,Chromosome,Start,End,Strand
0,chr1,175387,175388,+
1,chr1,197700,197701,+
2,chr1,265404,265405,+
3,chr1,266854,266855,+
4,chr1,271731,271732,+
...,...,...,...,...
111404,chrY,1392112,1392113,-
111405,chrY,1452928,1452929,-
111406,chrY,2500975,2500976,-
111407,chrY,2500649,2500650,-


In [11]:
CAGE_data[display_tissue]

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,629179,629180,"chr1:629179..629180,+",1,+
1,chr1,629210,629211,"chr1:629210..629211,+",1,+
2,chr1,629212,629213,"chr1:629212..629213,+",1,+
3,chr1,629620,629621,"chr1:629620..629621,+",1,+
4,chr1,629625,629626,"chr1:629625..629626,+",1,+
...,...,...,...,...,...,...
3059674,chrY,20782681,20782682,"chrY:20782681..20782682,-",1,-
3059675,chrY,21303006,21303007,"chrY:21303006..21303007,-",1,-
3059676,chrY,25626516,25626517,"chrY:25626516..25626517,-",1,-
3059677,chrY,26401653,26401654,"chrY:26401653..26401654,-",1,-


In [12]:
# Extend positions by 20bp in each direction
CAGE_windows = {tissue: CAGE_data[tissue][['Score']].extend(window_padding) for tissue in tissues}
CAGE_windows[display_tissue]

Unnamed: 0,Chromosome,Start,End,Score,Strand
0,chr1,629079,629280,1,+
1,chr1,629110,629311,1,+
2,chr1,629112,629313,1,+
3,chr1,629520,629721,1,+
4,chr1,629525,629726,1,+
...,...,...,...,...,...
3059674,chrY,20782581,20782782,1,-
3059675,chrY,21302906,21303107,1,-
3059676,chrY,25626416,25626617,1,-
3059677,chrY,26401553,26401754,1,-


In [13]:
def add_CAGE_Support(tss_sites, CAGE_windows):
    merged = tss_sites.join(CAGE_windows, strandedness='same', how='left')
    clustered = merged.cluster(slack=-1).boundaries("Cluster", agg={"Score": "sum"})
    return clustered

tool_gtfs['CAGE_clusters'] = tool_gtfs.apply(lambda x: add_CAGE_Support(x['tss_sites'], CAGE_windows[x['tissue']]), axis=1)

+--------------+-----------+-----------+--------------+
| Chromosome   | Start     | End       | Strand       |
| (category)   | (int64)   | (int64)   | (category)   |
|--------------+-----------+-----------+--------------|
| chr1         | 19963     | 19964     | +            |
| chr1         | 195837    | 195838    | +            |
| chr1         | 296931    | 296932    | +            |
| chr1         | 493489    | 493490    | +            |
| ...          | ...       | ...       | ...          |
| chrY         | 1392112   | 1392113   | -            |
| chrY         | 1452928   | 1452929   | -            |
| chrY         | 2500975   | 2500976   | -            |
| chrY         | 2500649   | 2500650   | -            |
+--------------+-----------+-----------+--------------+
Stranded PyRanges object has 89,221 rows and 4 columns from 25 chromosomes.
For printing, the PyRanges was sorted on Chromosome and Strand.
+--------------+-----------+-----------+--------------+
| Chromosome   | Sta

In [14]:
tool_gtfs.loc[(tool_gtfs['tool'] == display_tool) & (tool_gtfs['tissue'] == display_tissue), 'CAGE_clusters'].values[0]

Unnamed: 0,Chromosome,Start,End,Strand,Score,Cluster
0,chr1,175387,175388,+,-1,1
1,chr1,197700,197701,+,-1,2
2,chr1,265404,265405,+,-1,3
3,chr1,266854,266855,+,-1,4
4,chr1,271731,271732,+,-1,5
...,...,...,...,...,...,...
111404,chrY,1452928,1452929,-,1,111405
111405,chrY,1596449,1596450,-,-1,111406
111406,chrY,2500649,2500650,-,-1,111407
111407,chrY,2500975,2500976,-,4,111408


In [21]:
tool_gtfs['num_transcripts'] = tool_gtfs['CAGE_clusters'].apply(lambda x: x.length)
tool_gtfs['CAGE support absolute'] = tool_gtfs['CAGE_clusters'].apply(lambda x: x[x.Score > 0].length)
tool_gtfs['CAGE support relative'] = tool_gtfs['CAGE support absolute'] / tool_gtfs['num_transcripts']

tool_gtfs.loc[:, ['tool', 'tissue', 'num_transcripts', 'CAGE support absolute', 'CAGE support relative']]

Unnamed: 0,tool,tissue,num_transcripts,CAGE support absolute,CAGE support relative
0,FLAIR,aorta,89221,27938,0.313133
1,FLAIR,brain,111409,35306,0.316904
2,FLAIR,colon,43343,24719,0.570311
3,FLAIR,heart,120390,37958,0.315292
4,FLAIR,lung,99081,36854,0.371958
5,FLAIR,muscle,27721,19427,0.700804


In [None]:

# for tissue in tissues:
#     print('\n\n', tissue, '\n')
#     # count numbers of each novelty combination
#     print(CAGE_clusters[tissue].df.groupby(tss_cat).size())
#     print('---')
#     print(CAGE_clusters[tissue][CAGE_clusters[tissue].Score > 0].df.groupby(tss_cat).size())

In [None]:
# for tissue in tissues:
#     print('\n\n', tissue, '\n')
#     # Percentages in each group
#     print(CAGE_clusters[tissue][CAGE_clusters[tissue].Score > 0].df.groupby(tss_cat).size() / CAGE_clusters[tissue].df.groupby(tss_cat).size())