In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/lankenau/isotools/src')

In [3]:
import os
import logging
from isotools import Transcriptome
from isotools import __version__ as isotools_version
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# set up logging
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
logger = logging.getLogger("isotools")
logger.info(f"This is isotools version {isotools_version}")

INFO:This is isotools version 0.3.5rc11


In [None]:
isotools_pkl = 'path/to/notebooks/results/flair_isotools.pkl'
sqanti_classification = 'path/to/snakemake-pipeline/results/sqanti/flair/qc/aorta/aorta_classification.txt'

In [5]:
sqanti_df = pd.read_csv(sqanti_classification, sep='\t')
# Rename and make boolean
sqanti_df['category'] = sqanti_df['structural_category']
sqanti_df['TSS ratio'] = sqanti_df['ratio_TSS'] > 1.5
sqanti_df['CAGE support'] = sqanti_df['within_CAGE_peak']
sqanti_df['polyA site'] = sqanti_df['within_polyA_site']
sqanti_df['polyA motif'] = sqanti_df['polyA_motif_found']
sqanti_df['start both'] = sqanti_df['TSS ratio'] & sqanti_df['CAGE support']
sqanti_df['end both'] = sqanti_df['polyA site'] & sqanti_df['polyA motif']

sqanti_df['category'] = sqanti_df['category'].replace({
    'intergenic': 'Intergenic',
    'antisense': 'Antisense',
    'genic': 'Genic',
    'genic_intron': 'Genic Intron',
    'fusion': 'Fusion',
    'full-splice_match': 'FSM',
    'incomplete-splice_match': 'ISM',
    'novel_in_catalog': 'NIC',
    'novel_not_in_catalog': 'NNC'
})

In [6]:
sqanti_df = sqanti_df.loc[:,['isoform', 'category', 'subcategory']]
sqanti_df

Unnamed: 0,isoform,category,subcategory
0,ENST00000037502.11,FSM,reference_match
1,ENST00000040877.2,FSM,reference_match
2,ENST00000054666.11,FSM,reference_match
3,ENST00000060969.6,FSM,alternative_3end5end
4,ENST00000072644.7,FSM,reference_match
...,...,...,...
109140,m54284U_210227_205004/70453536/ccs,NIC,combination_of_known_splicesites
109141,m54284U_210227_205004/722100/ccs,Intergenic,mono-exon
109142,m54284U_210227_205004/72286900/ccs,Genic Intron,mono-exon
109143,m54284U_210227_205004/83820795/ccs,Genic Intron,mono-exon


In [None]:
isoseq = Transcriptome.load(isotools_pkl)

In [8]:
isotools_df = isoseq.transcript_table(query="", extra_columns=["transcript_id"])\
# check if transcript_id is unique, otherwise
# create isoform column from gene_id + _ + transcript_nr
if len(isotools_df['transcript_id'].unique()) == isotools_df.shape[0]:
    isotools_df = isotools_df.assign(isoform=isotools_df['transcript_id'])
else:
    isotools_df = isotools_df.assign(isoform=isotools_df['gene_id'] + '_' + isotools_df['transcript_nr'].astype(str))
# rename novelty_class to isotools_category and novelty_subclasses to isotools_subcategory
isotools_df = isotools_df.rename(columns={'novelty_class': 'isotools_category', 'novelty_subclasses': 'isotools_subcategory'})
isotools_df = isotools_df.loc[:,['gene_id', 'isoform', 'isotools_category', 'isotools_subcategory', 'chr', 'transcript_start', 'transcript_end']]

In [9]:
isotools_df

Unnamed: 0,gene_id,isoform,isotools_category,isotools_subcategory,chr,transcript_start,transcript_end
0,ENSG00000121957.15,m54284U_210227_205004/37357274/ccs,ISM,mono-exon,chr1,108876984,108880418
1,ENSG00000121957.15,m54284U_210227_205004/61016242/ccs,ISM,mono-exon,chr1,108906669,108909981
2,ENSG00000121957.15,m54284U_210227_205004/84346647/ccs,ISM,mono-exon,chr1,108925726,108926898
3,ENSG00000121957.15,m54284U_210227_205004/103220783/ccs,ISM,mono-exon,chr1,108928073,108931970
4,ENSG00000121957.15,ENST00000264126.9,FSM,FSM,chr1,108876984,108930382
...,...,...,...,...,...,...,...
107486,IT_novel_53673,m54284U_200419_012632/115279072/ccs,NOVEL,antisense,chrY,1385100,1386656
107487,ENSG00000292335.1,m54284U_210227_205004/121636612/ccs,ISM,mono-exon,chrY,992801,994365
107488,IT_novel_53657,m54284U_210227_205004/20318378/ccs,NOVEL,intronic,chrY,1405441,1407766
107489,IT_novel_53664,m54284U_200419_012632/12716168/ccs,NOVEL,genic genomic,chrY,1415645,1418144


In [10]:
# join dfs
df = pd.merge(isotools_df, sqanti_df, on='isoform', how='inner')
df

Unnamed: 0,gene_id,isoform,isotools_category,isotools_subcategory,chr,transcript_start,transcript_end,category,subcategory
0,ENSG00000121957.15,m54284U_210227_205004/37357274/ccs,ISM,mono-exon,chr1,108876984,108880418,NIC,mono-exon_by_intron_retention
1,ENSG00000121957.15,m54284U_210227_205004/61016242/ccs,ISM,mono-exon,chr1,108906669,108909981,Genic,mono-exon
2,ENSG00000121957.15,m54284U_210227_205004/84346647/ccs,ISM,mono-exon,chr1,108925726,108926898,Genic,mono-exon
3,ENSG00000121957.15,m54284U_210227_205004/103220783/ccs,ISM,mono-exon,chr1,108928073,108931970,NIC,mono-exon_by_intron_retention
4,ENSG00000121957.15,ENST00000264126.9,FSM,FSM,chr1,108876984,108930382,FSM,alternative_3end
...,...,...,...,...,...,...,...,...,...
107486,IT_novel_53673,m54284U_200419_012632/115279072/ccs,NOVEL,antisense,chrY,1385100,1386656,Antisense,mono-exon
107487,ENSG00000292335.1,m54284U_210227_205004/121636612/ccs,ISM,mono-exon,chrY,992801,994365,Genic,mono-exon
107488,IT_novel_53657,m54284U_210227_205004/20318378/ccs,NOVEL,intronic,chrY,1405441,1407766,Genic Intron,mono-exon
107489,IT_novel_53664,m54284U_200419_012632/12716168/ccs,NOVEL,genic genomic,chrY,1415645,1418144,Genic,mono-exon


In [11]:
mismatches = df.loc[(df['category'] != df['isotools_category']) & (df['isotools_category'] != 'NOVEL')]
mismatches

Unnamed: 0,gene_id,isoform,isotools_category,isotools_subcategory,chr,transcript_start,transcript_end,category,subcategory
0,ENSG00000121957.15,m54284U_210227_205004/37357274/ccs,ISM,mono-exon,chr1,108876984,108880418,NIC,mono-exon_by_intron_retention
1,ENSG00000121957.15,m54284U_210227_205004/61016242/ccs,ISM,mono-exon,chr1,108906669,108909981,Genic,mono-exon
2,ENSG00000121957.15,m54284U_210227_205004/84346647/ccs,ISM,mono-exon,chr1,108925726,108926898,Genic,mono-exon
3,ENSG00000121957.15,m54284U_210227_205004/103220783/ccs,ISM,mono-exon,chr1,108928073,108931970,NIC,mono-exon_by_intron_retention
7,ENSG00000163431.14,m54284U_200419_012632/53545803/ccs,ISM,mono-exon,chr1,201896455,201898889,Genic,mono-exon
...,...,...,...,...,...,...,...,...,...
107424,ENSG00000157514.18,m54284U_200419_012632/25363560/ccs,ISM,mono-exon,chrX,107713223,107716269,NIC,mono-exon_by_intron_retention
107444,ENSG00000292341.1,m54284U_210227_205004/123078285/ccs,ISM,mono-exon,chrY,1398288,1399412,NIC,mono-exon_by_intron_retention
107455,ENSG00000292357.1,m54284U_210227_205004/33751947/ccs,NIC,intron retention,chrY,1285651,1307499,ISM,internal_fragment
107487,ENSG00000292335.1,m54284U_210227_205004/121636612/ccs,ISM,mono-exon,chrY,992801,994365,Genic,mono-exon


# Monoexons

In [12]:
monoexon_mm = mismatches.loc[mismatches['subcategory'] == 'mono-exon']
monoexon_mm

Unnamed: 0,gene_id,isoform,isotools_category,isotools_subcategory,chr,transcript_start,transcript_end,category,subcategory
1,ENSG00000121957.15,m54284U_210227_205004/61016242/ccs,ISM,mono-exon,chr1,108906669,108909981,Genic,mono-exon
2,ENSG00000121957.15,m54284U_210227_205004/84346647/ccs,ISM,mono-exon,chr1,108925726,108926898,Genic,mono-exon
7,ENSG00000163431.14,m54284U_200419_012632/53545803/ccs,ISM,mono-exon,chr1,201896455,201898889,Genic,mono-exon
34,ENSG00000116685.17,m54284U_210227_205004/122096460/ccs,ISM,mono-exon,chr1,11919590,11922106,Genic,mono-exon
39,ENSG00000251484.3,m54284U_200419_012632/112657704/ccs,ISM,mono-exon,chr1,109103706,109105540,Genic,mono-exon
...,...,...,...,...,...,...,...,...,...
107148,ENSG00000236017.8,m54284U_210227_205004/64290982/ccs,ISM,mono-exon,chrX,1412900,1414741,Genic,mono-exon
107272,ENSG00000233661.1,m54284U_210227_205004/4589284/ccs,ISM,mono-exon,chrX,63351095,63356012,Genic,mono-exon
107360,ENSG00000268738.3,m54284U_200419_012632/93651732/ccs,ISM,mono-exon,chrX,149592043,149593938,Genic,mono-exon
107487,ENSG00000292335.1,m54284U_210227_205004/121636612/ccs,ISM,mono-exon,chrY,992801,994365,Genic,mono-exon


In [13]:
# First transcript from the list
isoseq['ENSG00000121957.15'].transcripts[13]

IndexError: list index out of range