In [22]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
import sys
sys.path.append('/home/lankenau/isotools/src')

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import math
import copy

path = '/path/to/data'
alignment_path = 'alignment_v45'
genome_file = 'GRCh38.p14.genome.fa'
genome_path = os.path.join(path, 'gencode_human/version_45', genome_file)

In [None]:
metadata_file = 'reads/metadata_tissue.tsv'
metadata = pd.read_csv(os.path.join(path, metadata_file), sep='\t')
metadata

In [26]:
import logging
from isotools import Transcriptome
from isotools import __version__ as isotools_version
# set up logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
logger = logging.getLogger('isotools')
logger.info(f'This is isotools version {isotools_version}')

INFO:This is isotools version 0.3.5rc10


In [27]:
tissues = metadata['group'].unique()
tissues

array(['aorta', 'brain', 'colon', 'heart', 'lung', 'muscle', 'ovary',
       'vessel'], dtype=object)

In [None]:
annotation_file = os.path.join(path, 'gencode_human/version_45', 'gencode.v45.chr_patch_hapl_scaff.annotation_sorted.gff3.gz')
# create one IsoTools transcriptome object from the reference annotation per tissue
isoseq: Transcriptome = Transcriptome.from_reference(annotation_file)

In [None]:
for i, row in metadata.iterrows():
    sample_name = row['sample ID']
    # file is the full (wrong) path, we just need the filename without the extension
    sample_file = os.path.join(path, alignment_path, row['file'].split('/')[-1].split('.')[0] + '_aligned.bam')
    if not os.path.exists(sample_file):
        logger.error(f'File {sample_file} does not exist')
        continue
    group = row['group']
    isoseq.add_sample_from_bam(fn=sample_file, sample_name=sample_name, group=group)
    break
isoseq.sample_table

In [9]:
# compute qc metrics
isoseq.add_qc_metrics(genome_path)
# add ORF predictions
isoseq.add_orf_prediction(genome_path)

100%|██████████| 541983/541983 [2:11:08<00:00, 68.88genes/s]  
100%|██████████| 541983/541983 [29:08<00:00, 310.01genes/s]


In [None]:
group_idx = {gn:[i for i,sa in enumerate(isoseq.samples) if sa in grp] for gn,grp in isoseq.groups().items()}
for tissue in tissues:
    filter_name = 'IN' + tissue.upper()
    tissue_index = group_idx[tissue]
    # TODO: Lower coverage threshold, high for now to reduce time
    expression = f'g.coverage[{tissue_index},trid].sum() >= 10'
    isoseq.add_filter(tag=filter_name, expression=expression, context='transcript', update=True)
    print(f'Added filter {filter_name} for tissue {tissue}: {expression}')

INFO:replaced existing filter rule INAORTA in transcript context


Added filter INAORTA for tissue aorta: g.coverage[[0, 1],trid].sum() >= 10


INFO:replaced existing filter rule INBRAIN in transcript context


Added filter INBRAIN for tissue brain: g.coverage[[2, 3, 4, 5],trid].sum() >= 10


INFO:replaced existing filter rule INCOLON in transcript context


Added filter INCOLON for tissue colon: g.coverage[[6, 7],trid].sum() >= 10


INFO:replaced existing filter rule INHEART in transcript context


Added filter INHEART for tissue heart: g.coverage[[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],trid].sum() >= 10


INFO:replaced existing filter rule INLUNG in transcript context


Added filter INLUNG for tissue lung: g.coverage[[24, 25, 26, 27],trid].sum() >= 10


INFO:replaced existing filter rule INMUSCLE in transcript context


Added filter INMUSCLE for tissue muscle: g.coverage[[28, 29],trid].sum() >= 10


INFO:replaced existing filter rule INOVARY in transcript context


Added filter INOVARY for tissue ovary: g.coverage[[30, 31],trid].sum() >= 10


INFO:replaced existing filter rule INVESSEL in transcript context


Added filter INVESSEL for tissue vessel: g.coverage[[32, 33],trid].sum() >= 10


In [32]:
isoseq.add_filter(tag='HIGH_COVER',
                  expression='g.coverage.sum(0)[trid] >= 7',
                  context='transcript',
                  update=True)
isoseq.add_filter(tag='PERMISSIVE',
                  expression='FSM or not (RTTS or INTERNAL_PRIMING or FRAGMENT)',
                  context='transcript',
                  update=True)
isoseq.add_filter(tag='BALANCED',
                  expression='FSM or (HIGH_COVER and not (RTTS or FRAGMENT or INTERNAL_PRIMING))',
                  context='transcript',
                  update=True)
isoseq.add_filter(tag='STRICT',
                  expression='SUBSTANTIAL and (FSM or not (RTTS or FRAGMENT or INTERNAL_PRIMING))',
                  context='transcript',
                  update=True)

This can happen for correct filters when there are no or only a few transcripts loaded into the model.
INFO:replaced existing filter rule HIGH_COVER in transcript context
This can happen for correct filters when there are no or only a few transcripts loaded into the model.
INFO:replaced existing filter rule PERMISSIVE in transcript context
This can happen for correct filters when there are no or only a few transcripts loaded into the model.
INFO:replaced existing filter rule BALANCED in transcript context
This can happen for correct filters when there are no or only a few transcripts loaded into the model.
INFO:replaced existing filter rule STRICT in transcript context


In [32]:
isoseq.save('results/isoseq_v45.pkl')

[autoreload of isotools.splice_graph failed: Traceback (most recent call last):
  File "/home/lankenau/miniforge3/envs/isodev/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/home/lankenau/miniforge3/envs/isodev/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "/home/lankenau/miniforge3/envs/isodev/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/home/lankenau/miniforge3/envs/isodev/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 365, in update_class
    update_instances(old, new)
  File "/home/lankenau/miniforge3/envs/isodev/lib/python3.9/site-packages/IPython/extensions/autoreload.py", line 323, in update_instances
    object.__setattr__(ref, "__class__", new)
TypeError: __class__ assignment: 'SegGraphNode' object layout differs f

PicklingError: Can't pickle <class 'isotools.splice_graph.SegGraphNode'>: it's not the same object as isotools.splice_graph.SegGraphNode

In [None]:
isoseq.write_gtf('results/isoseq_v45.gtf', min_coverage=5, gzip=False, query="")

INFO:writing gtf file to results/isoseq_v45.gtf


In [13]:
isoseq.groups()

{'aorta': ['ENCSR700EBI', 'ENCSR425HFS'],
 'brain': ['ENCSR463IDK', 'ENCSR205QMF', 'ENCSR169YNI', 'ENCSR094NFM'],
 'colon': ['ENCSR997RFW', 'ENCSR450GAR'],
 'heart': ['ENCSR984OAE',
  'ENCSR994YZY',
  'ENCSR728TXV',
  'ENCSR549ELD',
  'ENCSR782LGT',
  'ENCSR514YQN',
  'ENCSR700XDQ',
  'ENCSR424QFN',
  'ENCSR575LWI',
  'ENCSR329ZQG',
  'ENCSR553SVP',
  'ENCSR777CCI',
  'ENCSR435UUS',
  'ENCSR899GAP',
  'ENCSR591OZR',
  'ENCSR786FLO'],
 'lung': ['ENCSR261GOA', 'ENCSR096QUP', 'ENCSR986WKB', 'ENCSR323XND'],
 'muscle': ['ENCSR803QWH', 'ENCSR243DYK'],
 'ovary': ['ENCSR477HHG', 'ENCSR507WZC'],
 'vessel': ['ENCSR853YZN', 'ENCSR138TAS']}