In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('/home/lankenau/isotools/src')

In [3]:
STRICT = False

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import math

path = '/path/to/data'
alignment_path = 'alignment_v45'
genome_file = 'GRCh38.p14.genome.fa'
genome_path = os.path.join(path, 'gencode_human/version_45', genome_file)

In [None]:
metadata_file = 'reads/metadata_tissue.tsv'
metadata = pd.read_csv(os.path.join(path, metadata_file), sep='\t')
metadata

In [6]:
import logging
from isotools import Transcriptome
from isotools import __version__ as isotools_version
# set up logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
logger=logging.getLogger('isotools')
logger.info(f'This is isotools version {isotools_version}')

INFO:This is isotools version 0.3.5rc10


In [None]:
annotation_file=os.path.join(path, 'gencode_human/version_45', 'gencode.v45.chr_patch_hapl_scaff.annotation_sorted.gff3.gz')
#create the IsoTools transcriptome object from the reference annotation
isoseq=Transcriptome.from_reference(annotation_file)

In [None]:
for i, row in metadata.iterrows():
    sample_name = row['sample ID']
    # file is the full (wrong) path, we just need the filename without the extension
    sample_file = os.path.join(path, alignment_path, row['file'].split('/')[-1].split('.')[0] + '_aligned.bam')
    if not os.path.exists(sample_file):
        logger.error(f'File {sample_file} does not exist')
        continue
    group = row['group']
    isoseq.add_sample_from_bam(fn=sample_file, sample_name=sample_name, group=group, strictness=20 if STRICT else math.inf)
isoseq.sample_table

In [9]:
# compute qc metrics
isoseq.add_qc_metrics(genome_path)
# add ORF predictions
isoseq.add_orf_prediction(genome_path)

100%|██████████| 541983/541983 [2:09:49<00:00, 69.58genes/s]  
100%|██████████| 541983/541983 [29:24<00:00, 307.23genes/s]


In [12]:
suffix = '_strict' if STRICT else ''

In [14]:
isoseq.save(f'results/isoseq_v45{suffix}.pkl')

INFO:saving transcriptome to results/isoseq_v45.pkl


In [16]:
isoseq.write_gtf(f'results/isoseq_v45{suffix}.gtf',
                 min_coverage=5, gzip=False, query="")

INFO:writing gtf file to results/isoseq_v45.gtf


In [17]:
# export transcript table with the same filter criteria:
transcript_tab=isoseq.transcript_table( groups=isoseq.groups(),tpm=True,coverage=True,
                                       min_coverage=5, progress_bar=True,
                                       query="")
# write to csv file
transcript_tab.to_csv(f'results/demonstration_dataset_substantial_transcripts{suffix}.csv',
                      index=False, sep='\t')

transcript_tab.head()

100%|██████████| 541983/541983 [00:20<00:00, 26557.64genes/s]


Unnamed: 0,chr,transcript_start,transcript_end,strand,gene_id,gene_name,transcript_nr,transcript_length,num_exons,exon_starts,...,ovary_sum_coverage,vessel_sum_coverage,aorta_sum_tpm,brain_sum_tpm,colon_sum_tpm,heart_sum_tpm,lung_sum_tpm,muscle_sum_tpm,ovary_sum_tpm,vessel_sum_tpm
0,GL000194.1,61662,62921,-,PB_novel_188538,PB_novel_188538,0,1259,1,61662,...,2,4,0.320567,0.58893,0.391436,0.77162,0.444486,0.0,0.58443,2.083491
1,GL000194.1,53591,55442,-,ENSG00000277400.1,ENSG00000277400,0,1851,1,53591,...,3,3,2.243966,0.19631,1.957179,1.043957,1.926105,0.40793,0.876645,1.562618
2,GL000194.1,53591,115065,-,ENSG00000277400.1,ENSG00000277400,1,2224,3,53591112791114985,...,13,3,3.526232,0.58893,1.957179,1.361683,2.370591,0.40793,3.798797,1.562618
3,GL000194.1,53591,55503,-,ENSG00000277400.1,ENSG00000277400,2,1127,2,5359154677,...,0,0,0.320567,0.19631,0.391436,0.226947,0.148162,0.40793,0.0,0.0
4,GL000194.1,53591,115065,-,ENSG00000277400.1,ENSG00000277400,3,1439,4,5359154677112791114985,...,3,1,0.641133,0.39262,0.391436,0.090779,0.148162,0.0,0.876645,0.520873
