In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

path = '/project/hfa_work/ENCODE'
alignment_path = 'alignment_v45'
genome_file = 'GRCh38.p14.genome.fa'
genome_path = os.path.join(path, 'gencode_human/version_45', genome_file)

INFO:This is isotools version 0.3.4


In [None]:
metadata_file = 'metadata_tissue.tsv'
metadata = pd.read_csv(os.path.join(path, metadata_file), sep='\t')
metadata

'/project/hfa_work/yalan/golong/encode_tissue_data/ENCFF144KHH.fastq.gz'

In [None]:
import logging
from isotools import Transcriptome
from isotools import __version__ as isotools_version
# set up logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
logger=logging.getLogger('isotools')
logger.info(f'This is isotools version {isotools_version}')

In [2]:
annotation_file=os.path.join(path, 'gencode_human/version_45', 'gencode.v45.chr_patch_hapl_scaff.annotation_sorted.gff3.gz')
#create the IsoTools transcriptome object from the reference annotation
isoseq=Transcriptome.from_reference(annotation_file)

INFO:importing reference from gff3 file /project/hfa_work/ENCODE/gencode_human/version_45/gencode.v45.chr_patch_hapl_scaff.annotation_sorted.gff3.gz
100%|█████████▉| 81.7M/81.7M [00:53<00:00, 1.61MB/s]
INFO:skipped the following categories: {'three_prime_UTR', 'five_prime_UTR', 'CDS', 'stop_codon_redefined_as_selenocysteine'}


In [4]:
for i, row in metadata.iterrows():
    sample_name = row['sample ID']
    # file is the full (wrong) path, we just need the filename without the extension
    sample_file = os.path.join(path, alignment_path, row['file'].split('/')[-1].split('.')[0] + '_aligned.bam')
    if not os.path.exists(sample_file):
        logger.error(f'File {sample_file} does not exist')
        continue
    group = row['group']
    isoseq.add_sample_from_bam(fn=sample_file, sample_name=sample_name, group=group)
isoseq.sample_table

INFO:adding sample ENCSR700EBI from file /project/hfa_work/ENCODE/alignment_v45/ENCFF144KHH_aligned.bam
100%|██████████| 2.19M/2.19M [04:04<00:00, 8.97kreads/s, chr=KI270757.1]
INFO:skipped 2720 reads aligned fraction of less than 0.75.
INFO:skipped 950420 secondary alignments (0x100), alignment that failed quality check (0x200) or PCR duplicates (0x400)
INFO:ignoring 123726 chimeric alignments with less than 2 reads
INFO:imported 1106210 nonchimeric reads (including  1377 chained chimeric alignments) and 9024 chimeric reads with coverage of at least 2.
INFO:adding sample ENCSR425HFS from file /project/hfa_work/ENCODE/alignment_v45/ENCFF902BIU_aligned.bam
100%|██████████| 2.74M/2.74M [07:42<00:00, 5.93kreads/s, chr=KI270757.1]
INFO:skipped 9838 reads aligned fraction of less than 0.75.
INFO:skipped 667992 secondary alignments (0x100), alignment that failed quality check (0x200) or PCR duplicates (0x400)
INFO:ignoring 50134 chimeric alignments with less than 2 reads
INFO:imported 201326

Unnamed: 0,name,file,group,nonchimeric_reads,chimeric_reads
0,ENCSR700EBI,/project/hfa_work/ENCODE/alignment_v45/ENCFF14...,aorta,1106210,9024
0,ENCSR425HFS,/project/hfa_work/ENCODE/alignment_v45/ENCFF90...,aorta,2013267,2918
0,ENCSR463IDK,/project/hfa_work/ENCODE/alignment_v45/ENCFF83...,brain,2035784,18875
0,ENCSR205QMF,/project/hfa_work/ENCODE/alignment_v45/ENCFF20...,brain,963304,8488
0,ENCSR169YNI,/project/hfa_work/ENCODE/alignment_v45/ENCFF82...,brain,1593059,11852
0,ENCSR094NFM,/project/hfa_work/ENCODE/alignment_v45/ENCFF26...,brain,501838,5193
0,ENCSR997RFW,/project/hfa_work/ENCODE/alignment_v45/ENCFF38...,colon,1272092,16637
0,ENCSR450GAR,/project/hfa_work/ENCODE/alignment_v45/ENCFF24...,colon,1282606,3025
0,ENCSR984OAE,/project/hfa_work/ENCODE/alignment_v45/ENCFF42...,heart,1053851,9911
0,ENCSR994YZY,/project/hfa_work/ENCODE/alignment_v45/ENCFF60...,heart,992724,11205


In [5]:
# compute qc metrics
isoseq.add_qc_metrics(genome_path)
# add ORF predictions
isoseq.add_orf_prediction(genome_path)

100%|██████████| 541983/541983 [2:22:09<00:00, 63.54genes/s]  
100%|██████████| 541983/541983 [27:18<00:00, 330.77genes/s]


In [6]:
isoseq.save('isoseq_v45.pkl')

INFO:saving transcriptome to isoseq_v45.pkl


In [7]:
# export transcript table with the same filter criteria:
transcript_tab=isoseq.transcript_table( groups=isoseq.groups(),tpm=True,coverage=True,
                                       min_coverage=5, progress_bar=True,
                                       query="")
# write to csv file
transcript_tab.to_csv(f'{path}/demonstration_dataset_substantial_transcripts.csv',
                      index=False, sep='\t')

transcript_tab.head()

100%|██████████| 541983/541983 [01:42<00:00, 5287.83genes/s]


Unnamed: 0,chr,transcript_start,transcript_end,strand,gene_id,gene_name,transcript_nr,transcript_length,num_exons,exon_starts,...,ovary_sum_coverage,vessel_sum_coverage,aorta_sum_tpm,brain_sum_tpm,colon_sum_tpm,heart_sum_tpm,lung_sum_tpm,muscle_sum_tpm,ovary_sum_tpm,vessel_sum_tpm
0,GL000194.1,61662,62921,-,PB_novel_188538,PB_novel_188538,0,1259,1,61662,...,2,4,0.320567,0.58893,0.391436,0.77162,0.444486,0.0,0.58443,2.083491
1,GL000194.1,53591,55442,-,ENSG00000277400.1,ENSG00000277400,0,1851,1,53591,...,3,3,2.243966,0.19631,1.957179,1.043957,1.926105,0.40793,0.876645,1.562618
2,GL000194.1,53591,115065,-,ENSG00000277400.1,ENSG00000277400,1,2224,3,53591112791114985,...,13,3,3.526232,0.58893,1.957179,1.361683,2.370591,0.40793,3.798797,1.562618
3,GL000194.1,53591,55503,-,ENSG00000277400.1,ENSG00000277400,2,1127,2,5359154677,...,0,0,0.320567,0.19631,0.391436,0.226947,0.148162,0.40793,0.0,0.0
4,GL000194.1,53591,115065,-,ENSG00000277400.1,ENSG00000277400,3,1439,4,5359154677112791114985,...,3,1,0.641133,0.39262,0.391436,0.090779,0.148162,0.0,0.876645,0.520873
