# Comparing HIT annotation to GENCODE using GFFCompare

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib import font_manager
import csv
import pyranges as pr
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

  import pkg_resources


In [2]:
gtf_column_names = ['chromosome','source','feature','start','end','score','strand','frame','attribute']

In [3]:
def split_attributes(df, *attribute_type):
    """split attribute of a gtf into separate columns"""
    try: 
        l_attributes = df.strip(';').split('; ')
        attribute_position = [i for i, x in enumerate(l_attributes) if x.startswith(attribute_type)]
        attribute = l_attributes[attribute_position[0]].split(' ')[1].strip('"')
        return attribute
    except Exception:
        return np.nan

## Comparison with hg38

Ran command: gffcompare -r gencode.v47.annotation.gtf -o hg38 Human_Islet_Transcriptome_v2.2.HG38.gtf

In [4]:
# Load output from gffcompare
hg38_gffcompare_annotation = pd.read_csv( '../data_raw/hg38.annotated.gtf',
                           sep='\t', header=None, names=gtf_column_names)
hg38_gffcompare_annotation.head()

Unnamed: 0,chromosome,source,feature,start,end,score,strand,frame,attribute
0,chr1,ENSEMBL,transcript,629062,629433,.,+,.,"transcript_id ""ENST00000416931.1""; gene_id ""EN..."
1,chr1,ENSEMBL,exon,629062,629433,.,+,.,"transcript_id ""ENST00000416931.1""; gene_id ""EN..."
2,chr1,FANTOM_cat,transcript,629083,634376,.,+,.,"transcript_id ""FTMT20400000028.1""; gene_id ""EN..."
3,chr1,FANTOM_cat,exon,629083,634376,.,+,.,"transcript_id ""FTMT20400000028.1""; gene_id ""EN..."
4,chr1,ENSEMBL,transcript,629640,630683,.,+,.,"transcript_id ""ENST00000457540.1""; gene_id ""EN..."


In [5]:
# filter the transcripts (remove exons)
hg38_gffcompare_annotation = hg38_gffcompare_annotation[hg38_gffcompare_annotation['feature'] == 'transcript']
print(len(hg38_gffcompare_annotation))

# remove non-standard chromosomes
standard_chromosomes = {f'chr{i}' for i in range(1, 23)}.union({'chrX', 'chrY'})
hg38_gffcompare_annotation = hg38_gffcompare_annotation[hg38_gffcompare_annotation['chromosome'].isin(standard_chromosomes)]
print(len(hg38_gffcompare_annotation))

376727
376587


In [6]:
hg38_gffcompare_annotation.loc[:, 'transcript_id'] = hg38_gffcompare_annotation['attribute'].apply(split_attributes, args=("transcript_id",))
hg38_gffcompare_annotation.loc[:, 'gene_id'] = hg38_gffcompare_annotation['attribute'].apply(split_attributes, args=("gene_id",))
hg38_gffcompare_annotation.loc[:, 'class_code'] = hg38_gffcompare_annotation['attribute'].apply(split_attributes, args=("class_code",))
hg38_gffcompare_annotation.loc[:, 'cmp_ref'] = hg38_gffcompare_annotation['attribute'].apply(split_attributes, args=("cmp_ref",))        # closest ref transcript
hg38_gffcompare_annotation.loc[:, 'ref_gene_id'] = hg38_gffcompare_annotation['attribute'].apply(split_attributes, args=("ref_gene_id",))
hg38_gffcompare_annotation.loc[:, 'ref_gene_name'] = hg38_gffcompare_annotation['attribute'].apply(split_attributes, args=("ref_gene_name",))

In [7]:
hg38_gffcompare_annotation['ref_gene_id'] = hg38_gffcompare_annotation['ref_gene_id'].str.replace(r'\..*', '', regex=True)
hg38_gffcompare_annotation.head()

Unnamed: 0,chromosome,source,feature,start,end,score,strand,frame,attribute,transcript_id,gene_id,class_code,cmp_ref,ref_gene_id,ref_gene_name
0,chr1,ENSEMBL,transcript,629062,629433,.,+,.,"transcript_id ""ENST00000416931.1""; gene_id ""EN...",ENST00000416931.1,ENSG00000225972,k,ENST00000416931.1,ENSG00000225972,
2,chr1,FANTOM_cat,transcript,629083,634376,.,+,.,"transcript_id ""FTMT20400000028.1""; gene_id ""EN...",FTMT20400000028.1,ENSG00000237973,k,ENST00000414273.1,ENSG00000237973,MTCO1P12
4,chr1,ENSEMBL,transcript,629640,630683,.,+,.,"transcript_id ""ENST00000457540.1""; gene_id ""EN...",ENST00000457540.1,ENSG00000225630,k,ENST00000457540.1,ENSG00000225630,
6,chr1,ENSEMBL,transcript,631074,632616,.,+,.,"transcript_id ""ENST00000414273.1""; gene_id ""EN...",ENST00000414273.1,ENSG00000237973,k,ENST00000414273.1,ENSG00000237973,MTCO1P12
8,chr1,StringTie,transcript,631074,707804,.,+,.,"transcript_id ""STRT00160641""; gene_id ""ENSG000...",STRT00160641,ENSG00000237973,k,ENST00000414273.1,ENSG00000237973,MTCO1P12


## Compare number of transcripts in gffcompare annotation vs original transcriptome GTF

In [8]:
# Total number of transcripts in gffcompare annotation
print(len(hg38_gffcompare_annotation))

376587


In [9]:
# Number of UNIQUE transcripts in gffcompare annotation
len(hg38_gffcompare_annotation['transcript_id'].unique())

376465

In [10]:
# Number of duplicated transcripts
len(hg38_gffcompare_annotation) - len(hg38_gffcompare_annotation['transcript_id'].unique())

122

In [11]:
# Which transcripts are duplicated in gffcompare annotation?
dup_ids = hg38_gffcompare_annotation['transcript_id'][hg38_gffcompare_annotation['transcript_id'].duplicated()]
duplicated_transcripts = hg38_gffcompare_annotation[hg38_gffcompare_annotation['transcript_id'].isin(dup_ids)].sort_values(by='transcript_id').drop(columns=["attribute"])

# Save to TSV
duplicated_transcripts.to_csv('../data_processed/duplicated_transcripts.tsv', sep='\t', index=False)
duplicated_transcripts

Unnamed: 0,chromosome,source,feature,start,end,score,strand,frame,transcript_id,gene_id,class_code,cmp_ref,ref_gene_id,ref_gene_name
336729,chr1,FANTOM_cat,transcript,144930269,144995200,.,-,.,ENCT00000010979.1,ENSG00000196369,j,ENST00000619678.2,ENSG00000196369,
119343,chr1,FANTOM_cat,transcript,143972661,143975153,.,+,.,ENCT00000010979.1,ENSG00000196369,=,ENST00000685922.2,ENSG00000289318,ENSG00000289318
336256,chr1,FANTOM_cat,transcript,120176307,120176520,.,-,.,ENCT00000011102.1,ENSG00000223380,c,ENST00000578049.4,ENSG00000265808,SEC22B
340612,chr1,FANTOM_cat,transcript,148787530,148787807,.,-,.,ENCT00000011102.1,ENSG00000223380,u,,,
119178,chr1,FANTOM_cat,transcript,120851843,120851857,.,+,.,ENCT00000011617.1,FTMG001538,p,ENST00000834689.1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4049817,chr9,StringTie,transcript,67082692,67201219,.,+,.,STRT02161951,ENSG00000154529,j,ENST00000792558.1,ENSG00000303182,ENSG00000303182
4049404,chr9,StringTie,transcript,65675928,65720092,.,+,.,STRT02165508,ENSG00000147996,c,ENST00000382405.8,ENSG00000147996,ZNG1E
4144616,chr9,StringTie,transcript,65318522,65323015,.,-,.,STRT02165508,ENSG00000147996,c,ENST00000445695.1,ENSG00000204778,ZNG1DP
4049350,chr9,StringTie,transcript,65675928,65703014,.,+,.,STRT02167594,ENSG00000147996,c,ENST00000430059.6,ENSG00000147996,ZNG1E


### Why are there duplicates?

In [12]:
# Show only instances of transcripts with + and -
strand_conflicts = duplicated_transcripts.groupby("transcript_id").filter(
    lambda g: set(g["strand"]) == {'+', '-'} and len(g) == 2)
strand_conflicts

Unnamed: 0,chromosome,source,feature,start,end,score,strand,frame,transcript_id,gene_id,class_code,cmp_ref,ref_gene_id,ref_gene_name
336729,chr1,FANTOM_cat,transcript,144930269,144995200,.,-,.,ENCT00000010979.1,ENSG00000196369,j,ENST00000619678.2,ENSG00000196369,
119343,chr1,FANTOM_cat,transcript,143972661,143975153,.,+,.,ENCT00000010979.1,ENSG00000196369,=,ENST00000685922.2,ENSG00000289318,ENSG00000289318
2235922,chr2,FANTOM_cat,transcript,91578477,91580863,.,+,.,ENCT00000226871.1,ENSG00000261600,c,ENST00000797278.1,ENSG00000261600,ENSG00000261600
336725,chr1,FANTOM_cat,transcript,144809581,144809666,.,-,.,ENCT00000226871.1,ENSG00000261600,c,ENST00000783910.1,ENSG00000302082,ENSG00000302082
336723,chr1,FANTOM_cat,transcript,144809577,144809726,.,-,.,ENCT00000226875.1,ENSG00000261600,c,ENST00000783910.1,ENSG00000302082,ENSG00000302082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4049817,chr9,StringTie,transcript,67082692,67201219,.,+,.,STRT02161951,ENSG00000154529,j,ENST00000792558.1,ENSG00000303182,ENSG00000303182
4049404,chr9,StringTie,transcript,65675928,65720092,.,+,.,STRT02165508,ENSG00000147996,c,ENST00000382405.8,ENSG00000147996,ZNG1E
4144616,chr9,StringTie,transcript,65318522,65323015,.,-,.,STRT02165508,ENSG00000147996,c,ENST00000445695.1,ENSG00000204778,ZNG1DP
4049350,chr9,StringTie,transcript,65675928,65703014,.,+,.,STRT02167594,ENSG00000147996,c,ENST00000430059.6,ENSG00000147996,ZNG1E


In [13]:
# Show only instances of transcripts with different chromosomes
chrom_conflicts = duplicated_transcripts.groupby("transcript_id").filter(
    lambda g: len(g["chromosome"].unique()) > 1 and len(g) == 2)
chrom_conflicts

Unnamed: 0,chromosome,source,feature,start,end,score,strand,frame,transcript_id,gene_id,class_code,cmp_ref,ref_gene_id,ref_gene_name
2944049,chr3,FANTOM_cat,transcript,48211057,48211110,.,-,.,ENCT00000213115.1,ENSG00000231205,u,,,
2126392,chr19,FANTOM_cat,transcript,20408950,20424956,.,-,.,ENCT00000213115.1,ENSG00000231205,=,ENST00000731957.1,ENSG00000291130,ENSG00000291130
2235922,chr2,FANTOM_cat,transcript,91578477,91580863,.,+,.,ENCT00000226871.1,ENSG00000261600,c,ENST00000797278.1,ENSG00000261600,ENSG00000261600
336725,chr1,FANTOM_cat,transcript,144809581,144809666,.,-,.,ENCT00000226871.1,ENSG00000261600,c,ENST00000783910.1,ENSG00000302082,ENSG00000302082
336723,chr1,FANTOM_cat,transcript,144809577,144809726,.,-,.,ENCT00000226875.1,ENSG00000261600,c,ENST00000783910.1,ENSG00000302082,ENSG00000302082
2235916,chr2,FANTOM_cat,transcript,91578477,91580863,.,+,.,ENCT00000226875.1,ENSG00000261600,c,ENST00000797278.1,ENSG00000261600,ENSG00000261600
2944045,chr3,StringTie,transcript,48210879,48211110,.,-,.,STRT01039144,ENSG00000231205,u,,,
2126383,chr19,StringTie,transcript,20408950,20424931,.,-,.,STRT01039144,ENSG00000231205,c,ENST00000615684.4,ENSG00000291130,ENSG00000291130
2944047,chr3,StringTie,transcript,48210956,48211110,.,-,.,STRT01091226,ENSG00000231205,u,,,
2126376,chr19,StringTie,transcript,20408950,20424927,.,-,.,STRT01091226,ENSG00000231205,c,ENST00000615684.4,ENSG00000291130,ENSG00000291130


In [14]:
# Show all remaining instances
conflict_ids = set(chrom_conflicts["transcript_id"]) | set(strand_conflicts["transcript_id"])
other_conflicts = duplicated_transcripts[~duplicated_transcripts["transcript_id"].isin(conflict_ids)]
#other_conflicts.to_csv('other_conflicts.tsv', sep='\t', index=False)
other_conflicts

Unnamed: 0,chromosome,source,feature,start,end,score,strand,frame,transcript_id,gene_id,class_code,cmp_ref,ref_gene_id,ref_gene_name
336256,chr1,FANTOM_cat,transcript,120176307,120176520,.,-,.,ENCT00000011102.1,ENSG00000223380,c,ENST00000578049.4,ENSG00000265808,SEC22B
340612,chr1,FANTOM_cat,transcript,148787530,148787807,.,-,.,ENCT00000011102.1,ENSG00000223380,u,,,
119178,chr1,FANTOM_cat,transcript,120851843,120851857,.,+,.,ENCT00000011617.1,FTMG001538,p,ENST00000834689.1,,
119311,chr1,FANTOM_cat,transcript,143729963,143730438,.,+,.,ENCT00000011617.1,FTMG001538,c,ENST00000783555.1,ENSG00000232721,ENSG00000232721
336278,chr1,FANTOM_cat,transcript,120850897,120850985,.,-,.,ENCT00000031689.1,ENSG00000212544,c,ENST00000664186.1,ENSG00000287979,ENSG00000287979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340542,chr1,StringTie,transcript,148769633,148784591,.,-,.,STRT00204587,ENSG00000223380,j,ENST00000764483.1,ENSG00000299543,ENSG00000299543
4143669,chr9,StringTie,transcript,41131678,41189273,.,-,.,STRT02160141,ENSG00000204790,j,ENST00000456520.5,ENSG00000215126,ZNG1F
4144572,chr9,StringTie,transcript,63974762,64002564,.,-,.,STRT02160141,ENSG00000204790,j,ENST00000485382.3,ENSG00000204790,ENSG00000204790
4143686,chr9,StringTie,transcript,41131678,41189307,.,-,.,STRT02161837,ENSG00000204790,j,ENST00000456520.5,ENSG00000215126,ZNG1F


In [15]:
distances = []
for tid, group in other_conflicts.groupby("transcript_id"):
    min_start = group["start"].min()
    max_end = group["end"].max()
    dist = int(max_end) - int(min_start)
    distances.append(dist)

print(min(distances))
print(max(distances))

22870886
28656077


In [16]:
# Which transcripts have both different strands and different chromosomes
conflict_overlap = set(strand_conflicts["transcript_id"]) & set(chrom_conflicts["transcript_id"])

print("Transcript IDs in both strand and chromosome conflicts:")
for tid in sorted(conflict_overlap):
    print(tid)

Transcript IDs in both strand and chromosome conflicts:
ENCT00000226871.1
ENCT00000226875.1
STRT01132284
STRT01151362
STRT01193804
