# Comparing the HIT to GENCODE v47

In [1]:
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import pyranges as pr
import matplotlib as mpl
from collections import defaultdict
import numpy as np

  import pkg_resources


In [2]:
gencode_v47_df = pd.read_csv(
    "gencode.v47.tsv",
    sep="\t",
    dtype=str)
gencode_v47_df

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,transcript_id,...,exon_id,level,tag,transcript_support_level,havana_transcript,hgnc_id,ont,havana_gene,protein_id,ccdsid
0,chr1,HAVANA,exon,11121,11211,.,+,.,ENSG00000290825.2,ENST00000832824.1,...,ENSE00004248723.1,2,TAGENE,,,,,,,
1,chr1,HAVANA,exon,12010,12227,.,+,.,ENSG00000290825.2,ENST00000832824.1,...,ENSE00004248735.1,2,TAGENE,,,,,,,
2,chr1,HAVANA,exon,12613,12721,.,+,.,ENSG00000290825.2,ENST00000832824.1,...,ENSE00003582793.1,2,TAGENE,,,,,,,
3,chr1,HAVANA,exon,13453,14413,.,+,.,ENSG00000290825.2,ENST00000832824.1,...,ENSE00004248730.1,2,TAGENE,,,,,,,
4,chr1,HAVANA,exon,11125,11211,.,+,.,ENSG00000290825.2,ENST00000832825.1,...,ENSE00004248721.1,2,TAGENE,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155000,chrM,ENSEMBL,exon,14149,14673,.,-,.,ENSG00000198695.2,ENST00000361681.2,...,ENSE00001434974.2,3,appris_principal_1,,,HGNC:7462,,,ENSP00000354665.2,
2155001,chrM,ENSEMBL,exon,14674,14742,.,-,.,ENSG00000210194.1,ENST00000387459.1,...,ENSE00001544476.1,3,Ensembl_canonical,,,HGNC:7479,,,,
2155002,chrM,ENSEMBL,exon,14747,15887,.,+,.,ENSG00000198727.2,ENST00000361789.2,...,ENSE00001436074.2,3,appris_principal_1,,,HGNC:7427,,,ENSP00000354554.2,
2155003,chrM,ENSEMBL,exon,15888,15953,.,+,.,ENSG00000210195.2,ENST00000387460.2,...,ENSE00001544475.2,3,Ensembl_canonical,,,HGNC:7499,,,,


In [11]:
HIT_hg38_df = pd.read_csv(
    "HIT_hg38.tsv",
    sep="\t",
    dtype=str)
HIT_hg38_df

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,transcript_id,gene_name,gene_type,transcript_classification,protein_id_orf_summary,Intron,Exon,5prime,3prime,closest
0,chr1,PacBio,exon,14360,14829,.,-,.,ENSG00000227232,PBT00037199,WASH7P,pseudogene,Novel Protein coding not overlapping annotated...,,,,,,
1,chr1,PacBio,exon,14970,15038,.,-,.,ENSG00000227232,PBT00037199,WASH7P,pseudogene,Novel Protein coding not overlapping annotated...,,,,,,
2,chr1,PacBio,exon,15796,15947,.,-,.,ENSG00000227232,PBT00037199,WASH7P,pseudogene,Novel Protein coding not overlapping annotated...,,,,,,
3,chr1,PacBio,exon,16607,16765,.,-,.,ENSG00000227232,PBT00037199,WASH7P,pseudogene,Novel Protein coding not overlapping annotated...,,,,,,
4,chr1,PacBio,exon,16854,17055,.,-,.,ENSG00000227232,PBT00037199,WASH7P,pseudogene,Novel Protein coding not overlapping annotated...,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3967844,chrY,StringTie,exon,20582590,20582693,.,+,.,ENSG00000198692,STRT02294670,EIF1AY,protein_coding,Novel Protein coding: internal additional pept...,,,,,,
3967845,chrY,StringTie,exon,20584474,20584524,.,+,.,ENSG00000198692,STRT02294670,EIF1AY,protein_coding,Novel Protein coding: internal additional pept...,,,,,,
3967846,chrY,StringTie,exon,20588024,20588105,.,+,.,ENSG00000198692,STRT02294670,EIF1AY,protein_coding,Novel Protein coding: internal additional pept...,,,,,,
3967847,chrY,StringTie,exon,20589484,20589575,.,+,.,ENSG00000198692,STRT02294670,EIF1AY,protein_coding,Novel Protein coding: internal additional pept...,,,,,,


In [12]:
# remove duplicated tx
dup_tx = pd.read_csv("duplicated_transcripts.tsv", sep="\t", dtype=str)["transcript_id"].unique()
HIT_hg38_df = HIT_hg38_df[~HIT_hg38_df["transcript_id"].isin(dup_tx)]

In [13]:
# remove non-standard chr
standard_chromosomes = {f'chr{i}' for i in range(1, 23)}.union({'chrX', 'chrY'})
HIT_hg38_df = HIT_hg38_df[HIT_hg38_df['seqname'].isin(standard_chromosomes)]

In [14]:
print("Number unique transcripts")
print("GENCODE:", len(gencode_v47_df["transcript_id"].unique()))
print("HIT:", len(HIT_hg38_df["transcript_id"].unique()))
print("Number unique genes")
print("GENCODE:", len(gencode_v47_df["gene_id"].unique()))
print("HIT:", len(HIT_hg38_df["gene_id"].unique()))

Number unique transcripts
GENCODE: 385659
HIT: 376343
Number unique genes
GENCODE: 78724
HIT: 22119


In [15]:
# Build exon dictionaries for each chromosome
def build_transcript_dict(df):
    chr_dict = defaultdict(list)
    grouped = df.groupby("transcript_id")
    
    for tid, group in grouped:
        seqname = group["seqname"].iloc[0]
        exon_coords = sorted(list(zip(group["start"].astype(int), group["end"].astype(int))))
        chr_dict[seqname].append((tid, exon_coords))
    return chr_dict

HIT_hg38_df_exons = build_transcript_dict(HIT_hg38_df)
gencode_v47_df_exons = build_transcript_dict(gencode_v47_df)

In [16]:
# Find transcripts fully annotated in GENCODE
def exon_overlap(exons1, exons2, tolerance=100):
    for i, ((s1, e1), (s2, e2)) in enumerate(zip(exons1, exons2)):
        if i == 0:  # first exon
            if abs(s1 - s2) > tolerance or e1 != e2:
                return False
        elif i == len(exons1) - 1:  # last exon
            if abs(e1 - e2) > tolerance or s1 != s2:
                return False
        else:  # internal exon
            if s1 != s2 or e1 != e2:
                return False
    return True


# Find new splice variants with ALL exons annotated (GENCODE transcript may contain additional exons)
def subset_match(HIT_exons, gencode_exons, tolerance=100):
    g_idx = 0
    for i, (hit_start, hit_end) in enumerate(HIT_exons):
        found = False
        while g_idx < len(gencode_exons):
            gen_start, gen_end = gencode_exons[g_idx]
            if i == 0:
                # First HIT exon — allow tolerance only on start
                if abs(hit_start - gen_start) <= tolerance and hit_end == gen_end:
                    found = True
                    g_idx += 1
                    break
            elif i == len(HIT_exons) - 1:
                # Last HIT exon — allow tolerance only on end
                if abs(hit_end - gen_end) <= tolerance and hit_start == gen_start:
                    found = True
                    g_idx += 1
                    break
            else:
                # Internal exon — exact match
                if hit_start == gen_start and hit_end == gen_end:
                    found = True
                    g_idx += 1
                    break

            g_idx += 1
        if not found:
            return False

    return True


# Find HIT exons that are annotated in GENCODE
def found_exon(HIT_exon, gencode_exon_dict, position, nr_HIT_exons):
    hit_start, hit_end = HIT_exon
    if position == 0:
        key = (hit_start, hit_end, 'first')
    elif position == nr_HIT_exons - 1:
        key = (hit_start, hit_end, 'last')
    else:
        key = (hit_start, hit_end, 'internal')
    
    if key in gencode_exon_dict:
        return True, gencode_exon_dict[key]
    else:
        return False, set()

In [17]:
# Build a lookup set of GENCODE exons which includes a tolerance window for the start and end of a tx
def build_tolerant_gencode_set(transcripts, tolerance=100):
    exon_dict = defaultdict(set)
    for tid, exons in transcripts:
        for i, (start, end) in enumerate(exons):
            if i == 0:
                for delta in range(-tolerance, tolerance + 1):
                    exon_dict[(start + delta, end, 'first')].add(tid)
            elif i == len(exons) - 1:
                for delta in range(-tolerance, tolerance + 1):
                    exon_dict[(start, end + delta, 'last')].add(tid)
            else:
                exon_dict[(start, end, 'internal')].add(tid)
    return exon_dict

In [18]:
# Merge exons that are directly adjacent to each other
def merge_adjacent_exons(exons, tid):
    if not exons:
        return []
    
    # Sort exons by start coordinate 
    exons = sorted(exons, key=lambda x: x[0])
    merged = [exons[0]]
    merged_count = 0
    
    for start, end in exons[1:]:
        last_start, last_end = merged[-1]
        
        # If current exon starts directly after the last one ends
        if start == last_end + 1:
            # Merge them
            merged[-1] = (last_start, end)
            merged_count += 1
        else:
            merged.append((start, end))

    return merged

In [56]:
matching_transcripts = {}
subset_transcripts = {}
partially_unannotated_transcripts = {}
fully_unannotated_transcripts = {}
new_combo_annotated_exons = {}

for chr_name in HIT_hg38_df_exons:
    print(chr_name)
    if chr_name not in gencode_v47_df_exons:
        continue
    
    HIT_hg38_transcript_list = HIT_hg38_df_exons[chr_name]
    gencode_v47_transcript_list = gencode_v47_df_exons[chr_name]
    
    # Group GENCODE transcripts by exon count
    gencode_by_exon_count = defaultdict(list)
    for tid, exons in gencode_v47_transcript_list:
        gencode_by_exon_count[len(exons)].append((tid, exons))

    # Build fast exon lookup set (for finding individual annotated exons)
    gencode_exon_set = build_tolerant_gencode_set(gencode_v47_transcript_list)

    
    for HIT_tid, HIT_exons in tqdm(HIT_hg38_transcript_list, desc=f"Transcripts in {chr_name}", leave=False):
        HIT_exons = merge_adjacent_exons(HIT_exons, HIT_tid)
        matched = False
        len_hit = len(HIT_exons)
        
        # Try to match entire transcript exactly
        for gencode_tid, gencode_exons in gencode_by_exon_count[len_hit]:
            if exon_overlap(HIT_exons, gencode_exons):
                matching_transcripts[HIT_tid] = {
                    'chr': chr_name,
                    'exons': HIT_exons,
                    'matched_to': gencode_exons,
                    'matched_ref_id': gencode_tid
                }
                matched = True
                break

        if matched:
            continue
            
        # Try subset match (HIT tx contained in GENCODE tx)
        for longer_len in range(len_hit + 1, max(gencode_by_exon_count.keys()) + 1):
            for gencode_tid, gencode_exons in gencode_by_exon_count[longer_len]:
                # Skip if exon coordinates are out of possible range
                if HIT_exons[0][0] > gencode_exons[-1][1] or HIT_exons[-1][1] < gencode_exons[0][0]:
                    continue
                if subset_match(HIT_exons, gencode_exons):
                    subset_transcripts[HIT_tid] = {
                        'chr': chr_name,
                        'exons': HIT_exons,
                        'matched_to': gencode_exons,
                        'matched_ref_id': gencode_tid
                    }
                    matched = True
                    break
            if matched:
                break
            
        if not matched:
            # If entire tx can't be matched, try to match individual exons
            annotated_info = [
                found_exon(exon, gencode_exon_set, i, len_hit)
                for i, exon in enumerate(HIT_exons)
                ]
            annotated_flags = [flag for flag, _ in annotated_info]
            matched_tids = set().union(*[tids for flag, tids in annotated_info if flag])
     
            # No exons are annotated
            if all(not flag for flag in annotated_flags):
                fully_unannotated_transcripts[HIT_tid] = {
                    'chr': chr_name,
                    'exons': HIT_exons
                }
            # All exons are annotated (new splice variant)
            elif all(annotated_flags):
                ref_match_counts = defaultdict(int)
                for flag, tids in annotated_info:
                    for tid in tids:
                        ref_match_counts[tid] += 1

                # Find the reference ID with the most matches
                if ref_match_counts:
                    # Get the highest match count
                    max_count = max(ref_match_counts.values())
    
                    # Find all ref IDs with that count
                    tied_ref_ids = [tid for tid, count in ref_match_counts.items() if count == max_count]
    
                    if len(tied_ref_ids) == 1:
                        # No tie: take the sole best match
                        best_ref_id = tied_ref_ids[0]
                    else:
                        # Tie: resolve by comparing tx lengths
                        hit_exons = dict(HIT_hg38_transcript_list)[HIT_tid]
                        hit_length = hit_exons[-1][1] - hit_exons[0][0]
        
                        ref_lengths = {}
                        for tid in tied_ref_ids:
                            ref_exons = dict(gencode_v47_transcript_list)[tid]
                            ref_lengths[tid] = ref_exons[-1][1] - ref_exons[0][0]
        
                        # Choose the ref ID with length closest to HIT
                        best_ref_id = min(ref_lengths.items(), key=lambda x: abs(x[1] - hit_length))[0]
                else:
                    best_ref_id = None

                # Save result
                new_combo_annotated_exons[HIT_tid] = {
                    'chr': chr_name,
                    'exons': HIT_exons,
                    'matched_ref_id': list(matched_tids),
                    'best_match_ref_id': best_ref_id
                }
            # Some exons annotated
            elif any(not flag for flag in annotated_flags):
                unannotated_exons = [
                    exon for exon, is_annotated in zip(HIT_exons, annotated_flags) if not is_annotated
                ]
                # Count how many times each reference ID appears across annotated exons
                ref_match_counts = defaultdict(int)
                for flag, tids in annotated_info:
                    if flag:
                        for tid in tids:
                            ref_match_counts[tid] += 1

                # Find the reference ID with the most matches
                if ref_match_counts:
                    # Get the highest match count
                    max_count = max(ref_match_counts.values())
    
                    # Find all ref IDs with that count
                    tied_ref_ids = [tid for tid, count in ref_match_counts.items() if count == max_count]
    
                    if len(tied_ref_ids) == 1:
                        # No tie: take the sole best match
                        best_ref_id = tied_ref_ids[0]
                    else:
                        # Tie: resolve by comparing tx lengths
                        hit_exons = dict(HIT_hg38_transcript_list)[HIT_tid]
                        hit_length = hit_exons[-1][1] - hit_exons[0][0]
        
                        ref_lengths = {}
                        for tid in tied_ref_ids:
                            ref_exons = dict(gencode_v47_transcript_list)[tid]
                            ref_lengths[tid] = ref_exons[-1][1] - ref_exons[0][0]
        
                        # Choose the ref ID with length closest to HIT
                        best_ref_id = min(ref_lengths.items(), key=lambda x: abs(x[1] - hit_length))[0]
                else:
                    best_ref_id = None

                # Save result
                partially_unannotated_transcripts[HIT_tid] = {
                    'chr': chr_name,
                    'unannotated_exons': unannotated_exons,
                    'exons': HIT_exons,
                    'matched_ref_id': list(matched_tids),
                    'best_match_ref_id': best_ref_id
                }

chr1


                                                                           

chr10


                                                                            

chr11


                                                                            

chr12


                                                                            

chr13


                                                                          

chr14


                                                                            

chr15


                                                                            

chr16


                                                                            

chr17


                                                                            

chr18


                                                                           

chr19


                                                                            

chr2


                                                                           

chr20


                                                                           

chr21


                                                                           

chr22


                                                                           

chr3


                                                                           

chr4


                                                                           

chr5


                                                                           

chr6


                                                                           

chr7


                                                                           

chr8


                                                                           

chr9


                                                                           

chrX


                                                                           

chrY


                                                                        

In [57]:
len(matching_transcripts)

38891

In [58]:
len(subset_transcripts)

25891

In [59]:
len(partially_unannotated_transcripts)

262784

In [60]:
len(fully_unannotated_transcripts)

17755

In [61]:
len(new_combo_annotated_exons)

31022

In [62]:
len(matching_transcripts) + len(subset_transcripts) + len(partially_unannotated_transcripts) + len(fully_unannotated_transcripts) + len(new_combo_annotated_exons)

376343

In [63]:
sum(len(v) for v in HIT_hg38_df_exons.values())

376343

### Split up partially unannotated tx into overlapping & non-overlapping

In [64]:
# Convert dictionary of partially unannotated exons into PyRanges
rows = []
for tid, info in partially_unannotated_transcripts.items():
    for exon in info["unannotated_exons"]:
        start, end = exon
        rows.append({
            "Chromosome": info["chr"],
            "Start": start,
            "End": end,
            "transcript_id": tid
        })

partially_unannotated_df = pd.DataFrame(rows)
# Add strand info
strand_info = HIT_hg38_df[['transcript_id', 'strand']].drop_duplicates(subset='transcript_id')
strand_info = strand_info.rename(columns={'strand': 'Strand'})

# Merge 
partially_unannotated_df = partially_unannotated_df.merge(
    strand_info,
    on='transcript_id',
    how='left'
)

partially_unannotated_pr = pr.PyRanges(partially_unannotated_df)


gencode_pr = pr.PyRanges(
    gencode_v47_df.rename(columns={
        "seqname": "Chromosome",
        "start": "Start",
        "end": "End",
        "strand": "Strand"
    })[["Chromosome", "Start", "End", "Strand", "transcript_id"]]
)

In [65]:
# Join the unannotated exon df with gencode exons
overlapping_exon_pu = partially_unannotated_pr.join(gencode_pr)
overlapping_exon_pu = overlapping_exon_pu[overlapping_exon_pu.Strand == overlapping_exon_pu.Strand_b]
overlapping_exon_pu = overlapping_exon_pu.df
overlapping_exon_pu = overlapping_exon_pu.rename(columns={
    "transcript_id": "HIT_transcript_id",
    "transcript_id_b": "gencode_transcript_id"
})

In [66]:
# Assign each exon as overlapping or non-overlapping
overlapping_coords_pu = set(
    tuple(row) for row in overlapping_exon_pu[["Chromosome", "Start", "End"]].values
)

partially_unannotated_df["is_overlapping"] = partially_unannotated_df.apply(
    lambda row: (row["Chromosome"], row["Start"], row["End"]) in overlapping_coords_pu,
    axis=1
)

In [67]:
import pickle

with open("partially_unannotated_df.pkl", "wb") as f:
    pickle.dump(partially_unannotated_df, f)

In [68]:
# Split up partially annotated tx into those with exon overlap and those not overlapping gencode exon
non_overlapping_ids_pu = set(
    partially_unannotated_df.loc[partially_unannotated_df["is_overlapping"] == False, "transcript_id"]
)
overlapping_ids_pu = set(partially_unannotated_df["transcript_id"]) - non_overlapping_ids_pu

nonoverlapping_partially_unannotated = {}
overlapping_partially_unannotated = {}

nonoverlapping_partially_unannotated = {
    tid: val for tid, val in partially_unannotated_transcripts.items()
    if tid in non_overlapping_ids_pu
}

overlapping_partially_unannotated = {
    tid: val for tid, val in partially_unannotated_transcripts.items()
    if tid in overlapping_ids_pu
}

### Split up fully unannotated tx into overlapping & non-overlapping

In [69]:
# Now do the same for the fully unannotated tx
rows = []
for tid, info in fully_unannotated_transcripts.items():
    for exon in info["exons"]:
        start, end = exon
        rows.append({
            "Chromosome": info["chr"],
            "Start": start,
            "End": end,
            "transcript_id": tid
        })

fully_unannotated_df = pd.DataFrame(rows)

# Merge with strand info
fully_unannotated_df = fully_unannotated_df.merge(
    strand_info,
    on='transcript_id',
    how='left'
)
fully_unannotated_pr = pr.PyRanges(fully_unannotated_df)


In [70]:
# join the unannotated exon df with gencode exons
overlapping_exon_fu = fully_unannotated_pr.join(gencode_pr)
overlapping_exon_fu = overlapping_exon_fu[overlapping_exon_fu.Strand == overlapping_exon_fu.Strand_b]
overlapping_exon_fu = overlapping_exon_fu.df
overlapping_exon_fu = overlapping_exon_fu.rename(columns={
    "transcript_id": "HIT_transcript_id",
    "transcript_id_b": "gencode_transcript_id"
})

In [71]:
overlapping_exon_fu["overlap_size"] = (
    overlapping_exon_fu[["Start", "End", "Start_b", "End_b"]]
    .apply(lambda row: max(0, min(row["End"], row["End_b"]) - max(row["Start"], row["Start_b"])), axis=1)
)

In [72]:
# Assign each exon as overlapping or non-overlapping
overlapping_coords_fu = set(
    tuple(row) for row in overlapping_exon_fu[["Chromosome", "Start", "End"]].values
)

fully_unannotated_df["is_overlapping"] = fully_unannotated_df.apply(
    lambda row: (row["Chromosome"], row["Start"], row["End"]) in overlapping_coords_fu,
    axis=1
)

In [73]:
# Compute total overlap size for each HIT-gencode transcript pair
tx_overlap = (
    overlapping_exon_fu
    .groupby(["HIT_transcript_id", "gencode_transcript_id"])["overlap_size"]
    .sum()
    .reset_index()
)

# Select the gencode transcript with the highest total overlap per HIT transcript
best_tx_overlap = (
    tx_overlap
    .sort_values(by=["overlap_size", "gencode_transcript_id"], ascending=[False, True])
    .drop_duplicates(subset=["HIT_transcript_id"])
    .rename(columns={
        "HIT_transcript_id": "transcript_id",
        "gencode_transcript_id": "largest_overlap_gencode_id"
    })[["transcript_id", "largest_overlap_gencode_id"]]
)

# Merge best transcript-level overlap into unannotated_df
fully_unannotated_df = fully_unannotated_df.merge(
    best_tx_overlap,
    on="transcript_id",
    how="left"
)

In [74]:
# Split up fully unannotated tx into those with exon overlap and those not overlapping gencode exon
overlapping_ids_fu = set(
    fully_unannotated_df.loc[fully_unannotated_df["is_overlapping"] == True, "transcript_id"]
)
non_overlapping_ids_fu = set(fully_unannotated_df["transcript_id"]) - overlapping_ids_fu


nonoverlapping_fully_unannotated = {}
overlapping_fully_unannotated = {}

nonoverlapping_fully_unannotated = {
    tid: val for tid, val in fully_unannotated_transcripts.items()
    if tid in non_overlapping_ids_fu
}

overlapping_fully_unannotated = {
    tid: val for tid, val in fully_unannotated_transcripts.items()
    if tid in overlapping_ids_fu
}

for tid in overlapping_fully_unannotated:
    # Get all gencode IDs with the largest overlaps for this transcript
    gencode_ids = fully_unannotated_df.loc[
        fully_unannotated_df["transcript_id"] == tid, "largest_overlap_gencode_id"
    ].dropna().values[0]

    # Add as a new key to the transcript entry
    overlapping_fully_unannotated[tid]["largest_overlap_gencode_id"] = gencode_ids

In [75]:
def dict_to_df(d, label):
    df = pd.DataFrame.from_dict(d, orient='index')
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'transcript_id'}, inplace=True)
    df['category'] = label
    return df

In [76]:
# Convert dictionaries into df
df_nonoverlap_partial = dict_to_df(nonoverlapping_partially_unannotated, "Transcripts with annotated exons and non-overlapping unannotated exons")
df_overlap_partial = dict_to_df(overlapping_partially_unannotated, "Transcripts with annotated exons and only overlapping unannotated exons")
df_unannotated_overlap = dict_to_df(overlapping_fully_unannotated, "Fully unannotated transcripts with overlapping exons")
df_unannotated_nonoverlap = dict_to_df(nonoverlapping_fully_unannotated, "Fully unannotated transcripts without any overlapping exons")
df_combo = dict_to_df(new_combo_annotated_exons, "Unannotated transcripts composed entirely of annotated exons")
df_match = dict_to_df(matching_transcripts, "GENCODE-annotated transcripts")
df_subset = dict_to_df(subset_transcripts, "Unannotated transcripts fully contained within GENCODE transcripts")


combined_df = pd.concat([df_match, df_subset, df_nonoverlap_partial, df_overlap_partial, df_unannotated_overlap, df_unannotated_nonoverlap, df_combo], ignore_index=True)

In [77]:
# Add strand info
combined_df = combined_df.merge(
    strand_info[["transcript_id", "Strand"]],
    how='left',
    on='transcript_id',
)

In [78]:
combined_df["category"].value_counts()

category
Transcripts with annotated exons and only overlapping unannotated exons    199109
Transcripts with annotated exons and non-overlapping unannotated exons      63675
GENCODE-annotated transcripts                                               38891
Unannotated transcripts composed entirely of annotated exons                31022
Unannotated transcripts fully contained within GENCODE transcripts          25891
Fully unannotated transcripts with overlapping exons                        14464
Fully unannotated transcripts without any overlapping exons                  3291
Name: count, dtype: int64

In [79]:
def summarize_tx_category(df):
    tx_category = df['category']
    if tx_category == 'GENCODE-annotated transcripts':
        summarized_category = "GENCODE-annotated transcripts"
        ref_id = df['matched_ref_id']
        
    elif tx_category == "Unannotated transcripts fully contained within GENCODE transcripts":
        summarized_category = "Unannotated transcripts with all exons annotated"
        ref_id = df['matched_ref_id']
        
    elif tx_category == "Unannotated transcripts composed entirely of annotated exons":
        summarized_category = "Unannotated transcripts with all exons annotated"
        ref_id = df['best_match_ref_id']
        
    elif tx_category in ["Transcripts with annotated exons and only overlapping unannotated exons", "Transcripts with annotated exons and non-overlapping unannotated exons"]:
        summarized_category = "Unannotated transcripts with annotated and unannotated exons"
        ref_id = df['best_match_ref_id']
        
    elif tx_category == "Fully unannotated transcripts with overlapping exons":
        summarized_category = "Unannotated transcripts without any annotated exons"
        ref_id = df['largest_overlap_gencode_id']
        
    elif tx_category == "Fully unannotated transcripts without any overlapping exons":
        summarized_category = "Unannotated transcripts without any annotated exons"
        ref_id = 'None'
    
    else:
        summarized_category = 'None'
        ref_id = 'None'
        
    return summarized_category, ref_id

In [80]:
combined_df[['summarized_category', 'ref_id']] = combined_df.apply(summarize_tx_category, axis=1, result_type='expand')

## Add gene name and type info

In [81]:
gene_names = pd.read_csv('mart_export.txt',
                           sep='\t')
gene_names.head()

Unnamed: 0,Gene stable ID,Gene stable ID version,Transcript stable ID,Transcript stable ID version,Gene name,Gene type
0,ENSG00000210049,ENSG00000210049.1,ENST00000387314,ENST00000387314.1,MT-TF,Mt_tRNA
1,ENSG00000211459,ENSG00000211459.2,ENST00000389680,ENST00000389680.2,MT-RNR1,Mt_rRNA
2,ENSG00000210077,ENSG00000210077.1,ENST00000387342,ENST00000387342.1,MT-TV,Mt_tRNA
3,ENSG00000210082,ENSG00000210082.2,ENST00000387347,ENST00000387347.2,MT-RNR2,Mt_rRNA
4,ENSG00000209082,ENSG00000209082.1,ENST00000386347,ENST00000386347.1,MT-TL1,Mt_tRNA


In [82]:
combined_df.head()

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,summarized_category,ref_id
0,ENCT00000006334.1,chr1,"[(61077274, 61077628), (61077916, 61079203)]","[(61077326, 61077628), (61077916, 61079231)]",ENST00000699992.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000699992.1
1,ENCT00000013056.1,chr1,"[(160261734, 160261922), (160281430, 160281892)]","[(160261682, 160261922), (160281430, 160281935)]",ENST00000756371.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000756371.1
2,ENCT00000019620.1,chr1,"[(239386568, 239387227), (239492709, 239492807...","[(239386568, 239387227), (239492709, 239492807...",ENST00000676153.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000676153.1
3,ENCT00000022754.1,chr1,"[(23191893, 23195001), (23217291, 23217499)]","[(23191895, 23195001), (23217291, 23217502)]",ENST00000374619.2,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000374619.2
4,ENCT00000039204.1,chr1,"[(233319835, 233321155), (233327229, 233327357)]","[(233319834, 233321155), (233327229, 233327455)]",ENST00000771587.1,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000771587.1


In [83]:
combined_df['stripped_ref_id'] = combined_df['ref_id'].str.replace(r'\.\d+$', '', regex=True)


combined_df = combined_df.merge(
    gene_names[["Transcript stable ID", "Gene stable ID", "Gene name", "Gene type"]],
    how='left',
    left_on='stripped_ref_id',
    right_on='Transcript stable ID'
).drop(columns="Transcript stable ID")


In [84]:
combined_df["Gene type"].value_counts()

Gene type
protein_coding                        330761
lncRNA                                 41009
transcribed_unprocessed_pseudogene       462
processed_pseudogene                     366
transcribed_unitary_pseudogene           133
TEC                                       60
transcribed_processed_pseudogene          50
unprocessed_pseudogene                    46
IG_V_gene                                 38
snRNA                                     29
misc_RNA                                  16
miRNA                                     15
snoRNA                                    14
TR_C_gene                                  5
scaRNA                                     5
unitary_pseudogene                         5
ribozyme                                   1
Name: count, dtype: int64

In [85]:
combined_df["Gene name"].isna().sum()

np.int64(20207)

In [86]:
combined_df["Gene type"].isna().sum()

np.int64(3328)

In [87]:
combined_df[combined_df['Gene name'].isna()]

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,summarized_category,ref_id,stripped_ref_id,Gene stable ID,Gene name,Gene type
4,ENCT00000039204.1,chr1,"[(233319835, 233321155), (233327229, 233327357)]","[(233319834, 233321155), (233327229, 233327455)]",ENST00000771587.1,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000771587.1,ENST00000771587,ENSG00000300424,,lncRNA
121,ENST00000289779.3,chr1,"[(160997957, 160998906), (160999043, 160999091...","[(160997957, 160998906), (160999043, 160999091...",ENST00000289779.7,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000289779.7,ENST00000289779,ENSG00000270149,,protein_coding
516,ENST00000362058.2,chr1,"[(16629670, 16631109), (16631530, 16631613), (...","[(16629670, 16631109), (16631530, 16631613), (...",ENST00000362058.2,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000362058.2,ENST00000362058,ENSG00000291072,,lncRNA
1482,ENST00000412483.1,chr1,"[(234212606, 234214107), (234214868, 234215088)]","[(234212606, 234214107), (234214868, 234215088)]",ENST00000412483.1,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000412483.1,ENST00000412483,ENSG00000233332,,lncRNA
1504,ENST00000415019.1,chr1,"[(158197922, 158199835), (158203793, 158203877)]","[(158197922, 158199835), (158203793, 158203877)]",ENST00000415019.1,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000415019.1,ENST00000415019,ENSG00000176320,,lncRNA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376237,STRT02276681,chrX,"[(136909443, 136909780), (136922445, 136922776...",,"[ENST00000685550.1, ENST00000759417.1, ENST000...",Unannotated transcripts composed entirely of a...,,ENST00000759400.1,,+,Unannotated transcripts with all exons annotated,ENST00000759400.1,ENST00000759400,ENSG00000291054,,lncRNA
376270,STRT02285555,chrX,"[(102599494, 102599847), (102601464, 102601509...",,"[ENST00000602366.5, ENST00000332262.10, ENST00...",Unannotated transcripts composed entirely of a...,,ENST00000466616.6,,+,Unannotated transcripts with all exons annotated,ENST00000466616.6,ENST00000466616,ENSG00000271147,,lncRNA
376305,ENCT00000484963.1,chrY,"[(11161704, 11163137), (11212744, 11212802), (...",,"[ENST00000796802.1, ENST00000796805.1, ENST000...",Unannotated transcripts composed entirely of a...,,ENST00000796795.1,,-,Unannotated transcripts with all exons annotated,ENST00000796795.1,ENST00000796795,ENSG00000291032,,lncRNA
376330,STRT02292412,chrY,"[(12406182, 12406500), (12406796, 12406937), (...",,"[ENST00000357871.6, ENST00000689264.2, ENST000...",Unannotated transcripts composed entirely of a...,,ENST00000737339.1,,-,Unannotated transcripts with all exons annotated,ENST00000737339.1,ENST00000737339,ENSG00000291034,,lncRNA


In [91]:
combined_df["category"].unique()

array(['GENCODE-annotated transcripts',
       'Unannotated transcripts fully contained within GENCODE transcripts',
       'Transcripts with annotated exons and non-overlapping unannotated exons',
       'Transcripts with annotated exons and only overlapping unannotated exons',
       'Fully unannotated transcripts with overlapping exons',
       'Fully unannotated transcripts without any overlapping exons',
       'Unannotated transcripts composed entirely of annotated exons'],
      dtype=object)

In [92]:
combined_df[(combined_df["Gene type"].isna()) & (combined_df["category"] != "Fully unannotated transcripts without any overlapping exons")]["stripped_ref_id"].unique()

array(['ENST00000351839', 'ENST00000391947', 'ENST00000543133',
       'ENST00000636394', 'ENST00000334318', 'ENST00000418951',
       'ENST00000617484', 'ENST00000612355', 'ENST00000458258',
       'ENST00000756245', 'ENST00000797550'], dtype=object)

In [94]:
manual_gene_info = {
    "ENST00000351839": {"Gene stable ID": "ENSG00000165119", "Gene name": "HNRNPK", "Gene type": "protein_coding"},
    "ENST00000636394": {"Gene stable ID": "ENSG00000279170", "Gene name": "TSTD3", "Gene type": "protein_coding"},
    "ENST00000617484": {"Gene stable ID": "ENSG00000109819", "Gene name": "PPARGC1A", "Gene type": "protein_coding"},
    "ENST00000756245": {"Gene stable ID": "ENSG00000298529", "Gene name": "ENSG00000298529", "Gene type": "lncRNA"},

    "ENST00000391947": {"Gene stable ID": "ENSG00000198625", "Gene name": "MDM4", "Gene type": "protein_coding"},
    "ENST00000334318": {"Gene stable ID": "ENSG00000119314", "Gene name": "PTBP3", "Gene type": "protein_coding"},
    "ENST00000612355": {"Gene stable ID": "ENSG00000109819", "Gene name": "PPARGC1A", "Gene type": "protein_coding"},
    "ENST00000797550": {"Gene stable ID": "ENSG00000303858", "Gene name": "ENSG00000303858", "Gene type": "lncRNA"},

    "ENST00000543133": {"Gene stable ID": "ENSG00000099968", "Gene name": "BCL2L13", "Gene type": "protein_coding"},
    "ENST00000418951": {"Gene stable ID": "ENSG00000099968", "Gene name": "BCL2L13", "Gene type": "protein_coding"},
    "ENST00000458258": {"Gene stable ID": "ENSG00000119314", "Gene name": "PTBP3", "Gene type": "protein_coding"} 
}

for ref_id, gene_info in manual_gene_info.items():
    mask = combined_df["stripped_ref_id"] == ref_id
    for col, value in gene_info.items():
        combined_df.loc[mask, col] = value

In [95]:
combined_df[(combined_df["Gene type"].isna()) & (combined_df["category"] != "Fully unannotated transcripts with non-overlapping exons")]["stripped_ref_id"].unique()

array(['None'], dtype=object)

In [96]:
gene_type_map = {
    'protein_coding': 'protein_coding',
    'lncRNA': 'lncRNA'
}

# Apply the mapping to make a new category column
combined_df['gene_category'] = combined_df['Gene type'].map(gene_type_map)

# Fill the rest: NaN --> 'unmapped', others --> 'other_noncoding'
combined_df['gene_category'] = combined_df['gene_category'].fillna(
    combined_df['Gene type'].apply(lambda x: 'unmapped' if pd.isna(x) else 'other_noncoding')
)
combined_df.head()

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,summarized_category,ref_id,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category
0,ENCT00000006334.1,chr1,"[(61077274, 61077628), (61077916, 61079203)]","[(61077326, 61077628), (61077916, 61079231)]",ENST00000699992.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000699992.1,ENST00000699992,ENSG00000162599,NFIA,protein_coding,protein_coding
1,ENCT00000013056.1,chr1,"[(160261734, 160261922), (160281430, 160281892)]","[(160261682, 160261922), (160281430, 160281935)]",ENST00000756371.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000756371.1,ENST00000756371,ENSG00000228606,DCAF8-DT,lncRNA,lncRNA
2,ENCT00000019620.1,chr1,"[(239386568, 239387227), (239492709, 239492807...","[(239386568, 239387227), (239492709, 239492807...",ENST00000676153.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000676153.1,ENST00000676153,ENSG00000133019,CHRM3,protein_coding,protein_coding
3,ENCT00000022754.1,chr1,"[(23191893, 23195001), (23217291, 23217499)]","[(23191895, 23195001), (23217291, 23217502)]",ENST00000374619.2,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000374619.2,ENST00000374619,ENSG00000179546,HTR1D,protein_coding,protein_coding
4,ENCT00000039204.1,chr1,"[(233319835, 233321155), (233327229, 233327357)]","[(233319834, 233321155), (233327229, 233327455)]",ENST00000771587.1,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000771587.1,ENST00000771587,ENSG00000300424,,lncRNA,lncRNA


## Add transcript expression info

In [3]:
tpm_df = pd.read_csv( 'TSS_info_w_lift.tsv', sep='\t')
tpm_df.head()

Unnamed: 0,chromosome,tx_start,tx_end,strand,transcript_id,gene_id,gene_name,Gene class,gene_type,transcript_type,...,low_tpm,peak,"TSS expression high glucose (Salmon, TPM)",Relative TSS usage,TSS_type,distance_to_closest_TSS,Distance to closest Gencode TSS,name,new_1st_exon,new_name
0,chr1,14362,29370,-,ENST00000423562.1,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,...,17.856425,Robust,27.348529,1.0,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1
1,chr1,14359,29350,-,PBT00037199,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,...,17.856425,Robust,27.348529,1.0,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1
2,chr1,14359,29350,-,PBT00037200,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,...,17.856425,Robust,27.348529,1.0,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1
3,chr1,14360,29364,-,PBT00037201,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,...,17.856425,Robust,27.348529,1.0,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1
4,chr1,14362,29346,-,PBT00037202,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,...,17.856425,Robust,27.348529,1.0,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1


In [7]:
pd.set_option('display.max_columns', None)

In [5]:
# Add back the TPMs per transcript 
tpm_df['mean_TPM_ref'] = tpm_df['relative_tx_exp'] * tpm_df['gene_exp_ref'] 

In [6]:
# Remove rows with no expression and log transform
tpm_df = tpm_df[tpm_df['mean_TPM_ref'] != 0].copy()
tpm_df['log2_tx_exp'] = np.log2(tpm_df['mean_TPM_ref'])
tpm_df.head()

Unnamed: 0,chromosome,tx_start,tx_end,strand,transcript_id,gene_id,gene_name,Gene class,gene_type,transcript_type,protein_id_orf_summary,transcript_classification,Transcript expression high glucose (TPM),gene_exp_ref,relative_tx_exp,CAGE_ID,TSS_start,TSS_end,high_tpm,low_tpm,peak,"TSS expression high glucose (Salmon, TPM)",Relative TSS usage,TSS_type,distance_to_closest_TSS,Distance to closest Gencode TSS,name,new_1st_exon,new_name,mean_TPM_ref,log2_tx_exp
0,chr1,14362,29370,-,ENST00000423562.1,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,UniProtKB=B3KS13,Known Protein coding,0.219906,27.348529,0.008041,chr1:29336-29359:-,29336,29359,17.58083,17.856425,Robust,27.348529,1.0,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1,0.219906,-2.185038
1,chr1,14359,29350,-,PBT00037199,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,,Novel Protein coding not overlapping annotated...,3.422413,27.348529,0.125141,chr1:29336-29359:-,29336,29359,17.58083,17.856425,Robust,27.348529,1.0,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1,3.422413,1.775014
2,chr1,14359,29350,-,PBT00037200,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,"Closest_UniProtKB=A9QQ17, percentage_UniProtKB...",Novel Protein coding not overlapping annotated...,0.172042,27.348529,0.006291,chr1:29336-29359:-,29336,29359,17.58083,17.856425,Robust,27.348529,1.0,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1,0.172042,-2.539164
3,chr1,14360,29364,-,PBT00037201,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,,Non coding,0.071035,27.348529,0.002597,chr1:29336-29359:-,29336,29359,17.58083,17.856425,Robust,27.348529,1.0,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1,0.071035,-3.815321
4,chr1,14362,29346,-,PBT00037202,ENSG00000227232,WASH7P,misc RNA,pseudogene,new_transcript,,Non coding,0.32226,27.348529,0.011783,chr1:29336-29359:-,29336,29359,17.58083,17.856425,Robust,27.348529,1.0,unique TSS,211,>100bp,chr1:29336-29359_Peak_1,yes,chr1:29336-29359_Peak_1,0.32226,-1.633703


In [8]:
combined_df = combined_df.drop(columns=["log2_tx_exp", "Relative TSS usage"])

In [20]:
combined_df.head()

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,summarized_category,ref_id,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,peak_name,peak_type,og_gene_id,mean_TPM_ref,log2_tx_exp,gene_exp_ref,relative_tx_exp,TSS_TPM_per_gene,Relative TSS usage,TSS_type
0,ENCT00000006334.1,chr1,"[(61077274, 61077628), (61077916, 61079203)]","[(61077326, 61077628), (61077916, 61079231)]",ENST00000699992.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000699992.1,ENST00000699992,ENSG00000162599,NFIA,protein_coding,protein_coding,chr1:61077221-61077380_Peak_3111,Permissive,ENSG00000162599,0.178215,-2.48831,4.82035,0.036971,0.213765,0.044346,minor TSS
1,ENCT00000013056.1,chr1,"[(160261734, 160261922), (160281430, 160281892)]","[(160261682, 160261922), (160281430, 160281935)]",ENST00000756371.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000756371.1,ENST00000756371,ENSG00000228606,DCAF8-DT,lncRNA,lncRNA,chr1:160261726-160261763_Peak_5604,Permissive,ENSG00000228606,0.358414,-1.4803,0.743874,0.481821,0.743874,1.0,unique TSS
2,ENCT00000019620.1,chr1,"[(239386568, 239387227), (239492709, 239492807...","[(239386568, 239387227), (239492709, 239492807...",ENST00000676153.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000676153.1,ENST00000676153,ENSG00000133019,CHRM3,protein_coding,protein_coding,chr1:239386576-239386587_Peak_7900,Robust,ENSG00000133019,0.468766,-1.093061,3.12717,0.149901,1.42149,0.454561,main TSS
3,ENCT00000022754.1,chr1,"[(23191893, 23195001), (23217291, 23217499)]","[(23191895, 23195001), (23217291, 23217502)]",ENST00000374619.2,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000374619.2,ENST00000374619,ENSG00000179546,HTR1D,protein_coding,protein_coding,chr1:23217498-23217499_Peak_1212,Permissive,ENSG00000179546,0.064644,-3.951339,0.116743,0.553728,0.116743,1.0,unique TSS
4,ENCT00000039204.1,chr1,"[(233319835, 233321155), (233327229, 233327357)]","[(233319834, 233321155), (233327229, 233327455)]",ENST00000771587.1,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000771587.1,ENST00000771587,ENSG00000300424,ENSG00000300424,lncRNA,lncRNA,chr1:233327323-233327461_Peak_7654,Permissive,FTMG005195,0.616169,-0.698603,2.291033,0.268948,2.093885,0.913948,main TSS


In [10]:
# Merge tx tpm info with comparison results
combined_df = combined_df.merge(
    tpm_df[['transcript_id', 'mean_TPM_ref', 'log2_tx_exp']].rename(columns={'transcript_id': 'merge_key'}),
    how='left',
    left_on='transcript_id',
    right_on='merge_key'
).drop(columns='merge_key')

In [21]:
len(combined_df)

376342

In [13]:
# Add the gene expression (sum of all the transcripts from the same gene) 
f = lambda x: x.sum()
combined_df['gene_exp_ref'] = combined_df.groupby('Gene stable ID')['mean_TPM_ref'].transform(f)

In [14]:
# get the transcript relative expression 
combined_df['relative_tx_exp'] = combined_df['mean_TPM_ref'] / combined_df['gene_exp_ref']

In [15]:
# add the sum of TPM expression of all the transcripts from one gene starting from the same TSS
f = lambda x: x.sum()
combined_df['TSS_TPM_per_gene'] = combined_df.groupby(['Gene stable ID','peak_name'])['mean_TPM_ref'].transform(f)

In [16]:
combined_df['Relative TSS usage'] = combined_df['TSS_TPM_per_gene']/combined_df['gene_exp_ref']

In [17]:
# control that no gene has an total relative expression above 1
combined_df[(combined_df['Relative TSS usage'] > 1)]

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,summarized_category,ref_id,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,peak_name,peak_type,og_gene_id,mean_TPM_ref,log2_tx_exp,gene_exp_ref,relative_tx_exp,TSS_TPM_per_gene,Relative TSS usage


In [18]:
TSS_gene_max = combined_df.groupby('Gene stable ID')['Relative TSS usage'].max()

In [19]:
def get_TSS_type(df):
    if df['Relative TSS usage'] == 1:
        TSS_type = 'unique TSS'
    elif df['Relative TSS usage'] == TSS_gene_max.loc[df['Gene stable ID']]:
        TSS_type = 'main TSS'
    elif df['Relative TSS usage'] < 0.2:
        TSS_type = 'minor TSS'
    else:
        TSS_type = 'secondary TSS'
    return TSS_type


# Add if the TSS is unique for the gene or the main / secondary / minor(less than 20%)
combined_df['TSS_type'] = combined_df.apply(get_TSS_type, axis=1)

In [99]:
# Merge tpm info with comparison results
combined_df = combined_df.merge(
    tpm_df[['transcript_id', 'log2_tx_exp', 'Relative TSS usage', 'new_name', 'peak']].rename(columns={'transcript_id': 'merge_key'}),
    how='left',
    left_on='transcript_id',
    right_on='merge_key'
).drop(columns='merge_key')

combined_df.rename(columns={"new_name": "peak_name"}, inplace=True)
combined_df.rename(columns={"peak": "peak_type"}, inplace=True)
combined_df.head()

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,...,ref_id,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type
0,ENCT00000006334.1,chr1,"[(61077274, 61077628), (61077916, 61079203)]","[(61077326, 61077628), (61077916, 61079231)]",ENST00000699992.1,GENCODE-annotated transcripts,,,,+,...,ENST00000699992.1,ENST00000699992,ENSG00000162599,NFIA,protein_coding,protein_coding,-2.48831,0.044346,chr1:61077221-61077380_Peak_3111,Permissive
1,ENCT00000013056.1,chr1,"[(160261734, 160261922), (160281430, 160281892)]","[(160261682, 160261922), (160281430, 160281935)]",ENST00000756371.1,GENCODE-annotated transcripts,,,,+,...,ENST00000756371.1,ENST00000756371,ENSG00000228606,DCAF8-DT,lncRNA,lncRNA,-1.4803,1.0,chr1:160261726-160261763_Peak_5604,Permissive
2,ENCT00000019620.1,chr1,"[(239386568, 239387227), (239492709, 239492807...","[(239386568, 239387227), (239492709, 239492807...",ENST00000676153.1,GENCODE-annotated transcripts,,,,+,...,ENST00000676153.1,ENST00000676153,ENSG00000133019,CHRM3,protein_coding,protein_coding,-1.093061,0.454561,chr1:239386576-239386587_Peak_7900,Robust
3,ENCT00000022754.1,chr1,"[(23191893, 23195001), (23217291, 23217499)]","[(23191895, 23195001), (23217291, 23217502)]",ENST00000374619.2,GENCODE-annotated transcripts,,,,-,...,ENST00000374619.2,ENST00000374619,ENSG00000179546,HTR1D,protein_coding,protein_coding,-3.951339,1.0,chr1:23217498-23217499_Peak_1212,Permissive
4,ENCT00000039204.1,chr1,"[(233319835, 233321155), (233327229, 233327357)]","[(233319834, 233321155), (233327229, 233327455)]",ENST00000771587.1,GENCODE-annotated transcripts,,,,-,...,ENST00000771587.1,ENST00000771587,ENSG00000300424,,lncRNA,lncRNA,-0.698603,1.0,chr1:233327323-233327461_Peak_7654,Permissive


In [100]:
perfect_matches = combined_df[combined_df['category'] == "GENCODE-annotated transcripts"]

In [101]:
# Find cases where multiple transcripts have same GENCODE perfect match (redundancy)
perfect_matches[(perfect_matches['ref_id'].duplicated(keep=False))].sort_values("ref_id")

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,...,ref_id,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type
11512,ENST00000204566.2,chr15,"[(64963021, 64963736), (64965320, 64965460), (...","[(64963022, 64963736), (64965320, 64965460), (...",ENST00000204566.7,GENCODE-annotated transcripts,,,,-,...,ENST00000204566.7,ENST00000204566,ENSG00000090487,SPG21,protein_coding,protein_coding,4.442431,0.710032,chr15:64989775-64989928_Peak_26056,Robust
12724,MICT00000118180.1,chr15,"[(64963022, 64963736), (64965320, 64965460), (...","[(64963022, 64963736), (64965320, 64965460), (...",ENST00000204566.7,GENCODE-annotated transcripts,,,,-,...,ENST00000204566.7,ENST00000204566,ENSG00000090487,SPG21,protein_coding,protein_coding,-2.82994,0.710032,chr15:64989775-64989928_Peak_26056,Robust


In [102]:
# Drop the transcript with worse match to GENCODE ref
combined_df = combined_df[combined_df['transcript_id'] != 'ENST00000204566.2']

In [103]:
# Fill NA gene names with the gene ID
combined_df['Gene name'] = combined_df['Gene name'].fillna(combined_df['Gene stable ID'])

In [104]:
# Add back the orignal gene ID (Andrew's version)
hit_unique = HIT_hg38_df[['transcript_id', 'gene_id']].drop_duplicates(subset='transcript_id')
combined_df = combined_df.merge(hit_unique, on='transcript_id', how='left')
combined_df = combined_df.rename(columns={'gene_id': 'og_gene_id'})

In [105]:
# Assign new Gene ID/name to unmapped tx
mask = combined_df['Gene stable ID'].isna()

peak_ids = combined_df.loc[mask, 'peak_name'].str.extract(r'(Peak_\d+)', expand=False)
combined_df.loc[mask, 'Gene stable ID'] = peak_ids
combined_df.loc[mask, 'Gene name'] = peak_ids

In [106]:
combined_df[combined_df["gene_category"] == "unmapped"].sort_values("Gene name")

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,...,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type,og_gene_id
342378,PBT00056522,chr10,"[(87691030, 87699809)]",,,Fully unannotated transcripts without any over...,,,,+,...,,Peak_10119,Peak_10119,,unmapped,-1.353093,1.000000,chr10:87691007-87691017_Peak_10119,Permissive,PBG006147
342442,STRT00252480,chr10,"[(88254372, 88254547), (88259789, 88261003)]",,,Fully unannotated transcripts without any over...,,,,+,...,,Peak_10177,Peak_10177,,unmapped,-3.250237,1.000000,chr10:88254381-88254385_Peak_10177,Permissive,STRG028391
342450,STRT00260558,chr10,"[(88254385, 88254547), (88365733, 88365827)]",,,Fully unannotated transcripts without any over...,,,,+,...,,Peak_10177,Peak_10177,,unmapped,-6.209846,1.000000,chr10:88254381-88254385_Peak_10177,Permissive,STRG028391
342465,STRT00270734,chr10,"[(88254334, 88254547), (88278950, 88279099)]",,,Fully unannotated transcripts without any over...,,,,+,...,,Peak_10177,Peak_10177,,unmapped,-5.259763,0.042094,chr10:88254381-88254385_Peak_10177,Permissive,PBG006152
342512,STRT00312163,chr10,"[(88254421, 88254547), (88316020, 88316785)]",,,Fully unannotated transcripts without any over...,,,,+,...,,Peak_10177,Peak_10177,,unmapped,-4.843592,1.000000,chr10:88254381-88254385_Peak_10177,Permissive,STRG028391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342392,PBT00058606,chr10,"[(78431851, 78434440), (78437405, 78437611), (...",,,Fully unannotated transcripts without any over...,,,,-,...,,Peak_9894,Peak_9894,,unmapped,-4.491659,1.000000,chr10:78458139-78458287_Peak_9894,Permissive,PBG007271
342078,HBMT00000006198.1,chr1,"[(19485741, 19486944), (19490178, 19492914)]",,,Fully unannotated transcripts without any over...,,,,+,...,,Peak_996,Peak_996,,unmapped,-4.207605,0.196512,chr1:19485730-19485780_Peak_996,Permissive,FTMG000306
342032,ENCT00000002302.1,chr1,"[(19485741, 19489397)]",,,Fully unannotated transcripts without any over...,,,,+,...,,Peak_996,Peak_996,,unmapped,-3.132117,0.196512,chr1:19485730-19485780_Peak_996,Permissive,FTMG000306
342394,PBT00058610,chr10,"[(79977809, 79982054)]",,,Fully unannotated transcripts without any over...,,,,-,...,,Peak_9964,Peak_9964,,unmapped,-3.456994,1.000000,chr10:79982070-79982071_Peak_9964,Permissive,PBG007290


In [107]:
combined_df["Gene stable ID"].nunique()

22068

In [108]:
combined_df["og_gene_id"].nunique()

22119

In [109]:
#combined_df.to_csv('jessies_gffcompare.tsv', sep='\t', index=False)

In [110]:
len(combined_df)

376342

In [111]:
# not a perfect match but old transcript ID is from GENCODE
combined_df[(combined_df['category'] != "GENCODE-annotated transcripts") & (combined_df['transcript_id'].str.startswith("ENS"))]

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,...,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type,og_gene_id
38927,ENST00000235329.5,chr1,"[(11980197, 11980484), (11981970, 11982114), (...","[(11980209, 11980484), (11981970, 11982114), (...",ENST00000675817.1,Unannotated transcripts fully contained within...,,,,+,...,ENST00000675817,ENSG00000116688,MFN2,protein_coding,protein_coding,1.842377,0.849730,chr1:11980204-11980451_Peak_666,Robust,ENSG00000116688
38928,ENST00000236914.3,chr1,"[(200027634, 200027911), (200043774, 200043892...","[(200027710, 200027911), (200039658, 200039795...",ENST00000367362.8,Unannotated transcripts fully contained within...,,,,+,...,ENST00000367362,ENSG00000116833,NR5A2,protein_coding,protein_coding,2.091592,0.573290,chr1:200027537-200027864_Peak_6480,Robust,ENSG00000116833
38929,ENST00000239457.5,chr1,"[(173824659, 173825356), (173826687, 173826786...","[(173824673, 173825356), (173826687, 173826786...",ENST00000649689.2,Unannotated transcripts fully contained within...,,,,+,...,ENST00000649689,ENSG00000117593,DARS2,protein_coding,protein_coding,-1.795171,0.930435,chr1:173824474-173824687_Peak_6035,Robust,ENSG00000117593
38930,ENST00000253251.8,chr1,"[(10032959, 10033694), (10072028, 10072214), (...","[(10032958, 10033694), (10072028, 10072214), (...",ENST00000343090.11,Unannotated transcripts fully contained within...,,,,+,...,ENST00000343090,ENSG00000130939,UBE4B,protein_coding,protein_coding,2.490205,0.997981,chr1:10032954-10033166_Peak_545,Robust,ENSG00000130939
38931,ENST00000261443.5,chr1,"[(114716920, 114718216), (114718613, 114718745...","[(114716913, 114718216), (114718613, 114718745...",ENST00000339438.10,Unannotated transcripts fully contained within...,,,,-,...,ENST00000339438,ENSG00000009307,CSDE1,protein_coding,protein_coding,1.739073,0.712407,chr1:114757960-114757994_Peak_4265,Robust,ENSG00000009307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375496,ENST00000543642.1,chrX,"[(37349330, 37349469), (37406209, 37406259), (...",,"[ENST00000850645.1, ENST00000465127.1, ENST000...",Unannotated transcripts composed entirely of a...,,ENST00000378628.9,,+,...,ENST00000378628,ENSG00000130962,PRRG1,protein_coding,protein_coding,-0.986525,1.000000,chrX:37349326-37349369_Peak_78599,Robust,ENSG00000130962
375497,ENST00000545566.1,chrX,"[(13653123, 13653188), (13655289, 13655373), (...",,"[ENST00000490617.1, ENST00000544987.3, ENST000...",Unannotated transcripts composed entirely of a...,,ENST00000696128.1,,+,...,ENST00000696128,ENSG00000176896,TCEANC,protein_coding,protein_coding,-3.353878,1.000000,chrX:13653136-13653193_Peak_78335,Robust,ENSG00000176896
375498,ENST00000545618.1,chrX,"[(64916370, 64917896), (64919042, 64919204), (...",,"[ENST00000337990.2, ENST00000492653.6, ENST000...",Unannotated transcripts composed entirely of a...,,ENST00000703136.1,,-,...,ENST00000703136,ENSG00000126970,ZC4H2,protein_coding,protein_coding,-3.809740,0.531402,chrX:64976443-64976455_Peak_79235,Robust,ENSG00000126970
375499,ENST00000602791.1,chrX,"[(77910631, 77910835), (77962623, 77964890)]",,"[ENST00000686515.1, ENST00000685264.1, ENST000...",Unannotated transcripts composed entirely of a...,,ENST00000602791.2,,+,...,ENST00000602791,ENSG00000293258,ENSG00000293258,lncRNA,lncRNA,-1.288001,1.000000,chrX:77910639-77910745_Peak_79619,Robust,ENSG00000248503


In [112]:
# perfect match but doesn't have GENCODE ID
combined_df[(combined_df['category'] == "GENCODE-annotated transcripts") & (~combined_df['transcript_id'].str.startswith("ENS"))]

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,...,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type,og_gene_id
0,ENCT00000006334.1,chr1,"[(61077274, 61077628), (61077916, 61079203)]","[(61077326, 61077628), (61077916, 61079231)]",ENST00000699992.1,GENCODE-annotated transcripts,,,,+,...,ENST00000699992,ENSG00000162599,NFIA,protein_coding,protein_coding,-2.488310,0.044346,chr1:61077221-61077380_Peak_3111,Permissive,ENSG00000162599
1,ENCT00000013056.1,chr1,"[(160261734, 160261922), (160281430, 160281892)]","[(160261682, 160261922), (160281430, 160281935)]",ENST00000756371.1,GENCODE-annotated transcripts,,,,+,...,ENST00000756371,ENSG00000228606,DCAF8-DT,lncRNA,lncRNA,-1.480300,1.000000,chr1:160261726-160261763_Peak_5604,Permissive,ENSG00000228606
2,ENCT00000019620.1,chr1,"[(239386568, 239387227), (239492709, 239492807...","[(239386568, 239387227), (239492709, 239492807...",ENST00000676153.1,GENCODE-annotated transcripts,,,,+,...,ENST00000676153,ENSG00000133019,CHRM3,protein_coding,protein_coding,-1.093061,0.454561,chr1:239386576-239386587_Peak_7900,Robust,ENSG00000133019
3,ENCT00000022754.1,chr1,"[(23191893, 23195001), (23217291, 23217499)]","[(23191895, 23195001), (23217291, 23217502)]",ENST00000374619.2,GENCODE-annotated transcripts,,,,-,...,ENST00000374619,ENSG00000179546,HTR1D,protein_coding,protein_coding,-3.951339,1.000000,chr1:23217498-23217499_Peak_1212,Permissive,ENSG00000179546
4,ENCT00000039204.1,chr1,"[(233319835, 233321155), (233327229, 233327357)]","[(233319834, 233321155), (233327229, 233327455)]",ENST00000771587.1,GENCODE-annotated transcripts,,,,-,...,ENST00000771587,ENSG00000300424,ENSG00000300424,lncRNA,lncRNA,-0.698603,1.000000,chr1:233327323-233327461_Peak_7654,Permissive,FTMG005195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38885,STRT02291998,chrY,"[(3002941, 3003217), (3058657, 3058784), (3079...","[(3002901, 3003217), (3058657, 3058784), (3079...",ENST00000803197.1,GENCODE-annotated transcripts,,,,+,...,ENST00000803197,ENSG00000231535,LINC00278,lncRNA,lncRNA,-3.564880,1.000000,chrY:3002898-3002997_Peak_80710,Permissive,ENSG00000231535
38886,STRT02292325,chrY,"[(19567359, 19567482), (19587210, 19587507), (...","[(19567372, 19567482), (19587210, 19587507), (...",ENST00000789733.1,GENCODE-annotated transcripts,,,,+,...,ENST00000789733,ENSG00000291033,TXLNGY,lncRNA,lncRNA,-2.102465,0.730209,chrY:19567328-19567382_Peak_80767,Robust,ENSG00000131002
38887,STRT02292507,chrY,"[(18932730, 18932841), (19041370, 19041586), (...","[(18932699, 18932841), (19041370, 19041586), (...",ENST00000850219.1,GENCODE-annotated transcripts,,,,-,...,ENST00000850219,ENSG00000176728,TTTY14,lncRNA,lncRNA,-5.536436,0.484843,chrY:19075997-19076094_Peak_80763,Permissive,ENSG00000176728
38888,STRT02292969,chrY,"[(19555472, 19555554), (19558024, 19558140), (...","[(19555390, 19555554), (19558024, 19558140), (...",ENST00000761024.1,GENCODE-annotated transcripts,,,,-,...,ENST00000761024,ENSG00000291031,BCORP1,lncRNA,lncRNA,-4.122080,1.000000,chrY:19567006-19567035_Peak_80766,Robust,FTMG057573


## Filter for de novo lncRNA (for differential expression analysis)

In [4]:
lncRNA = combined_df[(combined_df["gene_category"] == "lncRNA")]
lncRNA["de_novo_tx"] = lncRNA["summarized_category"] == "Unannotated transcripts without any annotated exons"
lncRNA["de_novo_gene"] = lncRNA.groupby("Gene stable ID")["de_novo_tx"].transform("all")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lncRNA["de_novo_tx"] = lncRNA["summarized_category"] == "Unannotated transcripts without any annotated exons"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lncRNA["de_novo_gene"] = lncRNA.groupby("Gene stable ID")["de_novo_tx"].transform("all")


In [5]:
lncRNA = lncRNA.drop(columns=["exons", "matched_to", "unannotated_exons", "best_match_ref_id", "largest_overlap_gencode_id",
                              "Gene type", "Relative TSS usage", "peak_name", "peak_type"])

In [6]:
lncRNA.head()

Unnamed: 0,transcript_id,chr,matched_ref_id,category,Strand,summarized_category,ref_id,stripped_ref_id,Gene stable ID,Gene name,gene_category,log2_tx_exp,og_gene_id,de_novo_tx,de_novo_gene
1,ENCT00000013056.1,chr1,ENST00000756371.1,GENCODE-annotated transcripts,+,GENCODE-annotated transcripts,ENST00000756371.1,ENST00000756371,ENSG00000228606,DCAF8-DT,lncRNA,-1.4803,ENSG00000228606,False,False
4,ENCT00000039204.1,chr1,ENST00000771587.1,GENCODE-annotated transcripts,-,GENCODE-annotated transcripts,ENST00000771587.1,ENST00000771587,ENSG00000300424,ENSG00000300424,lncRNA,-0.698603,FTMG005195,False,False
6,ENCT00000039586.1,chr1,ENST00000649127.1,GENCODE-annotated transcripts,-,GENCODE-annotated transcripts,ENST00000649127.1,ENST00000649127,ENSG00000215808,LINC01139,lncRNA,-5.028075,ENSG00000215808,False,False
151,ENST00000295012.5,chr1,ENST00000295012.5,GENCODE-annotated transcripts,-,GENCODE-annotated transcripts,ENST00000295012.5,ENST00000295012,ENSG00000162913,OBSCN-AS1,lncRNA,-7.066893,ENSG00000162913,False,False
408,ENST00000356684.3,chr1,ENST00000356684.9,GENCODE-annotated transcripts,-,GENCODE-annotated transcripts,ENST00000356684.9,ENST00000356684,ENSG00000198468,FLVCR1-DT,lncRNA,-1.751037,ENSG00000198468,False,False


In [7]:
lncRNA["Gene stable ID"].nunique()

4823

In [8]:
lncRNA["Gene name"].isna().sum()

np.int64(0)

In [10]:
#lncRNA.to_csv('lncrna.tsv', sep='\t', index=False)

In [9]:
lncRNA.groupby("Gene stable ID")["de_novo_gene"].first().value_counts()

de_novo_gene
False    3681
True     1142
Name: count, dtype: int64

## Compare gene assignment now vs in HIT_hg38

In [121]:
combined_df[combined_df["Gene stable ID"] != combined_df["og_gene_id"]]

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,...,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type,og_gene_id
4,ENCT00000039204.1,chr1,"[(233319835, 233321155), (233327229, 233327357)]","[(233319834, 233321155), (233327229, 233327455)]",ENST00000771587.1,GENCODE-annotated transcripts,,,,-,...,ENST00000771587,ENSG00000300424,ENSG00000300424,lncRNA,lncRNA,-0.698603,1.000000,chr1:233327323-233327461_Peak_7654,Permissive,FTMG005195
183,ENST00000309092.7,chr1,"[(150293861, 150293898), (150294335, 150294449...","[(150293861, 150293898), (150294335, 150294449...",ENST00000614145.5,GENCODE-annotated transcripts,,,,+,...,ENST00000614145,ENSG00000266472,MRPS21,protein_coding,protein_coding,2.847926,1.000000,chr1:150293854-150293881_Peak_4722,Robust,ENSG00000187145
202,ENST00000314835.5,chr1,"[(155745829, 155745912), (155746005, 155746140...","[(155745829, 155745912), (155746005, 155746140...",ENST00000314835.5,GENCODE-annotated transcripts,,,,+,...,ENST00000314835,ENSG00000203761,MSTO2P,transcribed_unprocessed_pseudogene,other_noncoding,-2.947815,0.122135,chr1:155745816-155745886_Peak_5359,Permissive,ENSG00000125459
240,ENST00000323397.4,chr1,"[(145959442, 145961879), (145962421, 145962968...","[(145959441, 145961879), (145962421, 145962968...",ENST00000606888.3,GENCODE-annotated transcripts,,,,-,...,ENST00000606888,ENSG00000272031,ANKRD34A,protein_coding,protein_coding,-4.950279,0.278635,chr1:145964595-145964603_Peak_4522,Permissive,ENSG00000181039
259,ENST00000330165.8,chr1,"[(145921556, 145925927), (145926041, 145926177...","[(145921556, 145925927), (145926041, 145926177...",ENST00000583313.7,GENCODE-annotated transcripts,,,,-,...,ENST00000583313,ENSG00000265241,RBM8A,protein_coding,protein_coding,2.684561,0.874929,chr1:145927362-145927459_Peak_4533,Robust,ENSG00000131795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376329,STRT02292412,chrY,"[(12406182, 12406500), (12406796, 12406937), (...",,"[ENST00000357871.6, ENST00000689264.2, ENST000...",Unannotated transcripts composed entirely of a...,,ENST00000737339.1,,-,...,ENST00000737339,ENSG00000291034,ENSG00000291034,lncRNA,lncRNA,-4.580062,1.000000,chrY:12421569-12421599_Peak_80720,Robust,ENSG00000206159
376330,STRT02292484,chrY,"[(19567329, 19567570), (19587210, 19587507), (...",,"[ENST00000789723.1, ENST00000789733.1, ENST000...",Unannotated transcripts composed entirely of a...,,ENST00000589075.6,,+,...,ENST00000589075,ENSG00000291033,TXLNGY,lncRNA,lncRNA,-4.409059,0.730209,chrY:19567328-19567382_Peak_80767,Robust,ENSG00000131002
376331,STRT02293232,chrY,"[(11167047, 11167217), (11212744, 11212802), (...",,"[ENST00000796802.1, ENST00000796805.1, ENST000...",Unannotated transcripts composed entirely of a...,,ENST00000796810.1,,-,...,ENST00000796810,ENSG00000291032,ENSG00000291032,lncRNA,lncRNA,-4.380444,1.000000,chrY:11214982-11215027_Peak_80719,Permissive,FTMG057506
376333,STRT02294005,chrY,"[(19567414, 19567951), (19587210, 19587507), (...",,"[ENST00000789733.1, ENST00000789728.1, ENST000...",Unannotated transcripts composed entirely of a...,,ENST00000693214.2,,+,...,ENST00000693214,ENSG00000291033,TXLNGY,lncRNA,lncRNA,-5.310583,0.730209,chrY:19567328-19567382_Peak_80767,Robust,ENSG00000131002


## Renaming transcript IDs

In [8]:
pd.set_option('display.max_columns', None)
combined_df.head()

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,summarized_category,ref_id,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type,og_gene_id
0,ENCT00000006334.1,chr1,"[(61077274, 61077628), (61077916, 61079203)]","[(61077326, 61077628), (61077916, 61079231)]",ENST00000699992.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000699992.1,ENST00000699992,ENSG00000162599,NFIA,protein_coding,protein_coding,-2.48831,0.044346,chr1:61077221-61077380_Peak_3111,Permissive,ENSG00000162599
1,ENCT00000013056.1,chr1,"[(160261734, 160261922), (160281430, 160281892)]","[(160261682, 160261922), (160281430, 160281935)]",ENST00000756371.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000756371.1,ENST00000756371,ENSG00000228606,DCAF8-DT,lncRNA,lncRNA,-1.4803,1.0,chr1:160261726-160261763_Peak_5604,Permissive,ENSG00000228606
2,ENCT00000019620.1,chr1,"[(239386568, 239387227), (239492709, 239492807...","[(239386568, 239387227), (239492709, 239492807...",ENST00000676153.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000676153.1,ENST00000676153,ENSG00000133019,CHRM3,protein_coding,protein_coding,-1.093061,0.454561,chr1:239386576-239386587_Peak_7900,Robust,ENSG00000133019
3,ENCT00000022754.1,chr1,"[(23191893, 23195001), (23217291, 23217499)]","[(23191895, 23195001), (23217291, 23217502)]",ENST00000374619.2,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000374619.2,ENST00000374619,ENSG00000179546,HTR1D,protein_coding,protein_coding,-3.951339,1.0,chr1:23217498-23217499_Peak_1212,Permissive,ENSG00000179546
4,ENCT00000039204.1,chr1,"[(233319835, 233321155), (233327229, 233327357)]","[(233319834, 233321155), (233327229, 233327455)]",ENST00000771587.1,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000771587.1,ENST00000771587,ENSG00000300424,ENSG00000300424,lncRNA,lncRNA,-0.698603,1.0,chr1:233327323-233327461_Peak_7654,Permissive,FTMG005195


In [5]:
HIT_hg38 = pd.read_csv( 'HIT_hg38.tsv', sep='\t')
HIT_hg38.head()

  HIT_hg38 = pd.read_csv( 'HIT_hg38.tsv', sep='\t')


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,transcript_id,gene_name,gene_type,transcript_classification,protein_id_orf_summary,Intron,Exon,5prime,3prime,closest
0,chr1,PacBio,exon,14360,14829,.,-,.,ENSG00000227232,PBT00037199,WASH7P,pseudogene,Novel Protein coding not overlapping annotated...,,,,,,
1,chr1,PacBio,exon,14970,15038,.,-,.,ENSG00000227232,PBT00037199,WASH7P,pseudogene,Novel Protein coding not overlapping annotated...,,,,,,
2,chr1,PacBio,exon,15796,15947,.,-,.,ENSG00000227232,PBT00037199,WASH7P,pseudogene,Novel Protein coding not overlapping annotated...,,,,,,
3,chr1,PacBio,exon,16607,16765,.,-,.,ENSG00000227232,PBT00037199,WASH7P,pseudogene,Novel Protein coding not overlapping annotated...,,,,,,
4,chr1,PacBio,exon,16854,17055,.,-,.,ENSG00000227232,PBT00037199,WASH7P,pseudogene,Novel Protein coding not overlapping annotated...,,,,,,


In [6]:
tx_source_map = HIT_hg38[['source', 'transcript_id']].drop_duplicates()
combined_df = combined_df.merge(tx_source_map, on='transcript_id', how='left')
combined_df.head()

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,summarized_category,ref_id,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type,og_gene_id,source
0,ENCT00000006334.1,chr1,"[(61077274, 61077628), (61077916, 61079203)]","[(61077326, 61077628), (61077916, 61079231)]",ENST00000699992.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000699992.1,ENST00000699992,ENSG00000162599,NFIA,protein_coding,protein_coding,-2.48831,0.044346,chr1:61077221-61077380_Peak_3111,Permissive,ENSG00000162599,FANTOM_cat
1,ENCT00000013056.1,chr1,"[(160261734, 160261922), (160281430, 160281892)]","[(160261682, 160261922), (160281430, 160281935)]",ENST00000756371.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000756371.1,ENST00000756371,ENSG00000228606,DCAF8-DT,lncRNA,lncRNA,-1.4803,1.0,chr1:160261726-160261763_Peak_5604,Permissive,ENSG00000228606,FANTOM_cat
2,ENCT00000019620.1,chr1,"[(239386568, 239387227), (239492709, 239492807...","[(239386568, 239387227), (239492709, 239492807...",ENST00000676153.1,GENCODE-annotated transcripts,,,,+,GENCODE-annotated transcripts,ENST00000676153.1,ENST00000676153,ENSG00000133019,CHRM3,protein_coding,protein_coding,-1.093061,0.454561,chr1:239386576-239386587_Peak_7900,Robust,ENSG00000133019,FANTOM_cat
3,ENCT00000022754.1,chr1,"[(23191893, 23195001), (23217291, 23217499)]","[(23191895, 23195001), (23217291, 23217502)]",ENST00000374619.2,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000374619.2,ENST00000374619,ENSG00000179546,HTR1D,protein_coding,protein_coding,-3.951339,1.0,chr1:23217498-23217499_Peak_1212,Permissive,ENSG00000179546,FANTOM_cat
4,ENCT00000039204.1,chr1,"[(233319835, 233321155), (233327229, 233327357)]","[(233319834, 233321155), (233327229, 233327455)]",ENST00000771587.1,GENCODE-annotated transcripts,,,,-,GENCODE-annotated transcripts,ENST00000771587.1,ENST00000771587,ENSG00000300424,ENSG00000300424,lncRNA,lncRNA,-0.698603,1.0,chr1:233327323-233327461_Peak_7654,Permissive,FTMG005195,FANTOM_cat


In [13]:
prefix_map = {
    'FANTOM_cat': 'FTMT',
    'PacBio': 'PBT',
    'StringTie': 'STRT',
    'ENSEMBL': 'NCBIT',
}

combined_df["new_tx_id"] = None

# assign GENCODE-annotated transcripts
mask = combined_df["category"] == "GENCODE-annotated transcripts"
combined_df.loc[mask, "new_tx_id"] = combined_df.loc[mask, "matched_ref_id"]

# generate IDs based on source
remaining = combined_df.loc[~mask].copy()

for source_name, prefix in prefix_map.items():
    source_mask = remaining["source"] == source_name
    idx = remaining[source_mask].index
    for i, row_idx in enumerate(idx, start=1):
        combined_df.at[row_idx, "new_tx_id"] = f"{prefix}{str(i).zfill(7)}"

In [16]:
combined_df[combined_df["new_tx_id"].str.startswith("NCBIT")]

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,summarized_category,ref_id,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type,og_gene_id,source,new_tx_id
38927,ENST00000235329.5,chr1,"[(11980197, 11980484), (11981970, 11982114), (...","[(11980209, 11980484), (11981970, 11982114), (...",ENST00000675817.1,Unannotated transcripts fully contained within...,,,,+,Unannotated transcripts with all exons annotated,ENST00000675817.1,ENST00000675817,ENSG00000116688,MFN2,protein_coding,protein_coding,1.842377,0.849730,chr1:11980204-11980451_Peak_666,Robust,ENSG00000116688,ENSEMBL,NCBIT0000001
38928,ENST00000236914.3,chr1,"[(200027634, 200027911), (200043774, 200043892...","[(200027710, 200027911), (200039658, 200039795...",ENST00000367362.8,Unannotated transcripts fully contained within...,,,,+,Unannotated transcripts with all exons annotated,ENST00000367362.8,ENST00000367362,ENSG00000116833,NR5A2,protein_coding,protein_coding,2.091592,0.573290,chr1:200027537-200027864_Peak_6480,Robust,ENSG00000116833,ENSEMBL,NCBIT0000002
38929,ENST00000239457.5,chr1,"[(173824659, 173825356), (173826687, 173826786...","[(173824673, 173825356), (173826687, 173826786...",ENST00000649689.2,Unannotated transcripts fully contained within...,,,,+,Unannotated transcripts with all exons annotated,ENST00000649689.2,ENST00000649689,ENSG00000117593,DARS2,protein_coding,protein_coding,-1.795171,0.930435,chr1:173824474-173824687_Peak_6035,Robust,ENSG00000117593,ENSEMBL,NCBIT0000003
38930,ENST00000253251.8,chr1,"[(10032959, 10033694), (10072028, 10072214), (...","[(10032958, 10033694), (10072028, 10072214), (...",ENST00000343090.11,Unannotated transcripts fully contained within...,,,,+,Unannotated transcripts with all exons annotated,ENST00000343090.11,ENST00000343090,ENSG00000130939,UBE4B,protein_coding,protein_coding,2.490205,0.997981,chr1:10032954-10033166_Peak_545,Robust,ENSG00000130939,ENSEMBL,NCBIT0000004
38931,ENST00000261443.5,chr1,"[(114716920, 114718216), (114718613, 114718745...","[(114716913, 114718216), (114718613, 114718745...",ENST00000339438.10,Unannotated transcripts fully contained within...,,,,-,Unannotated transcripts with all exons annotated,ENST00000339438.10,ENST00000339438,ENSG00000009307,CSDE1,protein_coding,protein_coding,1.739073,0.712407,chr1:114757960-114757994_Peak_4265,Robust,ENSG00000009307,ENSEMBL,NCBIT0000005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375496,ENST00000543642.1,chrX,"[(37349330, 37349469), (37406209, 37406259), (...",,"['ENST00000850645.1', 'ENST00000465127.1', 'EN...",Unannotated transcripts composed entirely of a...,,ENST00000378628.9,,+,Unannotated transcripts with all exons annotated,ENST00000378628.9,ENST00000378628,ENSG00000130962,PRRG1,protein_coding,protein_coding,-0.986525,1.000000,chrX:37349326-37349369_Peak_78599,Robust,ENSG00000130962,ENSEMBL,NCBIT0014842
375497,ENST00000545566.1,chrX,"[(13653123, 13653188), (13655289, 13655373), (...",,"['ENST00000490617.1', 'ENST00000544987.3', 'EN...",Unannotated transcripts composed entirely of a...,,ENST00000696128.1,,+,Unannotated transcripts with all exons annotated,ENST00000696128.1,ENST00000696128,ENSG00000176896,TCEANC,protein_coding,protein_coding,-3.353878,1.000000,chrX:13653136-13653193_Peak_78335,Robust,ENSG00000176896,ENSEMBL,NCBIT0014843
375498,ENST00000545618.1,chrX,"[(64916370, 64917896), (64919042, 64919204), (...",,"['ENST00000337990.2', 'ENST00000492653.6', 'EN...",Unannotated transcripts composed entirely of a...,,ENST00000703136.1,,-,Unannotated transcripts with all exons annotated,ENST00000703136.1,ENST00000703136,ENSG00000126970,ZC4H2,protein_coding,protein_coding,-3.809740,0.531402,chrX:64976443-64976455_Peak_79235,Robust,ENSG00000126970,ENSEMBL,NCBIT0014844
375499,ENST00000602791.1,chrX,"[(77910631, 77910835), (77962623, 77964890)]",,"['ENST00000686515.1', 'ENST00000685264.1', 'EN...",Unannotated transcripts composed entirely of a...,,ENST00000602791.2,,+,Unannotated transcripts with all exons annotated,ENST00000602791.2,ENST00000602791,ENSG00000293258,ENSG00000293258,lncRNA,lncRNA,-1.288001,1.000000,chrX:77910639-77910745_Peak_79619,Robust,ENSG00000248503,ENSEMBL,NCBIT0014845


## Random filtering

In [2]:
combined_df = pd.read_csv( 'jessies_gffcompare.tsv', sep='\t')
combined_df.head()

  combined_df = pd.read_csv( 'jessies_gffcompare.tsv', sep='\t')


Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,...,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type,og_gene_id
0,ENCT00000006334.1,chr1,"[(61077274, 61077628), (61077916, 61079203)]","[(61077326, 61077628), (61077916, 61079231)]",ENST00000699992.1,GENCODE-annotated transcripts,,,,+,...,ENST00000699992,ENSG00000162599,NFIA,protein_coding,protein_coding,-2.48831,0.044346,chr1:61077221-61077380_Peak_3111,Permissive,ENSG00000162599
1,ENCT00000013056.1,chr1,"[(160261734, 160261922), (160281430, 160281892)]","[(160261682, 160261922), (160281430, 160281935)]",ENST00000756371.1,GENCODE-annotated transcripts,,,,+,...,ENST00000756371,ENSG00000228606,DCAF8-DT,lncRNA,lncRNA,-1.4803,1.0,chr1:160261726-160261763_Peak_5604,Permissive,ENSG00000228606
2,ENCT00000019620.1,chr1,"[(239386568, 239387227), (239492709, 239492807...","[(239386568, 239387227), (239492709, 239492807...",ENST00000676153.1,GENCODE-annotated transcripts,,,,+,...,ENST00000676153,ENSG00000133019,CHRM3,protein_coding,protein_coding,-1.093061,0.454561,chr1:239386576-239386587_Peak_7900,Robust,ENSG00000133019
3,ENCT00000022754.1,chr1,"[(23191893, 23195001), (23217291, 23217499)]","[(23191895, 23195001), (23217291, 23217502)]",ENST00000374619.2,GENCODE-annotated transcripts,,,,-,...,ENST00000374619,ENSG00000179546,HTR1D,protein_coding,protein_coding,-3.951339,1.0,chr1:23217498-23217499_Peak_1212,Permissive,ENSG00000179546
4,ENCT00000039204.1,chr1,"[(233319835, 233321155), (233327229, 233327357)]","[(233319834, 233321155), (233327229, 233327455)]",ENST00000771587.1,GENCODE-annotated transcripts,,,,-,...,ENST00000771587,ENSG00000300424,ENSG00000300424,lncRNA,lncRNA,-0.698603,1.0,chr1:233327323-233327461_Peak_7654,Permissive,FTMG005195


In [3]:
combined_df[combined_df["peak_name"] == "chr9:76906093-76906284_Peak_76147"]

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,...,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type,og_gene_id
37057,ENST00000376713.3,chr9,"[(76788344, 76788534), (76823632, 76823726), (...","[(76788344, 76788534), (76823632, 76823726), (...",ENST00000376713.3,GENCODE-annotated transcripts,,,,-,...,ENST00000376713,ENSG00000106772,PRUNE2,protein_coding,protein_coding,-3.13475,0.361045,chr9:76906093-76906284_Peak_76147,Robust,ENSG00000106772
37058,ENST00000376718.3,chr9,"[(76611377, 76614600), (76619340, 76619387), (...","[(76611376, 76614600), (76619340, 76619387), (...",ENST00000376718.8,GENCODE-annotated transcripts,,,,-,...,ENST00000376718,ENSG00000106772,PRUNE2,protein_coding,protein_coding,-2.32708,0.361045,chr9:76906093-76906284_Peak_76147,Robust,ENSG00000106772
63701,STRT02184338,chr9,"[(76611481, 76614600), (76629192, 76629290), (...","[(76611481, 76614600), (76629192, 76629290), (...",ENST00000443509.6,Unannotated transcripts fully contained within...,,,,-,...,ENST00000443509,ENSG00000106772,PRUNE2,protein_coding,protein_coding,-4.64013,0.361045,chr9:76906093-76906284_Peak_76147,Robust,ENSG00000106772
124338,STRT02135347,chr9,"[(76772808, 76773009), (76823632, 76823726), (...",,"['ENST00000443509.6', 'ENST00000428286.5', 'EN...",Transcripts with annotated exons and non-overl...,"[(76772808, 76773009)]",ENST00000376713.3,,-,...,ENST00000376713,ENSG00000106772,PRUNE2,protein_coding,protein_coding,-3.057171,0.361045,chr9:76906093-76906284_Peak_76147,Robust,ENSG00000106772
124395,STRT02138947,chr9,"[(76813818, 76823726), (76826580, 76826732), (...",,"['ENST00000443509.6', 'ENST00000428286.5', 'EN...",Transcripts with annotated exons and non-overl...,"[(76813818, 76823726), (76886304, 76886395)]",ENST00000492157.5,,-,...,ENST00000492157,ENSG00000106772,PRUNE2,protein_coding,protein_coding,-3.531564,0.361045,chr9:76906093-76906284_Peak_76147,Robust,ENSG00000106772
124523,STRT02143313,chr9,"[(76766664, 76766741), (76823632, 76823726), (...",,"['ENST00000443509.6', 'ENST00000428286.5', 'EN...",Transcripts with annotated exons and non-overl...,"[(76766664, 76766741)]",ENST00000376713.3,,-,...,ENST00000376713,ENSG00000106772,PRUNE2,protein_coding,protein_coding,-0.115937,0.361045,chr9:76906093-76906284_Peak_76147,Robust,ENSG00000106772
124803,STRT02152013,chr9,"[(76769497, 76769690), (76823632, 76823726), (...",,"['ENST00000443509.6', 'ENST00000428286.5', 'EN...",Transcripts with annotated exons and non-overl...,"[(76769497, 76769690)]",ENST00000376713.3,,-,...,ENST00000376713,ENSG00000106772,PRUNE2,protein_coding,protein_coding,-3.218237,0.361045,chr9:76906093-76906284_Peak_76147,Robust,ENSG00000106772
124891,STRT02155452,chr9,"[(76769728, 76769779), (76823632, 76823726), (...",,"['ENST00000443509.6', 'ENST00000428286.5', 'EN...",Transcripts with annotated exons and non-overl...,"[(76769728, 76769779)]",ENST00000376713.3,,-,...,ENST00000376713,ENSG00000106772,PRUNE2,protein_coding,protein_coding,1.011718,0.361045,chr9:76906093-76906284_Peak_76147,Robust,ENSG00000106772
124926,STRT02157377,chr9,"[(76805850, 76805901), (76823632, 76823726), (...",,"['ENST00000443509.6', 'ENST00000428286.5', 'EN...",Transcripts with annotated exons and non-overl...,"[(76805850, 76805901)]",ENST00000376713.3,,-,...,ENST00000376713,ENSG00000106772,PRUNE2,protein_coding,protein_coding,-0.009579,0.361045,chr9:76906093-76906284_Peak_76147,Robust,ENSG00000106772
125401,STRT02171755,chr9,"[(76879152, 76879205), (76886304, 76886395), (...",,"['ENST00000376718.8', 'ENST00000443509.6', 'EN...",Transcripts with annotated exons and non-overl...,"[(76879152, 76879205), (76886304, 76886395)]",ENST00000492157.5,,-,...,ENST00000492157,ENSG00000106772,PRUNE2,protein_coding,protein_coding,-5.039257,0.361045,chr9:76906093-76906284_Peak_76147,Robust,ENSG00000106772


In [7]:
combined_df[combined_df["transcript_id"] == "PBT00031884"]

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,...,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type,og_gene_id
328077,PBT00031884,chr1,"[(943393, 943453), (943698, 944574)]",,,Fully unannotated transcripts with overlapping...,,,ENST00000341065.8,+,...,ENST00000341065,ENSG00000187634,SAMD11,protein_coding,protein_coding,0.759239,0.296967,chr1:943392-943773_Peak_73,Robust,ENSG00000187634


In [7]:
combined_df[combined_df["Gene name"] == "SYT16"]

Unnamed: 0,transcript_id,chr,exons,matched_to,matched_ref_id,category,unannotated_exons,best_match_ref_id,largest_overlap_gencode_id,Strand,...,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,log2_tx_exp,Relative TSS usage,peak_name,peak_type,og_gene_id
81992,PBT00131765,chr14,"[(61884880, 61885249), (61897253, 61897355), (...",,['ENST00000554436.1'],Transcripts with annotated exons and non-overl...,"[(61884880, 61885249), (61903911, 61907364)]",ENST00000554436.1,,+,...,ENST00000554436,ENSG00000139973,SYT16,protein_coding,protein_coding,-2.314628,1.0,chr14:61884816-61884916_Peak_23082,Robust,ENSG00000258882
81993,PBT00131766,chr14,"[(61884880, 61885249), (61970132, 61970311), (...",,"['ENST00000555409.1', 'ENST00000683842.1', 'EN...",Transcripts with annotated exons and non-overl...,"[(61884880, 61885249), (62075135, 62075604)]",ENST00000683842.1,,+,...,ENST00000683842,ENSG00000139973,SYT16,protein_coding,protein_coding,-1.21614,0.507257,chr14:61884816-61884916_Peak_23082,Robust,ENSG00000139973
81994,PBT00131767,chr14,"[(61884880, 61885249), (61970132, 61970311), (...",,"['ENST00000555409.1', 'ENST00000683842.1', 'EN...",Transcripts with annotated exons and non-overl...,"[(61884880, 61885249), (62084196, 62086479)]",ENST00000683842.1,,+,...,ENST00000683842,ENSG00000139973,SYT16,protein_coding,protein_coding,-4.791865,0.507257,chr14:61884816-61884916_Peak_23082,Robust,ENSG00000139973
81995,PBT00131769,chr14,"[(61884880, 61885249), (61970132, 61970311), (...",,"['ENST00000555409.1', 'ENST00000683842.1', 'EN...",Transcripts with annotated exons and non-overl...,"[(61884880, 61885249)]",ENST00000683842.1,,+,...,ENST00000683842,ENSG00000139973,SYT16,protein_coding,protein_coding,-4.093752,0.507257,chr14:61884816-61884916_Peak_23082,Robust,ENSG00000139973
81996,PBT00131770,chr14,"[(61884903, 61885249), (61970132, 61970311), (...",,"['ENST00000555409.1', 'ENST00000683842.1', 'EN...",Transcripts with annotated exons and non-overl...,"[(61884903, 61885249), (62075135, 62075605)]",ENST00000636133.1,,+,...,ENST00000636133,ENSG00000139973,SYT16,protein_coding,protein_coding,-1.410534,0.507257,chr14:61884816-61884916_Peak_23082,Robust,ENSG00000139973
81997,PBT00131771,chr14,"[(61885175, 61885249), (61970132, 61970311), (...",,"['ENST00000555409.1', 'ENST00000683842.1', 'EN...",Transcripts with annotated exons and non-overl...,"[(61885175, 61885249), (62084196, 62086705)]",ENST00000683842.1,,+,...,ENST00000683842,ENSG00000139973,SYT16,protein_coding,protein_coding,-2.722954,0.224807,chr14:61885033-61885181_Peak_23083,Robust,ENSG00000139973
82138,STRT00610923,chr14,"[(61885175, 61885249), (61970132, 61970311), (...",,"['ENST00000555409.1', 'ENST00000683842.1', 'EN...",Transcripts with annotated exons and non-overl...,"[(61885175, 61885249), (62100563, 62101078)]",ENST00000683842.1,,+,...,ENST00000683842,ENSG00000139973,SYT16,protein_coding,protein_coding,-3.374329,0.224807,chr14:61885033-61885181_Peak_23083,Robust,ENSG00000139973
82228,STRT00616169,chr14,"[(61884814, 61884983), (61885077, 61885249), (...",,"['ENST00000555409.1', 'ENST00000683842.1', 'EN...",Transcripts with annotated exons and non-overl...,"[(61884814, 61884983), (61885077, 61885249)]",ENST00000683842.1,,+,...,ENST00000683842,ENSG00000139973,SYT16,protein_coding,protein_coding,0.853407,0.507257,chr14:61884816-61884916_Peak_23082,Robust,ENSG00000139973
82240,STRT00616988,chr14,"[(61884843, 61885249), (61897253, 61897355), (...",,['ENST00000554436.1'],Transcripts with annotated exons and non-overl...,"[(61884843, 61885249), (61907437, 61907735)]",ENST00000554436.1,,+,...,ENST00000554436,ENSG00000139973,SYT16,protein_coding,protein_coding,-2.257821,0.507257,chr14:61884816-61884916_Peak_23082,Robust,ENSG00000139973
82241,STRT00616989,chr14,"[(61885175, 61885249), (61970132, 61970311), (...",,"['ENST00000555409.1', 'ENST00000683842.1', 'EN...",Transcripts with annotated exons and non-overl...,"[(61885175, 61885249), (62100563, 62107102)]",ENST00000683842.1,,+,...,ENST00000683842,ENSG00000139973,SYT16,protein_coding,protein_coding,-2.967091,0.224807,chr14:61885033-61885181_Peak_23083,Robust,ENSG00000139973


In [4]:
combined_df[combined_df["Gene name"] == "PRUNE2"]["category"].value_counts()

category
Transcripts with annotated exons and only overlapping unannotated exons    51
Transcripts with annotated exons and non-overlapping unannotated exons     19
Unannotated transcripts composed entirely of annotated exons               18
Unannotated transcripts fully contained within GENCODE transcripts          8
GENCODE-annotated transcripts                                               3
Fully unannotated transcripts with overlapping exons                        3
Name: count, dtype: int64

In [8]:
alpha_peak_ids = [
    "ENST00000223609.6", "ENST00000488346.1", "HBMT00001485712.1", "HBMT00001485831.1",
    "PBT00426042", "PBT00426044", "PBT00426045", "PBT00426046", "PBT00426047", "PBT00426048",
    "PBT00426049", "PBT00426050", "PBT00426051", "PBT00426065",
    "STRT02140348", "STRT02141127", "STRT02154998", "STRT02171970",
    "STRT02184791", "STRT02189297", "STRT02194130"
]

beta_peak_ids = [
    "ENST00000376713.3", "ENST00000376718.3", "ENST00000428286.1", "ENST00000492157.1",
    "FTMT23300029397.1", "FTMT23300029975.1", "PBT00426069", "PBT00426071", "PBT00426074",
    "STRT02128601", "STRT02135347", "STRT02137332", "STRT02138946", "STRT02138947",
    "STRT02143313", "STRT02152013", "STRT02155452", "STRT02157377", "STRT02162417",
    "STRT02163302", "STRT02165955", "STRT02171755", "STRT02177848", "STRT02178142",
    "STRT02179724", "STRT02179725", "STRT02180042", "STRT02183942", "STRT02184337",
    "STRT02184338", "STRT02184999", "STRT02185383", "STRT02185739", "STRT02186418",
    "STRT02186750", "STRT02190622", "STRT02192005", "STRT02193969", "STRT02194971",
    "STRT02203853", "STRT02206296", "STRT02207040", "STRT02207483", "STRT02209417",
    "STRT02209624", "STRT02211017", "STRT02214009", "STRT02214010", "STRT02214012"
]


In [25]:
combined_df[combined_df["transcript_id"].isin(alpha_peak_ids)]["category"].value_counts()

category
Unannotated transcripts composed entirely of annotated exons               10
Transcripts with annotated exons and only overlapping unannotated exons     6
Unannotated transcripts fully contained within GENCODE transcripts          3
GENCODE-annotated transcripts                                               1
Fully unannotated transcripts with overlapping exons                        1
Name: count, dtype: int64

In [12]:
combined_df[combined_df["transcript_id"].isin(beta_peak_ids)]["category"].value_counts()

category
Transcripts with annotated exons and only overlapping unannotated exons    24
Transcripts with annotated exons and non-overlapping unannotated exons     13
Unannotated transcripts composed entirely of annotated exons                8
GENCODE-annotated transcripts                                               2
Unannotated transcripts fully contained within GENCODE transcripts          1
Fully unannotated transcripts with overlapping exons                        1
Name: count, dtype: int64