# Do novel exons contain T2D GWAS variants?

In [1]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load CDS output of non-overlapping exons in predicted CDS
with open("../data_processed/ORFanage/orfanage_output_with_CDS_no_stop.pkl", "rb") as f:
    orfanage_output_with_CDS = pickle.load(f)

In [5]:
novel_exons = orfanage_output_with_CDS[orfanage_output_with_CDS["is_non_overlapping"] == True]

In [7]:
novel_exons['exon_key'] = novel_exons['Chromosome'].astype(str) + '_' + \
                        novel_exons['Start'].astype(str) + '_' + \
                        novel_exons['End'].astype(str) + '_' + \
                        novel_exons['Strand'].astype(str)
novel_exons.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  novel_exons['exon_key'] = novel_exons['Chromosome'].astype(str) + '_' + \


Unnamed: 0,Chromosome,source,feature,Start,End,score,Strand,frame,transcript_id,is_non_overlapping,is_unannotated,CDS_matches_exon,is_stop_in_cds,Gene name,exon_key
1,chrX,custom,exon,2932983,2933528,.,-,.,PBT00439837,True,True,False,False,ARSL,chrX_2932983_2933528_-
21,chrX,custom,exon,2933334,2933528,.,-,.,STRT02233988,True,True,False,False,ARSL,chrX_2933334_2933528_-
43,chrX,custom,exon,7918808,7918984,.,-,.,STRT02228666,True,True,False,False,PNPLA4,chrX_7918808_7918984_-
59,chrX,custom,exon,7918864,7918984,.,-,.,STRT02237178,True,True,False,False,PNPLA4,chrX_7918864_7918984_-
73,chrX,custom,exon,7918904,7918984,.,-,.,STRT02253319,True,True,False,False,PNPLA4,chrX_7918904_7918984_-


In [8]:
novel_exons["exon_key"].nunique()

10847

In [9]:
# same novel exon in multiple transcripts
duplicates_mask = novel_exons["exon_key"].duplicated(keep=False)
novel_exons[duplicates_mask]

Unnamed: 0,Chromosome,source,feature,Start,End,score,Strand,frame,transcript_id,is_non_overlapping,is_unannotated,CDS_matches_exon,is_stop_in_cds,Gene name,exon_key
265,chrX,custom,exon,13715720,13715794,.,+,.,STRT02287699,True,True,False,False,OFD1,chrX_13715720_13715794_+
313,chrX,custom,exon,13715720,13715794,.,+,.,STRT02244133,True,True,False,False,OFD1,chrX_13715720_13715794_+
519,chrX,custom,exon,15753960,15754004,.,+,.,STRT02274688,True,True,False,False,CA5B,chrX_15753960_15754004_+
543,chrX,custom,exon,15753960,15754004,.,+,.,STRT02271968,True,True,False,False,CA5B,chrX_15753960_15754004_+
701,chrX,custom,exon,18930763,18930847,.,-,.,STRT02275181,True,True,False,False,PHKA2,chrX_18930763_18930847_-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272264,chr22,custom,exon,50276134,50276188,.,-,.,PBT00285527,True,True,False,False,PLXNB2,chr22_50276134_50276188_-
272299,chr22,custom,exon,50295377,50295429,.,-,.,PBT00285527,True,True,False,False,PLXNB2,chr22_50295377_50295429_-
272332,chr22,custom,exon,50276134,50276188,.,-,.,PBT00285553,True,True,False,False,PLXNB2,chr22_50276134_50276188_-
272396,chr22,custom,exon,50276134,50276227,.,-,.,PBT00285567,True,True,False,False,PLXNB2,chr22_50276134_50276227_-


In [10]:
# one transcript with multiple novel exons
duplicates_mask = novel_exons["transcript_id"].duplicated(keep=False)
novel_exons[duplicates_mask]

Unnamed: 0,Chromosome,source,feature,Start,End,score,Strand,frame,transcript_id,is_non_overlapping,is_unannotated,CDS_matches_exon,is_stop_in_cds,Gene name,exon_key
169,chrX,custom,exon,9466385,9466457,.,+,.,STRT02262527,True,True,False,False,TBL1X,chrX_9466385_9466457_+
186,chrX,custom,exon,9719997,9721277,.,+,.,STRT02262527,True,True,False,False,TBL1X,chrX_9719997_9721277_+
572,chrX,custom,exon,16653206,16653389,.,-,.,STRT02255865,True,True,False,False,CTPS2,chrX_16653206_16653389_-
585,chrX,custom,exon,16707409,16707503,.,-,.,STRT02255865,True,True,False,False,CTPS2,chrX_16707409_16707503_-
1161,chrX,custom,exon,23832331,23832407,.,-,.,STRT02274094,True,True,False,False,APOO,chrX_23832331_23832407_-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272041,chr22,custom,exon,50204919,50205024,.,+,.,STRT01398653,True,True,False,False,SELENOO,chr22_50204919_50205024_+
272196,chr22,custom,exon,50276134,50276227,.,-,.,STRT01405431,True,True,False,False,PLXNB2,chr22_50276134_50276227_-
272231,chr22,custom,exon,50295377,50295429,.,-,.,STRT01405431,True,True,False,False,PLXNB2,chr22_50295377_50295429_-
272264,chr22,custom,exon,50276134,50276188,.,-,.,PBT00285527,True,True,False,False,PLXNB2,chr22_50276134_50276188_-


In [11]:
# Load variant positions
column_names = ['chr','pos']
variants = pd.read_csv("../data_raw/credset_liftover_positions.tsv", sep="\t", header=None, names=column_names)
variants.head()

Unnamed: 0,chr,pos
0,chr1,229537208
1,chr9,133273813
2,chr9,133274084
3,chr9,133270061
4,chr9,133271745


In [12]:
len(variants)

7902

In [13]:
merged = pd.merge(variants, novel_exons, left_on="chr", right_on="Chromosome", how="left")
merged["in_exon"] = (merged["pos"] >= merged["Start"]) & (merged["pos"] <= merged["End"])

In [14]:
hits = merged[merged["in_exon"]]
hits

Unnamed: 0,chr,pos,Chromosome,source,feature,Start,End,score,Strand,frame,transcript_id,is_non_overlapping,is_unannotated,CDS_matches_exon,is_stop_in_cds,Gene name,exon_key,in_exon
112454,chr10,97230382,chr10,custom,exon,97230320,97230387,.,-,.,STRT00277921,True,True,False,False,ARHGAP19,chr10_97230320_97230387_-,True
519840,chr17,67896391,chr17,custom,exon,67895489,67898418,.,+,.,STRT00929838,True,True,False,False,BPTF,chr17_67895489_67898418_+,True
521724,chr17,67896903,chr17,custom,exon,67895489,67898418,.,+,.,STRT00929838,True,True,False,False,BPTF,chr17_67895489_67898418_+,True
522195,chr17,67896636,chr17,custom,exon,67895489,67898418,.,+,.,STRT00929838,True,True,False,False,BPTF,chr17_67895489_67898418_+,True
525483,chr17,67951854,chr17,custom,exon,67951730,67951922,.,+,.,HBMT00000608352.1,True,True,False,False,BPTF,chr17_67951730_67951922_+,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5182746,chr8,115567092,chr8,custom,exon,115518120,115586630,.,-,.,STRT02087866,True,True,False,False,TRPS1,chr8_115518120_115586630_-,True
5183350,chr8,115566821,chr8,custom,exon,115518120,115586630,.,-,.,STRT02087866,True,True,False,False,TRPS1,chr8_115518120_115586630_-,True
5184558,chr8,115571890,chr8,custom,exon,115518120,115586630,.,-,.,STRT02087866,True,True,False,False,TRPS1,chr8_115518120_115586630_-,True
5185766,chr8,115571642,chr8,custom,exon,115518120,115586630,.,-,.,STRT02087866,True,True,False,False,TRPS1,chr8_115518120_115586630_-,True


In [15]:
hits["pos"].nunique()

67

In [16]:
hits["exon_key"].nunique()

11

In [17]:
hits["Gene name"].unique()

array(['ARHGAP19', 'BPTF', 'PCGF3', 'FAF1', 'DEUP1', 'SLC9B2', 'DPEP1',
       'TCF7L2', 'TRPS1', 'WFS1'], dtype=object)

In [18]:
hits["exon_key"].unique()

array(['chr10_97230320_97230387_-', 'chr17_67895489_67898418_+',
       'chr17_67951730_67951922_+', 'chr4_747902_749477_+',
       'chr1_50779947_50781044_-', 'chr11_93397259_93398926_+',
       'chr4_103035389_103036518_-', 'chr16_89640594_89641859_+',
       'chr10_112968330_112981419_+', 'chr8_115518120_115586630_-',
       'chr4_6295916_6296402_+'], dtype=object)

In [19]:
hits.groupby("exon_key")["pos"].nunique()

exon_key
chr10_112968330_112981419_+     5
chr10_97230320_97230387_-       1
chr11_93397259_93398926_+       1
chr16_89640594_89641859_+       1
chr17_67895489_67898418_+       5
chr17_67951730_67951922_+       1
chr1_50779947_50781044_-        1
chr4_103035389_103036518_-      1
chr4_6295916_6296402_+          1
chr4_747902_749477_+            2
chr8_115518120_115586630_-     48
Name: pos, dtype: int64

In [20]:
pd.set_option('display.max_rows', None)

In [21]:
# Are the variants actually in the predicted CDS regions?

results = []  

for gene in hits["Gene name"].unique():
    gene_hits = hits[hits["Gene name"] == gene]
    
    # find all tx for that gene 
    transcripts = orfanage_output_with_CDS.loc[
        orfanage_output_with_CDS["Gene name"] == gene, "transcript_id"
    ].unique()
    
    for tx in transcripts:
        # CDS regions that match novel exon
        cds_rows = orfanage_output_with_CDS[
            (orfanage_output_with_CDS["transcript_id"] == tx)
            & (orfanage_output_with_CDS["CDS_matches_exon"] == True)
        ]
        cds_ranges = list(zip(cds_rows["Start"], cds_rows["End"]))
        
        # mark which variants fall inside CDS
        gene_hits_tx = gene_hits.copy()
        gene_hits_tx["in_CDS"] = gene_hits_tx["pos"].apply(
            lambda p: any(start <= p <= end for start, end in cds_ranges)
        )
        
        # keep only variants inside CDS
        hits_in_cds = gene_hits_tx[gene_hits_tx["in_CDS"]]
        
        results.append({
            "Gene name": gene,
            "transcript_id": tx,
            "num_variants_in_CDS": len(hits_in_cds),
            "variant_positions_in_CDS": hits_in_cds["pos"].tolist()
        })

cds_hits = pd.DataFrame(results)
cds_hits


Unnamed: 0,Gene name,transcript_id,num_variants_in_CDS,variant_positions_in_CDS
0,ARHGAP19,STRT00277921,1,[97230382]
1,BPTF,HBMT00000608352.1,0,[]
2,BPTF,STRT00930465,0,[]
3,BPTF,STRT00931774,0,[]
4,BPTF,STRT00955719,0,[]
5,BPTF,STRT00955720,0,[]
6,BPTF,STRT00936211,0,[]
7,BPTF,STRT00941653,0,[]
8,BPTF,STRT00952645,0,[]
9,BPTF,STRT00955391,0,[]


In [22]:
novel_exons[novel_exons["Gene name"] == "ARHGAP19"]

Unnamed: 0,Chromosome,source,feature,Start,End,score,Strand,frame,transcript_id,is_non_overlapping,is_unannotated,CDS_matches_exon,is_stop_in_cds,Gene name,exon_key
170137,chr10,custom,exon,97230320,97230387,.,-,.,STRT00277921,True,True,False,False,ARHGAP19,chr10_97230320_97230387_-
170145,chr10,custom,exon,97265242,97265334,.,-,.,STRT00277921,True,True,False,False,ARHGAP19,chr10_97265242_97265334_-


In [23]:
novel_exons[novel_exons["Gene name"] == "WFS1"]

Unnamed: 0,Chromosome,source,feature,Start,End,score,Strand,frame,transcript_id,is_non_overlapping,is_unannotated,CDS_matches_exon,is_stop_in_cds,Gene name,exon_key
75806,chr4,custom,exon,6295916,6296402,.,+,.,STRT01577036,True,True,False,False,WFS1,chr4_6295916_6296402_+
