# Predicting coding potential with CPC2

In [1]:
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import pyranges as pr
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from collections import defaultdict
from matplotlib import font_manager
import numpy as np
import csv

  import pkg_resources


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
combined_df = pd.read_csv( '../data_processed/curated_HIT.tsv', sep='\t')

  combined_df = pd.read_csv( '../data_processed/curated_HIT.tsv', sep='\t')


In [6]:
gtf_column_names = ['chromosome','source','feature','start','end','score','strand','frame','attribute']
gtf_df = pd.read_csv( '../data_processed/curated_HIT.gtf', sep='\t', header=None, names=gtf_column_names)

gtf_df["attribute"].nunique()

376342

In [7]:
gtf_df.head()

Unnamed: 0,chromosome,source,feature,start,end,score,strand,frame,attribute
0,chr1,custom,exon,61077274,61077628,.,+,.,"gene_id ""ENSG00000162599""; transcript_id ""ENCT..."
1,chr1,custom,exon,61077916,61079203,.,+,.,"gene_id ""ENSG00000162599""; transcript_id ""ENCT..."
2,chr1,custom,exon,160261734,160261922,.,+,.,"gene_id ""ENSG00000228606""; transcript_id ""ENCT..."
3,chr1,custom,exon,160281430,160281892,.,+,.,"gene_id ""ENSG00000228606""; transcript_id ""ENCT..."
4,chr1,custom,exon,239386568,239387227,.,+,.,"gene_id ""ENSG00000133019""; transcript_id ""ENCT..."


In [8]:
def split_attributes(df, *attribute_type):
    """split attribute of a gtf into separate columns"""
    try: 
        l_attributes = df.strip(';').split('; ')
        attribute_position = [i for i, x in enumerate(l_attributes) if x.startswith(attribute_type)]
        attribute = l_attributes[attribute_position[0]].split(' ')[1].strip('"')
        return attribute
    except Exception:
        return np.nan

## Coding potential of unmapped transcripts (unannotated & non-overlapping)

In [9]:
unmapped_tx = combined_df[combined_df["gene_category"] == "unmapped"]["transcript_id"].unique()

In [10]:
len(unmapped_tx)

3291

In [11]:
# Make gtf of the unmapped transcripts
gtf_df.loc[:, 'transcript_id'] = gtf_df['attribute'].apply(split_attributes, args=("transcript_id",))
unmapped_tx_gtf = gtf_df[gtf_df['transcript_id'].isin(unmapped_tx)].copy()
print(len(unmapped_tx_gtf['transcript_id'].unique()))
unmapped_tx_gtf.drop('transcript_id', axis=1, inplace=True)

3291


In [13]:
# Save unmapped transcripts 
unmapped_tx_gtf.to_csv('/no_backup/jferrer/jmidgley/CPAT/unmapped_tx.gtf', sep='\t', header=False, index=False, quoting=csv.QUOTE_NONE)

Converted to FASTA using command: gffread unmapped_tx.gtf -g /no_backup/jferrer/mplanas/VASAseq/files/hg38.fa -w unmapped_tx.fa

Ran CPC2: http://cpc2.gao-lab.org/run_cpc2_result.php?userid=250715752970787

In [12]:
# Load results from CPC
cpc_unmapped_results = pd.read_csv('../data_processed/cpc2_unmapped_result.txt', sep='\t')
cpc_unmapped_results

Unnamed: 0,#ID,peptide_length,Fickett_score,pI,ORF_integrity,coding_probability,label
0,FTMT20200000058.1,28,0.36862,12.302673,1,0.012557,noncoding
1,ENCT00000000219.1,98,0.29846,11.379456,-1,0.138521,noncoding
2,HBMT00000000427.1,188,0.24028,11.163025,1,0.237013,noncoding
3,STRT00201664,88,0.33055,12.196350,1,0.060881,noncoding
4,ENCT00000000276.1,167,0.28825,11.949646,1,0.203065,noncoding
...,...,...,...,...,...,...,...
3286,STRT02254397,84,0.31852,8.883728,1,0.063876,noncoding
3287,STRT02250844,36,0.34599,9.624695,1,0.013191,noncoding
3288,STRT02276598,48,0.29112,9.381531,1,0.018462,noncoding
3289,ENCT00000473760.1,387,0.27251,11.006165,1,0.958774,coding


In [13]:
print("Nr of predicted coding transcripts: ",len(cpc_unmapped_results[cpc_unmapped_results["label"] == "coding"]))
print("Nr of predicted non-coding transcripts: ", len(cpc_unmapped_results[cpc_unmapped_results["label"] == "noncoding"]))

Nr of predicted coding transcripts:  233
Nr of predicted non-coding transcripts:  3058


### Add this info to unmapped_tx df

In [15]:
pd.set_option('display.max_columns', None)

In [16]:
unmapped_tx_df = combined_df[combined_df["gene_category"] == "unmapped"]

In [17]:
# Add column with predicted coding potential
unmapped_tx_df = (unmapped_tx_df.merge(cpc_unmapped_results[['#ID','label']],
             left_on='transcript_id', right_on='#ID', how='left').rename(columns={'label':'predicted_tx_potential'}).drop(columns=['#ID']))
unmapped_tx_df.sort_values("peak_name")

Unnamed: 0,transcript_id,chr,exons,matched_to,category,unannotated_exons,Strand,summarized_category,ref_id,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,peak_name,peak_type,mean_TPM_ref,log2_tx_exp,gene_exp_ref,relative_tx_exp,TSS_TPM_per_gene,Relative TSS usage,TSS_type,TSS_start,TSS_end,distance_to_closest_TSS,closest_TSS_ref_id,TSS distance category,TSS_name,predicted_tx_potential
353,PBT00056893,chr10,"[(101481561, 101481953), (101483491, 101486439)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_10623,Peak_10623,,unmapped,chr10:101481539-101481561_Peak_10623,Permissive,0.038927,-4.683070,0.038927,1.000000,0.038927,1.0,unique TSS,101481539.0,101481561.0,53823.0,ENST00000493877.1,>100bp,P10623_U,noncoding
338,HBMT00000152473.1,chr10,"[(102056374, 102056599)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_10661,Peak_10661,,unmapped,chr10:102056349-102056377_Peak_10661,Permissive,0.149276,-2.743945,0.149276,1.000000,0.149276,1.0,unique TSS,102056349.0,102056377.0,8972.0,ENST00000299238.7,>100bp,P10661_U,noncoding
479,STRT00309589,chr10,"[(103192661, 103192857), (103197445, 103197595)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_10781,Peak_10781,,unmapped,chr10:103192627-103192661_Peak_10781,Permissive,0.034002,-4.878225,0.531743,0.063945,0.531743,1.0,unique TSS,103192627.0,103192661.0,17073.0,ENST00000412473.1,>100bp,P10781_U,noncoding
343,MICT00000047960.1,chr10,"[(103192628, 103194616), (103194939, 103195545)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_10781,Peak_10781,,unmapped,chr10:103192627-103192661_Peak_10781,Permissive,0.497741,-1.006532,0.531743,0.936055,0.531743,1.0,unique TSS,103192627.0,103192661.0,17073.0,ENST00000412473.1,>100bp,P10781_U,noncoding
427,STRT00264588,chr10,"[(103556772, 103556925), (103557098, 103557253...",,Fully unannotated transcripts without any over...,,-,Unannotated transcripts without any annotated ...,,,Peak_10807,Peak_10807,,unmapped,chr10:103557493-103557494_Peak_10807,Permissive,0.027886,-5.164295,0.027886,1.000000,0.027886,1.0,unique TSS,103557493.0,103557494.0,26193.0,ENST00000663302.1,>100bp,P10807_U,noncoding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,FTMT20100020125.1,chr1,"[(148441797, 148445260)]",,Fully unannotated transcripts without any over...,,-,Unannotated transcripts without any annotated ...,,,Peak_4614,Peak_4614,,unmapped,conversion_failed_Peak_4614,Permissive,1.842866,0.881952,2.341060,0.787193,2.341060,1.0,unique TSS,,,,,>100bp,P4614_U,noncoding
200,STRT00097280,chr1,"[(148442358, 148444749)]",,Fully unannotated transcripts without any over...,,-,Unannotated transcripts without any annotated ...,,,Peak_4614,Peak_4614,,unmapped,conversion_failed_Peak_4614,Permissive,0.498194,-1.005221,2.341060,0.212807,2.341060,1.0,unique TSS,,,,,>100bp,P4614_U,noncoding
272,STRT00165158,chr1,"[(148526237, 148526509)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_4623,Peak_4623,,unmapped,conversion_failed_Peak_4623,Robust,0.612328,-0.707622,0.612328,1.000000,0.612328,1.0,unique TSS,,,,,>100bp,P4623_U,noncoding
3140,FTMT23500025511.1,chr9,"[(41597447, 41597817)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_75829,Peak_75829,,unmapped,conversion_failed_Peak_75829,Robust,0.331050,-1.594880,0.331050,1.000000,0.331050,1.0,unique TSS,,,,,>100bp,P75829_U,noncoding


In [18]:
# For each gene, calculate percentage of tx with coding potential
peak_summary = unmapped_tx_df.groupby(['Gene name', 'predicted_tx_potential']).size().unstack(fill_value=0)
peak_summary["pct_coding"] = (peak_summary["coding"] / peak_summary.sum(1) * 100).round(1)
peak_summary

predicted_tx_potential,coding,noncoding,pct_coding
Gene name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Peak_10119,1,0,100.0
Peak_10177,0,5,0.0
Peak_10218,0,17,0.0
Peak_10257,0,1,0.0
Peak_10277,0,2,0.0
...,...,...,...
Peak_9870,1,0,100.0
Peak_9894,0,4,0.0
Peak_996,0,2,0.0
Peak_9964,0,1,0.0


In [19]:
# How many genes with at least 20% of tx predicted coding
len(peak_summary[peak_summary["pct_coding"] >= 20])

132

In [20]:
unmapped_tx_df['predicted_gene_potential'] = np.where(unmapped_tx_df['Gene name'].map(peak_summary['pct_coding']).fillna(0).ge(20), 'coding', 'noncoding')
unmapped_tx_df

Unnamed: 0,transcript_id,chr,exons,matched_to,category,unannotated_exons,Strand,summarized_category,ref_id,stripped_ref_id,Gene stable ID,Gene name,Gene type,gene_category,peak_name,peak_type,mean_TPM_ref,log2_tx_exp,gene_exp_ref,relative_tx_exp,TSS_TPM_per_gene,Relative TSS usage,TSS_type,TSS_start,TSS_end,distance_to_closest_TSS,closest_TSS_ref_id,TSS distance category,TSS_name,predicted_tx_potential,predicted_gene_potential
0,ENCT00000000219.1,chr1,"[(1162877, 1163719), (1163993, 1164042)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_111,Peak_111,,unmapped,chr1:1162789-1162942_Peak_111,Robust,0.018685,-5.741955,0.339126,0.055098,0.339126,1.0,unique TSS,1162789.0,1162942.0,4162.0,ENST00000384997.3,>100bp,P111_U,noncoding,noncoding
1,ENCT00000000276.1,chr1,"[(1349559, 1355076), (1355263, 1355740), (1357...",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_145,Peak_145,,unmapped,chr1:1349517-1349586_Peak_145,Permissive,0.836916,-0.256846,0.867767,0.964448,0.867767,1.0,unique TSS,1349517.0,1349586.0,679.0,ENST00000806659.1,>100bp,P145_U,noncoding,noncoding
2,ENCT00000002194.1,chr1,"[(17439849, 17449625)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_910,Peak_910,,unmapped,chr1:17439844-17439937_Peak_910,Permissive,0.085408,-3.549480,0.085408,1.000000,0.085408,1.0,unique TSS,17439844.0,17439937.0,20004.0,ENST00000835437.1,>100bp,P910_U,noncoding,noncoding
3,ENCT00000002302.1,chr1,"[(19485741, 19489397)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_996,Peak_996,,unmapped,chr1:19485730-19485780_Peak_996,Permissive,0.114061,-3.132117,0.168185,0.678191,0.168185,1.0,unique TSS,19485730.0,19485780.0,1327.0,ENST00000648702.1,>100bp,P996_U,noncoding,noncoding
4,ENCT00000002410.1,chr1,"[(20508236, 20510041)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_1039,Peak_1039,,unmapped,chr1:20508218-20508233_Peak_1039,Permissive,0.059096,-4.080791,0.059096,1.000000,0.059096,1.0,unique TSS,20508218.0,20508233.0,17756.0,ENST00000443195.1,>100bp,P1039_U,noncoding,noncoding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3286,STRT02284050,chrX,"[(74034396, 74034874)]",,Fully unannotated transcripts without any over...,,-,Unannotated transcripts without any annotated ...,,,Peak_79446,Peak_79446,,unmapped,chrX:74034838-74034839_Peak_79446,Permissive,1.715846,0.778920,1.715846,1.000000,1.715846,1.0,unique TSS,74034838.0,74034839.0,21893.0,ENST00000638437.1,>100bp,P79446_U,noncoding,noncoding
3287,STRT02289282,chrX,"[(51805614, 51805945)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_79008,Peak_79008,,unmapped,chrX:51805624-51805625_Peak_79008,Permissive,0.318309,-1.651502,0.318309,1.000000,0.318309,1.0,unique TSS,51805624.0,51805625.0,2617.0,ENST00000375772.7,>100bp,P79008_U,noncoding,noncoding
3288,STRT02289408,chrX,"[(57617393, 57618659)]",,Fully unannotated transcripts without any over...,,+,Unannotated transcripts without any annotated ...,,,Peak_79202,Peak_79202,,unmapped,chrX:57617355-57617356_Peak_79202,Permissive,0.129846,-2.945122,0.129846,1.000000,0.129846,1.0,unique TSS,57617355.0,57617356.0,23432.0,ENST00000818758.1,>100bp,P79202_U,noncoding,noncoding
3289,STRT02290336,chrX,"[(19990302, 19990805)]",,Fully unannotated transcripts without any over...,,-,Unannotated transcripts without any annotated ...,,,Peak_78445,Peak_78445,,unmapped,chrX:19990802-19990828_Peak_78445,Permissive,0.101023,-3.307240,0.260752,0.387430,0.260752,1.0,unique TSS,19990802.0,19990828.0,233.0,ENST00000379682.9,>100bp,P78445_U,noncoding,noncoding


In [21]:
unmapped_gene_name_map = (unmapped_tx_df[['Gene name','predicted_gene_potential']].drop_duplicates()
    .assign(updated_gene_name=lambda d: d['Gene name'].str.replace(r'Peak_ ?', 'HIT', regex=True) + d['predicted_gene_potential'].map({'coding':'-PC','noncoding':'-NC'}))
    .drop(columns=['predicted_gene_potential']))


In [22]:
# Updated gene names (for figures)
unmapped_gene_name_map

Unnamed: 0,Gene name,updated_gene_name
0,Peak_111,HIT111-NC
1,Peak_145,HIT145-NC
2,Peak_910,HIT910-NC
3,Peak_996,HIT996-NC
4,Peak_1039,HIT1039-NC
...,...,...
3285,Peak_80214,HIT80214-NC
3286,Peak_79446,HIT79446-NC
3287,Peak_79008,HIT79008-NC
3288,Peak_79202,HIT79202-NC


In [23]:
unmapped_gene_name_map.to_csv("../data_processed/unmapped_gene_name_map.csv", index=False)