# Create list of genes with ExAC pLI score

ExAC file has pLI score listed for each transcript so need to match up transcript IDs with gene IDs

## Analysis

In [6]:
import pandas as pd
import re

In [7]:
def removeTranscriptIDVersions(text):
    return re.findall('(ENST\d+)', text)[0]

### Read in ExAC pLI file

In [10]:
ExAC = pd.read_csv('../../datasets/ExAC/fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt', sep="\t")
ExAC['Ensembl Transcript ID'] = ExAC['transcript'].apply(removeTranscriptIDVersions)
ExAC

Unnamed: 0,transcript,gene,chr,n_exons,cds_start,cds_end,bp,mu_syn,mu_mis,mu_lof,...,exp_syn,exp_mis,exp_lof,syn_z,mis_z,lof_z,pLI,pRec,pNull,Ensembl Transcript ID
0,ENST00000379370.2,AGRN,1,36,955552,990361,6138,5.462029e-05,0.000102,4.848279e-06,...,445.215539,837.314601,54.416958,-2.109089,0.140544,5.561613,1.733523e-01,8.266477e-01,1.592093e-09,ENST00000379370
1,ENST00000327044.6,NOC2L,1,19,880073,894620,2250,1.416177e-05,0.000029,2.065329e-06,...,154.668966,311.833216,28.883512,-3.605556,-2.275892,-0.021471,1.330382e-19,2.562842e-03,9.974372e-01,ENST00000327044
2,ENST00000379198.2,B3GALT6,1,1,1167658,1168648,990,1.350738e-05,0.000024,4.561960e-07,...,82.403301,149.127739,3.000516,2.349510,3.209371,0.572158,4.810447e-02,6.821018e-01,2.697937e-01,ENST00000379198
3,ENST00000421241.2,C1orf159,1,8,1018272,1026923,597,5.250226e-06,0.000010,5.878979e-07,...,25.770931,50.876442,4.392331,-0.882809,-0.419916,1.130742,9.087764e-02,7.656914e-01,1.434310e-01,ENST00000421241
4,ENST00000379389.4,ISG15,1,2,948953,949858,498,4.126498e-06,0.000007,2.358722e-07,...,47.592276,82.962476,3.510674,0.322813,0.212786,0.269984,9.847813e-03,6.002493e-01,3.899029e-01,ENST00000379389
5,ENST00000338591.3,KLHL17,1,12,896073,900571,1929,1.788056e-05,0.000033,1.699976e-06,...,159.550603,297.290098,18.832454,-3.457622,0.575586,1.331336,2.516763e-07,7.229136e-01,2.770861e-01,ENST00000338591
6,ENST00000379410.3,PLEKHN1,1,16,901911,909955,1836,1.328778e-05,0.000026,1.793241e-06,...,120.689904,231.456248,21.149710,-1.766845,-3.328942,1.324623,2.019748e-08,6.599948e-01,3.400052e-01,ENST00000379410
7,ENST00000342066.3,SAMD11,1,13,861321,879533,2046,1.643513e-05,0.000031,1.875129e-06,...,127.708248,243.266475,20.005031,-2.978340,-3.786196,0.665532,1.353812e-10,1.837000e-01,8.163000e-01,ENST00000342066
8,ENST00000335137.3,OR4F5,1,1,69090,70008,918,3.433040e-06,0.000008,2.193920e-07,...,30.016040,69.311169,2.217719,2.830684,3.602089,0.809997,1.763293e-01,6.440863e-01,1.795844e-01,ENST00000335137
9,ENST00000428771.2,HES4,1,3,934438,935353,744,9.134353e-06,0.000016,3.604458e-07,...,59.557864,110.748771,3.489528,1.089108,1.800964,-0.800975,1.479429e-04,2.459815e-01,7.538705e-01,ENST00000428771


### Read in Ensembl Gene ID and Transcript ID map

In [11]:
EnsV75GeneIDTranscriptID = pd.read_csv('../../datasets/geneLists/Ensembl/EnsV75GeneIDTranscriptID.txt', sep="\t")
EnsV75GeneIDTranscriptID

Unnamed: 0,Ensembl Gene ID,Ensembl Transcript ID
0,ENSG00000261657,ENST00000566782
1,ENSG00000261657,ENST00000562780
2,ENSG00000261657,ENST00000569579
3,ENSG00000261657,ENST00000568242
4,ENSG00000261657,ENST00000565530
5,ENSG00000223116,ENST00000411184
6,ENSG00000233440,ENST00000418454
7,ENSG00000207157,ENST00000384428
8,ENSG00000229483,ENST00000414345
9,ENSG00000252952,ENST00000517143


### Merge lists to add gene IDs to pLI scores

In [18]:
mergedList = pd.merge(EnsV75GeneIDTranscriptID,ExAC, on='Ensembl Transcript ID', how='inner')
mergedList

Unnamed: 0,Ensembl Gene ID,Ensembl Transcript ID,transcript,gene,chr,n_exons,cds_start,cds_end,bp,mu_syn,...,n_lof,exp_syn,exp_mis,exp_lof,syn_z,mis_z,lof_z,pLI,pRec,pNull
0,ENSG00000215405,ENST00000427390,ENST00000427390.2,GOLGA6L6,15,8,20739496,20747023,2253,7.372860e-06,...,1,1.118061,2.415408,0.698446,0.655514,0.130736,-0.357428,8.686430e-02,5.600548e-01,3.530809e-01
1,ENSG00000230031,ENST00000454856,ENST00000454856.4,POTEB2,15,11,21040719,21071610,1635,6.100685e-06,...,0,13.806530,31.165745,2.350088,2.303519,2.642967,1.518559,5.718928e-01,3.821580e-01,4.594920e-02
2,ENSG00000138593,ENST00000559471,ENST00000559471.1,SECISBP2L,15,18,49284440,49338496,3306,9.918054e-06,...,9,112.543308,292.457320,35.131820,-0.085126,0.442098,4.367260,1.017131e-01,8.982844e-01,2.500294e-06
3,ENSG00000166157,ENST00000361285,ENST00000361285.4,TPTE,21,21,10906904,10973733,1656,6.303252e-06,...,42,80.037334,187.016931,34.549246,-3.670059,-7.403054,-1.255656,5.114037e-31,2.419518e-06,9.999976e-01
4,ENSG00000166351,ENST00000299443,ENST00000299443.5,POTED,21,11,14982549,15013887,1755,6.326827e-06,...,0,26.339656,64.149610,4.142748,3.181666,3.612196,2.016202,7.366598e-01,2.517796e-01,1.156055e-02
5,ENSG00000168675,ENST00000361205,ENST00000361205.4,LDLRAD4,18,5,13387721,13645656,921,6.090747e-06,...,2,67.694283,134.466328,8.797946,0.052313,1.453799,2.270266,3.773064e-01,6.119331e-01,1.076050e-02
6,ENSG00000188992,ENST00000344577,ENST00000344577.2,LIPI,21,10,15481313,15579244,1446,4.348296e-06,...,13,51.599812,126.334604,13.934967,-0.897567,-1.900171,0.248103,8.825424e-09,1.582923e-01,8.417077e-01
7,ENSG00000185272,ENST00000400577,ENST00000400577.3,RBM11,21,5,15588507,15599614,846,3.163328e-06,...,8,31.853311,73.302573,10.231089,-0.345642,-0.725392,0.690949,2.910155e-05,5.473837e-01,4.525872e-01
8,ENSG00000183706,ENST00000328795,ENST00000328795.4,OR4N4,15,1,22382472,22383423,951,2.949985e-06,...,3,33.843040,96.705258,4.629465,-0.123292,-0.661259,0.750187,1.805060e-02,7.240400e-01,2.579094e-01
9,ENSG00000155304,ENST00000285667,ENST00000285667.3,HSPA13,21,5,15745937,15755440,1416,4.992004e-06,...,6,58.655929,144.349435,11.224720,0.053095,0.665596,1.544774,1.886370e-03,9.041260e-01,9.398763e-02


In [22]:
!mkdir ../../datasets/geneLists/ExAC

In [24]:
mergedList[['Ensembl Gene ID','pLI']].to_csv('../../datasets/geneLists/ExAC/EnsemblGeneIDWithExACPLiScore.tsv', sep="\t", index=False)