In [1]:
import pandas as pd

In [2]:
# Open data into a DataFrame using Pandas
# The original file has no header
df = pd.read_csv("TM_gnomad_nonPatho_missense.csv", sep=",", header=0)
df = df [["Gene(s)", "position", "initial_aa", "final_aa", "Transmembrane_Regions_new", "Pfam_code"]]
df

Unnamed: 0,Gene(s),position,initial_aa,final_aa,Transmembrane_Regions_new,Pfam_code
0,TNFRSF4,229.0,Gly,Gly,"[['215', '235']]",PF00020
1,TNFRSF4,228.0,Leu,Leu,"[['215', '235']]",PF00020
2,TNFRSF4,226.0,Gly,Ala,"[['215', '235']]",PF00020
3,TNFRSF4,223.0,Leu,Pro,"[['215', '235']]",PF00020
4,TNFRSF4,223.0,Leu,Val,"[['215', '235']]",PF00020
...,...,...,...,...,...,...
45968,PLXNA3,1238.0,Val,Val,"[['1221', '1241']]",PF08337_PF01437_PF01403_PF01833_PF18020_PF17960
45969,PLXNA3,1241.0,Ala,Val,"[['1221', '1241']]",PF08337_PF01437_PF01403_PF01833_PF18020_PF17960
45970,PLXNA3,1241.0,Ala,Ala,"[['1221', '1241']]",PF08337_PF01437_PF01403_PF01833_PF18020_PF17960
45971,CLIC2,41.0,Trp,Ser,"[['32', '52']]",PF13409


In [3]:
# Function imported from: https://chem-workflows.com/articles/2019/10/29/retrieve-uniprot-data-using-python/
# pip install beautifulsoup4
import urllib
from bs4 import BeautifulSoup

def get_uniprot (query='',query_type='ACC'):
    #query_type must be: "PDB_ID" or "ACC"
    url = 'https://www.uniprot.org/uploadlists/' #This is the webser to retrieve the Uniprot data
    params = {
    'from':query_type,
    'to':'ACC',
    'format':'txt',
    'query':query,
    'taxon':'9606'
    }

    data = urllib.parse.urlencode(params)
    data = data.encode('ascii')
    request = urllib.request.Request(url, data)
    with urllib.request.urlopen(request) as response:
        res = response.read()
        page=BeautifulSoup(res).get_text()
        page=page.splitlines()
    return page

In [4]:
GN = []
for UniProtKB_GN in df['Gene(s)']:
    GN.append(UniProtKB_GN)
res = []
[res.append(x) for x in GN if x not in res]
print(res)

['TNFRSF4', 'B3GALT6', 'MXRA8', 'ATAD3A', 'TMEM240', 'GPR153', 'SLC45A1', 'UBIAD1', 'MFN2', 'EPHA2', 'ATP13A2', 'UBR4', 'EMC1', 'PINK1', 'DDOST', 'ECE1', 'EPHB2', 'SLC30A2', 'PIGV', 'SLC9A1', 'TMEM222', 'PTPRU', 'ADGRB2', 'GJB4', 'GJB3', 'GJA4', 'CSF3R', 'ZMPSTE24', 'KCNQ4', 'CLDN19', 'SLC2A1', 'TIE1', 'MPL', 'ELOVL1', 'ST3GAL3', 'SLC6A9', 'DHCR24', 'BSND', 'TACSTD2', 'ALG6', 'ROR1', 'LEPR', 'SLC35D1', 'PIGK', 'HS2ST1', 'ABCA4', 'SLC35A3', 'SLC30A7', 'SLC25A24', 'CLCC1', 'CELSR2', 'SLC6A17', 'KCNA2', 'DRAM2', 'KCND3', 'SLC16A1', 'LRIG2', 'VANGL1', 'ATP1A1', 'HSD3B2', 'NOTCH2', 'GJA5', 'GJA8', 'IL6R', 'CHRNB2', 'KCNN3', 'DPM3', 'SEMA4A', 'NTRK1', 'KIRREL1', 'CADM3', 'PIGM', 'KCNJ10', 'ATP1A2', 'NCSTN', 'NECTIN4', 'MPZ', 'SDHC', 'ATF6', 'DDR2', 'ILDR2', 'CD247', 'GPR161', 'SLC19A2', 'FMO3', 'FASLG', 'ASTN1', 'TOR1AIP1', 'XPR1', 'MR1', 'CACNA1E', 'KCNT2', 'CRB1', 'CACNA1S', 'SYT2', 'NFASC', 'SLC41A1', 'CR2', 'CD46', 'PLXNA2', 'SYT14', 'HHAT', 'KCNH1', 'FLVCR1', 'USH2A', 'SLC30A10', 'DEGS1

In [5]:
gene_name = []
for a in res:
    y=get_uniprot(query=a ,query_type='GENENAME')
    for line in y:
        if line.startswith('ID   '):
            gene_name.append(a+':'+line)
#print(gene_name)

In [6]:
df_gene_name =  pd.DataFrame (gene_name, columns = ['GN'])
df_gene_name

Unnamed: 0,GN
0,TNFRSF4:ID TNR4_HUMAN Reviewed;...
1,TNFRSF4:ID A0A024R0B2_HUMAN Unreviewe...
2,B3GALT6:ID B3GT6_HUMAN Reviewed;...
3,B3GALT6:ID Q499Z2_HUMAN Unreviewe...
4,MXRA8:ID MXRA8_HUMAN Reviewed; ...
...,...
7407,PLXNA3:ID PLXA3_HUMAN Reviewed; ...
7408,PLXNA3:ID L8ECH0_HUMAN Unreviewed...
7409,CLIC2:ID CLIC2_HUMAN Reviewed; ...
7410,CLIC2:ID A6PVS0_HUMAN Unreviewed;...


In [7]:
df_gene_name[['GN', 'ID']] = df_gene_name['GN'].str.split(':', 1, expand=True)
df_gene_name

Unnamed: 0,GN,ID
0,TNFRSF4,ID TNR4_HUMAN Reviewed; ...
1,TNFRSF4,ID A0A024R0B2_HUMAN Unreviewed; ...
2,B3GALT6,ID B3GT6_HUMAN Reviewed; ...
3,B3GALT6,ID Q499Z2_HUMAN Unreviewed; ...
4,MXRA8,ID MXRA8_HUMAN Reviewed; ...
...,...,...
7407,PLXNA3,ID PLXA3_HUMAN Reviewed; ...
7408,PLXNA3,ID L8ECH0_HUMAN Unreviewed; ...
7409,CLIC2,ID CLIC2_HUMAN Reviewed; ...
7410,CLIC2,ID A6PVS0_HUMAN Unreviewed; ...


In [8]:
df_gene_name['ID'] = df_gene_name['ID'].str.split(' ').str[3]
df_gene_name

Unnamed: 0,GN,ID
0,TNFRSF4,TNR4_HUMAN
1,TNFRSF4,A0A024R0B2_HUMAN
2,B3GALT6,B3GT6_HUMAN
3,B3GALT6,Q499Z2_HUMAN
4,MXRA8,MXRA8_HUMAN
...,...,...
7407,PLXNA3,PLXA3_HUMAN
7408,PLXNA3,L8ECH0_HUMAN
7409,CLIC2,CLIC2_HUMAN
7410,CLIC2,A6PVS0_HUMAN


In [9]:
df_gene_name = df_gene_name.drop_duplicates()
df_gene_name

Unnamed: 0,GN,ID
0,TNFRSF4,TNR4_HUMAN
1,TNFRSF4,A0A024R0B2_HUMAN
2,B3GALT6,B3GT6_HUMAN
3,B3GALT6,Q499Z2_HUMAN
4,MXRA8,MXRA8_HUMAN
...,...,...
7407,PLXNA3,PLXA3_HUMAN
7408,PLXNA3,L8ECH0_HUMAN
7409,CLIC2,CLIC2_HUMAN
7410,CLIC2,A6PVS0_HUMAN


In [10]:
df_gene_name['ID'] = df_gene_name.groupby(['GN'])['ID'].transform(lambda x : ' '.join(x))
df_gene_name = df_gene_name.drop_duplicates()
df_gene_name

Unnamed: 0,GN,ID
0,TNFRSF4,TNR4_HUMAN A0A024R0B2_HUMAN
2,B3GALT6,B3GT6_HUMAN Q499Z2_HUMAN
4,MXRA8,MXRA8_HUMAN A0A024R0A0_HUMAN
6,ATAD3A,ATD3A_HUMAN H0Y2W2_HUMAN Q5SV16_HUMAN
9,TMEM240,TM240_HUMAN A0A096LNN7_HUMAN
...,...,...
7387,OPN1MW,OPSG_HUMAN B2RU31_HUMAN B7ZLG5_HUMAN H0Y642_HUMAN
7391,EMD,EMD_HUMAN A0A804HIG7_HUMAN A0A804HIV7_HUMAN A0...
7397,ATP6AP1,VAS1_HUMAN A0A0C4DGX8_HUMAN A0A7I2V2G3_HUMAN A...
7407,PLXNA3,PLXA3_HUMAN L8ECH0_HUMAN


In [11]:
df_gene_name.to_csv('TM_gnomad+GN_nonPatho_missense.txt', header=True, index=False, sep='\t', mode='a')

In [12]:
df_aux = pd.read_csv("TM_gnomad+GN_nonPatho_missense.txt", sep="\t", header=0)
df_aux = df_aux.rename(columns={"GN":"Gene(s)"})
df_aux

Unnamed: 0,Gene(s),ID
0,TNFRSF4,TNR4_HUMAN A0A024R0B2_HUMAN
1,B3GALT6,B3GT6_HUMAN Q499Z2_HUMAN
2,MXRA8,MXRA8_HUMAN A0A024R0A0_HUMAN
3,ATAD3A,ATD3A_HUMAN H0Y2W2_HUMAN Q5SV16_HUMAN
4,TMEM240,TM240_HUMAN A0A096LNN7_HUMAN
...,...,...
978,OPN1MW,OPSG_HUMAN B2RU31_HUMAN B7ZLG5_HUMAN H0Y642_HUMAN
979,EMD,EMD_HUMAN A0A804HIG7_HUMAN A0A804HIV7_HUMAN A0...
980,ATP6AP1,VAS1_HUMAN A0A0C4DGX8_HUMAN A0A7I2V2G3_HUMAN A...
981,PLXNA3,PLXA3_HUMAN L8ECH0_HUMAN


In [13]:
result = df.merge(df_aux, on='Gene(s)', how='outer')
result

Unnamed: 0,Gene(s),position,initial_aa,final_aa,Transmembrane_Regions_new,Pfam_code,ID
0,TNFRSF4,229.0,Gly,Gly,"[['215', '235']]",PF00020,TNR4_HUMAN A0A024R0B2_HUMAN
1,TNFRSF4,228.0,Leu,Leu,"[['215', '235']]",PF00020,TNR4_HUMAN A0A024R0B2_HUMAN
2,TNFRSF4,226.0,Gly,Ala,"[['215', '235']]",PF00020,TNR4_HUMAN A0A024R0B2_HUMAN
3,TNFRSF4,223.0,Leu,Pro,"[['215', '235']]",PF00020,TNR4_HUMAN A0A024R0B2_HUMAN
4,TNFRSF4,223.0,Leu,Val,"[['215', '235']]",PF00020,TNR4_HUMAN A0A024R0B2_HUMAN
...,...,...,...,...,...,...,...
45968,PLXNA3,1238.0,Val,Val,"[['1221', '1241']]",PF08337_PF01437_PF01403_PF01833_PF18020_PF17960,PLXA3_HUMAN L8ECH0_HUMAN
45969,PLXNA3,1241.0,Ala,Val,"[['1221', '1241']]",PF08337_PF01437_PF01403_PF01833_PF18020_PF17960,PLXA3_HUMAN L8ECH0_HUMAN
45970,PLXNA3,1241.0,Ala,Ala,"[['1221', '1241']]",PF08337_PF01437_PF01403_PF01833_PF18020_PF17960,PLXA3_HUMAN L8ECH0_HUMAN
45971,CLIC2,41.0,Trp,Ser,"[['32', '52']]",PF13409,CLIC2_HUMAN A6PVS0_HUMAN Q86YM0_HUMAN


In [14]:
result.to_csv('TM_gnomad+GN(final)_nonPatho_missense.txt', header=True, index=False, sep='\t', mode='a')