## Matching STRING Protein IDs to Entrez Gene Symbols
#### ALL DATABASES ACCESSED 08/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import json
import pandas as pd
import requests
import time
import numpy as np
import os

### Importing Protein-Drug Interaction Data (STRING Protein Identifiers & PubChem IDs)
#### Databases Accessed : http://stitch.embl.de
#### Input Files : protein_chemical_links.v5.0.tsv (http://stitch.embl.de/cgi/download.pl?UserId=RZH952LT3dmF&sessionId=V9HrDjiZWim3&species_text=Homo+sapiens)

In [2]:
df = pd.read_csv('input/9606.protein_chemical.links.v5.0.tsv', delimiter = '\t')

In [3]:
df.head()

Unnamed: 0,chemical,protein,combined_score
0,CIDm91758680,9606.ENSP00000257254,279
1,CIDm91758680,9606.ENSP00000302120,154
2,CIDm91758408,9606.ENSP00000006777,225
3,CIDm91758408,9606.ENSP00000056217,178
4,CIDm91758408,9606.ENSP00000216085,225


In [4]:
# Creating list of all unique STITCH protein identifiers 
STITCH_protein_identifiers = set(df['protein'].tolist())

In [5]:
len(STITCH_protein_identifiers)

19195

### Matching STRING Protein Identifiers to Entrez Gene Symbols using STITCH API

In [6]:
# Querying all STITCH protein identifiers through STITCH API and retrieving Entrez Gene Symbols #
protein_dict = {}
failed_list = []

for protein in STITCH_protein_identifiers:
    try:
        url = ('http://stitch.embl.de/api/json/resolve?identifier='+ str(protein) +'&species=9606')
        response = requests.get(url)
    except ValueError:
        pass
    time.sleep(0.5)
    try:
        if "preferredName" in response.json()[0].keys():
            gene_symbol = response.json()[0]["preferredName"]
            protein_dict[protein] = gene_symbol
    except:
        failed_list.append(protein)
        
print(len(failed_list))

0


In [32]:
df_protein = pd.DataFrame.from_dict(protein_dict, orient = 'index')
df_protein = df_protein.reset_index()
df_protein.columns = ['protein','gene symbol']

In [33]:
len(df_protein)

19195

In [34]:
df_protein.head()

Unnamed: 0,protein,gene symbol
0,9606.ENSP00000305107,GIMAP8
1,9606.ENSP00000376139,WDR45L
2,9606.ENSP00000334853,ZNF555
3,9606.ENSP00000264935,CEP72
4,9606.ENSP00000276440,DOCK5


### Making sure each gene symbol can be traced back to approved symbol
#### Input file: Homo_sapiens.gene_info (ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia)

In [35]:
# Importing approved symbol table
df_lookup = pd.read_csv('../mapping_files/Homo_sapiens.gene_info', delimiter = '\t')
approved_symbols = df_lookup['Symbol'].tolist()

# Gene synonym lookup 
with open('../mapping_files/gene_symbol_lookup.json', 'r') as f:
    synonym_lookup = json.load(f)

term_list = []
for index, row in df_protein.iterrows():
    gene = row.loc['gene symbol']
    if gene in approved_symbols:
        term_list.append(gene)
    elif gene in synonym_lookup:
        term_list.append(synonym_lookup[gene])
    else:
        failed_list.append(gene)
        df_protein.drop(index, inplace = True)
df_protein.loc[:,'approved symbol'] = pd.Series(np.array(term_list), index=df_protein.index)

In [36]:
len(df_protein)

18809

In [39]:
df_protein.head()

Unnamed: 0,protein,gene symbol,approved symbol
0,9606.ENSP00000305107,GIMAP8,GIMAP8
1,9606.ENSP00000376139,WDR45L,WDR45B
2,9606.ENSP00000334853,ZNF555,ZNF555
3,9606.ENSP00000264935,CEP72,CEP72
4,9606.ENSP00000276440,DOCK5,DOCK5


In [40]:
# Exporting filtered STRING_to_entrez table
df_protein.to_csv('input/STRING_to_Entrez.csv', index = False)