In [1]:
#Import Libraries
import requests
import urllib.parse
import urllib.request
import json
import sys
import re
import pandas as pd

In [4]:
#Getting the STRING network interactions

string_api_url = "https://string-db.org/api"
output_format = "tsv-no-header"
method = "network"

## Construct URL

request_url = "/".join([string_api_url, output_format, method])

## Set parameters

my_genes = "G6PD"

params = {

    "identifiers" : my_genes, # your protein
    "add_nodes": 20,
    "species" : 9606 # species NCBI identifier

}

## Call STRING

response = requests.post(request_url, data=params)

query_name = []
partner_ensp = []
partner_name = []
combined_score = []
neighborhood_score = []
coexpression_score = []
textmining_score = []
database_score = []

for line in response.text.strip().split("\n"):

    l = line.strip().split("\t")
    p1, p2 = l[2], l[3]

    ## filter the interaction according to total score
    score = float(l[5])
    if score > 0.7:
        query_name.append(p1)
        partner_name.append(p2)
        combined_score.append(score)
        neighborhood_score.append(l[6])
        coexpression_score.append(l[9])
        textmining_score.append(l[12])
        database_score.append(l[11])
        
df = pd.DataFrame(list(zip(query_name, partner_name, database_score, textmining_score, coexpression_score, neighborhood_score, combined_score)))
df.rename(columns = {df.columns[0]:"node1",df.columns[1]:"node2",df.columns[2]:"database_score",df.columns[3]:"textmining_score",
                    df.columns[4]:"coexpression_score",df.columns[5]:"neighborhood_score",df.columns[6]:"combined_score"},inplace = True)

df.to_csv("string_interactions.csv",index = False)
df

Unnamed: 0,node1,node2,database_score,textmining_score,coexpression_score,neighborhood_score,combined_score
0,GSR,GAPDH,0,0.678,0.104,0.072,0.709
1,GSR,GAPDH,0,0.678,0.104,0.072,0.709
2,GSR,PGD,0,0.797,0.188,0.111,0.848
3,GSR,PGD,0,0.797,0.188,0.111,0.848
4,GSR,H6PD,0,0.882,0.152,0.211,0.914
...,...,...,...,...,...,...,...
235,HK1,H6PD,0.8,0.821,0.15,0,0.967
236,HK1,GPI,0.9,0.889,0.146,0,0.989
237,HK1,GPI,0.9,0.889,0.146,0,0.989
238,GPI,H6PD,0.9,0.862,0.174,0.344,0.991


In [4]:
#Get data from Human Protein Atlas

#www.proteinatlas.org/api/search_download.php?search=P53&format=json&columns=g,gs&compress=no
listOFgenes = ["G6PD","ACLY","CAT","CLTC","UBR","IGHA","MTHFD","PRPS","AK","PSMA","GSN","PLG",
         "PSMB","PFKM","KRT","PSME","PSMC","OLA","PSMD","ACTR","PSMD","PCMT","RTCA","KRT"]

genes = []
protein_id = []
diseases = []
for name in listOFgenes:
    urlapi = "https://www.proteinatlas.org/api/search_download.php?"
    search = f"search=\b{name}\b"
    form = "format=tsv"
    cols = "columns=g,up,di"
    compress = "compress=no"
    
    request_url = urlapi + "&".join([search,form,cols,compress])
    
    response = requests.get(request_url)
    
    response = response.text.strip().split("\n")
    if len(response) <2:
        pass
    else:
        tmp = response[1].strip().split("\t")
        if len(tmp) == 4:
            listOFdiseases = tmp[3].split(",")
            for disease in listOFdiseases:
                disease = re.sub("\"","",disease)
                genes.append(tmp[1])
                protein_id.append(tmp[2])
                diseases.append(disease)


df = pd.DataFrame(list(zip(genes,protein_id,diseases)))
df.rename(columns = {df.columns[0]:"genes",df.columns[1]:"UniProtID",df.columns[2]:"diseases"},inplace = True)

df.to_csv("HPA_interactions.csv",index = False)


In [5]:
#Get data from Ensembl
listOFgenes = ["G6PD","ACLY","CAT","CLTC","UBR","IGHA",
               "MTHFD","PRPS","AK","PSMA","GSN","PLG","PSMB",
               "PFKM","KRT","PSME","PSMC","OLA","PSMD","ACTR","PSMD","PCMT","RTCA","KRT"]

listOFjson = []
for name in listOFgenes:
    server = "https://rest.ensembl.org"
    parm = "phenotype/gene/homo_sapiens"
    gene = f"{name}"
    
    url = "/".join([server,parm,gene])
    
    response = requests.get(url, headers={ "Content-Type" : "application/json"})
    
    if 'error' not in str(response.json()):
        listOFjson.extend(response.json())



df = pd.json_normalize(listOFjson)
    
df.drop_duplicates(subset = 'attributes.external_id', inplace = True)
df.dropna(subset = ['Gene'],inplace = True)
df

JSONDecodeError: [Errno Expecting value] <html><body><h1>503 Service Unavailable</h1>
No server is available to handle this request.
</body></html>
: 0

In [98]:
#Modify data retrieved from Ensembl
final_df = df.copy()
final_df = final_df[["description","Gene","source"]]
final_df.rename({"description":"disease"},axis = "columns",inplace = True)
final_df.replace("MIM morbid","OMIM",inplace = True)
final_df.reset_index(inplace = True,drop = True)

names = final_df["Gene"].to_list()
names = str(names).strip("[|]")
names = re.sub("'","",names)

'ENSG00000160211, ENSG00000160211, ENSG00000160211, ENSG00000160211, ENSG00000121691, ENSG00000121691, ENSG00000141367, ENSG00000141367, ENSG00000141367, ENSG00000141367, ENSG00000141367, ENSG00000141367, ENSG00000141367, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000100714, ENSG00000263639, ENSG00000263639, ENSG00000156110, ENSG00000156110, ENSG00000148180, ENSG00000148180, ENSG00000122194, ENSG00000122194, ENSG00000122194, ENSG00000152556, ENSG00000152556'

In [128]:
#Match Ensembl IDs to Uniprot ACs
url = 'https://www.uniprot.org/uploadlists/'

params = {
'from': 'ENSEMBL_ID',
'to': 'SWISSPROT',
'format': 'tab',
'query': names
}

data = urllib.parse.urlencode(params)
data = data.encode('utf-8')
req = urllib.request.Request(url, data)
with urllib.request.urlopen(req) as f:
    response = f.read()
response = response.decode('utf-8')

replacement = {}
response = response.strip().split("\n")
for line in response[1:]:
    tmp = line.strip().split("\t")
    replacement.update({tmp[0]:tmp[1]})

final_df["Gene"].replace(replacement, inplace=True)
final_df.rename({"Gene":"UniprotID"},axis = "columns", inplace = True)

final_df.to_csv("Ensembl_interactions.csv",index = False)

In [73]:
#Getting the STITCH network interactions

db_api = "http://stitch.embl.de/api"
form = "psi-mi-tab"
methods = "interactions?identifier=G6PD&species=9606&limit=50%&required_score=700"

request_url = "/".join([db_api,form,methods])

response = requests.post(request_url)

data = []
for line in response.text.strip().split("\n"):
    line = re.sub("string:|-\t|taxid:\S+\t|\w{0,1}score:","",line)
    line = re.sub("\|","\t",line)
    l = line.strip().split("\t")#headers:node1_id, node2_id, node1_name, node2_name,total_score,neighborhood_score, coexp_score,db_score, textmining_score
    data.append(l)

stitch_data = pd.DataFrame(data)
stitch_data.drop([9,10],axis="columns",inplace=True)
#replace Nan values with zeros
stitch_data.fillna(0.0,inplace=True)

#fixing names
stitch_data.replace(to_replace="nicotinamide a.|NADP\(H\)*",value="NADPH",regex=True,inplace=True)
stitch_data.replace(to_replace="alpha-D-gl.e 6.",value="alpha-D-Glucose",regex=True,inplace=True)

#fixing column names
stitch_data.rename({stitch_data.columns[0]:"node1_id",stitch_data.columns[1]:"node2_id",stitch_data.columns[2]:"node1",stitch_data.columns[3]:"node2",
                   stitch_data.columns[4]:"combined_score",stitch_data.columns[5]:"neighborhood_score",stitch_data.columns[6]:"coexpression_score",
                   stitch_data.columns[7]:"database_score",stitch_data.columns[8]:"textmining_score"},axis="columns",inplace=True)

stitch_data.to_csv("stitch_interactions.csv",index=False)
