In [79]:
from pyteomics import fasta
import pandas as pd
import numpy as np
import re
import time
from datetime import datetime
import json
import os

### Data directories

In [80]:
#Data path
datadir = os.path.abspath("../../Data/")

#Datasets
OT_targets   = datadir+"/OpenTargets/targets" #OpenTargets
Uniprot_hsap = datadir+"/Uniprotdb/human_proteome/UP000005640_9606.fasta" #Uniprot h. sapiens proteome
Genemap      = datadir+"/Uniprotdb/HUMAN_9606_idmapping.dat" #Uniprot gene ID mapping

### Functions - Data parsing

In [81]:
def parseUniprot(fastain):
    #Generate dic of Uniprot: Gene, Sequences for proteome
    proteome = {}
    seq_file = fasta.UniProt(fastain)
    for prot in seq_file:
        unid = prot[0]['id']
        name = prot[0]['name']
        gene = prot[0]['gene_id']
        seq  = prot[1]
        #print(prot[0])
        proteome.setdefault(unid, {"Gene": gene, "Name": name, "Seq": seq})
    return proteome

def unitoGene(genein):
    #Generate dic of Uniprot to Gene names
    with open(genein) as f:
        read = f.readlines()
    unigene = {}
    for i in read:
        splitr = i.split("\t")
        if splitr[1] == "Gene_Name" or splitr[1] == "Gene_Synonym":
            unigene.setdefault(splitr[0], [])
            unigene[splitr[0]].append(splitr[2].strip("\n"))
    return unigene 

def parseOTtarget(data_dir, df):
    #Add new cols to df
    intelcol = {"description": "functionDescriptions", \
                "subcell_loc": "subcellularLocations", \
                "db_entries": "dbXrefs", \
                "bio_path": "pathways", \
                "tx_approaches": "tractability"}
    for col in intelcol:
        df[col] = [list() for x in range(len(df.index))]

    data = []
    #Read in multi part .json to data list 
    for file in os.listdir(data_dir):
        path = data_dir + "/" + str(file).replace("._", "")
        with open(path, "r") as f:
            for line in f:
                data.append(json.loads(line))
    
    #Retrieve intel from protein coding entries, annotate df
    protlist = {}
    for entry in data:
        try:
            entid = entry['proteinIds'][0]['id']
        except:
            entid = ""
        if entid in df.index:
            for j in intelcol:
                if intelcol[j] in entry:
                    df.loc[entid, j].append(entry[intelcol[j]])                                               

    return df

## Main code

In [124]:
#Define default proteome 
proteome = parseUniprot(Uniprot_hsap)
#Update gene to likely recent id
genemap  = unitoGene(Genemap)
for i in proteome:
    try:
        proteome[i]["Gene"] = genemap[i][0]
    except:
        pass
#Generate dataframe from dic
initdf = pd.DataFrame.from_dict(proteome, orient='index')
#Annotate with OpenTarget intel
initdf = parseOTtarget(OT_targets, initdf)

In [127]:
display(initdf.loc["P04637"])

Gene                                                          TP53
Name                                    Cellular tumor antigen p53
Seq              MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...
description      [[Acts as a tumor suppressor in many tumor typ...
subcell_loc      [[{'location': 'Cytoplasm', 'source': 'uniprot...
db_entries       [[{'id': '11998', 'source': 'HGNC'}, {'id': '1...
bio_path         [[{'pathwayId': 'R-HSA-3232118', 'pathway': 'S...
tx_approaches    [[{'modality': 'SM', 'id': 'Approved Drug', 'v...
Name: P04637, dtype: object