Take tmp_pdb_uniprot_db.csv and add the uniprot ids and infos when missing

1st step: merge row together (only 1 row for each (pdb,uniprot) tuple)

In [None]:
import pandas as pd
import numpy as np
import json
import pypdb
import urllib.request
import tqdm

In [None]:
COLUMNS=['uniprot', 'PDB_wild', 'sequence', 'length', 'molWeight', 'countByFeatureType', 'chain_start', 'chain_end']

In [13]:
def seq_to_pdb(seq):
    # get pdb id from protein sequence using the pypdb package to query the RCSB Protein Data Bank API
    q = pypdb.Query(seq, 
        query_type="sequence", 
        return_type="polymer_entity")
    
    for result in q.search()["result_set"]:
        [result_id, chain] = result["identifier"].split('_')
        if result["score"] == 1.0 and chain=="1":
            return result_id

    return ""

def uniprotid_to_infos(uniprotid, pdb_ids=False):
    try:
        with urllib.request.urlopen(f"https://rest.uniprot.org/uniprotkb/{uniprotid}.json") as url:
            data = json.load(url)
    except Exception as e:
        print(f"exception raised for {uniprotid}: {e}")
        return {}

    databases = data.get("uniProtKBCrossReferences", [])

    # if we only want the pdb_ids return it immediately
    if pdb_ids:
        pdb_ids = " ".join([x["id"] for x in databases if (x["database"]=="PDB")])
        return {"PDB_wild": pdb_ids}
    
    features = data.get("features", [])
    chain_location = next((x for x in features if x["type"]=="Chain"), {}).get("location", {})
    return {
        "sequence": data.get("sequence", {}).get("value"),
        "length": data.get("sequence", {}).get("length"),
        "molWeight": data.get("sequence", {}).get("molWeight"),
        # "countByFeatureType": data.get("extraAttributes", {}).get("countByFeatureType"), 
        "chain_start": chain_location.get("start", {}).get("value"),
        "chain_end": chain_location.get("end", {}).get("value"),
        "AlphaFoldDB": " ".join([x["id"] for x in databases if (x["database"]=="AlphaFoldDB")])
    }

In [None]:
df = pd.read_csv("./data/main_dataset/tmp_pdb_uniprot_db.csv")
df = df[["uniprot", "PDB_wild"]]
df["PDB_wild"] = df["PDB_wild"].apply(lambda x: str(x).upper())
df.head(2)

In [None]:
def agg_function(l):
    unique_l = set([str(x) for x in l if (x and str(x)!='nan')])
    return " ".join(unique_l)

df2 = df.groupby("PDB_wild", as_index=False).agg({"uniprot": agg_function})
print(len(df2))


#### MULTIPLE ids ####
def duplicate_multiple_ids_row(df, sep, multiple_col, unique_id_col):
    multiple_ids = df[multiple_col].str.contains(sep, case=False)
    multiple_ids_df = pd.DataFrame()
    for _, row in df.loc[multiple_ids].iterrows():
        ids = row[multiple_col].split(sep)
        additional_rows = pd.DataFrame({unique_id_col: [row[unique_id_col] if row[unique_id_col] != "NAN" else ""]*len(ids),
                                        multiple_col: ids
                                        }, columns=df.columns)
        multiple_ids_df = pd.concat([multiple_ids_df, additional_rows], ignore_index=True)

    df = df.loc[~multiple_ids]
    df = pd.concat([df, multiple_ids_df], ignore_index=True)
    return df

# (X,P00644 P00645) => (X, P00644), (X, P00645)
# add rows when there are multiple uniprot id for 1 pdb id
df2 = duplicate_multiple_ids_row(df2, " ", multiple_col="uniprot", unique_id_col="PDB_wild")
print(len(df2))

# (1SVX|3MBP,P0AEX9) => (1SVX,P0AEX9), (3MBP, P0AEX9)
df2 = duplicate_multiple_ids_row(df2, "|", multiple_col="PDB_wild", unique_id_col="uniprot")
print(len(df2))

#### Warning ####
# A weird bug makes it so 1E21 is converted to a number (1.00E+21) and therefor is different from the row w/ PDBwild = "1E21"
# Thus we remove this first line (it exist already on line 121)
df2 = df2.iloc[1:, :]

#### Remove occurences when there is a PDB with no uniprot 
# alltough another row contains the PDB & the uniprot 
# (and vice-versa) ####
no_uniprot = df2.uniprot.eq("")
no_uniprot_df = df2.loc[no_uniprot]
no_uniprot_df.reset_index(inplace=True)
df2 = df2.loc[~no_uniprot]

no_pdb = df2.PDB_wild.eq("")
no_pdb_df = df2.loc[no_pdb]
no_pdb_df.reset_index(inplace=True)
df2 = df2.loc[~no_pdb]

# df2 now consist of only rows with both pdb AND uniprot
# uniprot:
linked_row_found = [True]*len(no_uniprot_df)
for index, row in no_uniprot_df.iterrows():
    linked_row = df2.PDB_wild.eq(row.PDB_wild)
    if linked_row.any():
        linked_row_found[index] = False
# we remove the rows with no_uniprot for which we found another row with both uniprot and pdb
no_uniprot_df = no_uniprot_df[linked_row_found]
print(f"rm {len(linked_row_found)-len(no_uniprot_df)} rows from no_uniprot because at least 1 other row contained both information")
# pdb:
linked_row_found = [True]*len(no_pdb_df)
for index, row in no_pdb_df.iterrows():
    linked_row = df2.uniprot.eq(row.uniprot)
    if linked_row.any():
        linked_row_found[index] = False
# we remove the rows with no_pdb for which we found another row with both uniprot and pdb
no_pdb_df = no_pdb_df[linked_row_found]
print(f"rm {len(linked_row_found)-len(no_pdb_df)} rows from no_pdb because at least 1 other row contained both information")

# rows with no pdb have uniprot ids, we can use uniprot DB to get possible pdb ids
def add_pdb(row):
    row["PDB_wild"] = uniprotid_to_infos(row["uniprot"], pdb_ids=True).get("PDB_wild", "")
    return row

no_pdb_df = no_pdb_df.apply(add_pdb, axis=1)
no_pdb_df = duplicate_multiple_ids_row(no_pdb_df, " ", multiple_col="PDB_wild", unique_id_col="uniprot")


# we add back the curated rows with no_pdb and no_uniprot
df2 = pd.concat([df2, no_pdb_df, no_uniprot_df], ignore_index=True)

print(len(df2))
df2 = df2.drop_duplicates()
df2.drop(columns="index", inplace=True)
print(len(df2))

In [None]:
df2.to_csv("./data/main_dataset/pdb_uniprot_mapping.csv", index=False)

## 2nd step: for all unique uniprot id get all possible infos

In [14]:
# get all infos related to every uniprot id:
df2 = pd.read_csv("./data/main_dataset/pdb_uniprot_db.csv")
all_uniprot = set(df2.uniprot.to_list()) # this gives unique values of uniprot
uniprot_infos = []

for uniprot in tqdm.tqdm(all_uniprot):
        infos = uniprotid_to_infos(uniprot)
        infos["uniprot"] = uniprot
        uniprot_infos.append(infos)

  0%|          | 2/494 [00:00<01:19,  6.19it/s]

exception raised for nan: HTTP Error 400: Bad Request


 68%|██████▊   | 334/494 [01:05<00:25,  6.17it/s]

exception raised for GQ884175: HTTP Error 400: Bad Request


100%|██████████| 494/494 [01:47<00:00,  4.61it/s]


In [22]:
uniprot_infos_df = pd.DataFrame(uniprot_infos)
uniprot_infos_df = uniprot_infos_df[~uniprot_infos_df.uniprot.isna()]

# all numerical columns are float, because there are some NaNs
uniprot_infos_df.head()

Unnamed: 0,uniprot,sequence,length,molWeight,countByFeatureType,chain_start,chain_end,AlphaFoldDB
1,P07170,MSSSESIRMVLIGPPGAGKGTQAPNLQERFHAAHLATGDMLRSQIA...,222.0,24255.0,"{'Initiator methionine': 1, 'Propeptide': 1, '...",3.0,222.0,P07170
2,P30289,MRIPPRLVALAGAAAVAATLIAGPVAAAAPASHAVAASSAASASVK...,141.0,14820.0,"{'Signal': 1, 'Chain': 1, 'Active site': 2, 'D...",37.0,141.0,P30289
3,P0AA04,MFQQEVTITAPNGLHTRPAAQFVKEAKGFTSEITVTSNGKSASAKS...,85.0,9119.0,"{'Chain': 1, 'Domain': 1, 'Active site': 1, 'B...",1.0,85.0,P0AA04
4,P06312,MVLQTQVFISLLLWISGAYGDIVMTQSPDSLAVSLGERATINCKSS...,121.0,13380.0,"{'Signal': 1, 'Chain': 1, 'Domain': 1, 'Region...",21.0,121.0,P06312
5,P00094,MKISLTAATVAALVLAAPAFAGDAAKGEKEFNKCKTCHSIIAPDGT...,137.0,14279.0,"{'Signal': 1, 'Chain': 1, 'Binding site': 4, '...",22.0,137.0,P00094


In [24]:
uniprot_infos_df.to_csv("./data/main_dataset/uniprot_infos.csv", index=False)

check this: (from kaggle notebook)
```
xml_data = json.loads(json.dumps(xmltodict.parse(requests.get(f"https://www.ebi.ac.uk/proteins/api/proteins/pdb:{pdb_id}").content.decode('utf-8'))))
try:
    protein_data = xml_data["uniprot"]["entry"]["protein"]
except:
    return manual_map[pdb_id]

try:    
    return protein_data["recommendedName"]["fullName"]["#text"]
except:
    try:
        return protein_data['recommendedName']['fullName']
    except:
        try:
            return protein_data["submittedName"]["fullName"]["#text"]
        except:
            print("failed: \n", protein_data)
```