Take pdb_uniprot_db.csv and add the uniprot ids and infos when missing

1st step: merge row together (only 1 row for each pdb/uniprot)

In [2]:
import pandas as pd
import numpy as np
import json
import pypdb
import urllib.request

In [None]:
COLUMNS=['uniprot', 'PDB_wild', 'sequence', 'length', 'molWeight', 'countByFeatureType', 'chain_start', 'chain_end']

In [1]:
def seq_to_pdb(seq):
    # get pdb id from protein sequence using the pypdb package to query the RCSB Protein Data Bank API
    q = pypdb.Query(seq, 
        query_type="sequence", 
        return_type="polymer_entity")
    
    for result in q.search()["result_set"]:
        [result_id, chain] = result["identifier"].split('_')
        if result["score"] == 1.0 and chain=="1":
            return result_id

    return ""

def uniprotid_to_infos(uniprotid, row):
    with urllib.request.urlopen("https://rest.uniprot.org/uniprotkb/P00282.json") as url:
        data = json.load(url)

    features = data.get("features", [])
    chain_location = next((x for x in features if x["type"]=="Chain"), {}).get("location", {})
    return {
        "sequence": data.get("sequence", {}).get("value"),
        "length": data.get("sequence", {}).get("length"),
        "molWeight": data.get("sequence", {}).get("molWeight"),
        "countByFeatureType": data.get("extraAttributes", {}).get("countByFeatureType"), 
        "chain_start": chain_location.get("start")["value"],
        "chain_end": chain_location.get("end")["value"],
    }

In [46]:
df = pd.read_csv("./data/main_dataset/pdb_uniprot_db.csv")
df = df[["uniprot", "PDB_wild"]]
df["PDB_wild"] = df["PDB_wild"].apply(lambda x: str(x).upper())
df.head(2)

Unnamed: 0,uniprot,PDB_wild
0,,1AMQ
1,,1ARR


In [82]:
def agg_function(l):
    unique_l = set([str(x) for x in l if (x and str(x)!='nan')])
    return " ".join(unique_l)

df2 = df.groupby("PDB_wild", as_index=False).agg({"uniprot": agg_function})
print(len(df2))


#### MULTIPLE uniprot ids ####
# (X,P00644 P00645) => (X, P00644), (X, P00645)
# add rows when there are multiple uniprot id for 1 pdb id
multiple_uniprot = df2['uniprot'].str.contains(" ", case=False)
multiple_uniprot_df = pd.DataFrame()
for _, row in df2.loc[multiple_uniprot].iterrows():
    uniprot_ids = row.uniprot.split(" ")
    additional_rows = pd.DataFrame({"PDB_wild": [row.PDB_wild if row.PDB_wild != "NAN" else ""]*len(uniprot_ids),
                                    "uniprot": uniprot_ids
                                    }, columns=df2.columns)
    multiple_uniprot_df = pd.concat([multiple_uniprot_df, additional_rows], ignore_index=True)

df2 = df2.loc[~multiple_uniprot]
df2 = pd.concat([df2, multiple_uniprot_df], ignore_index=True)
print(len(df2))

#### MULTIPLE pdb ids ####
# (1SVX|3MBP,P0AEX9) => (1SVX,P0AEX9), (3MBP, P0AEX9)
multiple_pdb = df2["PDB_wild"].str.contains("|", case=False)
multiple_pdb_df = pd.DataFrame()
for _, row in df2.loc[multiple_pdb].iterrows():
    multiple_pdb_ids = row.PDB_wild.split("|")
    additional_rows = pd.DataFrame({"PDB_wild": multiple_pdb_ids,
                                    "uniprot": [row.uniprot]*len(multiple_pdb_ids)
                                    }, columns=df2.columns)
    multiple_pdb_df = pd.concat([multiple_pdb_df, additional_rows], ignore_index=True)

df2 = df2.loc[~multiple_pdb]
df2 = pd.concat([df2, multiple_pdb_df], ignore_index=True)
print(len(df2))

#### Warning ####
# A weird bug makes it so 1E21 is converted to a number (1.00E+21) and therefor is different from the row w/ PDBwild = "1E21"
# Thus we remove this first line (it exist already on line 121)
df2 = df2.iloc[1:, :]

#### Remove occurences when there is a PDB with no uniprot 
# alltough another row contains the PDB & the uniprot 
# (and vice-versa) ####
no_uniprot = df2.uniprot.eq("")
no_uniprot_df = df2.loc[no_uniprot]
no_uniprot_df.reset_index(inplace=True)
df2 = df2.loc[~no_uniprot]

no_pdb = df2.PDB_wild.eq("")
no_pdb_df = df2.loc[no_pdb]
no_pdb_df.reset_index(inplace=True)
df2 = df2.loc[~no_pdb]

# df2 now consist of only rows with both pdb AND uniprot
# uniprot:
linked_row_found = [True]*len(no_uniprot_df)
for index, row in no_uniprot_df.iterrows():
    linked_row = df2.PDB_wild.eq(row.PDB_wild)
    if linked_row.any():
        linked_row_found[index] = False
# we remove the rows with no_uniprot for which we found another row with both uniprot and pdb
no_uniprot_df = no_uniprot_df[linked_row_found]
print(f"rm {len(linked_row_found)-len(no_uniprot_df)} rows from no_uniprot because at least 1 other row contained both information")
# pdb:
linked_row_found = [True]*len(no_pdb_df)
for index, row in no_pdb_df.iterrows():
    linked_row = df2.uniprot.eq(row.uniprot)
    if linked_row.any():
        linked_row_found[index] = False
# we remove the rows with no_uniprot for which we found another row with both uniprot and pdb
no_pdb_df = no_pdb_df[linked_row_found]
print(f"rm {len(linked_row_found)-len(no_pdb_df)} rows from no_pdb because at least 1 other row contained both information")

# we add back the curated rows with no_pdb and no_uniprot
df2 = pd.concat([df2, no_pdb_df, no_uniprot_df], ignore_index=True)




df2.to_csv("./data/main_dataset/filled_pdb_uniprot_db.csv", index=False)
print(len(df2))
df2 = df2.drop_duplicates()
print(len(df2))

# df2.head()

649
730
794
rm 13 rows from no_uniprot because at least 1 other row contained both information
rm 8 rows from no_pdb because at least 1 other row contained both information
772
712


In [44]:
df3 = pd.DataFrame(df2.uniprot.str.split(" ").to_list()).stack()
df3.head()

0  0    P00918
1  0    P0A9D2
2  0    P06241
3  0    P0AEG4
   1    P00644
dtype: object