In [2]:
import os
import Bio
import pandas as pd

In [3]:
wd = "."
input_id = "faba_pandda_hits"
in_dir = os.path.join(wd, "input", input_id)
out_dir = os.path.join(wd, "output", input_id)
results_dir = os.path.join(out_dir, "results")

lig_data = pd.read_csv(os.path.join(results_dir, "{}_lig_data.csv".format(input_id)))

In [52]:
strucs = [f for f in os.listdir(in_dir) if f.endswith(".pdb") and "clean" not in f]
n_strucs = len(strucs)

In [60]:
struc2ligs = {}
for struc in strucs:
    struc_df = lig_data.query('struc_name == @struc')
    struc2ligs[struc] = []
    ligs = struc_df.label_comp_id.unique().tolist() 
    for the_lig in ligs: # RUNs ARPEGGIO ONCE FOR EACH LIGAND
        struc2ligs[struc].append(the_lig)

In [63]:
def dump_pickle(data, f_out):
    """
    dumps pickle
    """
    with open(f_out, "wb") as f:
        pickle.dump(data, f)

In [64]:
import pickle

In [68]:
struc2ligs

{'FabA-x0192-pandda-model.pdb': ['LIG'],
 'FabA-x0126-pandda-model.pdb': ['LIG'],
 'FabA-x0233-pandda-model.pdb': ['LIG'],
 'FabA-x0386-pandda-model.pdb': ['LIG'],
 'FabA-x0356-pandda-model.pdb': ['LIG'],
 'FabA-x0536-pandda-model.pdb': ['LIG'],
 '4B0C.pdb': ['C9H'],
 '7BHJ.pdb': ['TQH'],
 'FabA-x0495-pandda-model.pdb': [],
 'FabA-x0571-pandda-model.pdb': [],
 '4B0J.pdb': ['3MQ'],
 'FabA-x0388-pandda-model.pdb': ['LIG'],
 'FabA-x0539-pandda-model.pdb': [],
 'FabA-x0177-pandda-model.pdb': [],
 'FabA-x0431-pandda-model.pdb': ['LIG'],
 'FabA-x0416-pandda-model.pdb': ['LIG'],
 '7BIS.pdb': ['TZQ'],
 'FabA-x0570-pandda-model.pdb': ['LIG'],
 'FabA-x0142-pandda-model.pdb': ['LIG'],
 'FabA-x0503-pandda-model.pdb': ['LIG'],
 '7BK9.pdb': ['U0W'],
 '4B8U.pdb': ['IBK'],
 'FabA-x0119-pandda-model.pdb': ['LIG'],
 'FabA-x0162-pandda-model.pdb': ['LIG'],
 'FabA-x0507-pandda-model.pdb': ['LIG'],
 'FabA-x0266-pandda-model.pdb': ['LIG'],
 '4cl6.pdb': ['7SB'],
 'FabA-x0554-pandda-model.pdb': ['LIG'],
 'Fab

In [51]:
lig_data.query('struc_name == "7BHJ.pdb"')

Unnamed: 0,struc_name,label_comp_id,label_asym_id,auth_seq_id
11,7BHJ.pdb,TQH,B,201
12,7BHJ.pdb,TQH,C,201
13,7BHJ.pdb,TQH,D,201
14,7BHJ.pdb,TQH,E,201


In [5]:
def get_swissprot(): 
    """
    Retrieves sequences and their data from Swiss-Prot

    :param db: absolute path to a fasta file containing sequences, Swiss-Prot database by default
    :type db: str
    :returns: dictionary containing the sequence id, description and sequence for all proteins in Swiss-Prot
    :rtpe: dict
    """
    swissprot_dict = Bio.SeqIO.parse(swissprot, "fasta")
    proteins = {}
    for protein in swissprot_dict:
        acc = protein.id.split("|")[1]
        proteins[acc] = {}
        proteins[acc]["id"] = protein.id
        proteins[acc]["desc"] = protein.description
        proteins[acc]["seq"] = protein.seq
    return proteins

In [16]:
def retrieve_mapping_from_struc(struc, uniprot_id, struc_dir, sifts_dir, swissprot):
    input_struct = os.path.join(struc_dir, struc)
    pdb_structure = PDBXreader(inputfile = input_struct).atoms(format_type = "pdb") # ProIntVar reads the local file
    
    seq_record = str(swissprot[uniprot_id]["seq"])
    pps = pdb_structure[pdb_structure.group_PDB == "ATOM"][['label_comp_id', 'label_asym_id', 'label_seq_id_full']].drop_duplicates().groupby('label_asym_id')  # groupby chain
    pdb_chain_seqs = [(chain, SeqUtils.seq1(''.join(seq['label_comp_id'].values)), seq['label_seq_id_full'].values) for chain, seq in pps] # list of tuples like: [(chain_id, chain_seq, [chain resnums])]
    alignments = [pairwise2.align.globalxs(str(seq_record),chain_seq[1], -5, -1) for chain_seq in pdb_chain_seqs] # list of lists of tuples containing SwissProt seq - PDB chain seq pairwise alignment
    
    maps = []
    for pdb_chain_seq, alignment in zip(pdb_chain_seqs, alignments):
        PDB_UniProt_map = pd.DataFrame([(i, x) for i, x in enumerate(alignment[0][1], start=1)],  # create aligned PDB sequences to dataframe
                                       columns=['UniProt_ResNum', 'PDB_ResName'])
        PDB_UniProt_map = PDB_UniProt_map.assign(UniProt_ResName = list(alignment[0][0]))
        PDB_index = PDB_UniProt_map.query('PDB_ResName != "-"').index
        PDB_UniProt_map = PDB_UniProt_map.assign(PDB_ResNum = pd.Series(pdb_chain_seq[2], index=PDB_index)) # adds PDB_ResNum column
        PDB_UniProt_map = PDB_UniProt_map.assign(PDB_ChainID = pd.Series(pdb_chain_seq[0], index=PDB_index)) # adds PDB_ChainId column
        maps.append(PDB_UniProt_map)
    prointvar_mapping = pd.concat(maps)
    prointvar_mapping = prointvar_mapping[['UniProt_ResNum','UniProt_ResName','PDB_ResName','PDB_ResNum','PDB_ChainID']]
    prointvar_mapping = prointvar_mapping[~prointvar_mapping.PDB_ResNum.isnull()]
    prointvar_mapping.PDB_ResNum = prointvar_mapping.PDB_ResNum.astype(int)
    prointvar_mapping_csv = os.path.join(sifts_dir, "prointvar_mapping_" + struc + ".csv")
    prointvar_mapping.to_csv(prointvar_mapping_csv, index = False)
    return prointvar_mapping

In [9]:
import Bio.SeqIO

In [10]:
swissprot = "/cluster/gjb_lab/2394007/data/all_species/swissprot_rev_Nov21.fasta"

SP = get_swissprot()

In [65]:
dump_pickle(SP, "/cluster/gjb_lab/2394007/data/all_species/swissprot_rev_Nov21.pkl")

In [67]:
os.listdir("/cluster/gjb_lab/2394007/data/all_species/")

['seqs',
 'uniprot_anks_all_species.xml',
 'accs',
 'uniprot-keyword_ank+reviewed_yes.tab',
 'swissprot_rev_Nov21.fasta.bkp2',
 'uniprot_anks_all_species.tab',
 'swissprot_rev.fasta',
 'swissprot_rev_Nov21.pkl',
 'swissprot_rev_Nov21.fasta',
 'interpro_tsv_files_all_dbs',
 'uniprot_anks_all_species.txt',
 'swissprot_rev_Nov21.fasta.bkp1',
 'alns']

In [11]:
SP["O33877"]

{'id': 'sp|O33877|FABA_PSEAE',
 'desc': 'sp|O33877|FABA_PSEAE 3-hydroxydecanoyl-[acyl-carrier-protein] dehydratase OS=Pseudomonas aeruginosa (strain ATCC 15692 / DSM 22644 / CIP 104116 / JCM 14847 / LMG 12228 / 1C / PRS 101 / PAO1) OX=208964 GN=fabA PE=1 SV=1',
 'seq': Seq('MTKQHAFTREDLLRCSRGELFGPGNAQLPAPNMLMIDRIVHISDVGGKYGKGEL...DSF', SingleLetterAlphabet())}

In [21]:
from prointvar.pdbx import PDBXreader
from Bio import SeqUtils
from Bio import pairwise2

In [22]:
a = retrieve_mapping_from_struc(
    "FabA-x0569-pandda-model.pdb",
    "O33877",
    os.path.join(out_dir, "supp_pdbs"),
    os.path.join(out_dir, "sifts"),
    SP
)

2023-09-04 16:55:41,444 prointvar    INFO     Parsing PDB atoms from lines...
2023-09-04 16:55:42,026 prointvar    INFO     PDBx removed existing hydrogens...
2023-09-04 16:55:42,031 prointvar    INFO     PDBx reset atom numbers...


In [24]:
a.query('UniProt_ResName != PDB_ResName')

Unnamed: 0,UniProt_ResNum,UniProt_ResName,PDB_ResName,PDB_ResNum,PDB_ChainID


In [26]:
input_struct = os.path.join(os.path.join(out_dir, "supp_pdbs"), "FabA-x0569-pandda-model.pdb")
pdb_structure = PDBXreader(inputfile = input_struct).atoms(format_type = "pdb") # ProIntVar reads the local file

2023-09-05 08:30:21,223 prointvar    INFO     Parsing PDB atoms from lines...
2023-09-05 08:30:21,800 prointvar    INFO     PDBx removed existing hydrogens...
2023-09-05 08:30:21,807 prointvar    INFO     PDBx reset atom numbers...


In [29]:
uniprot_id = "O33877"

In [31]:
seq_record = str(SP[uniprot_id]["seq"])
pps = pdb_structure[pdb_structure.group_PDB == "ATOM"][['label_comp_id', 'label_asym_id', 'label_seq_id_full']].drop_duplicates().groupby('label_asym_id')  # groupby chain
pdb_chain_seqs = [(chain, SeqUtils.seq1(''.join(seq['label_comp_id'].values)), seq['label_seq_id_full'].values) for chain, seq in pps] # list of tuples like: [(chain_id, chain_seq, [chain resnums])]
alignments = [pairwise2.align.globalxs(str(seq_record),chain_seq[1], -5, -1) for chain_seq in pdb_chain_seqs] # list of lists of tuples containing SwissProt seq - PDB chain seq pairwise alignment

In [39]:
maps = []
for pdb_chain_seq, alignment in zip(pdb_chain_seqs, alignments):
    PDB_UniProt_map = pd.DataFrame([(i, x) for i, x in enumerate(alignment[0][1], start=1)],  # create aligned PDB sequences to dataframe
                                   columns=['UniProt_ResNum', 'PDB_ResName'])
    PDB_UniProt_map = PDB_UniProt_map.assign(UniProt_ResName = list(alignment[0][0]))
    PDB_index = PDB_UniProt_map.query('PDB_ResName != "-"').index
    PDB_UniProt_map = PDB_UniProt_map.assign(PDB_ResNum = pd.Series(pdb_chain_seq[2], index=PDB_index)) # adds PDB_ResNum column
    PDB_UniProt_map = PDB_UniProt_map.assign(PDB_ChainID = pd.Series(pdb_chain_seq[0], index=PDB_index)) # adds PDB_ChainId column
    maps.append(PDB_UniProt_map)

In [41]:
maps[0]

Unnamed: 0,UniProt_ResNum,PDB_ResName,UniProt_ResName,PDB_ResNum,PDB_ChainID
0,1,-,M,,
1,2,T,T,2,A
2,3,K,K,3,A
3,4,Q,Q,4,A
4,5,H,H,5,A
5,6,A,A,6,A
6,7,F,F,7,A
7,8,T,T,8,A
8,9,R,R,9,A
9,10,E,E,10,A


In [None]:


prointvar_mapping = pd.concat(maps)
prointvar_mapping = prointvar_mapping[['UniProt_ResNum','UniProt_ResName','PDB_ResName','PDB_ResNum','PDB_ChainID']]
prointvar_mapping = prointvar_mapping[~prointvar_mapping.PDB_ResNum.isnull()]
#prointvar_mapping = prointvar_mapping.fillna("null")
prointvar_mapping.PDB_ResNum = prointvar_mapping.PDB_ResNum.astype(int)
prointvar_mapping_csv = os.path.join(sifts_dir, "prointvar_mapping_" + struc + ".csv")
prointvar_mapping.to_csv(prointvar_mapping_csv, index = False)
return prointvar_mapping

In [47]:
len(sorted(list(set([el.split(".")[0] for el in os.listdir(in_dir)]))))

46