In [None]:
import requests
import pandas as pd
from engens.core.CrystalUtils import *
from crystal_utils_queries import *
from crystal_utils_alignments import *

## PDB ID -> PDB_ENTITIES -> UNIPROT IDs

Api resources: https://data.rcsb.org/index.html

## STEP 0 - define input PDB IDs

In [None]:
pdb_ids =  "\
2rd0 3hhm \
3hiz 4jps \
4a55 4l1b \
4l23 4l2y \
4ovu 4ovv \
4waf 4ykn \
4zop 5fi4 \
5itd 5sw8 \
5swg 5swo \
5swp 5swr \
5swt 5sx8 \
5sx9 5sxa \
5sxb 5sxc \
5sxd 5sxe \
5sxf 5sxi \
5sxj 5sxk \
5uk8 5ukj \
5ul1 5xgh \
5xgi 5xgj \
6nct \
5dxh 2y3a \
5dxu 5m6u \
5t8f 5ubt \
5vlr 6g6w \
6pyr 6pyu"


In [None]:
pdb_ids = [t.upper() for t in pdb_ids.split()]

In [None]:
len(pdb_ids)

## STEP 1 - QUERY RSCB to get all details on the PDB ids

In [None]:
results_df = rscb_entities_from_entries(pdb_ids)
results_df

In [None]:
entity_ids = list(results_df["entity_id"].unique())

In [None]:
entity_instance_connection_df = rscb_polymer_chains_info(entity_ids)

In [None]:
entity_instance_connection_df["first_asym_id"] = entity_instance_connection_df["asym_ids"].apply(lambda x: x[0])
# IF multiple chains per entity ID - select only first
results_df = results_df.merge(entity_instance_connection_df, how="left", on="entity_id")

In [None]:
results_df["instance_id"] = results_df["pdb_id"]+"."+results_df["first_asym_id"]
with pd.option_context('display.max_rows', None, 'display.max_columns', None, "expand_frame_repr", False, "display.float_format", '${:,.2f}'.format):
    display(results_df)

In [None]:
instance_ids = list(results_df["instance_id"].unique())

In [None]:
entity_instance_mapping_df = rscb_get_author_instance_info(instance_ids)
entity_instance_mapping_df

In [None]:
entity_instance_mapping_df["entity_id"] = entity_instance_mapping_df.merge(results_df, how="left", on="instance_id").entity_id
entity_instance_mapping_df

## STEP 2 - QUERY UNIPROT to get accession details

In [None]:
uniprot_ids = results_df["accession"].unique()
uniprot_ids

In [None]:
     
def uniprot_get_details(uniprot_ids):
    uniprot_details = {"accession_id":[], 
                       "id":[],
                       "full_name":[],
                       "seq" : []}
    uniprot_accession_url = "https://www.ebi.ac.uk/proteins/api/proteins/"
    
    for uni_id in uniprot_ids:
        accession_query = uniprot_accession_url+uni_id
        result_uniprot_details = requests.get(accession_query)
        if result_uniprot_details.status_code == 200:
            res_json = result_uniprot_details.json()
            uniprot_details["accession_id"].append( res_json["accession"] )
            uniprot_details["id"].append( res_json["id"] )
            uniprot_details["full_name"].append( res_json['protein']['recommendedName']['fullName']['value'] )
            uniprot_details["seq"].append( res_json['sequence']['sequence'] )
            
        else:
            print("Uniprot query failed: response "+result_uniprot_details.status_code)
            return None
    return pd.DataFrame(uniprot_details)

In [None]:
uniprot_details = uniprot_get_details(uniprot_ids)

In [None]:
results_df_uniprot = results_df.merge(uniprot_details, how="left", left_on="accession", right_on="accession_id")

In [None]:
results_access_grouped = results_df_uniprot.groupby("accession").count().reset_index()
uniprot_counts = results_access_grouped[["accession", "pdb_id"]]
uniprot_counts.columns = ["accession", "count"]
uniprot_counts["id"] = uniprot_counts["accession"].apply(lambda x: results_df_uniprot[results_df_uniprot.accession == x].iloc[0].id)
uniprot_counts

## STEP 3 - SINGLE OUT FEW META UNIPROTs

In [None]:
keyword = "PK3C"
subunit1_uniprot_details = uniprot_counts[uniprot_counts["id"].str.contains(keyword)]
subunit1_uniprot_details

In [None]:
keyword = "P85"
subunit2_uniprot_details = uniprot_counts[uniprot_counts["id"].str.contains(keyword)]
subunit2_uniprot_details

In [None]:
meta_uniprots = ["P42336", "P27986"]

In [None]:
subunit1_uniprot_details["meta_uniprot"] = list("P42336" for i in range(len(subunit1_uniprot_details)))
subunit2_uniprot_details["meta_uniprot"] = list("P27986" for i in range(len(subunit2_uniprot_details)))
accessions = list(subunit1_uniprot_details["accession"])
accessions.extend(subunit2_uniprot_details["accession"])
meta_accessions = list(subunit1_uniprot_details["meta_uniprot"])
meta_accessions.extend(subunit2_uniprot_details["meta_uniprot"])
meta_uniprot_mapping = dict(zip(accessions, meta_accessions))
subunit1_uniprot_details

In [None]:
meta_uniprot_mapping

In [None]:
meta_uniprot_mapping_df = pd.DataFrame({"uniprot_src": meta_uniprot_mapping.keys(), 
                                        "uniprot_meta": meta_uniprot_mapping.values()})
meta_uniprot_mapping_df

In [None]:
results_df = results_df.merge(meta_uniprot_mapping_df, left_on="accession", right_on="uniprot_src", how="left")

In [None]:
results_df

## Download structures and fix them

In [None]:
# Make the directory to  store the files in 
# Change this to any name you'd like
pdir = "./PI3Ks_pdbs/"
Path(pdir).mkdir(parents=True, exist_ok=True)

# -----------------STEP 1.1 - download PDBs------------------------ #
# Initialize CrystalUtils - it will autimatically (if no file_names provieded):
# Download PDBs
# Fix them
crystal_utils = CrystalUtils(pdb_codes = pdb_ids, dst_folder = pdir)

In [None]:
results_df

In [None]:

pdir = "/mnt/PI3K-experiments/crystal-structures/PI3Ks_pdbs/"
results_df["file_loc"] = results_df["pdb_id"].apply(lambda x:  pdir+"structure_output/"+x+"_fixed.pdb")

In [None]:
# prepare files for mTM-align

for meta_uniprot in results_df.uniprot_meta.unique():
    print(meta_uniprot)
    meta_uni_res = results_df[results_df.uniprot_meta == meta_uniprot]
    files_to_align = []
    for row in meta_uni_res.iterrows():
        print(row[1].instance_id)
        chain_id = row[1].first_asym_id
        file_name = row[1].file_loc
        new_file = pdir+"structure_output/"+row[1].instance_id+".pdb"
        cmd = "pdb_keepcoord | pdb_selchain -"+chain_id+" "+file_name+ " > " + new_file
        system(cmd)
        system("/miniconda3/bin/python ./add_missing_atoms.py -i {} -o {}".format(new_file, new_file))
        files_to_align.append(new_file)
    with open("mTM_input_{}".format(meta_uniprot), "w") as file:
        file.write("\n".join(files_to_align))

In [None]:
# extract residue index maps
file_residue_maps = {}
file_residue_maps_rev = {}

for row in results_df.iterrows():
    
    file_loc = row[1].file_loc
    pdb_id = row[1].pdb_id
    chain_id = row[1].first_asym_id
    instance_id = row[1].instance_id
    meta_uniprot = row[1].uniprot_meta
    parser = Bio.PDB.PDBParser()
    struct_tmp = parser.get_structure(pdb_id, file_loc)
    
    for chain in struct_tmp.get_chains():
        if chain.id == chain_id:
            file_residues = list(chain.get_residues())
            file_residues_ids = [res.id[1] for res in file_residues]
            zero_indexed_ids = list(range(len(file_residues)))
            file_residue_maps[instance_id+"-"+meta_uniprot] = dict(zip(file_residues_ids, zero_indexed_ids))
            file_residue_maps_rev[instance_id+"-"+meta_uniprot] = dict(zip(zero_indexed_ids, file_residues_ids))
        

In [None]:
# align with mTM

for meta_uniprot in results_df.uniprot_meta.unique():
    output_dir = "mTM-res-{}".format(meta_uniprot)
    if not os.path.exists(output_dir+"/result.fasta"):
        print(meta_uniprot)
        #system("/mTM-align/src/mTM-align -i "+"mTM_input_{}".format(meta_uniprot)
               +" -outdir mTM-res-{}".format(meta_uniprot) 
               +" > mTM-{}-progress.txt".format(meta_uniprot))

In [None]:
# visualize alignments

# visualize with continuous intervals and define an MCS based on the intervals
MCS = {}
for meta_uniprot in results_df.uniprot_meta.unique():
    fasta_res = "mTM-res-{}".format(meta_uniprot)+"/result.fasta"
    #regions = continuous_intervals[meta_uniprot]
    #MCS[meta_uniprot] = regions
    CrystalUtils(pdb_codes = [], dst_folder = ".").visualizeMSA(fasta_res)

In [None]:
# read in the alignments
MSTA = {}
for meta_uniprot in results_df.uniprot_meta.unique():
    fasta_res = "mTM-res-{}".format(meta_uniprot)+"/result.fasta"
    
    with open(fasta_res, "r") as file:
        fasta_lines = file.readlines()
        name = None
        seq_aligned = {}
        for line in fasta_lines:
            if line[0] == ">":
                if name is not None:
                    seq_aligned[name] = "".join(seq_aligned[name]).replace("\n", "")
                name = line.split('>')[1][:-5]
                line_name = True
                print(name)
            else:
                line_name = False 

            if not line_name:
                if name not in seq_aligned:
                    seq_aligned[name] = [line]
                else:
                    seq_aligned[name].append(line)
                line_name = False
        if name is not None:
            seq_aligned[name] = "".join(seq_aligned[name]).replace("\n", "")
    MSTA[meta_uniprot] = seq_aligned

In [None]:
# create MCS from above alignment

#converts the string with gaps into list of positions of aa-s
def alignment_string_to_numbers(alignment):
    num_array = []
    cnt = 0
    for aa in alignment:
        if aa.isalpha():
            num_array.append(cnt)
            cnt+=1
        else:
            num_array.append(-1)
    return num_array


MCS = {}

maps = {}
maps_reverse = {}
for meta_uniprot in results_df.uniprot_meta.unique():
    
    
    msta = MSTA[meta_uniprot]
    gaps = set()
    for instance_id, seq in msta.items():
        
        seq_num = alignment_string_to_numbers(seq)
        all_num = list(range(len(seq)))
        maps[instance_id+"-"+meta_uniprot] = dict(zip(seq_num, all_num))
        maps_reverse[instance_id+"-"+meta_uniprot] = dict(zip(all_num, seq_num))
        gaps.update(find_gaps(seq))
    
    MCS[meta_uniprot] = list(intervals_extract(set(all_num)-gaps))
    

In [None]:
# visualize alignment with MCS regions
for meta_uniprot in results_df.uniprot_meta.unique():
    fasta_res = "mTM-res-{}".format(meta_uniprot)+"/result.fasta"
    #regions = continuous_intervals[meta_uniprot]
    #MCS[meta_uniprot] = regions
    CrystalUtils(pdb_codes = [], dst_folder = ".").visualizeMSA(fasta_res, MCS[meta_uniprot])

In [None]:
results_df

In [None]:
def rename_region_chain_residues(pdb_id, 
                                 file_name, 
                                 old_chain, 
                                 old_region, 
                                 new_chain, 
                                 new_region,
                                 out_file):
    
    parser = Bio.PDB.PDBParser()
    struct_tmp = parser.get_structure(pdb_id, file_name)
    old_residues = [i for i in range(old_region[0], old_region[1]+1)]
    new_residues = [i for i in range(new_region[0], new_region[1]+1)]
    residue_map = dict(zip(old_residues, new_residues))
    
    chain_ids = [ chain.id for chain in struct_tmp.get_chains()]
    if not old_chain in chain_ids:
        print("Chain {} of file {} not given a mapping.".format(chain.id, file_name))
        return None
    
    for chain in struct_tmp.get_chains():
        if chain.id == old_chain:
            chain.id = new_chain
            residues = chain.get_residues()
            for residue in residues:
                if not residue.id[1] in residue_map:
                    print("Residue {} of chain {} of file {} not given a mapping.".format(residue.id[1],
                                                                                                chain.id, 
                                                                                              file_name))
                    return None
                else:
                    residue.id =  (residue.id[0], 
                                   residue_map[residue.id[1]], 
                                   residue.id[-1])
                    
    io = Bio.PDB.PDBIO()
    io.set_structure(struct_tmp)
    io.save(out_file)           
            
            
    
    

In [None]:
chain_map = {}
chains_str = "ABCDEFGHIJKLMNOPQRSTUVWYXZ"
for i, meta_uniprot in enumerate(results_df["uniprot_meta"].unique()):
    chain_map[meta_uniprot] = chains_str[i]
chain_map

In [None]:
results_df[results_df.pdb_id == "4YKN"]

In [None]:
 maps_reverse["4YKN.A"+"-"+"P27986"]

In [None]:
 file_residue_maps["4YKN.A"+"-"+"P42336"]

In [None]:
# extract MCS from each file
file_name_prefix = "./PI3Ks_pdbs/structure_output/"

for pdb_id in results_df["pdb_id"].unique():
    file_name = file_name_prefix+pdb_id+"_fixed.pdb"
    pdb_id_res = results_df[results_df.pdb_id == pdb_id]
    out_file = file_name_prefix+pdb_id+"_mcs.pdb"
    tmp_files = []
    
    for i, meta_uniprot in enumerate(results_df["uniprot_meta"].unique()):
        row = pdb_id_res[pdb_id_res.uniprot_meta == meta_uniprot].iloc[0]
        meta_uniprot = row["uniprot_meta"]
        instance_id = row["instance_id"]
        chain = row["first_asym_id"]
        new_chain = chain_map[meta_uniprot]
        seq_map = maps[instance_id+"-"+meta_uniprot]
        seq_map_rev = maps_reverse[instance_id+"-"+meta_uniprot]
        resid_map_res = file_residue_maps_rev[instance_id+"-"+meta_uniprot]
        mcs = MCS[meta_uniprot]
        
        for i, mcs_region in enumerate(mcs):
            file_region_index = [seq_map_rev[mcs_region[0]], seq_map_rev[mcs_region[1]]]
            file_region_resid = [resid_map_res[file_region_index[0]], resid_map_res[file_region_index[1]]]
            
            cont_regions_string = "{}:{}".format(file_region_resid[0], file_region_resid[1])
            tmp_file = file_name_prefix+pdb_id+"_mcs_"+instance_id+"-"+meta_uniprot+"region"+str(i)+".pdb"
            #get chain, get residues
            cmd = "pdb_keepcoord | pdb_selchain -"+chain+" "+file_name+ " | pdb_selres -"+cont_regions_string+" | head -n-2 > "\
                + tmp_file
            print(cmd)
            system(cmd)
            rename_region_chain_residues(pdb_id = pdb_id, 
                                 file_name = tmp_file, 
                                 old_chain = chain, 
                                 old_region = file_region_resid, 
                                 new_chain = new_chain, 
                                 new_region = mcs_region,
                                 out_file = tmp_file)
            tmp_files.append(tmp_file)

    system("cat {} | grep ^ATOM* > {}".format(" ".join(tmp_files), out_file))
    #system("cat {} > {}".format(" ".join(tmp_files), out_file))
    system("rm {}".format(" ".join(tmp_files)))


In [None]:
output_traj = "bb_traj.xtc"
file_name_prefix = "./PI3Ks_pdbs/structure_output/"
trajs = []
tmp_top = None
for pdb_id in results_df["pdb_id"].unique():
    print(pdb_id)
    if pdb_id == "4YKN":
        file_name = file_name_prefix+pdb_id+"_mcs.pdb"
        file_name2 = file_name_prefix+pdb_id+"_bb.pdb"
        #system("/miniconda3/bin/python ./add_missing_atoms.py -i {} -o {}".format(file_name, file_name))
        #system("pdb_keepcoord {} | grep -v ^MODEL* > {}".format(file_name, file_name2))
        tmp_top = mdtraj.load(file_name).top
        atom_sel = tmp_top.select("backbone")
        bb_loaded = mdtraj.load(file_name, atom_indices= atom_sel)
        bb_loaded.save(file_name2)
        continue
    file_name = file_name_prefix+pdb_id+"_mcs.pdb"
    file_name2 = file_name_prefix+pdb_id+"_bb.pdb"
    system("/miniconda3/bin/python ./add_missing_atoms.py -i {} -o {}".format(file_name, file_name))
    #system("pdb_keepcoord {} | grep -v ^MODEL* > {}".format(file_name, file_name2))
    tmp_top = mdtraj.load(file_name).top
    atom_sel = tmp_top.select("backbone")
    bb_loaded = mdtraj.load(file_name, atom_indices= atom_sel)
    bb_loaded.save(file_name2)

In [None]:
system("mdconvert -f $(ls ./PI3Ks_pdbs/structure_output/*_bb.pdb) -o "+output_traj)

In [None]:
MCS["P42336"]

In [None]:
list(MCS['P27986'])

In [None]:
[x for x in MCS['P27986']]

In [None]:
#################################################################
#SUCCESS
#################################################################