In [22]:
#! pip install biopython
from Bio.PDB.PDBParser import PDBParser
from Bio.SVDSuperimposer import SVDSuperimposer
from Bio.PDB.Polypeptide import three_to_one
from Bio.SVDSuperimposer import SVDSuperimposer
from Bio.PDB.Structure import Structure
import pandas as pd
import numpy as np
import os, sys
from urllib.error import HTTPError
import requests
import urllib.request
import matplotlib.pyplot as plt

In [23]:
# Read into df and export to excel file
DATA_PATH = "/Users/holgerchristiannyelandehlers/Desktop/master_thesis/data/results_pdb/output_P35557/"
df = pd.read_csv(DATA_PATH + 'P35557_mutations.csv')
df.to_excel(DATA_PATH + 'P35557_mutations.xlsx')
df.head()

Unnamed: 0,pdb_id,chain,sequence,mutation
0,3qic,A,-----------KKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,E339K\nS411X
1,4iwv,A,--------------QVEQILAEFQLQAADLKKVMRRMQKEMDRGLR...,K15Q\nE27A\nE28A\nE51A\nE52A
2,4ixc,A,--------------QVEQILAEFQLQAADLKKVMRRMQKEMDRGLR...,K15Q\nE27A\nE28A\nE51A\nE52A\nT342X
3,5v4w,A,---------------VEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,Q465X
4,5v4x,A,---------------VEQILAEFQLQEEDLKKVMRRMQKEMDRGLR...,Q465X


In [24]:
# Read into list
pdb_list = df.values.tolist()
print(pdb_list[0][3])

E339K
S411X


In [25]:
def download_pdb(pdbcode, datadir, downloadurl="https://files.rcsb.org/download/"):
    """
    Downloads a PDB file from the Internet and saves it in a data directory.
    :param pdbcode: The standard PDB ID e.g. '3ICB' or '3icb'
    :param datadir: The directory where the downloaded file will be saved
    :param downloadurl: The base PDB download URL, cf.
        `https://www.rcsb.org/pages/download/http#structures` for details
    :return: the full path to the downloaded PDB file or None if something went wrong
    """
    pdbfn = pdbcode + ".pdb"
    url = downloadurl + pdbfn
    outfnm = os.path.join(datadir, pdbfn)
    try:
        urllib.request.urlretrieve(url, outfnm)
        return outfnm
    except Exception as err:
        print(str(err), file=sys.stderr)
        return None

In [162]:
def align_strucs(wt_pdb, pdb_list):
    """
    Calculates the RMSD between two PDBs for shared residues.
    
    wt_pdb: str with pdb id of wild type structure used
    pdb_list: list with pdb_id, chain, seq and mutation (eg. V56L)
    """
    # Define backbone atoms
    atom_types = ["CA"]#, "N", "C", "O"]
    
    # Define res
    AA = ["ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", "LEU", "MET", "ASN", "PRO", "GLN",
      "ARG", "SER", "THR", "VAL", "TRP", "TYR"]
    
    # Get data from alignments for both wt and mutant
    pdb = pdb_list[0]
    chain = pdb_list[1]
    seq = pdb_list[2]
    wt_uniprot_seq = 'MLDDRARMEAAKKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLRLETHEEASVKMLPTYVRSTPEGSEVGDFLSLDLGGTNFRVMLVKVGEGEEGQWSVKTKHQMYSIPEDAMTGTAEMLFDYISECISDFLDKHQMKHKKLPLGFTFSFPVRHEDIDKGILLNWTKGFKASGAEGNNVVGLLRDAIKRRGDFEMDVVAMVNDTVATMISCYYEDHQCEVGMIVGTGCNACYMEEMQNVELVEGDEGRMCVNTEWGAFGDSGELDEFLLEYDRLVDESSANPGQQLYEKLIGGKYMGELVRLVLLRLVDENLLFHGEASEQLRTRGAFETRFVSQVESDTGDRKQIYNILSTLGLRPSTTDCDIVRRACESVSTRAAHMCSAGLAGVINRMRESRSEDVMRITVGVDGSVYKLHPSFKERFHASVRRLTPSCEITFIESEEGSGRGAALVSAVACKKACMLGQ'
    wt_seq = '-----------KKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLRLETHEEASVKMLPTYVRSTPEGSEVGDFLSLDLGGTNFRVMLVKVGEGEEGQWSVKTKHQMYSIPEDAMTGTAEMLFDYISECISDFLDKHQMKHKKLPLGFTFSFPVRHEDIDKGILLNWTKGFKASGAEGNNVVGLLRDAIKRRGDFEMDVVAMVNDTVATMISCYYEDHQCEVGMIVGTGCNACYMEEMQNVELVEGDEGRMCVNTEWGAFGDSGELDEFLLEYDRLVDESSANPGQQLYEKLIGGKYMGELVRLVLLRLVDENLLFHGEASEQLRTRGAFETRFVSQVESDTGDRKQIYNILSTLGLRPSTTDCDIVRRACESVSTRAAHMCSAGLAGVINRMRESRSEDVMRITVGVDGSVYKLHPSFKERFHASVRRLTPSCEITFIESEEGSGRGAALVSAVACxxxxxxxx'
    mut = pdb_list[3]
    print(pdb, mut)
    
    # Download structures
    download_pdb(wt_pdb, DATA_PATH + 'pdbs/')
    download_pdb(pdb, DATA_PATH + 'pdbs/')
    
    # Define the two structures to be compared
    p = PDBParser(QUIET=True)
    wild_type =  p.get_structure("wildtype", DATA_PATH + 'pdbs/' + wt_pdb + ".pdb")
    mutant = p.get_structure("mutant", DATA_PATH + 'pdbs/' + pdb + ".pdb")
    
    # Get seqs
    wild_type_seq = "".join([ three_to_one(r.resname) for r in wild_type[0][chain].get_residues() if r.resname in AA ])
    mutant_seq = "".join([ three_to_one(r.resname) for r in mutant[0][chain].get_residues() if r.resname in AA ])
    
    # Get atom coors
    wild_type_coords = [ a.coord for a in wild_type[0][chain].get_atoms() if a.parent.resname in AA and a.name in atom_types ]
    mutant_coords = [ a.coord for a in mutant[0][chain].get_atoms() if a.parent.resname in AA and a.name in atom_types ]
    
    # Get intersecting positions in wt and mutant
    wt_positions = list()
    mut_positions = list()
    if len(wild_type_coords) < len(wt_seq):
        for aa in range(len(wt_seq)):
            if wt_seq[aa] != 'X' and wt_seq[aa] != '-':
                wt_positions.append(aa)
                
    if len(mutant_coords) < len(seq):
        for aa in range(len(seq)):
            if seq[aa] != 'X' and seq[aa] != '-':
                mut_positions.append(aa)
    
    print(len(wt_positions), "\n", len(mut_positions))
    
    test_arr = np.intersect1d(wt_positions, mut_positions).tolist()
    # Plan: Get the sequences and compare to the sequences from pdb. Only get residues that are in wt and mutant at correct index.
    wt_seq = wt_seq[test_arr[0]:test_arr[-2]]
    start = 0
    stop = len(wild_type_seq)
    idx = wild_type_seq.find(wt_seq, start, stop)
    print('Here:', idx)
    print(wt_seq, '\n', wild_type_seq)
    #for aa in range(len(wt_seq)):
    #    if
    #seq = seq[test_arr]
    #print(wt_uniprot_seq, len(wt_uniprot_seq))#, '\n', wt_seq, "\n", seq)
            
    
    
    #print(len(wt_seq), len(seq))
    print(len(wild_type_coords), len(mutant_coords)) 
    #Run SVD and calculate RMSD - we exclude 5 first and 5 last residues
    si = SVDSuperimposer()
    si.set(np.array(wild_type_coords[5:-5]), np.array(mutant_coords[5:-5]))
    si.run() # Run the SVD alignment
    print(si.get_rms())
    return si.get_rms()
    


In [163]:
pdb_list
rmsd_list = list()
for i in range(len(pdb_list)):
    if pdb_list[i][1] == 'A':
        rmsd_list.append(align_strucs('3idh', pdb_list[i]))

3qic E339K
S411X
-----------KKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLRLETHEEASVKMLPTYVRSTPEGSEVGDFLSLDLGGTNFRVMLVKVGEGEEGQWSVKTKHQMYSIPEDAMTGTAEMLFDYISECISDFLDKHQMKHKKLPLGFTFSFPVRHEDIDKGILLNWTKGFKASGAEGNNVVGLLRDAIKRRGDFEMDVVAMVNDTVATMISCYYEDHQCEVGMIVGTGCNACYMEEMQNVELVEGDEGRMCVNTEWGAFGDSGELDEFLLEYDRLVDESSANPGQQLYEKLIGGKYMGELVRLVLLRLVDENLLFHGEASEQLRTRGAFETRFVSQVESDTGDRKQIYNILSTLGLRPSTTDCDIVRRACESVSTRAAHMCSAGLAGVINRMRESRSEDVMRITVGVDGSVYKLHPSFKERFHASVRRLTPSCEITFIESEEGSGRGAALVSAVACxxxxxxxx 
 
 -----------KKEKVEQILAEFQLQEEDLKKVMRRMQKEMDRGLRLETHEEASVKMLPTYVRSTPEGSEVGDFLSLDLGGTNFRVMLVKVGEGEEGQWSVKTKHQMYSIPEDAMTGTAEMLFDYISECISDFLDKHQMKHKKLPLGFTFSFPVRHEDIDKGILLNWTKGFKASGAEGNNVVGLLRDAIKRRGDFEMDVVAMVNDTVATMISCYYEDHQCEVGMIVGTGCNACYMEEMQNVELVEGDEGRMCVNTEWGAFGDSGELDEFLLEYDRLVDESSANPGQQLYEKLIGGKYMGELVRLVLLRLVDENLLFHGEASEQLRTRGAFETRFVSQVKSDTGDRKQIYNILSTLGLRPSTTDCDIVRRACESVSTRAAHMCSAGLAGVINRMRESRSEDVMRITVGVDGXVYKLHPSFKERFHASVRRLTPSCEITFIESEEGSGRGAALVSAVACKKXXXXXX
454 
 447
Here: 7
KKEKVEQILAEFQLQEEDLKKVMRRMQKE

Exception: Coordinate number/dimension mismatch.

In [None]:
output_variance_np = np.asarray(rmsd_list)
# Create histogram of means
hist,bin_edges = np.histogram(rmsd_list)
weights = np.ones_like(rmsd_list) / len(rmsd_list)
n, bins, patches = plt.hist(rmsd_list, 50, weights = weights, facecolor='b', edgecolor='k', alpha=0.75)
plt.grid(axis='y', alpha=0.75)
plt.xlabel(r'RMSD C$\alpha$ [Å])',fontsize=8)
plt.ylabel('Frequency',fontsize=8)
plt.xticks(fontsize=8,rotation='vertical')
plt.yticks(fontsize=8)
plt.ylabel('Frequency',fontsize=8)
plt.title('P09211: RMSD of wild type and mutants',fontsize=8)
plt.axvline(np.mean(rmsd_list), color='r', linestyle='dashed', linewidth=2)
plt.savefig('figs/rmsd_dis_P09211.png', dpi=1200)

In [80]:
x = 'degergfjdjfsjdjfds'
print(x[0:2])

de
