In [1]:
import pandas as pd
import numpy as np
import gemmi
import reciprocalspaceship as rs



In [2]:
lig_log = pd.read_pickle("./lig_heavy_atoms.pkl")

In [3]:
DK_bound = lig_log[lig_log['author'] == 'Keedy']
HG_bound = lig_log[lig_log['author'] == 'Ginn']

### Use 1009 as an example

In [5]:
pdbid = 1009
mtz_file = gemmi.read_mtz_file(f'../recons_refined_phases/PTP1B_y{pdbid}_recons_refined_phases.mtz')
st = gemmi.read_pdb(f'../PTP1B_DK/original_data/bound_state_models_reindexed/PTP1B_y{pdbid}_bound_state_reindexed.pdb')

In [6]:
real_grid = mtz_file.transform_f_phi_to_map('F-obs-diff', 'refine_PH2FOFCWT', sample_rate=3.0)

In [7]:
real_grid.normalize()

In [8]:
sel = gemmi.Selection('[CL,F,Br,S,I]')
sel_model = sel.copy_model_selection(st[0])
lig_heavy_atoms = [i for i in list(sel_model.all()) if i.residue.name == 'LIG']

In [9]:
lig_heavy_atoms

[<gemmi.CRA B/LIG 1/CL12>, <gemmi.CRA B/LIG 1/CL14>]

In [10]:
print(real_grid.nu, real_grid.nv, real_grid.nw)
print(real_grid.unit_cell)

144 144 192
<gemmi.UnitCell(90.32, 90.32, 106.721, 90, 90, 120)>


In [11]:
dis_lists = []
peak_values = []
for cra in lig_heavy_atoms:
    
    # Get all equivalent sites
    eq_points = []
    ops = real_grid.spacegroup.operations()
    atom = cra.atom
    
    # check the highest peak
    a,b,c = np.unravel_index(real_grid.array.argmax(), real_grid.array.shape)
    tmp = real_grid.get_fractional(a,b,c)
    peak_pos = st.cell.orthogonalize(gemmi.Fractional(tmp.x, tmp.y, tmp.z))
    dis_list = []
    

    for op in ops:
        SG_mapped=op.apply_to_xyz(st.cell.fractionalize(atom.pos).tolist())
        tmp = SG_mapped-np.floor(np.array(SG_mapped)) # Move into cell
        SG_mapped = gemmi.Fractional(*tmp)
        print(f"xyz: {SG_mapped[0]:.3f}, {SG_mapped[1]:.3}, {SG_mapped[2]:.3} ") 
        eq_points.append(SG_mapped)
        SG_mapped_orth = st.cell.orthogonalize(SG_mapped)
        dis_list.append(np.sqrt(np.sum(np.array((peak_pos - SG_mapped_orth).tolist())**2)))
        
    # Get the nearest voxel value
    peak_value = []
    for pos in eq_points:
        a = round(pos.x * real_grid.nu)
        b = round(pos.y * real_grid.nv)
        c = round(pos.z * real_grid.nw)
        peak_value.append(real_grid.get_value(a, b, c))
        print(real_grid.get_value(a, b, c))

    dis_lists.append(dis_list)
    peak_values.append(peak_value)
    
log_peak = np.max(peak_values)
log_ismaxpeak = np.any(np.array(dis_lists) < 5.0)

xyz: 0.592, 0.396, 0.952 
xyz: 0.604, 0.197, 0.285 
xyz: 0.803, 0.408, 0.618 
xyz: 0.396, 0.592, 0.0484 
xyz: 0.197, 0.604, 0.715 
xyz: 0.408, 0.803, 0.382 
4.664612770080566
4.664610862731934
4.664613246917725
4.664610385894775
4.664612293243408
4.664614200592041
xyz: 0.562, 0.367, 0.94 
xyz: 0.633, 0.194, 0.274 
xyz: 0.806, 0.438, 0.607 
xyz: 0.367, 0.562, 0.0595 
xyz: 0.194, 0.633, 0.726 
xyz: 0.438, 0.806, 0.393 
7.00383186340332
7.003829479217529
7.0038299560546875
7.003830909729004
7.003830909729004
7.003831386566162


In [12]:
log_peak

7.00383186340332

In [13]:
log_ismaxpeak

True

### Production run

In [14]:
DK_bound.loc[:, ('lig_heavy_peak(RMSD)')] = 0.0
DK_bound.loc[:, ('is_highest_peak(<5A)')] = 0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [37]:
for pdbid in DK_bound['sample']:
    
    mtz_file = gemmi.read_mtz_file(f'./recons_refined_phases/PTP1B_y{pdbid}_recons_refined_phases.mtz')
    st = gemmi.read_pdb(f'./bound_state_models_reindexed/PTP1B_y{pdbid}_bound_state_reindexed.pdb')
    
    real_grid = mtz_file.transform_f_phi_to_map('F-obs-diff', 'refine_PH2FOFCWT', sample_rate=3.0)
    real_grid.normalize()
    
    sel = gemmi.Selection('[CL,F,Br,S,I]')
    sel_model = sel.copy_model_selection(st[0])
    lig_heavy_atoms = [i for i in list(sel_model.all()) if i.residue.name == 'LIG']
    
    dis_lists = []
    peak_values = []
    for cra in lig_heavy_atoms:

        # Get all equivalent sites
        eq_points = []
        ops = real_grid.spacegroup.operations()
        atom = cra.atom

        # check the highest peak
        a,b,c = np.unravel_index(real_grid.array.argmax(), real_grid.array.shape)
        tmp = real_grid.get_fractional(a,b,c)
        peak_pos = st.cell.orthogonalize(gemmi.Fractional(tmp.x, tmp.y, tmp.z))
        dis_list = []


        for op in ops:
            SG_mapped=op.apply_to_xyz(st.cell.fractionalize(atom.pos).tolist())
            tmp = SG_mapped-np.floor(np.array(SG_mapped)) # Move into cell
            SG_mapped = gemmi.Fractional(*tmp)
            # print(f"xyz: {SG_mapped[0]:.3f}, {SG_mapped[1]:.3}, {SG_mapped[2]:.3} ") 
            eq_points.append(SG_mapped)
            SG_mapped_orth = st.cell.orthogonalize(SG_mapped)
            dis_list.append(np.sqrt(np.sum(np.array((peak_pos - SG_mapped_orth).tolist())**2)))

        # Get the nearest voxel value
        peak_value = []
        for pos in eq_points:
            a = round(pos.x * real_grid.nu)
            b = round(pos.y * real_grid.nv)
            c = round(pos.z * real_grid.nw)
            peak_value.append(real_grid.get_value(a, b, c))
            # print(real_grid.get_value(a, b, c))

        dis_lists.append(dis_list)
        peak_values.append(peak_value)

    log_peak = np.max(peak_values)
    log_ismaxpeak = np.any(np.array(dis_lists) < 5.0)
    
    DK_bound.loc[DK_bound['sample']==pdbid, 'lig_heavy_peak(RMSD)'] = log_peak
    DK_bound.loc[DK_bound['sample']==pdbid, 'is_highest_peak(<5A)'] = log_ismaxpeak

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DK_bound.loc[DK_bound['sample']==pdbid, 'is_highest_peak(<5A)'] = log_ismaxpeak


In [38]:
DK_bound

Unnamed: 0,sample,smiles_lst,Cl,F,Br,S,I,lig_soaked,bound,author,lig_heavy_peak(RMSD),is_highest_peak(<5A)
3,1009,[OCCCCn1cnc2cc(Cl)c(Cl)cc12],1,0,0,0,0,1,1,Keedy,7.003832,True
5,1011,[COc1ccc(cc1)C2=NCCc3sccc23],0,0,0,1,0,1,1,Keedy,13.021107,True
21,1043,[CNC(=S)NC1CCCCC1],0,0,0,1,0,1,1,Keedy,6.047942,True
63,1125,[CC1=CC(=C(C#N)C(=O)N1)C(F)(F)F],0,1,0,0,0,1,1,Keedy,21.246874,True
68,1136,[CC(=O)Cc1ccc(Cl)c(Cl)c1],1,0,0,0,0,1,1,Keedy,-0.392314,True
133,1264,[Cl.CCN1CNC(=NC1)SCc2ccc(Cl)cc2],1,0,0,1,0,1,1,Keedy,9.965891,True
139,1271,[CCc1cc2C(=O)NC=Nc2s1],0,0,0,1,0,1,1,Keedy,2.711882,True
146,1288,[O=S(=O)(NCc1cccs1)c2ccccc2],0,0,0,1,0,1,1,Keedy,8.266554,True
149,1294,[O=C1CN2CC(N1)c3ccccc3S2(=O)=O],0,0,0,1,0,1,1,Keedy,10.981049,True
151,1304,[CC(=O)N1C[C@@H](CO)[C@@H]2Oc3c(F)cccc3[C@H]12],0,1,0,0,0,1,1,Keedy,1.231786,False


In [39]:
HG_bound.loc[:, ('lig_heavy_peak(RMSD)')] = 0.0
HG_bound.loc[:, ('is_highest_peak(<5A)')] = 0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HG_bound.loc[:, ('lig_heavy_peak(RMSD)')] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HG_bound.loc[:, ('is_highest_peak(<5A)')] = 0.0


In [42]:
for pdbid in HG_bound['sample']:
    
    try:
        mtz_file = gemmi.read_mtz_file(f'./recons_refined_phases/PTP1B_y{pdbid}_recons_refined_phases.mtz')
        st = gemmi.read_structure(f"./cluster4x_models_reindex/y{pdbid}_cluster4x_model_reindexed.pdb")

        real_grid = mtz_file.transform_f_phi_to_map('F-obs-diff', 'refine_PH2FOFCWT', sample_rate=3.0)
        real_grid.normalize()

        sel = gemmi.Selection('[CL,F,Br,S,I]')
        sel_model = sel.copy_model_selection(st[0])
        lig_heavy_atoms = [i for i in list(sel_model.all()) if i.residue.name == 'LIG']

        dis_lists = []
        peak_values = []
        for cra in lig_heavy_atoms:

            # Get all equivalent sites
            eq_points = []
            ops = real_grid.spacegroup.operations()
            atom = cra.atom

            # check the highest peak
            a,b,c = np.unravel_index(real_grid.array.argmax(), real_grid.array.shape)
            tmp = real_grid.get_fractional(a,b,c)
            peak_pos = st.cell.orthogonalize(gemmi.Fractional(tmp.x, tmp.y, tmp.z))
            dis_list = []


            for op in ops:
                SG_mapped=op.apply_to_xyz(st.cell.fractionalize(atom.pos).tolist())
                tmp = SG_mapped-np.floor(np.array(SG_mapped)) # Move into cell
                SG_mapped = gemmi.Fractional(*tmp)
                # print(f"xyz: {SG_mapped[0]:.3f}, {SG_mapped[1]:.3}, {SG_mapped[2]:.3} ") 
                eq_points.append(SG_mapped)
                SG_mapped_orth = st.cell.orthogonalize(SG_mapped)
                dis_list.append(np.sqrt(np.sum(np.array((peak_pos - SG_mapped_orth).tolist())**2)))

            # Get the nearest voxel value
            peak_value = []
            for pos in eq_points:
                a = round(pos.x * real_grid.nu)
                b = round(pos.y * real_grid.nv)
                c = round(pos.z * real_grid.nw)
                peak_value.append(real_grid.get_value(a, b, c))
                # print(real_grid.get_value(a, b, c))

            dis_lists.append(dis_list)
            peak_values.append(peak_value)

        log_peak = np.max(peak_values)
        log_ismaxpeak = np.any(np.array(dis_lists) < 5.0)

        HG_bound.loc[HG_bound['sample']==pdbid, 'lig_heavy_peak(RMSD)'] = log_peak
        HG_bound.loc[HG_bound['sample']==pdbid, 'is_highest_peak(<5A)'] = log_ismaxpeak
    
    except:
        continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HG_bound.loc[HG_bound['sample']==pdbid, 'is_highest_peak(<5A)'] = log_ismaxpeak


In [43]:
HG_bound

Unnamed: 0,sample,smiles_lst,Cl,F,Br,S,I,lig_soaked,bound,author,lig_heavy_peak(RMSD),is_highest_peak(<5A)
34,1060,"[CC(=O)c1ccccc1Sc2ccccc2C(=O)O, Cl.Cc1ccc(CN)s1]",1,0,0,1,0,2,1,Ginn,4.805763,False
98,1182,[Cc1nc(sc1CN)c2ccncc2],0,0,0,1,0,1,1,Ginn,2.523158,False
122,1242,[CNC(=S)N(Cc1ccccc1)Cc2cccnc2],0,0,0,1,0,1,1,Ginn,10.170928,True
131,1261,[Fc1cccc(Cl)c1C2NC(=O)c3ccccc3N2],1,1,0,0,0,1,1,Ginn,1.020279,False
134,1265,[OC(=O)CSc1ccc(Cl)cc1],1,0,0,1,0,1,1,Ginn,3.028074,True
150,1302,[OCC1N([C@@H]2CC[C@@]1(O)CC2)C(=O)Cc3cccs3],0,0,0,1,0,1,1,Ginn,1.092413,False
155,1317,[OCCN1CC(O)c2ccccc2S1(=O)=O],0,0,0,1,0,1,1,Ginn,9.746892,True
156,1318,[CC[C@@H]1[C@@H](O)c2ccccc2S(=O)(=O)N1CC(=O)OC],0,0,0,1,0,1,1,Ginn,9.960307,True
165,1339,[NS(=O)(=O)c1ccc(Cl)s1],1,0,0,1,0,1,1,Ginn,2.558497,False
199,1402,[CC1CN=C(Nc2ccc(F)cc2)S1],0,1,0,1,0,1,1,Ginn,0.673231,False


In [44]:
valdo = pd.concat([DK_bound, HG_bound], axis=0)

In [45]:
valdo

Unnamed: 0,sample,smiles_lst,Cl,F,Br,S,I,lig_soaked,bound,author,lig_heavy_peak(RMSD),is_highest_peak(<5A)
3,1009,[OCCCCn1cnc2cc(Cl)c(Cl)cc12],1,0,0,0,0,1,1,Keedy,7.003832,True
5,1011,[COc1ccc(cc1)C2=NCCc3sccc23],0,0,0,1,0,1,1,Keedy,13.021107,True
21,1043,[CNC(=S)NC1CCCCC1],0,0,0,1,0,1,1,Keedy,6.047942,True
63,1125,[CC1=CC(=C(C#N)C(=O)N1)C(F)(F)F],0,1,0,0,0,1,1,Keedy,21.246874,True
68,1136,[CC(=O)Cc1ccc(Cl)c(Cl)c1],1,0,0,0,0,1,1,Keedy,-0.392314,True
133,1264,[Cl.CCN1CNC(=NC1)SCc2ccc(Cl)cc2],1,0,0,1,0,1,1,Keedy,9.965891,True
139,1271,[CCc1cc2C(=O)NC=Nc2s1],0,0,0,1,0,1,1,Keedy,2.711882,True
146,1288,[O=S(=O)(NCc1cccs1)c2ccccc2],0,0,0,1,0,1,1,Keedy,8.266554,True
149,1294,[O=C1CN2CC(N1)c3ccccc3S2(=O)=O],0,0,0,1,0,1,1,Keedy,10.981049,True
151,1304,[CC(=O)N1C[C@@H](CO)[C@@H]2Oc3c(F)cccc3[C@H]12],0,1,0,0,0,1,1,Keedy,1.231786,False


In [46]:
valdo.to_csv('./lig_heavy_atoms_peak_value_valdo.csv', index=True)

### Production run to Fo-Fc map

In [50]:
DK_bound.loc[:, ('lig_heavy_peak(RMSD)')] = 0.0
DK_bound.loc[:, ('is_highest_peak(<5A)')] = 0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DK_bound.loc[:, ('lig_heavy_peak(RMSD)')] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DK_bound.loc[:, ('is_highest_peak(<5A)')] = 0.0


In [54]:
pdbid = '1009'

mtz_file = gemmi.read_mtz_file(f'./recons_refined_phases/PTP1B_y{pdbid}_recons_refined_phases.mtz')
st = gemmi.read_pdb(f'./bound_state_models_reindexed/PTP1B_y{pdbid}_bound_state_reindexed.pdb')

real_grid = mtz_file.transform_f_phi_to_map('FOFCWT', 'PHIFOFCWT', sample_rate=3.0)
real_grid.normalize()

sel = gemmi.Selection('[CL,F,Br,S,I]')
sel_model = sel.copy_model_selection(st[0])
lig_heavy_atoms = [i for i in list(sel_model.all()) if i.residue.name == 'LIG']

dis_lists = []
peak_values = []

for cra in lig_heavy_atoms:

    # Get all equivalent sites
    eq_points = []
    ops = real_grid.spacegroup.operations()
    atom = cra.atom

    # check the highest peak
    a,b,c = np.unravel_index(real_grid.array.argmin(), real_grid.array.shape)
    tmp = real_grid.get_fractional(a,b,c)
    peak_pos = st.cell.orthogonalize(gemmi.Fractional(tmp.x, tmp.y, tmp.z))
    dis_list = []


    for op in ops:
        SG_mapped=op.apply_to_xyz(st.cell.fractionalize(atom.pos).tolist())
        tmp = SG_mapped-np.floor(np.array(SG_mapped)) # Move into cell
        SG_mapped = gemmi.Fractional(*tmp)
        print(f"xyz: {SG_mapped[0]:.3f}, {SG_mapped[1]:.3}, {SG_mapped[2]:.3} ") 
        eq_points.append(SG_mapped)
        SG_mapped_orth = st.cell.orthogonalize(SG_mapped)
        dis_list.append(np.sqrt(np.sum(np.array((peak_pos - SG_mapped_orth).tolist())**2)))

    # Get the nearest voxel value
    peak_value = []
    for pos in eq_points:
        a = round(pos.x * real_grid.nu)
        b = round(pos.y * real_grid.nv)
        c = round(pos.z * real_grid.nw)
        peak_value.append(real_grid.get_value(a, b, c))
        print(-real_grid.get_value(a, b, c))


xyz: 0.592, 0.396, 0.952 
xyz: 0.604, 0.197, 0.285 
xyz: 0.803, 0.408, 0.618 
xyz: 0.396, 0.592, 0.0484 
xyz: 0.197, 0.604, 0.715 
xyz: 0.408, 0.803, 0.382 
-1.0172756910324097
-1.0172791481018066
-1.0172779560089111
-1.0172758102416992
-1.017279863357544
-1.0172775983810425
xyz: 0.562, 0.367, 0.94 
xyz: 0.633, 0.194, 0.274 
xyz: 0.806, 0.438, 0.607 
xyz: 0.367, 0.562, 0.0595 
xyz: 0.194, 0.633, 0.726 
xyz: 0.438, 0.806, 0.393 
-1.008357286453247
-1.0083577632904053
-1.0083566904067993
-1.0083577632904053
-1.0083582401275635
-1.008357048034668


In [None]:
for pdbid in DK_bound['sample']:
    
    mtz_file = gemmi.read_mtz_file(f'./recons_refined_phases/PTP1B_y{pdbid}_recons_refined_phases.mtz')
    st = gemmi.read_pdb(f'./bound_state_models_reindexed/PTP1B_y{pdbid}_bound_state_reindexed.pdb')
    
    real_grid = mtz_file.transform_f_phi_to_map('FOFCWT', 'FOFCWT', sample_rate=3.0)
    real_grid.normalize()
    
    sel = gemmi.Selection('[CL,F,Br,S,I]')
    sel_model = sel.copy_model_selection(st[0])
    lig_heavy_atoms = [i for i in list(sel_model.all()) if i.residue.name == 'LIG']
    
    dis_lists = []
    peak_values = []
    for cra in lig_heavy_atoms:

        # Get all equivalent sites
        eq_points = []
        ops = real_grid.spacegroup.operations()
        atom = cra.atom

        # check the highest peak
        a,b,c = np.unravel_index(real_grid.array.argmax(), real_grid.array.shape)
        tmp = real_grid.get_fractional(a,b,c)
        peak_pos = st.cell.orthogonalize(gemmi.Fractional(tmp.x, tmp.y, tmp.z))
        dis_list = []


        for op in ops:
            SG_mapped=op.apply_to_xyz(st.cell.fractionalize(atom.pos).tolist())
            tmp = SG_mapped-np.floor(np.array(SG_mapped)) # Move into cell
            SG_mapped = gemmi.Fractional(*tmp)
            # print(f"xyz: {SG_mapped[0]:.3f}, {SG_mapped[1]:.3}, {SG_mapped[2]:.3} ") 
            eq_points.append(SG_mapped)
            SG_mapped_orth = st.cell.orthogonalize(SG_mapped)
            dis_list.append(np.sqrt(np.sum(np.array((peak_pos - SG_mapped_orth).tolist())**2)))

        # Get the nearest voxel value
        peak_value = []
        for pos in eq_points:
            a = round(pos.x * real_grid.nu)
            b = round(pos.y * real_grid.nv)
            c = round(pos.z * real_grid.nw)
            peak_value.append(real_grid.get_value(a, b, c))
            # print(real_grid.get_value(a, b, c))

        dis_lists.append(dis_list)
        peak_values.append(peak_value)

    log_peak = np.max(peak_values)
    log_ismaxpeak = np.any(np.array(dis_lists) < 5.0)
    
    DK_bound.loc[DK_bound['sample']==pdbid, 'lig_heavy_peak(RMSD)'] = log_peak
    DK_bound.loc[DK_bound['sample']==pdbid, 'is_highest_peak(<5A)'] = log_ismaxpeak