In [9]:
import pandas as pd
import numpy as np
import gemmi
import reciprocalspaceship as rs
from tqdm import tqdm

In [10]:
# Use the lig log from DK's results
lig_log = pd.read_pickle("./lig_heavy_atoms.pkl")
DK_bound = lig_log[lig_log['author'] == 'Keedy'].copy()

In [11]:
phyllis_dir="/n/holyscratch01/hekstra_lab/phyllis/"
my_dir     ="/n/holyscratch01/hekstra_lab/dhekstra/valdo-tests/"

basepath = my_dir + 'pipeline/'
vae_reconstructed_path             = basepath + 'vae/reconstructed/'
# vae_reconstructed_with_phases_path = basepath + 'vae/reconstructed_w_phases/'
vae_reconstructed_with_phases_path = '/n/holyscratch01/hekstra_lab/dhekstra/valdo-tests/pipeline/vae/reconstructed_w_phases/'

# bound_models_standardized_path     = basepath + 'data/bound_models_reindexed/'
# bound_models_standardized_path     = '/n/holyscratch01/hekstra_lab/dhekstra/phyllis/PTP1B_DK/all_bound_models_reindexed_v2/short/'
bound_models_standardized_path     = '/n/holyscratch01/hekstra_lab/dhekstra/phyllis/PTP1B_DK/all_bound_models_reindexed_v2/short_setting_0049/'

### Get a mean peak value as a VAE metric

In [12]:
DK_bound.loc[:, 'lig_heavy_peak'] = 0.0
DK_bound.loc[:, 'is_highest_peak(<5A)'] = 0.0

In [13]:
%%time
diff_col="WDF" # %^*&^&*^#$^R%*#^(^R(*^(R*^(##(^##^
for pdbid in tqdm(DK_bound['sample']):
    try:
        # Change the following line to the mtzs files your model created
        # print(pdbid)
        try:
            mtz_file = gemmi.read_mtz_file(vae_reconstructed_with_phases_path + f'{pdbid}_0.mtz')
        except:
            try:
                mtz_file = gemmi.read_mtz_file(vae_reconstructed_with_phases_path + f'{pdbid}_1.mtz')
            except Exception as e:
                print(e)

        # mtz_file.reindex(gemmi.Op('-h,-k,l'))
        
        # don't change the pdb files
        st = gemmi.read_pdb(bound_models_standardized_path + f'{pdbid}.pdb')
        
        real_grid = mtz_file.transform_f_phi_to_map(diff_col, 'refine_PH2FOFCWT', sample_rate=3.0)
        real_grid.normalize()
        
        sel = gemmi.Selection('[CL,Br,S,I]')
        sel_model = sel.copy_model_selection(st[0])
        lig_heavy_atoms = [i for i in list(sel_model.all()) if i.residue.name == 'LIG']
        
        dis_lists = []
        peak_values = []
        for cra in lig_heavy_atoms:
    
            # Get all equivalent sites
            eq_points = []
            ops = real_grid.spacegroup.operations()
            atom = cra.atom
    
            # check the highest peak
            a,b,c = np.unravel_index(real_grid.array.argmax(), real_grid.array.shape)
            tmp = real_grid.get_fractional(a,b,c)
            peak_pos = st.cell.orthogonalize(gemmi.Fractional(tmp.x, tmp.y, tmp.z))
            dis_list = []
    
    
            for op in ops:
                SG_mapped=op.apply_to_xyz(st.cell.fractionalize(atom.pos).tolist())
                tmp = SG_mapped-np.floor(np.array(SG_mapped)) # Move into cell
                SG_mapped = gemmi.Fractional(*tmp)
                # print(f"xyz: {SG_mapped[0]:.3f}, {SG_mapped[1]:.3}, {SG_mapped[2]:.3} ") 
                eq_points.append(SG_mapped)
                SG_mapped_orth = st.cell.orthogonalize(SG_mapped)
                dis_list.append(np.sqrt(np.sum(np.array((peak_pos - SG_mapped_orth).tolist())**2)))
    
            # Get the nearest voxel value
            peak_value = []
            for pos in eq_points:
                a = round(pos.x * real_grid.nu)
                b = round(pos.y * real_grid.nv)
                c = round(pos.z * real_grid.nw)
                peak_value.append(real_grid.get_value(a, b, c))
                #print(real_grid.get_value(a, b, c))
    
            dis_lists.append(dis_list)
            peak_values.append(peak_value)
            
    
        log_peak = np.max(peak_values)
        log_ismaxpeak = np.any(np.array(dis_lists) < 5.0)
        
        DK_bound.loc[DK_bound['sample']==pdbid, 'lig_heavy_peak'] = log_peak
        DK_bound.loc[DK_bound['sample']==pdbid, 'is_highest_peak(<5A)'] = log_ismaxpeak
    except Exception as e:
        print(e)



100%|██████████████████████████████████████████████████| 24/24 [00:03<00:00,  7.77it/s]

CPU times: user 1.9 s, sys: 60.1 ms, total: 1.97 s
Wall time: 3.09 s





In [14]:
DK_bound.head(5)

Unnamed: 0,sample,smiles_lst,Cl,Br,S,I,lig_soaked,bound,author,lig_heavy_peak,is_highest_peak(<5A)
3,1009,[OCCCCn1cnc2cc(Cl)c(Cl)cc12],1,0,0,0,1,1,Keedy,11.992481,True
5,1011,[COc1ccc(cc1)C2=NCCc3sccc23],0,0,1,0,1,1,Keedy,17.44322,True
21,1043,[CNC(=S)NC1CCCCC1],0,0,1,0,1,1,Keedy,8.586608,True
68,1136,[CC(=O)Cc1ccc(Cl)c(Cl)c1],1,0,0,0,1,1,Keedy,-1.642047,True
133,1264,[Cl.CCN1CNC(=NC1)SCc2ccc(Cl)cc2],1,0,1,0,1,1,Keedy,7.574421,True


In [15]:
### Mean peak value as metric
print(DK_bound['lig_heavy_peak'])
print(np.mean(DK_bound['lig_heavy_peak']))

3      11.992481
5      17.443220
21      8.586608
68     -1.642047
133     7.574421
139     4.129325
146    11.834853
149    12.111731
154    24.169409
163    26.941822
225    12.734344
241    15.168922
272    34.512695
299    33.488838
303    27.872213
311     6.811953
313     7.486728
335    12.612943
361    14.005568
371    20.530773
374    18.712086
389    18.964876
394     8.171685
397    14.070197
Name: lig_heavy_peak, dtype: float64
15.345235223571459


### Same metric for Fo-Fc maps

In [16]:
DK_bound.loc[:, ('lig_heavy_peak')] = 0.0
DK_bound.loc[:, ('is_highest_peak(<5A)')] = 0.0

In [17]:
%%time
for pdbid in tqdm(DK_bound['sample']):
    # mtz_file = gemmi.read_mtz_file(phyllis_dir+f'pipeline/data/pandda_input_models_refined_waters/PTP1B_y{pdbid}_pandda_input_reindexed_refine_001.mtz')
    mtz_file = gemmi.read_mtz_file(phyllis_dir+f'PTP1B_DK/pandda_input_models_refined_waters/PTP1B_y{pdbid}_pandda_input_reindexed_refine_001.mtz')
    st = gemmi.read_pdb(phyllis_dir+f'PTP1B_DK/all_bound_models_reindexed/PTP1B_y{pdbid}_bound_state_reindexed.pdb')
    
    real_grid = mtz_file.transform_f_phi_to_map('FOFCWT', 'PHFOFCWT', sample_rate=3.0)
    real_grid.normalize()
    
    sel = gemmi.Selection('[CL,Br,S,I]')
    sel_model = sel.copy_model_selection(st[0])
    lig_heavy_atoms = [i for i in list(sel_model.all()) if i.residue.name == 'LIG']
    
    dis_lists = []
    peak_values = []
    for cra in lig_heavy_atoms:

        # Get all equivalent sites
        eq_points = []
        ops = real_grid.spacegroup.operations()
        atom = cra.atom

        # check the highest peak
        a,b,c = np.unravel_index(real_grid.array.argmax(), real_grid.array.shape)
        tmp = real_grid.get_fractional(a,b,c)
        peak_pos = st.cell.orthogonalize(gemmi.Fractional(tmp.x, tmp.y, tmp.z))
        dis_list = []


        for op in ops:
            SG_mapped=op.apply_to_xyz(st.cell.fractionalize(atom.pos).tolist())
            tmp = SG_mapped-np.floor(np.array(SG_mapped)) # Move into cell
            SG_mapped = gemmi.Fractional(*tmp)
            # print(f"xyz: {SG_mapped[0]:.3f}, {SG_mapped[1]:.3}, {SG_mapped[2]:.3} ") 
            eq_points.append(SG_mapped)
            SG_mapped_orth = st.cell.orthogonalize(SG_mapped)
            dis_list.append(np.sqrt(np.sum(np.array((peak_pos - SG_mapped_orth).tolist())**2)))

        # Get the nearest voxel value
        peak_value = []
        for pos in eq_points:
            a = round(pos.x * real_grid.nu)
            b = round(pos.y * real_grid.nv)
            c = round(pos.z * real_grid.nw)
            peak_value.append(real_grid.get_value(a, b, c))
            #print(real_grid.get_value(a, b, c))

        dis_lists.append(dis_list)
        peak_values.append(peak_value)

    log_peak = np.max(peak_values)
    log_ismaxpeak = np.any(np.array(dis_lists) < 5.0)
    
    DK_bound.loc[DK_bound['sample']==pdbid, 'lig_heavy_peak'] = log_peak
    DK_bound.loc[DK_bound['sample']==pdbid, 'is_highest_peak(<5A)'] = log_ismaxpeak

  0%|                                                           | 0/24 [00:00<?, ?it/s]


OSError: [Errno 2] Failed to open /n/holyscratch01/hekstra_lab/phyllis/PTP1B_DK/pandda_input_models_refined_waters/PTP1B_y1009_pandda_input_reindexed_refine_001.mtz: No such file or directory

In [18]:
### Mean peak value as metric
print(DK_bound['lig_heavy_peak'])
print(np.mean(DK_bound['lig_heavy_peak']))

3      0.0
5      0.0
21     0.0
68     0.0
133    0.0
139    0.0
146    0.0
149    0.0
154    0.0
163    0.0
225    0.0
241    0.0
272    0.0
299    0.0
303    0.0
311    0.0
313    0.0
335    0.0
361    0.0
371    0.0
374    0.0
389    0.0
394    0.0
397    0.0
Name: lig_heavy_peak, dtype: float64
0.0


### Zmap mean peak value as metric

In [19]:
DK_bound.loc[:, ('lig_heavy_peak')] = 0.0
DK_bound.loc[:, ('is_highest_peak(<5A)')] = 0.0

In [20]:
%%time
for pdbid in tqdm(DK_bound['sample']):
    
    zmap = gemmi.read_ccp4_map(f'/n/hekstra_lab/people/minhuan/projects/drug/minhuan_backup/pipeline/data/z_maps/PTP1B-y{pdbid}-z_map.native.ccp4')
    st = gemmi.read_structure(f'/n/hekstra_lab/people/minhuan/projects/drug/minhuan_backup/pipeline/data/bound_models_DK/PTP1B-y{pdbid}_refmac_input.split.bound-state.pdb')
    
    real_grid = zmap.grid
    real_grid.normalize()
    
    sel = gemmi.Selection('[CL,Br,S,I]')
    sel_model = sel.copy_model_selection(st[0])
    lig_heavy_atoms = [i for i in list(sel_model.all()) if i.residue.name == 'LIG']
    
    dis_lists = []
    peak_values = []
    for cra in lig_heavy_atoms:

        # Get all equivalent sites
        eq_points = []
        ops = real_grid.spacegroup.operations()
        atom = cra.atom

        # check the highest peak
        a,b,c = np.unravel_index(real_grid.array.argmax(), real_grid.array.shape)
        tmp = real_grid.get_fractional(a,b,c)
        peak_pos = st.cell.orthogonalize(gemmi.Fractional(tmp.z, tmp.y, tmp.x))
        dis_list = []


        for op in ops:
            SG_mapped=op.apply_to_xyz(st.cell.fractionalize(atom.pos).tolist())
            tmp = SG_mapped-np.floor(np.array(SG_mapped)) # Move into cell
            SG_mapped = gemmi.Fractional(*tmp)
            # print(f"xyz: {SG_mapped[0]:.3f}, {SG_mapped[1]:.3}, {SG_mapped[2]:.3} ") 
            eq_points.append(SG_mapped)
            SG_mapped_orth = st.cell.orthogonalize(SG_mapped)
            dis_list.append(np.sqrt(np.sum(np.array((peak_pos - SG_mapped_orth).tolist())**2)))

        # Get the nearest voxel value
        peak_value = []
        for pos in eq_points:
            a = round(pos.z * real_grid.nu)
            b = round(pos.y * real_grid.nv)
            c = round(pos.x * real_grid.nw)
            peak_value.append(real_grid.get_value(a, b, c))
            # print(real_grid.get_value(a, b, c))

        dis_lists.append(dis_list)
        peak_values.append(peak_value)

    log_peak = np.max(peak_values)
    log_ismaxpeak = np.any(np.array(dis_lists) < 5.0)
    
    DK_bound.loc[DK_bound['sample']==pdbid, 'lig_heavy_peak'] = log_peak
    DK_bound.loc[DK_bound['sample']==pdbid, 'is_highest_peak(<5A)'] = log_ismaxpeak

100%|██████████████████████████████████████████████████| 24/24 [00:27<00:00,  1.16s/it]

CPU times: user 1.84 s, sys: 374 ms, total: 2.21 s
Wall time: 27.7 s





In [22]:
### Mean peak value as metric
print(DK_bound['lig_heavy_peak'])
print(np.mean(DK_bound['lig_heavy_peak']))

3       5.925274
5      14.642082
21      7.497661
68      1.318191
133     6.371030
139     2.374749
146     5.483102
149     9.043446
154    11.662819
163    13.975651
225     5.086185
241     8.456112
272    23.948952
299    18.548353
303    21.206833
311     5.411858
313     4.801870
335     9.213498
361    12.359171
371     9.103870
374     8.571445
389    10.139035
394     7.352013
397     8.606597
Name: lig_heavy_peak, dtype: float64
9.629158134261766
