In [7]:
import pandas as pd
import numpy as np
import gemmi
import reciprocalspaceship as rs
from tqdm import tqdm

In [8]:
# Use the lig log from DK's results
lig_log = pd.read_pickle("./lig_heavy_atoms.pkl")
DK_bound = lig_log[lig_log['author'] == 'Keedy'].copy()

In [9]:
phyllis_dir="/n/holyscratch01/hekstra_lab/phyllis/"
my_dir     ="/n/holyscratch01/hekstra_lab/dhekstra/valdo-tests/"

basepath = my_dir + 'pipeline/'
vae_reconstructed_path             = basepath + 'vae/reconstructed/'
vae_reconstructed_with_phases_path = basepath + 'vae/reconstructed_w_phases/'
bound_models_standardized_path     = basepath + 'data/bound_models_reindexed/'

### Get a mean peak value as a VAE metric

In [10]:
DK_bound.loc[:, 'lig_heavy_peak'] = 0.0
DK_bound.loc[:, 'is_highest_peak(<5A)'] = 0.0

In [11]:
%%time
for pdbid in tqdm(DK_bound['sample']):
    
    # Change the following line to the mtzs files your model created
    mtz_file = gemmi.read_mtz_file(vae_reconstructed_with_phases_path + f'{pdbid}.mtz')
    
    # don't change the pdb files
    st = gemmi.read_pdb(bound_models_standardized_path + f'{pdbid}.pdb')
    
    real_grid = mtz_file.transform_f_phi_to_map('diff', 'refine_PH2FOFCWT', sample_rate=3.0)
    real_grid.normalize()
    
    sel = gemmi.Selection('[CL,Br,S,I]')
    sel_model = sel.copy_model_selection(st[0])
    lig_heavy_atoms = [i for i in list(sel_model.all()) if i.residue.name == 'LIG']
    
    dis_lists = []
    peak_values = []
    for cra in lig_heavy_atoms:

        # Get all equivalent sites
        eq_points = []
        ops = real_grid.spacegroup.operations()
        atom = cra.atom

        # check the highest peak
        a,b,c = np.unravel_index(real_grid.array.argmax(), real_grid.array.shape)
        tmp = real_grid.get_fractional(a,b,c)
        peak_pos = st.cell.orthogonalize(gemmi.Fractional(tmp.x, tmp.y, tmp.z))
        dis_list = []


        for op in ops:
            SG_mapped=op.apply_to_xyz(st.cell.fractionalize(atom.pos).tolist())
            tmp = SG_mapped-np.floor(np.array(SG_mapped)) # Move into cell
            SG_mapped = gemmi.Fractional(*tmp)
            # print(f"xyz: {SG_mapped[0]:.3f}, {SG_mapped[1]:.3}, {SG_mapped[2]:.3} ") 
            eq_points.append(SG_mapped)
            SG_mapped_orth = st.cell.orthogonalize(SG_mapped)
            dis_list.append(np.sqrt(np.sum(np.array((peak_pos - SG_mapped_orth).tolist())**2)))

        # Get the nearest voxel value
        peak_value = []
        for pos in eq_points:
            a = round(pos.x * real_grid.nu)
            b = round(pos.y * real_grid.nv)
            c = round(pos.z * real_grid.nw)
            peak_value.append(real_grid.get_value(a, b, c))
            #print(real_grid.get_value(a, b, c))

        dis_lists.append(dis_list)
        peak_values.append(peak_value)

    log_peak = np.max(peak_values)
    log_ismaxpeak = np.any(np.array(dis_lists) < 5.0)
    
    DK_bound.loc[DK_bound['sample']==pdbid, 'lig_heavy_peak'] = log_peak
    DK_bound.loc[DK_bound['sample']==pdbid, 'is_highest_peak(<5A)'] = log_ismaxpeak


100%|███████████████████████████████████████████| 24/24 [00:01<00:00, 13.69it/s]

CPU times: user 1.61 s, sys: 43 ms, total: 1.65 s
Wall time: 1.75 s





In [12]:
### Mean peak value as metric
print(np.mean(DK_bound['lig_heavy_peak']))

1.2429127916693687


### Same metric for Fo-Fc maps

In [10]:
DK_bound.loc[:, ('lig_heavy_peak')] = 0.0
DK_bound.loc[:, ('is_highest_peak(<5A)')] = 0.0

In [11]:
%%time
for pdbid in tqdm(DK_bound['sample']):
    
    mtz_file = gemmi.read_mtz_file(phyllis_dir+f'pipeline/data/pandda_input_models_refined_waters/PTP1B_y{pdbid}_pandda_input_reindexed_refine_001.mtz')
    st = gemmi.read_pdb(phyllis_dir+f'pipeline/data/bound_state_models_reindexed/PTP1B_y{pdbid}_bound_state_reindexed.pdb')
    
    real_grid = mtz_file.transform_f_phi_to_map('FOFCWT', 'PHFOFCWT', sample_rate=3.0)
    real_grid.normalize()
    
    sel = gemmi.Selection('[CL,Br,S,I]')
    sel_model = sel.copy_model_selection(st[0])
    lig_heavy_atoms = [i for i in list(sel_model.all()) if i.residue.name == 'LIG']
    
    dis_lists = []
    peak_values = []
    for cra in lig_heavy_atoms:

        # Get all equivalent sites
        eq_points = []
        ops = real_grid.spacegroup.operations()
        atom = cra.atom

        # check the highest peak
        a,b,c = np.unravel_index(real_grid.array.argmax(), real_grid.array.shape)
        tmp = real_grid.get_fractional(a,b,c)
        peak_pos = st.cell.orthogonalize(gemmi.Fractional(tmp.x, tmp.y, tmp.z))
        dis_list = []


        for op in ops:
            SG_mapped=op.apply_to_xyz(st.cell.fractionalize(atom.pos).tolist())
            tmp = SG_mapped-np.floor(np.array(SG_mapped)) # Move into cell
            SG_mapped = gemmi.Fractional(*tmp)
            # print(f"xyz: {SG_mapped[0]:.3f}, {SG_mapped[1]:.3}, {SG_mapped[2]:.3} ") 
            eq_points.append(SG_mapped)
            SG_mapped_orth = st.cell.orthogonalize(SG_mapped)
            dis_list.append(np.sqrt(np.sum(np.array((peak_pos - SG_mapped_orth).tolist())**2)))

        # Get the nearest voxel value
        peak_value = []
        for pos in eq_points:
            a = round(pos.x * real_grid.nu)
            b = round(pos.y * real_grid.nv)
            c = round(pos.z * real_grid.nw)
            peak_value.append(real_grid.get_value(a, b, c))
            #print(real_grid.get_value(a, b, c))

        dis_lists.append(dis_list)
        peak_values.append(peak_value)

    log_peak = np.max(peak_values)
    log_ismaxpeak = np.any(np.array(dis_lists) < 5.0)
    
    DK_bound.loc[DK_bound['sample']==pdbid, 'lig_heavy_peak'] = log_peak
    DK_bound.loc[DK_bound['sample']==pdbid, 'is_highest_peak(<5A)'] = log_ismaxpeak

  0%|                                                                                                      | 0/24 [00:00<?, ?it/s]


OSError: [Errno 2] Failed to open /n/holyscratch01/hekstra_lab/phyllis/pipeline/data/pandda_input_models_refined_waters/PTP1B_y1009_pandda_input_reindexed_refine_001.mtz: No such file or directory

In [39]:
### Mean peak value as metric
print(np.mean(DK_bound['lig_heavy_peak']))

3.698671612028892


### Zmap mean peak value as metric

In [40]:
DK_bound.loc[:, ('lig_heavy_peak')] = 0.0
DK_bound.loc[:, ('is_highest_peak(<5A)')] = 0.0

In [12]:
%%time
for pdbid in tqdm(DK_bound['sample']):
    
    zmap = gemmi.read_ccp4_map(phyllis_dir+f'pipeline/data/z_maps/PTP1B-y{pdbid}-z_map.native.ccp4')
    st = gemmi.read_structure(phyllis_dir+f'pipeline/data/bound_state_models/PTP1B-y{pdbid}_refmac_input.split.bound-state.pdb')
    
    real_grid = zmap.grid
    real_grid.normalize()
    
    sel = gemmi.Selection('[CL,Br,S,I]')
    sel_model = sel.copy_model_selection(st[0])
    lig_heavy_atoms = [i for i in list(sel_model.all()) if i.residue.name == 'LIG']
    
    dis_lists = []
    peak_values = []
    for cra in lig_heavy_atoms:

        # Get all equivalent sites
        eq_points = []
        ops = real_grid.spacegroup.operations()
        atom = cra.atom

        # check the highest peak
        a,b,c = np.unravel_index(real_grid.array.argmax(), real_grid.array.shape)
        tmp = real_grid.get_fractional(a,b,c)
        peak_pos = st.cell.orthogonalize(gemmi.Fractional(tmp.z, tmp.y, tmp.x))
        dis_list = []


        for op in ops:
            SG_mapped=op.apply_to_xyz(st.cell.fractionalize(atom.pos).tolist())
            tmp = SG_mapped-np.floor(np.array(SG_mapped)) # Move into cell
            SG_mapped = gemmi.Fractional(*tmp)
            # print(f"xyz: {SG_mapped[0]:.3f}, {SG_mapped[1]:.3}, {SG_mapped[2]:.3} ") 
            eq_points.append(SG_mapped)
            SG_mapped_orth = st.cell.orthogonalize(SG_mapped)
            dis_list.append(np.sqrt(np.sum(np.array((peak_pos - SG_mapped_orth).tolist())**2)))

        # Get the nearest voxel value
        peak_value = []
        for pos in eq_points:
            a = round(pos.z * real_grid.nu)
            b = round(pos.y * real_grid.nv)
            c = round(pos.x * real_grid.nw)
            peak_value.append(real_grid.get_value(a, b, c))
            # print(real_grid.get_value(a, b, c))

        dis_lists.append(dis_list)
        peak_values.append(peak_value)

    log_peak = np.max(peak_values)
    log_ismaxpeak = np.any(np.array(dis_lists) < 5.0)
    
    DK_bound.loc[DK_bound['sample']==pdbid, 'lig_heavy_peak'] = log_peak
    DK_bound.loc[DK_bound['sample']==pdbid, 'is_highest_peak(<5A)'] = log_ismaxpeak

  0%|                                                                                                      | 0/24 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] Failed to open /n/holyscratch01/hekstra_lab/phyllis/pipeline/data/z_maps/PTP1B-y1009-z_map.native.ccp4: No such file or directory

In [44]:
### Mean peak value as metric
print(np.mean(DK_bound['lig_heavy_peak']))

8.387225474302586
