## Imports / Helper Functions

### Imports

In [13]:
import pandas as pd
import reciprocalspaceship as rs
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import numpy as np
import json
from scipy import stats
import random
import math
import gemmi
from sklearn.decomposition import PCA
import time
import sys
from scipy.stats import pearsonr
import pickle
# scaled - reconstructed

### Ligand Cif Processing

In [2]:
ligand_cif = []

with open('../PTP1B_DK/ligand_cif_to_dataset_mapping.txt') as f:
    lines = f.readlines()
    
for line in lines:
    dataset, ligands = line[:-1].split(":")
    ligands = list(filter(None,ligands.split(" ")))
    ligand_cif.append([dataset, ligands, 0 if len(ligands) > 0 else 1])
    
ligand_cif = pd.DataFrame(ligand_cif, columns=["Dataset", "Ligands", "Apo"])

apo_samples = ligand_cif[ligand_cif['Apo'] == 1]['Dataset'].tolist()
apo_samples = [sample.split('y')[1] for sample in apo_samples]

samples = ligand_cif['Dataset'].tolist()
samples = [sample.split('y')[1] for sample in samples]

### Moving to Cell

In [3]:
def move2cell(cartesian_coordinates, unit_cell, fractionalize=True):
    '''
    Move your points into a unitcell with translational vectors
    
    Parameters
    ----------
    cartesian_coordinates: array-like
        [N_points, 3], cartesian positions of points you want to move
        
    unit_cell, gemmi.UnitCell
        A gemmi unitcell instance
    
    fractionalize: boolean, default True
        If True, output coordinates will be fractional; Or will be cartesians
    
    Returns
    -------
    array-like, coordinates inside the unitcell
    '''
    o2f_matrix = np.array(unit_cell.fractionalization_matrix)
    frac_pos = np.dot(cartesian_coordinates, o2f_matrix.T) 
    frac_pos_incell = frac_pos % 1
    if fractionalize:
        return frac_pos_incell
    else:
        f2o_matrix = np.array(unit_cell.orthogonalization_matrix)
        return np.dot(frac_pos_incell, f2o_matrix.T)

## Finding Blobs

#### Verifying FFT

In [25]:
data = rs.read_mtz('../PTP1B_DK/recons_mtzs/PTP1B-y0049_mrflagsref_idxs_recons.mtz')[["F-obs-diff", "PHIFOFCWT"]].dropna()
print(data.spacegroup)
data2 = data.copy()
data2.spacegroup = gemmi.SpaceGroup('P 1')

sample_data = data.to_gemmi()
sample_data2 = data2.to_gemmi()

map1 = sample_data.transform_f_phi_to_map("F-obs-diff", "PHIFOFCWT", sample_rate=3)
map2 = sample_data2.transform_f_phi_to_map("F-obs-diff", "PHIFOFCWT", sample_rate=3)

# map1 == map2?

<gemmi.SpaceGroup("P 31 2 1")>


In [28]:
if (map1 == map2):
    print("hi")
else:
    print("Not same")

Not same


#### FFT + find_blobs_by_flood_fill

In [4]:
long_names = {
    "chain"   : "Chain",
    "seqid"   : "SeqID",
    "residue" : "Residue",
    "name"    : "Atom Name",
    "dist"    : "Dist (Ã…)",
    "peak"    : "Peak Value",
    "peakz"   : "Peak Value (Z-score)",
    "score"   : "Peak Score",
    "scorez"  : "Peak Score (Z-score)",
    "cenx"    : "Centroid (x)",
    "ceny"    : "Centroid (y)",
    "cenz"    : "Centroid (z)",
    "coordx"  : "Coord (x)",
    "coordy"  : "Coord (y)",
    "coordz"  : "Coord (z)",
}

In [48]:
sample=rs.read_mtz('../PTP1B_DK/recons_mtzs/PTP1B-y0049_mrflagsref_idxs_recons.mtz')[["F-obs-diff", "PHIFOFCWT"]].dropna()
structure = gemmi.read_pdb("../PTP1B_DK/bound_state_models/PTP1B-y0049_refmac_input.split.bound-state.pdb")
cell = structure.cell

sample_gemmi=sample.to_gemmi()
grid = sample_gemmi.transform_f_phi_to_map("F-obs-diff", "PHIFOFCWT", sample_rate=3)

# sigma_cutoff = 1.5
# mean,sigma = np.mean(grid),np.std(grid)
# cutoff = mean + sigma_cutoff * sigma

# minimum requirements
blobs = gemmi.find_blobs_by_flood_fill(grid, cutoff=0, min_volume=0, min_score=0, min_peak=0)

#### Sanity Check (ugly scratch work)

In [89]:
grid.set_unit_cell(cell)

In [90]:
grid.unit_cell

<gemmi.UnitCell(89.839, 89.839, 106.494, 90, 90, 120)>

In [92]:
i_, j_, k_ = 0, 0, 0
max_ = 0
for i in range(-90, 90):
    for j in range(-90, 90):
        for k in range(-144, 144):
            if grid.get_point(i, j, k).value > max_:
                max_ = grid.get_point(i, j, k).value
                i_, j_, k_ = i, j, k

In [93]:
i_, j_, k_

(-44, -26, -32)

In [71]:
grid.point_to_position(grid.get_point(-44, -26, -32))

<gemmi.Position(-30.9445, -22.4764, -23.6653)>

In [84]:
move2cell((-30.9445, -22.4764, -23.6653), cell, fractionalize=False)

array([13.975     , 55.32645625, 82.8287    ])

In [29]:
grid.point_to_position(grid.get_point(46, 64, 112))

(46, 64, 112)

In [95]:
grid.get_nearest_point(gemmi.Position(-30.9445, -22.4764, -23.6653))

<gemmi.FloatGridBase.Point (-44, -26, -32) -> 0.0122986>

In [57]:
grid.get_nearest_point(gemmi.Position(-30.9445, -22.4764, -23.6653))

<gemmi.FloatGridBase.Point (-44, -26, -32) -> 0.0122986>

In [10]:
cell

<gemmi.UnitCell(89.839, 89.839, 106.494, 90, 90, 120)>

In [11]:
grid.unit_cell

<gemmi.UnitCell(89.839, 89.839, 106.494, 90, 90, 120)>

#### Determining Blob Position

In [31]:
for blob in blobs:
    print("move2cell:", move2cell([blob.peak_pos[0], blob.peak_pos[1], blob.peak_pos[2]], cell, fractionalize=False))
    print("position as seen on coot:", blob.peak_pos)

move2cell: [13.10152083 60.51333264 80.9798125 ]
position as seen on coot: <gemmi.Position(-31.818, -17.2895, -25.5142)>
move2cell: [47.41502778 60.51333264  1.66396875]
position as seen on coot: <gemmi.Position(47.415, 60.5133, 1.66397)>
move2cell: [33.06574306 21.61190451  3.3279375 ]
position as seen on coot: <gemmi.Position(33.0657, 21.6119, 3.32794)>
move2cell: [-18.09257639  50.78797561   3.88259375]
position as seen on coot: <gemmi.Position(-18.0926, 50.788, 3.88259)>
move2cell: [64.88372222 28.09547587  7.21053125]
position as seen on coot: <gemmi.Position(64.8837, 28.0955, 7.21053)>
move2cell: [-3.11940972 20.53130929  8.8745    ]
position as seen on coot: <gemmi.Position(-3.11941, 20.5313, 8.8745)>
move2cell: [ 6.86270139 74.56107057 16.6396875 ]
position as seen on coot: <gemmi.Position(6.8627, 74.5611, 16.6397)>


#### Desired Ligand Positions

In [38]:
# LIGAND POSITIONS
for x in model[1][0]:
    # print(cell.fractionalize(x.pos))
    print("move2cell:", move2cell([x.pos[0], x.pos[1], x.pos[2]], cell, fractionalize=False))
    print("position as seen on coot:", x)

move2cell: [59.724 28.115 93.439]
position as seen on coot: <gemmi.Atom C01 at (-30.1, 28.1, -13.1)>
move2cell: [60.669 25.741 92.97 ]
position as seen on coot: <gemmi.Atom C03 at (-29.2, 25.7, -13.5)>
move2cell: [60.469 24.855 94.229]
position as seen on coot: <gemmi.Atom C04 at (-29.4, 24.9, -12.3)>
move2cell: [58.285 24.485 93.198]
position as seen on coot: <gemmi.Atom C06 at (-31.6, 24.5, -13.3)>
move2cell: [58.187 26.017 93.382]
position as seen on coot: <gemmi.Atom C07 at (-31.7, 26.0, -13.1)>
move2cell: [59.287 22.62  94.677]
position as seen on coot: <gemmi.Atom C08 at (-30.6, 22.6, -11.8)>
move2cell: [58.155 20.432 95.128]
position as seen on coot: <gemmi.Atom C11 at (-31.7, 20.4, -11.4)>
move2cell: [58.155 19.275 94.365]
position as seen on coot: <gemmi.Atom C12 at (-31.7, 19.3, -12.1)>
move2cell: [58.132 18.004 94.94 ]
position as seen on coot: <gemmi.Atom C13 at (-31.7, 18.0, -11.6)>
move2cell: [58.107 17.795 96.343]
position as seen on coot: <gemmi.Atom C14 at (-31.7, 17.8

#### Finds Atoms Near Blobs

In [36]:
model = structure[0]
min_volume = 4.
min_score = 0.
min_peak = 4.
distance_cutoff = 4.
use_long_names = False
negate=False
sort_by_key='peakz'

ns = gemmi.NeighborSearch(model, structure.cell, distance_cutoff).populate()

peaks = []
for blob in blobs:
    #This is a list of weird pointer objects. It is safest to convert them `gemmi.CRA` objects (see below)
    marks = ns.find_atoms(blob.centroid)
    if len(marks) == 0:
        continue

    cra = dist = None
    for mark in marks:
        image_idx = mark.image_idx
        _cra = mark.to_cra(model)
        _dist = cell.find_nearest_pbc_image(blob.centroid, _cra.atom.pos, mark.image_idx).dist()
        if cra is None:
            dist = _dist
            cra  = _cra
        elif _dist < dist:
            dist = _dist
            cra  = _cra

    record = {
        "chain"   :    cra.chain.name,
        "seqid"   :    cra.residue.seqid.num,
        "residue" :    cra.residue.name,
        "atom"    :    cra.atom.name,
        "element" :    cra.atom.element.name,
        "dist"    :    dist,
        "peakz"   :    (blob.peak_value-mean)/sigma,
        "scorez"  :    (blob.score-mean)/sigma,
        "peak"    :    blob.peak_value,
        "score"   :    blob.score,
        "cenx"    :    blob.centroid.x,
        "ceny"    :    blob.centroid.y,
        "cenz"    :    blob.centroid.x,
        "coordx"  :    cra.atom.pos.x,
        "coordy"  :    cra.atom.pos.y,
        "coordz"  :    cra.atom.pos.z,
    }
    if negate:
        negative_keys = ['peak', 'peakz', 'score', 'scorez']
        for k in negative_keys:
            record[k] = -record[k]
    peaks.append(record)

out = pd.DataFrame.from_records(peaks)

In [37]:
out

Unnamed: 0,chain,seqid,residue,atom,element,dist,peakz,scorez,peak,score,cenx,ceny,cenz,coordx,coordy,coordz
0,A,270,SER,HA,H,1.215047,0.369537,0.22139,0.000395,0.000237,33.191085,21.402973,33.191085,-57.251,20.35,2.948
1,A,82,ILE,HD12,H,1.219874,0.269704,0.07961,0.000288,8.5e-05,-17.948696,50.934912,-17.948696,-37.856,10.399,-3.666
2,S,61,HOH,O,O,1.75134,0.538177,0.170032,0.000576,0.000182,64.912721,27.874912,64.912721,-24.655,29.579,7.51
3,A,36,LYS,HB3,H,1.959714,0.524576,0.349088,0.000561,0.000373,6.555321,74.625005,6.555321,-39.231,-1.959,15.189
