### Imports

In [1]:
import pandas as pd
import reciprocalspaceship as rs
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import numpy as np
import json
from scipy import stats
import random
import math
import gemmi
from sklearn.decomposition import PCA
import time
import sys
from scipy.stats import pearsonr
import pickle

### Ligand Cif Processing

In [2]:
ligand_cif = []

with open('PTP1B_DK/ligand_cif_to_dataset_mapping.txt') as f:
    lines = f.readlines()
    
for line in lines:
    dataset, ligands = line[:-1].split(":")
    ligands = list(filter(None,ligands.split(" ")))
    ligand_cif.append([dataset, ligands, 0 if len(ligands) > 0 else 1])
    
ligand_cif = pd.DataFrame(ligand_cif, columns=["Dataset", "Ligands", "Apo"])

apo_samples = ligand_cif[ligand_cif['Apo'] == 1]['Dataset'].tolist()
apo_samples = [sample.split('y')[1] for sample in apo_samples]

samples = ligand_cif['Dataset'].tolist()
samples = [sample.split('y')[1] for sample in samples]

In [3]:
long_names = {
    "chain"   : "Chain",
    "seqid"   : "SeqID",
    "residue" : "Residue",
    "name"    : "Atom Name",
    "dist"    : "Dist (Ã…)",
    "peak"    : "Peak Value",
    "peakz"   : "Peak Value (Z-score)",
    "score"   : "Peak Score",
    "scorez"  : "Peak Score (Z-score)",
    "cenx"    : "Centroid (x)",
    "ceny"    : "Centroid (y)",
    "cenz"    : "Centroid (z)",
    "coordx"  : "Coord (x)",
    "coordy"  : "Coord (y)",
    "coordz"  : "Coord (z)",
}

In [7]:
sample=rs.read_mtz('PTP1B_DK/recons_mtzs/PTP1B-y1288_mrflagsref_idxs_recons.mtz')[["F-obs-diff", "PHIFOFCWT"]].dropna()

sample_gemmi=sample.to_gemmi()
grid = sample_gemmi.transform_f_phi_to_map(
    "F-obs-diff", "PHIFOFCWT")

sigma_cutoff = 1.5
mean,sigma = np.mean(grid),np.std(grid)
cutoff = mean + sigma_cutoff * sigma

blobs = gemmi.find_blobs_by_flood_fill(grid, cutoff=0, min_volume=0, min_score=0, min_peak=0, negate=True)

In [14]:
structure = gemmi.read_pdb("PTP1B_DK/bound_state_models/PTP1B-y1288_refmac_input.split.bound-state.pdb")
cell = structure.cell
model = structure[0]
min_volume = 4.
min_score = 0.
min_peak = 4.
distance_cutoff = 4.
use_long_names = False
negate=False
sort_by_key='peakz'

ns = gemmi.NeighborSearch(model, structure.cell, distance_cutoff).populate()

peaks = []
for blob in blobs:
    #This is a list of weird pointer objects. It is safest to convert them `gemmi.CRA` objects (see below)
    marks = ns.find_atoms(blob.centroid)
    if len(marks) == 0:
        continue

    cra = dist = None
    for mark in marks:
        image_idx = mark.image_idx
        _cra = mark.to_cra(model)
        _dist = cell.find_nearest_pbc_image(blob.centroid, _cra.atom.pos, mark.image_idx).dist()
        if cra is None:
            dist = _dist
            cra  = _cra
        elif _dist < dist:
            dist = _dist
            cra  = _cra

    record = {
        "chain"   :    cra.chain.name,
        "seqid"   :    cra.residue.seqid.num,
        "residue" :    cra.residue.name,
        "atom"    :    cra.atom.name,
        "element" :    cra.atom.element.name,
        "dist"    :    dist,
        "peakz"   :    (blob.peak_value-mean)/sigma,
        "scorez"  :    (blob.score-mean)/sigma,
        "peak"    :    blob.peak_value,
        "score"   :    blob.score,
        "cenx"    :    blob.centroid.x,
        "ceny"    :    blob.centroid.y,
        "cenz"    :    blob.centroid.x,
        "coordx"  :    cra.atom.pos.x,
        "coordy"  :    cra.atom.pos.y,
        "coordz"  :    cra.atom.pos.z,
    }
    if negate:
        negative_keys = ['peak', 'peakz', 'score', 'scorez']
        for k in negative_keys:
            record[k] = -record[k]
    peaks.append(record)

out = pd.DataFrame.from_records(peaks)

In [15]:
out

Unnamed: 0,chain,seqid,residue,atom,element,dist,peakz,scorez,peak,score,cenx,ceny,cenz,coordx,coordy,coordz
0,A,96,TRP,HH2,H,1.43591,1.293533,2.617859,0.00122,0.002468,58.945969,21.929254,58.945969,-31.915,21.319,0.777
1,S,157,HOH,O,O,2.181812,0.513796,0.899343,0.000484,0.000848,-0.79884,71.282749,-0.79884,-47.299,-7.483,9.218
