In [1]:
import reciprocalspaceship as rs
import glob
import pandas as pd
import os
import numpy as np
import gemmi

This jupyter notebook is for processing the MTZ files contained in the folder `20221007_unscaled_unmerged`, the outputs of `hkl2mtz.sh`. The datasets must all have the same indexing orientation, so we reindex those datasets that are not well-correlated to the references, `apo_edit.pdb` or `7kqo.pdb`. Additionally, some outliers are rejected. Finally, we one-hot encode each dataset. 

There is an indexing ambiguity in P43, so some of the datasets need to be reindexed. We first flip the reference to correlate well with the reference model. 

In [2]:
def mtz_corr(a,d,key="IOBS"):
    return d.merge(a, on=["H","K","L"],check_isomorphous=False).corr()[f"{key}_x"][f"{key}_y"]

def mtz_flip(mtz_path):
    name,_=os.path.splitext(mtz_path)
    a = rs.read_mtz(mtz_path)
    a = a.apply_symop("y,x,-z").hkl_to_asu()
    #a.spacegroup = gemmi.SpaceGroup("P 4") #otherwise rs.diffmap will throw a non-isomorphous error
    a.write_mtz(name+"_flip.mtz")
    
def transfer_phases(ref, mtz):
    mtz.merge(ref["FreeR_flag"],on=["H","K","L"],how="left")
    return mtz

In [3]:
a = rs.read_mtz("./20221007_unscaled_unmerged/reference/out.mtz")
#a = a.rename(columns={"XD":"XCAL","YD":"YCAL","ZD":"ZCAL"})
a = a[a["IOBS"]<2e5]
a["SIGMA"]=a["SIGMA"]*np.sqrt(2) #this is due to the reference mtz used twice in the scaling and merging. 
#a = a.apply_symop("y,x,-z")

#we add a one-hot encoding of the reference dataset. 
for i in range(32):
    a[f"ds_{i}"]=0
    a[f"ds_{i}"]=a[f"ds_{i}"].astype("MTZInt")
a[f"ds_ref"]=1
a[f"ds_ref"]=a[f"ds_ref"].astype("MTZInt")
a
a.write_mtz("./20221007_unscaled_unmerged/reference/out_corrected_root2_ohp.mtz")

We then cut the holo datasets at 2e5 intensity as an outlier removal.

In [25]:
# Assuming the files are in the directory 'test' relative to the current directory
file_pattern = './20221007_unscaled_unmerged/UCSF-P*/out.mtz'
files = glob.glob(file_pattern)

cells = []
for file in files:
    name,_=os.path.splitext(file)
    t = rs.read_mtz(file)
    t = t[t["IOBS"]<2e5]
    t = t.write_mtz(name+"cut.mtz")
    #cells.append(rs.read_mtz(file).cell)

We flip the fragment hits if the correlation with the unflipped reference is low. We make sure the pearson correlation is high -- indeed, it's greater than 0.9 for all the datasets, after flipping. 

In [None]:
a = rs.read_mtz("./20221007_unscaled_unmerged/reference/out_corrected.mtz")

file_pattern = './20221007_unscaled_unmerged/UCSF-P*/out.mtz'
files = glob.glob(file_pattern)
files.sort()

In [3]:
for in_str in files:
    t = rs.read_mtz(in_str)
    name,_=os.path.splitext(in_str)
    t = t[t["IOBS"]<2e5]
    t = t.apply_symop("y,x,-z").hkl_to_asu()
    corr = mtz_corr(a,t)
    if corr < 0.8:
        t = t.apply_symop("y,x,-z").hkl_to_asu()
        corr = mtz_corr(a,t)
    t.write_mtz(name+"_cut_flip.mtz")
    print(f"{np.sqrt(corr):0.3f},",end="")
    #print(f"reference r with mtz {in_str}: {corr:0.4f}")

I add one-hot encoding. 

In [16]:
for ds_num,in_str in enumerate(files[:32]):
    name,_=os.path.splitext(in_str)
    t = rs.read_mtz(name+"_cut_flip.mtz")
    t[f"ds_ref"]=0
    t[f"ds_ref"]=t[f"ds_ref"].astype("MTZInt")
    for i in range(32):
        t[f"ds_{i}"]=(i==ds_num)
        t[f"ds_{i}"]=t[f"ds_{i}"].astype("MTZInt")
    print(f"ds_{ds_num},",end="")
    t.write_mtz(name+"_ohp.mtz")

ds_0,ds_1,ds_2,ds_3,ds_4,ds_5,ds_6,ds_7,ds_8,ds_9,ds_10,ds_11,ds_12,ds_13,ds_14,ds_15,ds_16,ds_17,ds_18,ds_19,ds_20,ds_21,ds_22,ds_23,ds_24,ds_25,ds_26,ds_27,ds_28,ds_29,ds_30,ds_31,

In [None]:
files[14]

'../20221007_unscaled_unmerged/UCSF-P0178/out.mtz'

In [None]:
rs.read_mtz('./20221007_unscaled_unmerged/UCSF-P0178/out_ohp.mtz')["ds_14"]

H   K    L 
23  -78  13   1 
25  -78  11   1 
26  -78  11   1 
         10   1 
27  -78  11   1 
              ..
9   86   -2   1 
10  86   1    1 
         0    1 
         0    1 
         -1   1 
Name: ds_14, Length: 1023765, dtype: MTZInt

In [None]:
t.columns

In [5]:
t["ds_0"]=0

Hereafter, we use careless and the `slurm-dw-array-grid.sh` file in `./careless_runs` for scaling and merging the `out_ohp` datasets. 