In [1]:
import numpy as np
import pandas as pd
import os
import networkx as nx

max_resid = 168
df_graph_scan = pd.read_pickle('raw/common_scan_edge_matrix.pkl')
df_graph_scan['type'] = ['scan'] * len(df_graph_scan)

df_graph_mask = pd.read_pickle('raw/common_mask_edge_matrix.pkl')
df_graph_mask['type'] = ['mask'] * len(df_graph_mask)

df_graph = pd.concat((df_graph_scan, df_graph_mask))
df_graph = df_graph[df_graph.resid <= max_resid]
df_graph

Unnamed: 0,variant,resid,ligand,edge_matrix,type
0,SCAN_R73K,73,R73K,"[[0.0, 0.0, 69.01336996337025, 0.0, 0.0, 0.0, ...",scan
1,SCAN_E168Q,168,E168Q,"[[0.0, 0.0, 48.29074965943725, 0.0, 0.0, 0.0, ...",scan
2,SCAN_T87P,87,T87P,"[[0.0, 0.0, 165.74442129602028, 0.0, 0.0, 0.0,...",scan
3,SCAN_V45T,45,V45T,"[[0.0, 0.0, 45.24072294072312, 0.0, 0.0, 0.0, ...",scan
4,SCAN_K42V,42,K42V,"[[0.0, 0.0, 26.37481407481406, 0.0, 0.0, 0.0, ...",scan
...,...,...,...,...,...
500,MASK_C118I,118,C118I,"[[0.0, 0.0, 26.34578754578753, 0.0, 0.0, 0.0, ...",mask
501,MASK_L159I,159,L159I,"[[0.0, 0.0, 31.68347763347762, 0.0, 0.0, 0.0, ...",mask
502,MASK_T148D,148,T148D,"[[0.0, 0.0, 42.53288540572662, 0.0, 0.0, 0.0, ...",mask
503,MASK_S136E,136,S136E,"[[0.0, 0.0, 44.84164719844607, 0.0, 0.0, 0.0, ...",mask


In [7]:
# df_data = pd.read_pickle('raw/scan_data_analysis.pkl')
# df_data.inactive_rmsd.max(), df_data.active_rmsd.max()

inactive_rmsd_95 = df_data['inactive_rmsd'].quantile(0.125), df_data['inactive_rmsd'].quantile(0.975)
active_rmsd_95 = df_data['active_rmsd'].quantile(0.125), df_data['active_rmsd'].quantile(0.975)
inactive_rmsd_95, active_rmsd_95

((np.float64(2.1011303222724376), np.float64(3.2063116587386595)),
 (np.float64(1.354749224129218), np.float64(3.224838351493347)))

In [2]:
df_data = pd.read_pickle('raw/scan_data_analysis.pkl')
df_data['variant'] = df_data.apply(lambda x: f'{x.type.upper()}_{x.prot}', axis=1)
df_data = df_data.groupby('variant')[['dimred_inactive_dist', 'dimred_active_dist']].mean()

df_graph[['inactive_dist', 'active_dist']] = df_graph.apply(lambda x: df_data.loc[x.variant], axis=1)
# del df_data
df_data

Unnamed: 0_level_0,dimred_inactive_dist,dimred_active_dist
variant,Unnamed: 1_level_1,Unnamed: 2_level_1
MASK_A11A,28.236616,77.426291
MASK_A11D,30.099492,65.036279
MASK_A11G,29.174706,63.628269
MASK_A11S,30.255691,63.418236
MASK_A130A,28.767458,75.430238
...,...,...
SCAN_Y71W,29.362805,75.648048
SCAN_Y96A,30.154165,74.431864
SCAN_Y96F,23.898685,75.554255
SCAN_Y96L,24.765763,76.176459


In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_graph[['inactive_dist', 'active_dist']] = scaler.fit_transform(df_graph[['inactive_dist', 'active_dist']])
df_graph

Unnamed: 0,variant,resid,ligand,edge_matrix,type,inactive_dist,active_dist
0,SCAN_R73K,73,R73K,"[[0.0, 0.0, 69.01336996337025, 0.0, 0.0, 0.0, ...",scan,0.566575,0.151092
1,SCAN_E168Q,168,E168Q,"[[0.0, 0.0, 48.29074965943725, 0.0, 0.0, 0.0, ...",scan,0.260282,0.339742
2,SCAN_T87P,87,T87P,"[[0.0, 0.0, 165.74442129602028, 0.0, 0.0, 0.0,...",scan,0.813908,0.155051
3,SCAN_V45T,45,V45T,"[[0.0, 0.0, 45.24072294072312, 0.0, 0.0, 0.0, ...",scan,-0.827519,1.137934
4,SCAN_K42V,42,K42V,"[[0.0, 0.0, 26.37481407481406, 0.0, 0.0, 0.0, ...",scan,-0.008781,0.471074
...,...,...,...,...,...,...,...
500,MASK_C118I,118,C118I,"[[0.0, 0.0, 26.34578754578753, 0.0, 0.0, 0.0, ...",mask,-0.020004,-0.786967
501,MASK_L159I,159,L159I,"[[0.0, 0.0, 31.68347763347762, 0.0, 0.0, 0.0, ...",mask,0.115819,-1.450660
502,MASK_T148D,148,T148D,"[[0.0, 0.0, 42.53288540572662, 0.0, 0.0, 0.0, ...",mask,0.813749,-0.326148
503,MASK_S136E,136,S136E,"[[0.0, 0.0, 44.84164719844607, 0.0, 0.0, 0.0, ...",mask,0.319163,-0.564167


In [4]:
aas = 'ACDEFGHIKLMNPQRSTVWY'
aa_dict = {aa: np.array([1 if i == j else 0 for i in range(len(aas))]) for j, aa in enumerate(aas)}

wt_seq = "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKHKEK"
wt_x = np.array([aa_dict[aa] for aa in wt_seq[:max_resid]])
wt_x.shape

(168, 20)

In [5]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df_graph, test_size=0.2)
val_df, test_df = train_test_split(temp_df, test_size=0.5)

train_df, val_df, test_df

(        variant  resid ligand  \
 243   MASK_D30E     30   D30E   
 53    SCAN_S39D     39   S39D   
 22    SCAN_E76Q     76   E76Q   
 114   MASK_Q43K     43   Q43K   
 248   MASK_T20I     20   T20I   
 ..          ...    ...    ...   
 343   SCAN_G75T     75   G75T   
 131  MASK_F156Y    156  F156Y   
 1     MASK_T35S     35   T35S   
 488  SCAN_G151S    151  G151S   
 31    SCAN_I55V     55   I55V   
 
                                            edge_matrix  type  inactive_dist  \
 243  [[0.0, 0.0, 15.699636474636456, 0.0, 0.0, 0.0,...  mask       0.065894   
 53   [[0.0, 0.0, 74.90745087269875, 0.0, 0.0, 0.0, ...  scan       0.599674   
 22   [[0.0, 0.0, 42.764538418570815, 0.0, 0.0, 0.0,...  scan       0.676427   
 114  [[0.0, 0.0, 37.828717616993565, 0.0, 0.0, 0.0,...  mask       0.145620   
 248  [[0.0, 0.0, 45.88351753113166, 0.0, 0.0, 0.0, ...  mask      -0.379817   
 ..                                                 ...   ...            ...   
 343  [[0.0, 0.0, 46.211997102

In [6]:
train_df.to_pickle('processed/train_df.pkl')
val_df.to_pickle('processed/val_df.pkl')
test_df.to_pickle('processed/test_df.pkl')