## This notebook preprocesses data to train random forest models---trained to predict protein stability using biophysical features---to identify Rocklin designs that are unexpectedly unstable. These designs will be subjected to exhaustive single-site mutations to assess if their stability can be rescued and to inform the engineering of new features and/or improve Rosetta's energy function.

In [1]:
import os
import json
import numpy as np
import pandas as pd
import multiprocessing
from sklearn.neighbors import KDTree
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestRegressor

## Preprocess original dataframes and save aggregated dataframes for model training

1. **Select original Rocklin designs, and identify design secondary structure topologies**
2. **Combine stability scores and Rosetta structural/biophysical metrics**
3. **Remove designs with low confidence experimental stability scores**
4. **Standardize features and remove features with missing data for model training**

#### ** Due to their large size original dataframes are not included in this repo but are available upon request **

In [None]:
# Load the stability scores and metadata dataframes for all designs
metadata_file = 'data/metadata/protein_groupings_by_uw.v1.metadata.csv'
aggrdata_file = 'data/aggregated_data/all_libs_cleaned.v4.aggregated_data.csv'
stabdata_file = 'data/experimental_stability_scores/Rocklin.v6.experimental_stability_scores.csv'

combined_df = pd.read_csv(aggrdata_file, comment='#', low_memory=False)
combined_df['original_dataset'] = combined_df['dataset']
combined_df.loc[(combined_df.dataset.str.contains('topology_mining') & combined_df.description.str.contains('from_Longxing')), 'dataset'] = 'longxing_untested'
combined_df.loc[(combined_df.dataset.str.contains('topology_mining') & combined_df.description.str.contains('topology_mining')), 'dataset'] = 'topmining_untested'
combined_df.set_index(['dataset','topology'], inplace=True)

topology_group_df = pd.read_csv(metadata_file, comment='#', low_memory=False)
topology_group_df['topology'] = topology_group_df['name']
topology_group_df.drop(columns=['name'], inplace=True)
topology_group_df.set_index(['dataset','topology'], inplace=True)

combined_df = combined_df.join(topology_group_df, how ='left')
combined_df.reset_index(inplace=True)

combined_df = combined_df[['dataset','name','stabilityscore_cnn_calibrated']]
combined_df.set_index(['dataset','name'], inplace=True)

In [None]:
structdata_file = 'data/structural_metrics/Rocklin.v5.structural_metrics.csv'

rocklin_df = pd.read_csv(structdata_file, comment='#', low_memory=False)
rocklin_df['topology'] = rocklin_df['name'].apply(lambda row: row.split('_')[0])
rocklin_df['original_dataset'] = rocklin_df['dataset']
rocklin_df.set_index(['dataset','topology'], inplace=True)

topology_group_df = pd.read_csv(metadata_file, comment='#', low_memory=False)
topology_group_df['topology'] = topology_group_df['name']
topology_group_df.drop(columns=['name'], inplace=True)
topology_group_df.set_index(['dataset','topology'], inplace=True)

rocklin_df = rocklin_df.join(topology_group_df, how='left')
rocklin_df.reset_index(inplace=True)

rocklin_df.set_index(['dataset','name'], inplace=True)
combined_df = rocklin_df.join(combined_df, how='left')
combined_df.reset_index(inplace=True)

In [None]:
# Remove those designs with a ec50_CI that spans more than 2 ec50 units
ci_df = pd.read_csv(stabdata_file, 
                    comment='#',
                    usecols=['dataset','name','ec50_95ci_lbound_t','ec50_95ci_ubound_t','ec50_95ci_lbound_c','ec50_95ci_ubound_c'])

ci_df['ec50_ci_span_t'] = ci_df['ec50_95ci_ubound_t'] - ci_df['ec50_95ci_lbound_t']
ci_df['ec50_ci_span_c'] = ci_df['ec50_95ci_ubound_c'] - ci_df['ec50_95ci_lbound_c']
ci_df['ec50_ci_span'] = ci_df[['ec50_ci_span_t', 'ec50_ci_span_c']].max(axis=1)
ci_df = ci_df.drop(ci_df[ci_df['ec50_ci_span'] > 2].index)

combined_df = combined_df.merge(ci_df, on=['dataset','name'])

In [None]:
# specify the feature columns to use in the random forest models
feature_cols = ['AlaCount', 'T1_absq', 'T1_netq', 'Tend_absq', 'Tend_netq', 'Tminus1_absq', 'Tminus1_netq', 'abego_res_profile', 'abego_res_profile_penalty', 'avg_all_frags', 'avg_best_frag', 'bb', 'buns_bb_heavy', 'buns_nonheavy', 'buns_sc_heavy', 'buried_minus_exposed', 'buried_np', 'buried_np_AFILMVWY', 'buried_np_AFILMVWY_per_res', 'buried_np_per_res', 'buried_over_exposed', 'chymo_cut_sites', 'chymo_with_LM_cut_sites', 'contact_all', 'contact_core_SASA', 'contact_core_SCN', 'contig_not_hp_avg', 'contig_not_hp_avg_norm', 'contig_not_hp_internal_max', 'contig_not_hp_max', 'degree', 'dslf_fa13', 'entropy', 'exposed_hydrophobics', 'exposed_np_AFILMVWY', 'exposed_polars', 'exposed_total', 'fa_atr', 'fa_atr_per_res', 'fa_dun_dev', 'fa_dun_rot', 'fa_dun_semi', 'fa_elec', 'fa_intra_atr_xover4', 'fa_intra_elec', 'fa_intra_rep_xover4', 'fa_intra_sol_xover4', 'fa_rep', 'fa_rep_per_res', 'fa_sol', 'frac_helix', 'frac_loop', 'frac_sheet', 'fxn_exposed_is_np', 'hbond_bb_sc', 'hbond_lr_bb', 'hbond_lr_bb_per_sheet', 'hbond_sc', 'hbond_sr_bb', 'hbond_sr_bb_per_helix', 'helix_sc', 'holes', 'hphob_sc_contacts', 'hphob_sc_degree', 'hxl_tors', 'hydrophobicity', 'largest_hphob_cluster', 'lk_ball', 'lk_ball_bridge', 'lk_ball_bridge_uncpl', 'lk_ball_iso', 'loop_sc', 'mismatch_probability', 'n_charged', 'n_hphob_clusters', 'n_hydrophobic', 'n_hydrophobic_noA', 'n_polar_core', 'n_res', 'nearest_chymo_cut_to_Cterm', 'nearest_chymo_cut_to_Nterm', 'nearest_chymo_cut_to_term', 'nearest_tryp_cut_to_Cterm', 'nearest_tryp_cut_to_Nterm', 'nearest_tryp_cut_to_term', 'net_atr_net_sol_per_res', 'net_atr_per_res', 'net_sol_per_res', 'netcharge', 'nres', 'nres_helix', 'nres_loop', 'nres_sheet', 'omega', 'one_core_each', 'p_aa_pp', 'pack', 'percent_core_SASA', 'percent_core_SCN', 'pro_close', 'rama_prepro', 'ref', 'res_count_core_SASA', 'res_count_core_SCN', 'score_per_res', 'ss_contributes_core', 'ss_sc', 'sum_best_frags', 'total_score', 'tryp_cut_sites', 'two_core_each', 'worst6frags', 'worstfrag', 'freq_HM', 'freq_IG', 'n_GR_3d_contacts_5A', 'avg_all_frags_in_E', 'freq_LS', 'n_AI_3d_contacts_3A', 'freq_KF', 'freq_ME', 'freq_FE', 'n_CQ_3d_contacts_3A', 'surface_freq_R', 'freq_HP', 'freq_SS', 'freq_QR', 'n_DW_3d_contacts_3A', 'n_CF_3d_contacts_5A', 'freq_F', 'freq_FA', 'n_KL_3d_contacts_3A', 'n_DN_3d_contacts_5A', 'freq_RA', 'n_DS_3d_contacts_3A', 'freq_GD', 'freq_LN', 'buried_npsa_surface', 'n_TW_3d_contacts_3A', 'n_IQ_3d_contacts_3A', 'freq_AA', 'freq_LG', 'freq_LL', 'freq_QI', 'freq_PF', 'n_DG_3d_contacts_3A', 'percent_polar_core', 'n_EV_3d_contacts_5A', 'freq_TA', 'n_IT_3d_contacts_3A', 'n_DG_3d_contacts_5A', 'n_GM_3d_contacts_5A', 'freq_KT', 'n_NY_3d_contacts_5A', 'surface_freq_L', 'n_FY_3d_contacts_5A', 'percent_hydrophobic_AFILMVWY', 'freq_Q', 'freq_IW', 'n_AK_3d_contacts_5A', 'n_EQ_3d_contacts_3A', 'freq_VR', 'n_SS_3d_contacts_5A', 'n_GK_3d_contacts_5A', 'percent_hydrophobic', 'freq_RD', 'freq_HL', 'boundary_freq_R', 'freq_GW', 'helices_freq_I', 'n_EL_3d_contacts_3A', 'n_GW_3d_contacts_5A', 'n_AH_3d_contacts_3A', 'n_QQ_3d_contacts_5A', 'freq_PR', 'freq_MR', 'n_AF_3d_contacts_5A', 'n_DY_3d_contacts_3A', 'freq_TK', 'freq_HE', 'n_TY_3d_contacts_3A', 'freq_AG', 'freq_CY', 'n_EI_3d_contacts_5A', 'n_MQ_3d_contacts_3A', 'freq_HG', 'freq_FV', 'min_energy_for_5mers', 'strands_freq_H', 'boundary_freq_Q', 'n_LL_3d_contacts_5A', 'n_VY_3d_contacts_3A', 'freq_AS', 'n_GY_3d_contacts_3A', 'freq_VL', 'boundary_freq_Y', 'freq_SK', 'n_FP_3d_contacts_5A', 'freq_AD', 'n_EW_3d_contacts_3A', 'n_DF_3d_contacts_3A', 'freq_GQ', 'freq_QN', 'freq_KI', 'n_DP_3d_contacts_3A', 'n_KW_3d_contacts_3A', 'freq_LF', 'freq_AI', 'n_ES_3d_contacts_3A', 'freq_TS', 'freq_FD', 'freq_MY', 'boundary_freq_A', 'n_PY_3d_contacts_5A', 'n_IL_3d_contacts_5A', 'freq_HR', 'n_NQ_3d_contacts_3A', 'freq_CP', 'freq_EK', 'core_freq_M', 'n_HV_3d_contacts_3A', 'freq_ST', 'freq_ES', 'freq_S', 'freq_YY', 'n_FQ_3d_contacts_3A', 'freq_IQ', 'n_EK_3d_contacts_5A', 'boundary_freq_F', 'freq_DF', 'freq_YF', 'n_CV_3d_contacts_5A', 'n_KV_3d_contacts_5A', 'freq_WF', 'freq_PY', 'n_HR_3d_contacts_3A', 'freq_DV', 'strands_freq_G', 'n_MT_3d_contacts_5A', 'helices_freq_N', 'n_FV_3d_contacts_3A', 'max_energy_of_3A_neighborhoods', 'freq_PN', 'n_CE_3d_contacts_3A', 'n_NW_3d_contacts_5A', 'n_RW_3d_contacts_3A', 'n_MY_3d_contacts_3A', 'n_AR_3d_contacts_3A', 'n_EP_3d_contacts_5A', 'n_KM_3d_contacts_3A', 'freq_CI', 'freq_AP', 'min_energy_for_4mers', 'freq_DD', 'n_KP_3d_contacts_5A', 'n_DE_3d_contacts_5A', 'n_WW_3d_contacts_3A', 'n_HL_3d_contacts_3A', 'n_NS_3d_contacts_3A', 'n_CM_3d_contacts_3A', 'n_CV_3d_contacts_3A', 'freq_IC', 'helices_freq_W', 'n_AV_3d_contacts_5A', 'freq_YE', 'n_VV_3d_contacts_3A', 'avg_energy_for_2mers', 'n_HI_3d_contacts_5A', 'n_IV_3d_contacts_3A', 'n_HW_3d_contacts_5A', 'n_EE_3d_contacts_3A', 'freq_MN', 'freq_HA', 'n_FW_3d_contacts_5A', 'n_AD_3d_contacts_3A', 'freq_EH', 'freq_AM', 'n_LR_3d_contacts_5A', 'n_KL_3d_contacts_5A', 'freq_LR', 'freq_MV', 'n_HT_3d_contacts_5A', 'freq_AT', 'n_KV_3d_contacts_3A', 'n_PS_3d_contacts_5A', 'freq_WG', 'strands_freq_M', 'freq_DT', 'freq_CK', 'freq_GC', 'surface_freq_Y', 'n_MV_3d_contacts_5A', 'n_MM_3d_contacts_5A', 'freq_GE', 'n_GK_3d_contacts_3A', 'n_PP_3d_contacts_5A', 'freq_AF', 'freq_GR', 'freq_VG', 'n_TV_3d_contacts_3A', 'buried_npsa_FAMILYVW', 'n_HM_3d_contacts_5A', 'freq_KN', 'n_ER_3d_contacts_3A', 'n_GG_3d_contacts_5A', 'freq_T', 'freq_TP', 'freq_I', 'freq_DK', 'n_LV_3d_contacts_5A', 'freq_GN', 'surface_freq_C', 'n_SV_3d_contacts_5A', 'n_RV_3d_contacts_5A', 'n_FT_3d_contacts_5A', 'n_HW_3d_contacts_3A', 'freq_SG', 'freq_R', 'n_AT_3d_contacts_5A', 'n_LT_3d_contacts_3A', 'n_VW_3d_contacts_3A', 'freq_MQ', 'freq_IR', 'freq_RL', 'n_GQ_3d_contacts_3A', 'freq_RC', 'freq_GS', 'n_CM_3d_contacts_5A', 'n_VW_3d_contacts_5A', 'freq_CS', 'n_IN_3d_contacts_5A', 'freq_VQ', 'freq_DP', 'freq_MC', 'n_QS_3d_contacts_5A', 'freq_RP', 'freq_YK', 'n_TY_3d_contacts_5A', 'freq_G', 'n_YY_3d_contacts_5A', 'n_AS_3d_contacts_3A', 'buried_psa_core', 'freq_NE', 'freq_NI', 'n_FR_3d_contacts_5A', 'strands_freq_E', 'n_AE_3d_contacts_5A', 'n_CR_3d_contacts_5A', 'freq_CR', 'freq_EE', 'n_FF_3d_contacts_5A', 'freq_IV', 'freq_MF', 'buried_npsa_core', 'n_DP_3d_contacts_5A', 'n_EG_3d_contacts_5A', 'n_NV_3d_contacts_3A', 'freq_FQ', 'n_AR_3d_contacts_5A', 'freq_RQ', 'freq_CA', 'freq_YD', 'n_LP_3d_contacts_3A', 'freq_HW', 'n_EY_3d_contacts_3A', 'freq_Y', 'percent_aromatic_no_his', 'freq_SN', 'n_QT_3d_contacts_5A', 'freq_AH', 'n_FS_3d_contacts_3A', 'freq_TR', 'n_KM_3d_contacts_5A', 'strands_freq_A', 'freq_GY', 'surface_freq_G', 'surface_freq_M', 'freq_TT', 'min_energy_for_2mers', 'n_GY_3d_contacts_5A', 'n_FF_3d_contacts_3A', 'buried_npsa_boundary', 'freq_QV', 'n_FR_3d_contacts_3A', 'n_HY_3d_contacts_3A', 'n_DQ_3d_contacts_5A', 'n_KT_3d_contacts_5A', 'freq_NV', 'freq_DI', 'n_RY_3d_contacts_5A', 'n_TT_3d_contacts_5A', 'freq_WY', 'freq_HV', 'n_AY_3d_contacts_3A', 'n_KN_3d_contacts_5A', 'freq_QW', 'freq_KA', 'freq_MM', 'n_RT_3d_contacts_3A', 'freq_GT', 'freq_M', 'n_PV_3d_contacts_5A', 'n_MM_3d_contacts_3A', 'freq_CD', 'n_DI_3d_contacts_5A', 'n_NY_3d_contacts_3A', 'percent_hydrophobic_surface', 'strands_freq_R', 'freq_VA', 'freq_GG', 'n_EN_3d_contacts_3A', 'n_EW_3d_contacts_5A', 'n_FH_3d_contacts_3A', 'freq_YR', 'core_freq_T', 'core_freq_C', 'freq_FT', 'freq_RT', 'n_CC_3d_contacts_5A', 'freq_EI', 'freq_EA', 'n_QQ_3d_contacts_3A', 'freq_NP', 'n_LT_3d_contacts_5A', 'n_RR_3d_contacts_3A', 'n_HV_3d_contacts_5A', 'surface_freq_K', 'freq_LK', 'n_RW_3d_contacts_5A', 'freq_VI', 'n_HN_3d_contacts_3A', 'freq_TF', 'n_MR_3d_contacts_3A', 'n_GL_3d_contacts_3A', 'n_IW_3d_contacts_3A', 'freq_VS', 'freq_QH', 'freq_YT', 'freq_PC', 'n_DM_3d_contacts_3A', 'exposed_npsa', 'freq_C', 'freq_IY', 'freq_CW', 'n_MP_3d_contacts_3A', 'freq_PV', 'freq_VM', 'freq_NW', 'freq_YN', 'freq_SD', 'n_NT_3d_contacts_3A', 'freq_AY', 'strands_freq_F', 'n_GI_3d_contacts_5A', 'strands_freq_D', 'boundary_freq_K', 'freq_QT', 'n_NW_3d_contacts_3A', 'freq_HY', 'freq_TW', 'n_DK_3d_contacts_5A', 'n_MS_3d_contacts_3A', 'freq_DH', 'n_IP_3d_contacts_3A', 'freq_IN', 'n_GS_3d_contacts_3A', 'freq_NS', 'freq_HK', 'n_NQ_3d_contacts_5A', 'n_SY_3d_contacts_3A', 'freq_GF', 'freq_TG', 'surface_freq_A', 'avg_energy_for_5mers', 'n_AV_3d_contacts_3A', 'n_AT_3d_contacts_3A', 'freq_TH', 'n_CP_3d_contacts_5A', 'n_KY_3d_contacts_3A', 'n_PY_3d_contacts_3A', 'n_LR_3d_contacts_3A', 'helices_freq_E', 'n_EY_3d_contacts_5A', 'n_NS_3d_contacts_5A', 'freq_H', 'freq_TE', 'n_DL_3d_contacts_5A', 'n_QV_3d_contacts_3A', 'freq_DG', 'freq_VN', 'n_SW_3d_contacts_3A', 'helices_freq_Y', 'n_RS_3d_contacts_5A', 'freq_QC', 'n_DT_3d_contacts_3A', 'freq_DY', 'freq_KD', 'n_EF_3d_contacts_3A', 'freq_PS', 'freq_TD', 'freq_NM', 'n_MP_3d_contacts_5A', 'n_AP_3d_contacts_5A', 'n_DR_3d_contacts_5A', 'n_MR_3d_contacts_5A', 'freq_EL', 'freq_MP', 'n_AI_3d_contacts_5A', 'strands_freq_S', 'freq_PI', 'helices_freq_L', 'n_FG_3d_contacts_5A', 'n_DS_3d_contacts_5A', 'n_CK_3d_contacts_3A', 'n_RY_3d_contacts_3A', 'n_GN_3d_contacts_3A', 'n_DV_3d_contacts_5A', 'boundary_freq_L', 'surface_freq_E', 'freq_P', 'freq_FH', 'freq_MG', 'n_DI_3d_contacts_3A', 'freq_KS', 'freq_TQ', 'n_AQ_3d_contacts_5A', 'n_HP_3d_contacts_5A', 'n_IK_3d_contacts_5A', 'n_MT_3d_contacts_3A', 'freq_KH', 'n_GP_3d_contacts_5A', 'freq_GI', 'n_DH_3d_contacts_5A', 'n_HP_3d_contacts_3A', 'n_FV_3d_contacts_5A', 'n_LL_3d_contacts_3A', 'n_EG_3d_contacts_3A', 'n_CR_3d_contacts_3A', 'min_energy_of_3A_neighborhoods', 'n_CN_3d_contacts_5A', 'freq_FL', 'max_energy_for_4mers', 'freq_DQ', 'freq_DM', 'freq_HT', 'n_FH_3d_contacts_5A', 'n_MW_3d_contacts_3A', 'n_EL_3d_contacts_5A', 'n_EQ_3d_contacts_5A', 'freq_FC', 'n_HI_3d_contacts_3A', 'n_TW_3d_contacts_5A', 'freq_NF', 'freq_QF', 'freq_LY', 'n_ER_3d_contacts_5A', 'freq_IH', 'freq_GP', 'freq_PW', 'buried_npsa_FAMILYVW_surface', 'n_HR_3d_contacts_5A', 'surface_freq_T', 'freq_PD', 'freq_DA', 'surface_freq_N', 'n_AM_3d_contacts_3A', 'percent_aromatic_no_his_surface', 'freq_WA', 'n_MN_3d_contacts_5A', 'n_CG_3d_contacts_5A', 'n_II_3d_contacts_3A', 'freq_LT', 'helices_freq_C', 'freq_RS', 'n_CF_3d_contacts_3A', 'n_CW_3d_contacts_3A', 'n_LQ_3d_contacts_3A', 'freq_IF', 'freq_WN', 'freq_NC', 'n_AQ_3d_contacts_3A', 'freq_LH', 'n_AL_3d_contacts_3A', 'freq_TN', 'freq_TM', 'n_GP_3d_contacts_3A', 'percent_aromatic_no_his_core', 'n_GR_3d_contacts_3A', 'n_EM_3d_contacts_3A', 'freq_WE', 'freq_HQ', 'n_CT_3d_contacts_5A', 'surface_freq_V', 'n_IS_3d_contacts_3A', 'avg_energy_for_3mers', 'avg_energy_of_5A_neighborhoods', 'freq_PK', 'n_DY_3d_contacts_5A', 'n_EK_3d_contacts_3A', 'freq_RY', 'freq_ND', 'freq_VF', 'n_CL_3d_contacts_3A', 'n_LV_3d_contacts_3A', 'freq_GH', 'core_freq_K', 'std_energy_of_3A_neighborhoods', 'freq_KL', 'freq_YS', 'n_GL_3d_contacts_5A', 'freq_SV', 'freq_WI', 'n_AM_3d_contacts_5A', 'freq_IP', 'n_SS_3d_contacts_3A', 'n_MN_3d_contacts_3A', 'freq_EY', 'boundary_freq_G', 'helices_freq_R', 'freq_MA', 'n_LS_3d_contacts_3A', 'n_GI_3d_contacts_3A', 'freq_FP', 'freq_IA', 'freq_GM', 'n_CP_3d_contacts_3A', 'core_freq_P', 'freq_HH', 'freq_ID', 'strands_freq_K', 'boundary_freq_E', 'freq_GV', 'n_CC_3d_contacts_3A', 'freq_CV', 'freq_DW', 'buried_psa_boundary', 'freq_IS', 'n_DD_3d_contacts_5A', 'freq_YM', 'min_energy_for_3mers', 'core_freq_D', 'freq_TC', 'n_IW_3d_contacts_5A', 'core_freq_G', 'freq_EC', 'freq_FF', 'n_GW_3d_contacts_3A', 'n_DD_3d_contacts_3A', 'n_CH_3d_contacts_5A', 'freq_CN', 'n_CE_3d_contacts_5A', 'freq_KM', 'freq_EW', 'std_energy_for_2mers', 'freq_WT', 'strands_freq_V', 'n_GG_3d_contacts_3A', 'n_TV_3d_contacts_5A', 'n_FM_3d_contacts_3A', 'n_VY_3d_contacts_5A', 'n_PS_3d_contacts_3A', 'freq_VH', 'core_freq_V', 'n_HK_3d_contacts_3A', 'helices_freq_G', 'n_PT_3d_contacts_3A', 'core_freq_E', 'freq_QL', 'freq_W', 'freq_RI', 'n_CY_3d_contacts_5A', 'freq_AV', 'freq_NA', 'freq_CH', 'n_DH_3d_contacts_3A', 'helices_freq_T', 'n_PW_3d_contacts_5A', 'std_energy_for_5mers', 'strands_freq_I', 'freq_HS', 'freq_EN', 'n_GH_3d_contacts_5A', 'freq_QQ', 'freq_EG', 'freq_SF', 'n_FL_3d_contacts_3A', 'freq_QG', 'n_VV_3d_contacts_5A', 'n_ST_3d_contacts_3A', 'n_DT_3d_contacts_5A', 'percent_hydrophobic_core', 'n_NR_3d_contacts_3A', 'n_IV_3d_contacts_5A', 'freq_LI', 'n_CI_3d_contacts_3A', 'freq_TL', 'freq_KE', 'n_DR_3d_contacts_3A', 'n_QT_3d_contacts_3A', 'n_CN_3d_contacts_3A', 'n_WY_3d_contacts_5A', 'n_FS_3d_contacts_5A', 'percent_hydrophobic_core_stringent', 'freq_LM', 'freq_SA', 'freq_SW', 'core_freq_F', 'freq_GA', 'n_LM_3d_contacts_5A', 'surface_freq_I', 'n_FN_3d_contacts_3A', 'freq_MD', 'n_HQ_3d_contacts_5A', 'n_MS_3d_contacts_5A', 'buried_psa', 'freq_LE', 'freq_VE', 'freq_CF', 'freq_V', 'n_AY_3d_contacts_5A', 'freq_CQ', 'n_LN_3d_contacts_5A', 'freq_TY', 'n_DL_3d_contacts_3A', 'n_MW_3d_contacts_5A', 'n_AC_3d_contacts_3A', 'percent_polar_core_stringent', 'n_FI_3d_contacts_5A', 'freq_YV', 'freq_YC', 'freq_SQ', 'n_KR_3d_contacts_5A', 'avg_all_frags_in_H', 'n_EI_3d_contacts_3A', 'freq_YQ', 'freq_DN', 'freq_SY', 'n_HN_3d_contacts_5A', 'n_GV_3d_contacts_3A', 'boundary_freq_P', 'n_QV_3d_contacts_5A', 'n_AK_3d_contacts_3A', 'freq_SM', 'n_PQ_3d_contacts_5A', 'n_FP_3d_contacts_3A', 'n_CY_3d_contacts_3A', 'freq_NN', 'freq_AN', 'freq_QP', 'freq_YL', 'freq_LD', 'n_RR_3d_contacts_5A', 'n_ST_3d_contacts_5A', 'freq_YW', 'freq_HI', 'n_IP_3d_contacts_5A', 'freq_WC', 'freq_RE', 'n_AG_3d_contacts_5A', 'boundary_freq_S', 'core_freq_Y', 'n_CH_3d_contacts_3A', 'freq_WV', 'freq_QS', 'n_GQ_3d_contacts_5A', 'max_energy_for_3mers', 'n_NT_3d_contacts_5A', 'freq_FS', 'n_MQ_3d_contacts_5A', 'freq_CE', 'freq_FY', 'n_WW_3d_contacts_5A', 'std_energy_for_3mers', 'freq_VC', 'n_AA_3d_contacts_5A', 'freq_KR', 'freq_TV', 'freq_KV', 'freq_YI', 'n_AL_3d_contacts_5A', 'n_DN_3d_contacts_3A', 'n_CT_3d_contacts_3A', 'freq_NT', 'n_GT_3d_contacts_5A', 'std_energy_for_4mers', 'n_AF_3d_contacts_3A', 'freq_PT', 'freq_AK', 'n_NR_3d_contacts_5A', 'freq_KP', 'n_IM_3d_contacts_5A', 'core_freq_L', 'n_HH_3d_contacts_3A', 'boundary_freq_V', 'n_MY_3d_contacts_5A', 'n_GH_3d_contacts_3A', 'freq_PQ', 'freq_VW', 'core_freq_A', 'helices_freq_F', 'n_II_3d_contacts_5A', 'freq_WM', 'helices_freq_V', 'n_IM_3d_contacts_3A', 'boundary_freq_I', 'freq_IL', 'freq_RR', 'n_GV_3d_contacts_5A', 'n_IS_3d_contacts_5A', 'n_CG_3d_contacts_3A', 'freq_HD', 'percent_hydrophobic_AFILMVWY_boundary', 'freq_A', 'n_CK_3d_contacts_5A', 'percent_aromatic_no_his_boundary', 'freq_KW', 'freq_VV', 'freq_QK', 'n_NP_3d_contacts_3A', 'freq_YA', 'n_EH_3d_contacts_5A', 'freq_MT', 'freq_CG', 'n_FY_3d_contacts_3A', 'freq_LP', 'buried_npsa', 'freq_IE', 'surface_freq_P', 'freq_SL', 'freq_FW', 'n_KW_3d_contacts_5A', 'freq_AL', 'freq_EV', 'n_SY_3d_contacts_5A', 'freq_WQ', 'freq_CC', 'n_GM_3d_contacts_3A', 'freq_TI', 'freq_HF', 'freq_VY', 'n_AN_3d_contacts_3A', 'strands_freq_N', 'freq_MI', 'n_QW_3d_contacts_3A', 'n_CS_3d_contacts_3A', 'freq_LV', 'freq_EQ', 'n_GT_3d_contacts_3A', 'max_energy_for_2mers', 'avg_energy_of_3A_neighborhoods', 'n_IL_3d_contacts_3A', 'freq_FN', 'core_freq_R', 'helices_freq_S', 'n_IT_3d_contacts_5A', 'freq_WD', 'n_EV_3d_contacts_3A', 'n_IR_3d_contacts_3A', 'n_CW_3d_contacts_5A', 'max_energy_for_5mers', 'surface_freq_S', 'freq_PM', 'n_CQ_3d_contacts_5A', 'n_KQ_3d_contacts_5A', 'boundary_freq_D', 'n_LY_3d_contacts_3A', 'freq_MH', 'n_LN_3d_contacts_3A', 'freq_VT', 'freq_WL', 'n_EH_3d_contacts_3A', 'n_AS_3d_contacts_5A', 'strands_freq_Y', 'n_LQ_3d_contacts_5A', 'n_QW_3d_contacts_5A', 'n_IY_3d_contacts_5A', 'freq_NR', 'freq_CL', 'n_LS_3d_contacts_5A', 'n_KQ_3d_contacts_3A', 'freq_SI', 'n_CI_3d_contacts_5A', 'freq_FI', 'boundary_freq_T', 'n_ET_3d_contacts_5A', 'freq_MW', 'rama_prepro_and_p_aa_pp', 'n_NN_3d_contacts_3A', 'n_PR_3d_contacts_5A', 'freq_II', 'n_PW_3d_contacts_3A', 'n_AC_3d_contacts_5A', 'core_freq_W', 'freq_QA', 'freq_VK', 'freq_WR', 'n_AH_3d_contacts_5A', 'freq_KQ', 'freq_KK', 'freq_HN', 'freq_DC', 'n_FI_3d_contacts_3A', 'boundary_freq_M', 'n_HS_3d_contacts_5A', 'strands_freq_P', 'freq_IT', 'n_RS_3d_contacts_3A', 'n_EM_3d_contacts_5A', 'freq_NQ', 'freq_QD', 'n_DV_3d_contacts_3A', 'n_KK_3d_contacts_5A', 'n_LP_3d_contacts_5A', 'freq_PA', 'freq_SC', 'freq_AR', 'freq_WW', 'core_freq_Q', 'n_FN_3d_contacts_5A', 'freq_MK', 'n_CL_3d_contacts_5A', 'helices_freq_Q', 'n_FQ_3d_contacts_5A', 'n_LW_3d_contacts_3A', 'n_QY_3d_contacts_3A', 'helices_freq_A', 'core_freq_S', 'strands_freq_C', 'boundary_freq_N', 'freq_SR', 'freq_LQ', 'freq_RN', 'n_CD_3d_contacts_5A', 'n_KN_3d_contacts_3A', 'n_KR_3d_contacts_3A', 'n_QS_3d_contacts_3A', 'freq_GK', 'surface_freq_D', 'helices_freq_H', 'n_AP_3d_contacts_3A', 'core_freq_I', 'n_HM_3d_contacts_3A', 'n_SW_3d_contacts_5A', 'freq_WP', 'n_AW_3d_contacts_3A', 'n_AG_3d_contacts_3A', 'n_PP_3d_contacts_3A', 'n_IY_3d_contacts_3A', 'freq_RG', 'n_KS_3d_contacts_3A', 'n_IR_3d_contacts_5A', 'strands_freq_T', 'strands_freq_Q', 'freq_RM', 'n_RV_3d_contacts_3A', 'buried_npsa_FAMILYVW_core', 'n_LY_3d_contacts_5A', 'std_energy_of_5A_neighborhoods', 'freq_MS', 'boundary_freq_W', 'freq_EP', 'freq_RW', 'freq_PE', 'freq_AW', 'percent_hydrophobic_AFILMVWY_surface', 'freq_VP', 'freq_FK', 'min_energy_of_5A_neighborhoods', 'freq_NK', 'n_AN_3d_contacts_5A', 'n_CS_3d_contacts_5A', 'n_LW_3d_contacts_5A', 'freq_DE', 'percent_hydrophobic_AFILMVWY_core', 'freq_LW', 'n_DW_3d_contacts_5A', 'n_EP_3d_contacts_3A', 'freq_NL', 'freq_N', 'n_KP_3d_contacts_3A', 'freq_VD', 'freq_SH', 'n_LM_3d_contacts_3A', 'n_EF_3d_contacts_5A', 'n_AD_3d_contacts_5A', 'freq_AQ', 'freq_GL', 'max_energy_of_5A_neighborhoods', 'freq_FM', 'freq_YP', 'n_PR_3d_contacts_3A', 'n_QR_3d_contacts_3A', 'freq_EM', 'freq_IK', 'n_HS_3d_contacts_3A', 'n_HL_3d_contacts_5A', 'freq_PG', 'freq_E', 'freq_YG', 'surface_freq_H', 'freq_EF', 'n_IK_3d_contacts_3A', 'freq_RV', 'freq_AC', 'freq_DL', 'freq_CM', 'freq_WH', 'freq_SP', 'freq_KG', 'n_WY_3d_contacts_3A', 'n_AW_3d_contacts_5A', 'n_DM_3d_contacts_5A', 'n_EN_3d_contacts_5A', 'freq_FR', 'freq_D', 'freq_PL', 'n_HY_3d_contacts_5A', 'freq_NY', 'n_AA_3d_contacts_3A', 'n_QY_3d_contacts_5A', 'n_FG_3d_contacts_3A', 'helices_freq_M', 'freq_CT', 'core_freq_H', 'freq_RF', 'n_EE_3d_contacts_5A', 'freq_ET', 'freq_LA', 'n_buried_hbonds', 'avg_all_frags_in_L', 'n_FM_3d_contacts_5A', 'n_DE_3d_contacts_3A', 'freq_DS', 'n_FL_3d_contacts_5A', 'freq_WS', 'n_GS_3d_contacts_5A', 'n_HT_3d_contacts_3A', 'avg_energy_for_4mers', 'n_YY_3d_contacts_3A', 'surface_freq_F', 'freq_ML', 'freq_LC', 'n_NN_3d_contacts_5A', 'n_FK_3d_contacts_5A', 'n_FW_3d_contacts_3A', 'n_QR_3d_contacts_5A', 'freq_KC', 'n_NP_3d_contacts_5A', 'n_HK_3d_contacts_5A', 'freq_K', 'n_DF_3d_contacts_5A', 'n_CD_3d_contacts_3A', 'freq_WK', 'buried_npsa_FAMILYVW_boundary', 'boundary_freq_H', 'n_DK_3d_contacts_3A', 'n_RT_3d_contacts_5A', 'n_HH_3d_contacts_5A', 'freq_ED', 'n_GN_3d_contacts_5A', 'freq_QE', 'freq_RH', 'n_PV_3d_contacts_3A', 'helices_freq_D', 'strands_freq_L', 'freq_NH', 'n_KS_3d_contacts_5A', 'n_MV_3d_contacts_3A', 'n_NV_3d_contacts_5A', 'freq_NG', 'freq_FG', 'freq_RK', 'percent_hydrophobic_boundary', 'n_ES_3d_contacts_5A', 'freq_L', 'n_KK_3d_contacts_3A', 'n_TT_3d_contacts_3A', 'n_KT_3d_contacts_3A', 'n_FT_3d_contacts_3A', 'freq_PH', 'freq_YH', 'strands_freq_W', 'n_PQ_3d_contacts_3A', 'freq_DR', 'core_freq_N', 'freq_PP', 'n_IN_3d_contacts_3A', 'n_DQ_3d_contacts_3A', 'freq_SE', 'freq_IM', 'n_FK_3d_contacts_3A', 'n_AE_3d_contacts_3A', 'surface_freq_W', 'n_ET_3d_contacts_3A', 'helices_freq_P', 'n_HQ_3d_contacts_3A', 'helices_freq_K', 'buried_psa_surface', 'n_IQ_3d_contacts_5A', 'freq_HC', 'n_PT_3d_contacts_5A', 'freq_KY', 'freq_AE', 'n_KY_3d_contacts_5A', 'freq_ER', 'n_SV_3d_contacts_3A', 'boundary_freq_C', 'freq_QM', 'freq_QY', 'surface_freq_Q', 'std_hbond_sr_bb_for_5mers', 'std_hbond_sr_bb_for_2mers', 'min_rama_prepro_for_3mers', 'avg_hbond_bb_sc_for_3mers', 'N_B_counts_in_loops', 'avg_pro_close_for_2mers', 'abego_counts_in_loops_EEA', 'std_lk_ball_bridge_uncpl_for_4mers', 'L_O_counts_in_loops', 'abego_counts_in_loops_OBE', 'max_pro_close_for_4mers', 'min_hbond_lr_bb_for_3mers', 'std_hbond_bb_sc_for_3mers', 'avg_p_aa_pp_for_3mers', 'max_fa_intra_rep_xover4_for_2mers', 'abego_counts_in_loops_AEB', 'abego_counts_in_loops_AAB', 'N_E_counts_in_loops', 'avg_hbond_sc_for_2mers', 'Koehl_Delarue_TdS_boundary', 'avg_fa_sol_for_2mers', 'max_charge_of_5A_neighborhoods', 'min_p_aa_pp_for_3mers', 'C_E_counts_in_loops', 'min_lk_ball_iso_for_3mers', 'Q_B_counts_in_loops', 'avg_fa_rep_for_2mers', 'min_omega_for_3mers', 'avg_omega_for_4mers', 'max_rama_prepro_for_5mers', 'avg_hbond_sr_bb_for_2mers', 'V_G_counts_in_loops', 'ProteinVolume_void_vol', 'abego_counts_in_loops_EAO', 'abego_counts_in_loops_GOO', 'abego_counts_in_loops_AOG', 'avg_hbond_sr_bb_for_3mers', 'C_O_counts_in_loops', 'avg_fa_rep_for_3mers', 'abego_counts_in_loops_OB', 'min_lk_ball_bridge_uncpl_for_3mers', 'abego_counts_in_loops_BAB', 'abego_counts_in_loops_GGE', 'std_dslf_fa13_for_5mers', 'std_charge_of_5A_neighborhoods', 'I_G_counts_in_loops', 'Abagyan_Totrov_TdS_core_per_res', 'max_lk_ball_iso_for_4mers', 'Picket_Sternberg_TdS_boundary', 'avg_hbond_bb_sc_for_4mers', 'R_O_counts_in_loops', 'K_O_counts_in_loops', 'avg_hbond_bb_sc_for_2mers', 'min_dslf_fa13_for_5mers', 'std_fa_dun_rot_for_4mers', 'max_ref_for_3mers', 'avg_fa_sol_for_4mers', 'abego_counts_in_loops_BBB', 'min_hbond_lr_bb_for_5mers', 'std_fa_intra_sol_xover4_for_3mers', 'G_B_counts_in_loops', 'min_fa_rep_for_4mers', 'Abagyan_Totrov_TdS_surface_per_res', 'abego_counts_in_loops_BE', 'std_hbond_lr_bb_for_5mers', 'Lee_TdS_boundary', 'min_lk_ball_bridge_uncpl_for_5mers', 'G_O_counts_in_loops', 'min_lk_ball_bridge_for_2mers', 'std_p_aa_pp_for_5mers', 'abego_counts_in_loops_AGO', 'min_hbond_lr_bb_for_4mers', 'min_hbond_sc_for_5mers', 'abego_counts_in_loops_OEO', 'max_hbond_sc_for_5mers', 'std_fa_rep_for_3mers', 'avg_hbond_lr_bb_for_2mers', 'std_hbond_lr_bb_for_4mers', 'min_fa_intra_rep_xover4_for_3mers', 'abego_counts_in_loops_EGB', 'abego_counts_in_loops_BGB', 'L_A_counts_in_loops', 'std_fa_dun_semi_for_4mers', 'avg_lk_ball_iso_for_4mers', 'E_O_counts_in_loops', 'avg_fa_elec_for_3mers', 'std_fa_atr_for_3mers', 'max_fa_dun_rot_for_2mers', 'std_lk_ball_for_4mers', 'abego_counts_in_loops_OOB', 'abego_counts_in_loops_OAB', 'M_G_counts_in_loops', 'nres_core', 'G_G_counts_in_loops', 'std_fa_intra_atr_xover4_for_2mers', 'abego_counts_in_loops_BBE', 'M_A_counts_in_loops', 'avg_fa_intra_atr_xover4_for_2mers', 'max_fa_sol_for_3mers', 'min_lk_ball_for_4mers', 'min_ref_for_3mers', 'min_p_aa_pp_for_5mers', 'min_fa_elec_for_2mers', 'std_fa_dun_rot_for_3mers', 'std_omega_for_3mers', 'avg_fa_intra_rep_xover4_for_3mers', 'avg_omega_for_3mers', 'min_fa_sol_for_3mers', 'abego_counts_in_loops_GEE', 'min_fa_atr_for_3mers', 'abego_counts_in_loops_EB', 'std_fa_dun_dev_for_3mers', 'avg_dslf_fa13_for_4mers', 'std_dslf_fa13_for_2mers', 'min_fa_intra_atr_xover4_for_2mers', 'cavity_volume', 'max_fa_intra_sol_xover4_for_2mers', 'min_fa_dun_rot_for_4mers', 'min_omega_for_5mers', 'min_lk_ball_for_2mers', 'std_pro_close_for_4mers', 'F_E_counts_in_loops', 'max_lk_ball_bridge_uncpl_for_4mers', 'avg_fa_sol_for_3mers', 'avg_fa_intra_elec_for_4mers', 'abego_counts_in_loops_BEG', 'avg_lk_ball_bridge_uncpl_for_3mers', 'abego_counts_in_loops_OAE', 'avg_lk_ball_iso_for_3mers', 'std_fa_intra_sol_xover4_for_5mers', 'L_B_counts_in_loops', 'min_rama_prepro_for_4mers', 'max_hbond_sc_for_2mers', 'std_ref_for_4mers', 'E_E_counts_in_loops', 'avg_fa_rep_for_5mers', 'abego_counts_in_loops_OEG', 'std_lk_ball_iso_for_4mers', 'avg_hbond_sr_bb_for_4mers', 'D_G_counts_in_loops', 'min_hbond_sr_bb_for_4mers', 'max_p_aa_pp_for_2mers', 'avg_lk_ball_iso_for_5mers', 'avg_hxl_tors_for_5mers', 'avg_fa_dun_semi_for_3mers', 'avg_lk_ball_bridge_for_5mers', 'max_dslf_fa13_for_5mers', 'D_A_counts_in_loops', 'max_ref_for_4mers', 'I_E_counts_in_loops', 'abego_counts_in_loops_BB', 'abego_counts_in_loops_BAO', 'buried_over_exposed_np_AFILMVWY', 'abego_counts_in_loops_GG', 'abego_counts_in_loops_BEB', 'std_p_aa_pp_for_2mers', 'min_hbond_bb_sc_for_5mers', 'E_B_counts_in_loops', 'avg_lk_ball_iso_for_2mers', 'min_fa_dun_dev_for_2mers', 'min_charge_of_3A_neighborhoods', 'std_pro_close_for_5mers', 'min_hbond_bb_sc_for_2mers', 'nres_surface', 'std_rama_prepro_for_3mers', 'avg_hbond_sc_for_3mers', 'P_G_counts_in_loops', 'abego_counts_in_loops_BAG', 'abego_counts_in_loops_BOE', 'max_hbond_lr_bb_for_5mers', 'std_fa_intra_elec_for_2mers', 'avg_lk_ball_bridge_for_2mers', 'C_G_counts_in_loops', 'abego_counts_in_loops_AOE', 'std_fa_atr_for_4mers', 'abego_counts_in_loops_BAA', 'min_p_aa_pp_for_4mers', 'abego_counts_in_loops_OBB', 'D_O_counts_in_loops', 'K_B_counts_in_loops', 'abego_counts_in_loops_EBO', 'avg_hbond_sr_bb_for_5mers', 'avg_omega_for_5mers', 'abego_counts_in_loops_AGB', 'abego_counts_in_loops_OGB', 'abego_counts_in_loops_G', 'abego_counts_in_loops_EAA', 'A_G_counts_in_loops', 'abego_counts_in_loops_EEE', 'abego_counts_in_loops_OO', 'C_B_counts_in_loops', 'Koehl_Delarue_TdS_core', 'abego_counts_in_loops_BOG', 'min_rama_prepro_for_2mers', 'std_hbond_bb_sc_for_2mers', 'std_fa_rep_for_2mers', 'abego_counts_in_loops_GEO', 'avg_hbond_lr_bb_for_5mers', 'avg_ref_for_2mers', 'avg_fa_dun_semi_for_2mers', 'S_A_counts_in_loops', 'std_fa_intra_atr_xover4_for_4mers', 'std_hbond_sr_bb_for_4mers', 'abego_counts_in_loops_OBO', 'abego_counts_in_loops_OG', 'M_B_counts_in_loops', 'abego_counts_in_loops_BBG', 'max_omega_for_5mers', 'max_fa_intra_atr_xover4_for_3mers', 'max_pro_close_for_2mers', 'min_fa_intra_sol_xover4_for_3mers', 'min_fa_intra_atr_xover4_for_3mers', 'max_fa_rep_for_5mers', 'abego_counts_in_loops_GEA', 'std_fa_sol_for_3mers', 'Koehl_Delarue_TdS_boundary_per_res', 'P_E_counts_in_loops', 'Koehl_Delarue_TdS_surface', 'max_lk_ball_bridge_uncpl_for_5mers', 'R_E_counts_in_loops', 'max_fa_intra_atr_xover4_for_5mers', 'T_O_counts_in_loops', 'avg_lk_ball_bridge_for_4mers', 'min_omega_for_2mers', 'max_fa_atr_for_4mers', 'N_O_counts_in_loops', 'abego_counts_in_loops_AEE', 'Y_A_counts_in_loops', 'Y_G_counts_in_loops', 'abego_counts_in_loops_OBG', 'avg_fa_intra_atr_xover4_for_4mers', 'max_fa_intra_atr_xover4_for_2mers', 'W_A_counts_in_loops', 'Y_O_counts_in_loops', 'max_lk_ball_iso_for_2mers', 'min_pro_close_for_4mers', 'max_rama_prepro_for_2mers', 'std_lk_ball_for_2mers', 'abego_counts_in_loops_GE', 'abego_counts_in_loops_OGO', 'avg_fa_intra_elec_for_2mers', 'std_p_aa_pp_for_3mers', 'std_p_aa_pp_for_4mers', 'E_G_counts_in_loops', 'min_fa_intra_rep_xover4_for_4mers', 'max_lk_ball_for_4mers', 'D_E_counts_in_loops', 'max_lk_ball_bridge_for_2mers', 'abego_counts_in_loops_OAO', 'max_hbond_bb_sc_for_5mers', 'min_hbond_sc_for_3mers', 'std_lk_ball_for_3mers', 'min_lk_ball_for_5mers', 'min_fa_dun_dev_for_4mers', 'avg_p_aa_pp_for_2mers', 'std_lk_ball_bridge_for_5mers', 'abego_counts_in_loops_BOO', 'std_fa_elec_for_3mers', 'std_fa_dun_semi_for_3mers', 'abego_counts_in_loops_OA', 'min_p_aa_pp_for_2mers', 'max_pro_close_for_5mers', 'min_ref_for_5mers', 'min_lk_ball_bridge_for_3mers', 'max_fa_dun_rot_for_3mers', 'abego_counts_in_loops_BOB', 'std_fa_sol_for_2mers', 'abego_counts_in_loops_E', 'avg_rama_prepro_for_4mers', 'avg_fa_dun_dev_for_2mers', 'std_lk_ball_iso_for_2mers', 'abego_counts_in_loops_EGE', 'max_hbond_lr_bb_for_3mers', 'std_fa_atr_for_5mers', 'std_fa_atr_for_2mers', 'avg_fa_dun_rot_for_3mers', 'min_lk_ball_for_3mers', 'S_G_counts_in_loops', 'Abagyan_Totrov_TdS_boundary_per_res', 'abego_counts_in_loops_BBO', 'abego_counts_in_loops_EA', 'avg_lk_ball_bridge_for_3mers', 'max_dslf_fa13_for_4mers', 'std_rama_prepro_for_5mers', 'max_fa_dun_semi_for_2mers', 'max_hbond_lr_bb_for_4mers', 'min_fa_sol_for_2mers', 'Q_G_counts_in_loops', 'abego_counts_in_loops_EE', 'min_lk_ball_iso_for_2mers', 'min_fa_intra_sol_xover4_for_5mers', 'abego_counts_in_loops_GBB', 'avg_dslf_fa13_for_5mers', 'max_lk_ball_for_3mers', 'min_hbond_lr_bb_for_2mers', 'abego_counts_in_loops_EBB', 'max_fa_atr_for_3mers', 'avg_pro_close_for_4mers', 'exposed_np_AFILMVWY_per_res', 'Koehl_Delarue_TdS_core_per_res', 'abego_counts_in_loops_GAA', 'avg_p_aa_pp_for_4mers', 'std_fa_intra_rep_xover4_for_4mers', 'abego_counts_in_loops_B', 'Picket_Sternberg_TdS_core', 'P_B_counts_in_loops', 'abego_counts_in_loops_BEO', 'min_fa_rep_for_5mers', 'max_fa_elec_for_2mers', 'avg_fa_dun_dev_for_3mers', 'max_fa_elec_for_5mers', 'avg_fa_elec_for_2mers', 'avg_hxl_tors_for_2mers', 'N_G_counts_in_loops', 'min_hbond_bb_sc_for_3mers', 'std_charge_of_3A_neighborhoods', 'std_fa_sol_for_5mers', 'avg_fa_intra_sol_xover4_for_2mers', 'std_fa_intra_rep_xover4_for_2mers', 'abego_counts_in_loops_OAA', 'abego_counts_in_loops_EOE', 'std_fa_intra_atr_xover4_for_3mers', 'avg_dslf_fa13_for_2mers', 'R_A_counts_in_loops', 'min_ref_for_4mers', 'P_O_counts_in_loops', 'min_fa_rep_for_3mers', 'K_A_counts_in_loops', 'min_fa_intra_elec_for_3mers', 'avg_fa_dun_rot_for_5mers', 'V_A_counts_in_loops', 'F_G_counts_in_loops', 'abego_counts_in_loops_GOG', 'Picket_Sternberg_TdS_surface_per_res', 'R_G_counts_in_loops', 'std_lk_ball_bridge_uncpl_for_3mers', 'avg_lk_ball_for_2mers', 'abego_counts_in_loops_O', 'max_hbond_sr_bb_for_5mers', 'std_omega_for_5mers', 'avg_fa_intra_atr_xover4_for_3mers', 'abego_counts_in_loops_BEA', 'max_fa_atr_for_2mers', 'max_omega_for_2mers', 'max_p_aa_pp_for_4mers', 'H_G_counts_in_loops', 'abego_counts_in_loops_AGE', 'Y_B_counts_in_loops', 'H_O_counts_in_loops', 'abego_counts_in_loops_OE', 'avg_fa_atr_for_3mers', 'abego_counts_in_loops_EEO', 'max_fa_dun_dev_for_4mers', 'std_ref_for_5mers', 'std_fa_dun_dev_for_2mers', 'avg_hxl_tors_for_3mers', 'std_hbond_sr_bb_for_3mers', 'avg_fa_intra_sol_xover4_for_4mers', 'avg_fa_dun_rot_for_4mers', 'min_hxl_tors_for_2mers', 'std_hbond_bb_sc_for_5mers', 'abego_counts_in_loops_AO', 'abego_counts_in_loops_GGG', 'std_fa_dun_rot_for_2mers', 'Y_E_counts_in_loops', 'Lee_TdS_core_per_res', 'max_fa_atr_for_5mers', 'max_hbond_bb_sc_for_3mers', 'abego_counts_in_loops_OGG', 'max_dslf_fa13_for_2mers', 'mean_charge_of_3A_neighborhoods', 'std_fa_intra_rep_xover4_for_5mers', 'abego_counts_in_loops_GAB', 'min_lk_ball_bridge_for_5mers', 'abego_counts_in_loops_GGB', 'abego_counts_in_loops_EAG', 'std_rama_prepro_for_2mers', 'max_fa_dun_semi_for_3mers', 'min_omega_for_4mers', 'avg_ref_for_3mers', 'abego_counts_in_loops_AE', 'abego_counts_in_loops_GBO', 'G_A_counts_in_loops', 'abego_counts_in_loops_OOO', 'max_lk_ball_for_2mers', 'L_E_counts_in_loops', 'std_hxl_tors_for_3mers', 'abego_counts_in_loops_AAA', 'abego_counts_in_loops_EG', 'min_fa_dun_semi_for_4mers', 'abego_counts_in_loops_GGA', 'max_rama_prepro_for_3mers', 'avg_lk_ball_bridge_uncpl_for_5mers', 'max_lk_ball_bridge_for_4mers', 'avg_hxl_tors_for_4mers', 'std_fa_intra_elec_for_3mers', 'avg_fa_sol_for_5mers', 'L_G_counts_in_loops', 'abego_counts_in_loops_BGE', 'abego_counts_in_loops_OOE', 'K_G_counts_in_loops', 'abego_counts_in_loops_AOO', 'max_fa_sol_for_2mers', 'std_hxl_tors_for_5mers', 'W_G_counts_in_loops', 'avg_fa_dun_dev_for_4mers', 'abego_counts_in_loops_AGG', 'std_fa_intra_sol_xover4_for_4mers', 'std_fa_dun_semi_for_5mers', 'mean_charge_of_5A_neighborhoods', 'abego_counts_in_loops_BGG', 'avg_fa_dun_semi_for_5mers', 'avg_rama_prepro_for_2mers', 'abego_counts_in_loops_AB', 'abego_counts_in_loops_EBG', 'max_pro_close_for_3mers', 'max_fa_dun_rot_for_4mers', 'max_fa_intra_sol_xover4_for_5mers', 'max_fa_intra_sol_xover4_for_3mers', 'std_fa_rep_for_5mers', 'min_lk_ball_bridge_for_4mers', 'S_E_counts_in_loops', 'std_ref_for_3mers', 'max_fa_intra_elec_for_5mers', 'max_dslf_fa13_for_3mers', 'Abagyan_Totrov_TdS_surface', 'max_lk_ball_bridge_uncpl_for_2mers', 'abego_counts_in_loops_AAG', 'abego_counts_in_loops_ABA', 'V_B_counts_in_loops', 'abego_counts_in_loops_BO', 'ProteinVolume_vdw_vol', 'Koehl_Delarue_TdS_surface_per_res', 'avg_fa_intra_sol_xover4_for_3mers', 'min_pro_close_for_3mers', 'abego_counts_in_loops_GAO', 'avg_p_aa_pp_for_5mers', 'max_hbond_sr_bb_for_3mers', 'A_E_counts_in_loops', 'abego_counts_in_loops_EEB', 'nres_boundary', 'abego_counts_in_loops_AEO', 'avg_fa_rep_for_4mers', 'W_O_counts_in_loops', 'avg_fa_intra_elec_for_5mers', 'Lee_TdS_surface_per_res', 'max_fa_intra_atr_xover4_for_4mers', 'std_hbond_lr_bb_for_3mers', 'std_hbond_sc_for_4mers', 'abego_counts_in_loops_EOG', 'avg_fa_elec_for_4mers', 'min_pro_close_for_5mers', 'abego_counts_in_loops_GOA', 'std_hxl_tors_for_2mers', 'std_lk_ball_bridge_uncpl_for_2mers', 'abego_counts_in_loops_ABB', 'std_hbond_sc_for_2mers', 'max_fa_dun_dev_for_2mers', 'max_fa_sol_for_4mers', 'abego_counts_in_loops_BG', 'Q_A_counts_in_loops', 'W_E_counts_in_loops', 'abego_counts_in_loops_EAB', 'avg_fa_atr_for_5mers', 'max_fa_intra_elec_for_4mers', 'I_A_counts_in_loops', 'abego_counts_in_loops_BGO', 'std_lk_ball_bridge_for_2mers', 'min_fa_sol_for_5mers', 'H_B_counts_in_loops', 'max_fa_dun_dev_for_5mers', 'max_fa_dun_rot_for_5mers', 'max_fa_rep_for_3mers', 'max_lk_ball_bridge_uncpl_for_3mers', 'max_rama_prepro_for_4mers', 'min_fa_dun_rot_for_2mers', 'max_lk_ball_bridge_for_3mers', 'max_fa_elec_for_3mers', 'min_dslf_fa13_for_4mers', 'std_hbond_sc_for_5mers', 'Picket_Sternberg_TdS_core_per_res', 'abego_counts_in_loops_EOB', 'P_A_counts_in_loops', 'max_lk_ball_bridge_for_5mers', 'min_fa_intra_rep_xover4_for_5mers', 'Lee_TdS_core', 'T_E_counts_in_loops', 'std_fa_intra_atr_xover4_for_5mers', 'min_fa_intra_elec_for_2mers', 'abego_counts_in_loops_OBA', 'max_lk_ball_iso_for_3mers', 'std_pro_close_for_3mers', 'abego_counts_in_loops_AG', 'avg_fa_intra_atr_xover4_for_5mers', 'min_hbond_sr_bb_for_5mers', 'min_fa_elec_for_4mers', 'max_hbond_sr_bb_for_4mers', 'min_hxl_tors_for_3mers', 'std_dslf_fa13_for_4mers', 'min_fa_dun_semi_for_2mers', 'ProteinVolume_packing_density', 'avg_fa_atr_for_4mers', 'abego_counts_in_loops_A', 'abego_counts_in_loops_GB', 'max_ref_for_2mers', 'min_charge_of_5A_neighborhoods', 'max_fa_intra_rep_xover4_for_4mers', 'avg_hbond_bb_sc_for_5mers', 'min_fa_intra_sol_xover4_for_4mers', 'avg_fa_intra_elec_for_3mers', 'std_fa_rep_for_4mers', 'max_hbond_bb_sc_for_2mers', 'M_O_counts_in_loops', 'max_fa_intra_rep_xover4_for_5mers', 'min_fa_sol_for_4mers', 'min_lk_ball_iso_for_5mers', 'min_lk_ball_bridge_uncpl_for_2mers', 'Picket_Sternberg_TdS_boundary_per_res', 'avg_hbond_lr_bb_for_4mers', 'max_hbond_sr_bb_for_2mers', 'min_fa_intra_rep_xover4_for_2mers', 'A_O_counts_in_loops', 'abego_counts_in_loops_OGE', 'F_O_counts_in_loops', 'max_fa_intra_sol_xover4_for_4mers', 'S_B_counts_in_loops', 'avg_ref_for_4mers', 'min_fa_rep_for_2mers', 'std_dslf_fa13_for_3mers', 'min_fa_intra_atr_xover4_for_4mers', 'abego_counts_in_loops_GA', 'min_rama_prepro_for_5mers', 'avg_fa_dun_semi_for_4mers', 'max_fa_intra_elec_for_2mers', 'max_fa_dun_semi_for_4mers', 'min_lk_ball_iso_for_4mers', 'min_fa_dun_rot_for_5mers', 'abego_counts_in_loops_GBA', 'Q_E_counts_in_loops', 'abego_counts_in_loops_OEA', 'avg_pro_close_for_5mers', 'avg_dslf_fa13_for_3mers', 'avg_lk_ball_for_3mers', 'abego_counts_in_loops_BA', 'abego_counts_in_loops_BAE', 'std_fa_elec_for_5mers', 'abego_counts_in_loops_EGA', 'I_O_counts_in_loops', 'N_A_counts_in_loops', 'avg_fa_atr_for_2mers', 'max_fa_dun_dev_for_3mers', 'A_B_counts_in_loops', 'abego_counts_in_loops_OEB', 'abego_counts_in_loops_BGA', 'min_fa_elec_for_3mers', 'min_hbond_sr_bb_for_3mers', 'V_O_counts_in_loops', 'avg_fa_intra_sol_xover4_for_5mers', 'H_E_counts_in_loops', 'avg_lk_ball_bridge_uncpl_for_4mers', 'abego_counts_in_loops_BEE', 'max_fa_dun_semi_for_5mers', 'min_fa_dun_semi_for_5mers', 'min_hxl_tors_for_5mers', 'avg_hbond_sc_for_4mers', 'std_fa_dun_dev_for_5mers', 'std_fa_sol_for_4mers', 'std_fa_dun_dev_for_4mers', 'std_fa_elec_for_4mers', 'abego_counts_in_loops_AEA', 'avg_fa_intra_rep_xover4_for_5mers', 'abego_counts_in_loops_EBA', 'abego_counts_in_loops_OAG', 'abego_counts_in_loops_AAO', 'avg_fa_elec_for_5mers', 'Abagyan_Totrov_TdS_boundary', 'abego_counts_in_loops_GAG', 'min_fa_intra_atr_xover4_for_5mers', 'avg_rama_prepro_for_3mers', 'Q_O_counts_in_loops', 'max_omega_for_4mers', 'buried_over_exposed_np_AFILMVWY_per_res', 'C_A_counts_in_loops', 'abego_counts_in_loops_OOG', 'max_hxl_tors_for_2mers', 'G_E_counts_in_loops', 'max_omega_for_3mers', 'std_fa_intra_sol_xover4_for_2mers', 'abego_counts_in_loops_BBA', 'max_hbond_sc_for_4mers', 'abego_counts_in_loops_EGO', 'avg_lk_ball_bridge_uncpl_for_2mers', 'std_rama_prepro_for_4mers', 'R_B_counts_in_loops', 'min_hbond_sc_for_2mers', 'max_fa_rep_for_4mers', 'abego_counts_in_loops_AOB', 'std_lk_ball_iso_for_5mers', 'avg_ref_for_5mers', 'abego_counts_in_loops_GEG', 'avg_fa_dun_dev_for_5mers', 'avg_lk_ball_for_4mers', 'Lee_TdS_surface', 'abego_counts_in_loops_GBG', 'max_hbond_sc_for_3mers', 'T_B_counts_in_loops', 'std_fa_intra_elec_for_5mers', 'max_fa_elec_for_4mers', 'min_fa_atr_for_5mers', 'min_fa_intra_sol_xover4_for_2mers', 'T_A_counts_in_loops', 'std_lk_ball_iso_for_3mers', 'abego_counts_in_loops_EOO', 'min_fa_dun_semi_for_3mers', 'min_hxl_tors_for_4mers', 'std_hbond_lr_bb_for_2mers', 'max_p_aa_pp_for_3mers', 'max_ref_for_5mers', 'std_lk_ball_for_5mers', 'abego_counts_in_loops_AAE', 'min_hbond_bb_sc_for_4mers', 'abego_counts_in_loops_EAE', 'V_E_counts_in_loops', 'abego_counts_in_loops_GO', 'max_fa_intra_rep_xover4_for_3mers', 'min_pro_close_for_2mers', 'abego_counts_in_loops_GBE', 'max_lk_ball_iso_for_5mers', 'min_dslf_fa13_for_3mers', 'abego_counts_in_loops_EEG', 'abego_counts_in_loops_GOE', 'max_lk_ball_for_5mers', 'std_fa_intra_elec_for_4mers', 'std_hbond_bb_sc_for_4mers', 'max_hbond_lr_bb_for_2mers', 'abego_counts_in_loops_BOA', 'std_hbond_sc_for_3mers', 'min_hbond_sr_bb_for_2mers', 'buried_minus_exposed_np_AFILMVWY_per_res', 'std_omega_for_2mers', 'buried_minus_exposed_np_AFILMVWY', 'max_fa_intra_elec_for_3mers', 'abego_counts_in_loops_GGO', 'min_fa_dun_rot_for_3mers', 'Lee_TdS_boundary_per_res', 'K_E_counts_in_loops', 'std_hxl_tors_for_4mers', 'abego_counts_in_loops_ABO', 'std_fa_dun_rot_for_5mers', 'min_fa_intra_elec_for_4mers', 'std_lk_ball_bridge_for_4mers', 'std_lk_ball_bridge_for_3mers', 'avg_hbond_lr_bb_for_3mers', 'S_O_counts_in_loops', 'M_E_counts_in_loops', 'min_hbond_sc_for_4mers', 'abego_counts_in_loops_ABG', 'abego_counts_in_loops_GAE', 'std_pro_close_for_2mers', 'abego_counts_in_loops_GOB', 'F_A_counts_in_loops', 'E_A_counts_in_loops', 'H_A_counts_in_loops', 'I_B_counts_in_loops', 'ProteinVolume_total_vol', 'abego_counts_in_loops_ABE', 'avg_pro_close_for_3mers', 'std_lk_ball_bridge_uncpl_for_5mers', 'min_fa_elec_for_5mers', 'avg_hbond_sc_for_5mers', 'abego_counts_in_loops_AOA', 'D_B_counts_in_loops', 'max_hxl_tors_for_3mers', 'avg_rama_prepro_for_5mers', 'max_hxl_tors_for_4mers', 'A_A_counts_in_loops', 'W_B_counts_in_loops', 'std_fa_elec_for_2mers', 'std_ref_for_2mers', 'std_fa_intra_rep_xover4_for_3mers', 'min_fa_dun_dev_for_3mers', 'abego_counts_in_loops_AGA', 'T_G_counts_in_loops', 'avg_fa_intra_rep_xover4_for_2mers', 'abego_counts_in_loops_EBE', 'max_fa_rep_for_2mers', 'F_B_counts_in_loops', 'abego_counts_in_loops_AA', 'min_fa_intra_elec_for_5mers', 'abego_counts_in_loops_OOA', 'max_hbond_bb_sc_for_4mers', 'min_ref_for_2mers', 'abego_counts_in_loops_GEB', 'min_dslf_fa13_for_2mers', 'avg_lk_ball_for_5mers', 'min_fa_dun_dev_for_5mers', 'abego_counts_in_loops_OGA', 'abego_counts_in_loops_AEG', 'avg_fa_intra_rep_xover4_for_4mers', 'max_fa_sol_for_5mers', 'max_p_aa_pp_for_5mers', 'max_hxl_tors_for_5mers', 'std_fa_dun_semi_for_2mers', 'Abagyan_Totrov_TdS_core', 'min_fa_atr_for_4mers', 'abego_counts_in_loops_EOA', 'max_charge_of_3A_neighborhoods', 'min_fa_atr_for_2mers', 'min_lk_ball_bridge_uncpl_for_4mers', 'std_omega_for_4mers', 'abego_counts_in_loops_EGG', 'Picket_Sternberg_TdS_surface', 'abego_counts_in_loops_OEE', 'avg_omega_for_2mers', 'abego_counts_in_loops_EO', 'avg_fa_dun_rot_for_2mers']

# specify the feature space normalization string for dataframe comments
normalization = 'mean-0; std-1 normalization'

# specify which topology groups to compute
topology_group_list = [1,2,3,4]  # Rocklin 2017, HHH, EHEE, EEHEE, HEEH designs

# for each topology group, normalize the feature columns
group_df_dict = {}
group_feature_dict = {}
for group_index in topology_group_list:
    group_df_dict[group_index] = combined_df[combined_df.group_index == group_index].copy().reset_index(drop=True)
    group_feature_dict[group_index] = []
    for feature in feature_cols:
        
        group_df_dict[group_index][feature] = (group_df_dict[group_index][feature]-group_df_dict[group_index][feature].mean())/group_df_dict[group_index][feature].std()
            
        if group_df_dict[group_index][feature].isna().any():
            # drop feature columns which contain any nans
            group_df_dict[group_index].drop(columns=[feature], inplace=True)
        else:
            # store those feature columns which contain floating point values for this group_index
            group_feature_dict[group_index].append(feature)

#group_df_dict[2].dropna(axis=0, subset=['stabilityscore_cnn_calibrated'], inplace=True)
#group_df_dict[2].reset_index(inplace=True)

designs_to_drop = {'EHEE_rd2_0542', 'EHEE_rd2_0727', 'EHEE_rd2_1253'}
group_df_dict[2] = group_df_dict[2][~group_df_dict[2]['name'].isin(designs_to_drop)]
group_df_dict[2].reset_index(inplace=True)

In [None]:
# Save preprocessed dataframes
topology_group_list = [1,2,3,4]  # Rocklin 2017, HHH, EHEE, EEHEE, HEEH designs
index_to_name = {1: 'HHH', 2: 'EHEE', 3: 'EEHEE', 4:'HEEH'}

for group_index in topology_group_list:
    group_df_file = f'data/processed_data/Rocklin.{index_to_name[group_index]}.ssm_processed_data.csv'
    group_feature_file = f'data/processed_data/Rocklin.{index_to_name[group_index]}.ssm_processed_data.structural_metrics.json'
    
    with open(group_df_file, "w", newline='') as f:
        f.write("# Topology-specific Rosetta feature space and stability scores\n")
        f.write("# Topology group index given in protein_groupings_by_uw.v1.metadata.csv: %d \n" %group_index)
        f.write("# Feature space: %s\n" % '; '.join(group_feature_dict[group_index]))
        f.write("# Feature space dimension: %d\n" % len(group_feature_dict[group_index]))
        f.write("# Feature space normalization: %s\n" %normalization)
        f.write("# Number of designs: %d\n" %len(group_df_dict[group_index]))
        f.write("# Author: Francis Motta\n")
        f.write(f"# Index: Rocklin.{index_to_name[group_index]}.filtered.v1.index.csv\n")
        group_df_dict[group_index].to_csv(f, index=False)
        
    with open(group_feature_file, "w") as f:
        json.dump(group_feature_dict[group_index], f, indent=4)