In [1]:
import MLR_utils
from MLR_utils import plot_three_features, drop_corr_features, linear_reg_of_n_features, evaluate_model_of_n_features 
from MLR_utils import all_features, combination_any_2_features, combination_groupof2_with_1feat, comb2feats_MLR_evaby_crossval
from MLR_utils import comb3feats_MLR_evaby_crossval, fold_5_cross_validation, fold_5_cross_validation_for_stream
from MLR_utils import feats_evaby_crossval_5nfold, linear_and_plot_yield_vs_ypred, comb3feats_MLR_evaby_crossval_metric


In [2]:
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from rdkit.Chem import AllChem, MolFromSmiles
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem import DataStructs
from rdkit import Chem
from rxnfp.tokenization import get_default_tokenizer, SmilesTokenizer
from rdkit.Chem import rdChemReactions
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

Data processing

In [3]:
# Data processing
df_origin = pd.read_excel('Afterscreen.xlsx', sheet_name='Result2')
df_origin = df_origin.sample(frac=1,random_state=42)
# Reindex the data
df_origin = df_origin.set_index('ID')


df_x = df_origin.drop(columns = ['Yield-avg','ligand','buch','Unnamed: 0','SMILES'])
df_y = df_origin[['Yield-avg']]

# standardize
scaler = StandardScaler()
df_x=pd.DataFrame(scaler.fit_transform(df_x),index=df_x.index, columns=df_x.columns)
# drop zero-variance features
df_std_origin = df_x
zero_std_cols = df_x.columns[df_x.std() == 0]
df_x=df_x[df_x.columns.difference(zero_std_cols)]
print (f"Dropping {len(zero_std_cols)} zero-variance features {zero_std_cols}")

print(df_x.shape)

Dropping 4 zero-variance features Index(['x119', 'x122', 'x144', 'x147'], dtype='object')
(35, 211)


Drop highly correlated features

In [4]:
# calculate pearson correlation
corr = df_x.corr(method='pearson')
# set threshold
threshold = 0.7

# find features with high correlation
high_corr = set()
for i in range(len(corr.columns)):
    for j in range(i):
        if abs(corr.iloc[i, j]) > threshold:
            colname = corr.columns[i]
            high_corr.add(colname)

# drop features with high correlation
df_x_reduced = df_x.drop(columns=high_corr)
# dropped features
dropped_features = df_x.columns.difference(df_x_reduced.columns)
print(f"Dropping {len(dropped_features)} correlated features {dropped_features}")

print(df_x_reduced.shape)
df_x = df_x_reduced

Dropping 186 correlated features Index(['x102', 'x104', 'x105', 'x106', 'x107', 'x109', 'x11', 'x110', 'x111',
       'x113',
       ...
       'x90', 'x91', 'x92', 'x93', 'x94', 'x95', 'x96', 'x97', 'x98', 'x99'],
      dtype='object', length=186)
(35, 25)


In [5]:
print('Shape of input:',df_x.shape,'\nShape of output:' ,df_y.shape)
df_x = df_x.astype(float)
df_y = df_y.astype(float)

Shape of input: (35, 25) 
Shape of output: (35, 1)


In [6]:
df_x.columns

Index(['x1', 'x10', 'x100', 'x101', 'x103', 'x108', 'x112', 'x118', 'x12',
       'x129', 'x13', 'x14', 'x183', 'x184', 'x20', 'x30', 'x46', 'x51', 'x52',
       'x56', 'x57', 'x60', 'x77', 'x78', 'x88'],
      dtype='object')

In [69]:
#df_x.to_excel('df_x_drop_corr_feats.xlsx')

In [7]:
# Assign the splitting
splitting = 0.8

Combine features

In [8]:
# Constitute a list of all features
all_feat = all_features(df_x)

In [9]:
# Combination of all features into groups of 3
from itertools import combinations
feats = list(combinations(all_feat, 3))

In [10]:
len(feats)

2300

In [11]:
test_feats = [('x1', 'x77', 'x12'),
 ('x1', 'x77', 'x12'),
 ('x1', 'x10', 'x103'),
 ('x1', 'x10', 'x108'),
 ('x1', 'x10', 'x112')]

# Feature selection over different metrics methods

In [12]:
def combnfeats_MLR_evaby_crossval_metric(feats, df_x, df_y, n_fold, model, metric, score):
    print('----------Linear regression of n features evaby crossval starts----------')
    """_Screen and select Well behalved features from of a list of 3 features_
    Evaluated by sum of 5-fold cross validation.
    This is because just taking consideration of one result from one MLR can cause lost of important features due to randomness.
    If the displaied R^2 is also high, there's a higher chance that these features are of high predictive value.
    If the displaied R^2 is 0/-/unexplainable, it can be due to accident, and further cross-validation can verify their importance.
    """
    # Create a list to store features with high, medium, or low score
    List_cvd_3 = [] 
    List_cvd_2 = [] 
    List_cvd_0 = [] 
    # for r2 and explained variance the closer to 1 the better
    if metric == 'r2' or metric == 'explained_variance':
        sum_score = [score*1,score*0.8,score*0.5]
    # for nmae, nrmse and max error, the closer to 0 the better, therefore we need to readjust the rank
    elif metric == 'neg_mean_absolute_error' or metric == 'neg_root_mean_squared_error' or metric == 'max_error':
        sum_score = [score/1,score/0.8,score/0.5]
    
    # Three different ranks for high, medium and low, based on the score we imputed
    rank_score = sorted(sum_score,reverse=True)
    score_1 = rank_score[0]
    score_2 = rank_score[1]
    score_3 = rank_score[2]
    
    for m in feats:
        fts = feats.index(m)
        df_repeated_cv_x = df_x
        df_repeated_cv_y = df_y
        sum_cvd = 0
        for k in range(0,n_fold):
            # Shuffle the data
            df_repeated_cv_x = df_repeated_cv_x.sample(frac=1,random_state=k)
            df_nf = df_repeated_cv_x[list(feats[fts])]
            df_repeated_cv_y = df_repeated_cv_y.sample(frac=1,random_state=k)
            cvd_scores = cross_val_score(model, df_nf, df_repeated_cv_y, cv=5,scoring= metric)
            sum_cvd += sum(cvd_scores)
        # Calculate average R^2
        avg_cvd = sum_cvd/n_fold/5
        if avg_cvd >= score_1: 
            print('Significant!')
            print(f'For features {feats[fts]} with {metric} = {avg_cvd}')
            List_cvd_3.append(feats[fts]) 
        elif avg_cvd >=score_2:
            List_cvd_2.append(feats[fts])
        elif avg_cvd >=score_3:
            List_cvd_0.append(feats[fts])
        #print(f'For features {feats[fts]} with {metric} = {avg_cvd}'+'cvd_scores = ',cvd_scores,sum_cvd,avg_cvd)   
    return List_cvd_3,List_cvd_2,List_cvd_0

In [76]:
model = LinearRegression()
# test how the n number affects the results
for m in test_feats:
    fts = test_feats.index(m)
    print(m)
    df_repeated_cv_x = df_x
    df_repeated_cv_y = df_y
    n = 100
    sum_cvd = 0
    for k in range(0,n):
        # Shuffle the data
        df_repeated_cv_x = df_repeated_cv_x.sample(frac=1,random_state=k)
        df_nf = df_repeated_cv_x[list(feats[fts])]
        df_repeated_cv_y = df_repeated_cv_y.sample(frac=1,random_state=k)
        cvd_scores = cross_val_score(model, df_nf, df_repeated_cv_y, cv=5,scoring= 'r2')
        sum_cvd += sum(cvd_scores)
    print(cvd_scores,sum_cvd/5/n)

('x1', 'x77', 'x12')


[ 0.37672047  0.1795819   0.32898063  0.81126451 -0.01897227] 0.06027523322922592
('x1', 'x77', 'x12')
[ 0.37672047  0.1795819   0.32898063  0.81126451 -0.01897227] 0.06027523322922592
('x1', 'x10', 'x103')
[ 0.14544046  0.27365277 -0.04491271  0.51314434 -0.37324119] -0.4920955379885389
('x1', 'x10', 'x108')
[ 0.03867144 -1.53158667 -0.13243161  0.70193811 -0.0648196 ] -1.8096191778829505
('x1', 'x10', 'x112')
[ 0.10362398  0.04333835 -0.06576234  0.68851247  0.00763016] -0.500317672565598


In [96]:
#Combine one preset feature with all combinations of 2 features from df_x
feats = list(combinations(all_feat, 3))


model = LinearRegression()
#Screening of features, report significant when score of 5-fold crossval is greater than 0
List_cvd_3_r2,List_cvd_2_r2,List_cvd_0_r2 = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,5,model,'r2',0.1)


----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x1', 'x10', 'x100') with r2 = 0.24023950664429172
Significant!
For features ('x1', 'x10', 'x12') with r2 = 0.1374732194685682
Significant!
For features ('x1', 'x10', 'x184') with r2 = 0.2975684994825149
Significant!
For features ('x1', 'x10', 'x46') with r2 = 0.17175730220282753
Significant!
For features ('x1', 'x10', 'x77') with r2 = 0.10442335347041432
Significant!
For features ('x1', 'x10', 'x78') with r2 = 0.10370298366083446
Significant!
For features ('x1', 'x100', 'x101') with r2 = 0.10400457788192981
Significant!
For features ('x1', 'x100', 'x12') with r2 = 0.19364178472571403
Significant!
For features ('x1', 'x100', 'x129') with r2 = 0.10350847797838218
Significant!
For features ('x1', 'x100', 'x183') with r2 = 0.10953867626045688
Significant!
For features ('x1', 'x100', 'x184') with r2 = 0.31060995971813726
Significant!
For features ('x1', 'x100', 'x46') with r2 = 0.1677015758

In [97]:
len(List_cvd_3_r2)

56

In [99]:
List2_cvd_3_r2,List2_cvd_2_r2,List2_cvd_0_r2 = combnfeats_MLR_evaby_crossval_metric(List_cvd_3_r2,df_x,df_y,100,model,'r2',0.1)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x1', 'x10', 'x184') with r2 = 0.1534682522648888
Significant!
For features ('x1', 'x100', 'x12') with r2 = 0.12900408476919195
Significant!
For features ('x1', 'x100', 'x183') with r2 = 0.10572903021458538
Significant!
For features ('x1', 'x100', 'x184') with r2 = 0.26425617714564864
Significant!
For features ('x1', 'x100', 'x46') with r2 = 0.18783122167394523
Significant!
For features ('x1', 'x100', 'x77') with r2 = 0.14776700745094223
Significant!
For features ('x1', 'x100', 'x78') with r2 = 0.10581025935245983
Significant!
For features ('x1', 'x12', 'x77') with r2 = 0.15702889110847634
Significant!
For features ('x1', 'x184', 'x20') with r2 = 0.14817377038935667
Significant!
For features ('x10', 'x100', 'x46') with r2 = 0.10855072428123905
Significant!
For features ('x10', 'x129', 'x46') with r2 = 0.1025775328446751
Significant!
For features ('x10', 'x30', 'x46') with r2 = 0.1334957

In [100]:
len(List2_cvd_3_r2)

13

In [101]:
List3_cvd_3_r2,List3_cvd_2_r2,List3_cvd_0_r2 = combnfeats_MLR_evaby_crossval_metric(List_cvd_3_r2,df_x,df_y,500,model,'r2',0.15)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x1', 'x10', 'x184') with r2 = 0.15162164660688493
Significant!
For features ('x1', 'x100', 'x184') with r2 = 0.25973162962652446
Significant!
For features ('x1', 'x100', 'x46') with r2 = 0.1645657425351602
Significant!
For features ('x1', 'x12', 'x77') with r2 = 0.16341958007248453
Significant!
For features ('x1', 'x184', 'x20') with r2 = 0.15306308929353002


x1 x100 x184

Negative MAE

In [114]:
# Scoring gave you the middle bar - the high bar is scoring *1.5, the low bar is scoring *0.5. A ranking is embedded in the function to 
# ensure that negative metric, like -RMSE, can also be ranked normally, not reversely.
List2_2cvd_3,List2_2cvd_2,List2_2cvd_0 = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,10,model,'neg_mean_absolute_error',-19)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x1', 'x10', 'x100') with neg_mean_absolute_error = -17.513646554009423
Significant!
For features ('x1', 'x10', 'x184') with neg_mean_absolute_error = -17.519493121616556
Significant!
For features ('x1', 'x10', 'x46') with neg_mean_absolute_error = -18.73470114129008
Significant!
For features ('x1', 'x100', 'x101') with neg_mean_absolute_error = -18.782264236184027
Significant!
For features ('x1', 'x100', 'x12') with neg_mean_absolute_error = -18.139711392044667
Significant!
For features ('x1', 'x100', 'x13') with neg_mean_absolute_error = -18.931999293937785
Significant!
For features ('x1', 'x100', 'x183') with neg_mean_absolute_error = -18.872164331890865
Significant!
For features ('x1', 'x100', 'x184') with neg_mean_absolute_error = -15.847842655045188
Significant!
For features ('x1', 'x100', 'x46') with neg_mean_absolute_error = -17.828641879154937
Significant!
For features ('x1', '

In [115]:
len(List2_2cvd_3)

43

In [120]:
List3_2cvd_3,List3_2cvd_2,List3_2cvd_0 = combnfeats_MLR_evaby_crossval_metric(List2_2cvd_3,df_x,df_y,100,model,'neg_mean_absolute_error',-18)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x1', 'x100', 'x12') with neg_mean_absolute_error = -17.897222376293975
Significant!
For features ('x1', 'x100', 'x184') with neg_mean_absolute_error = -16.117457350632193
Significant!
For features ('x1', 'x100', 'x46') with neg_mean_absolute_error = -17.533539009601718
Significant!
For features ('x1', 'x100', 'x77') with neg_mean_absolute_error = -17.35863125729577
Significant!
For features ('x1', 'x100', 'x78') with neg_mean_absolute_error = -17.70922707831412
Significant!
For features ('x10', 'x100', 'x46') with neg_mean_absolute_error = -17.354399983036252
Significant!
For features ('x10', 'x129', 'x46') with neg_mean_absolute_error = -17.73819864440344
Significant!
For features ('x10', 'x184', 'x46') with neg_mean_absolute_error = -17.897870387765135
Significant!
For features ('x10', 'x30', 'x46') with neg_mean_absolute_error = -17.056166470488527
Significant!
For features ('x10', 

In [121]:
len(List3_2cvd_3)

12

In [123]:
List4_2cvd_3,List4_2cvd_2,List4_2cvd_0 = combnfeats_MLR_evaby_crossval_metric(List3_2cvd_3,df_x,df_y,500,model,'neg_mean_absolute_error',-17)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x1', 'x100', 'x184') with neg_mean_absolute_error = -16.093064055856427
Significant!
For features ('x10', 'x30', 'x46') with neg_mean_absolute_error = -16.966419394810057


'x1', 'x100', 'x184'

Explained variance

In [130]:
List_2cvd_3_expv,List_2cvd_2_expv,List_2cvd_0_expv = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,5,model,'explained_variance',0.3)


----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x1', 'x10', 'x100') with explained_variance = 0.40129864248628905
Significant!
For features ('x1', 'x10', 'x12') with explained_variance = 0.3153237281906415
Significant!
For features ('x1', 'x10', 'x184') with explained_variance = 0.37995167151069015
Significant!
For features ('x1', 'x10', 'x46') with explained_variance = 0.3061978312218899
Significant!
For features ('x1', 'x100', 'x101') with explained_variance = 0.3410990503638993
Significant!
For features ('x1', 'x100', 'x103') with explained_variance = 0.3336089994006713
Significant!
For features ('x1', 'x100', 'x12') with explained_variance = 0.42483780927943793
Significant!
For features ('x1', 'x100', 'x129') with explained_variance = 0.3346057384187958
Significant!
For features ('x1', 'x100', 'x13') with explained_variance = 0.366065675789731
Significant!
For features ('x1', 'x100', 'x183') with explained_variance = 0.354070779

In [131]:
len(List_2cvd_3_expv)

56

In [139]:
List2_2cvd_3_expv,List2_2cvd_2_expv,List2_2cvd_0_expv = combnfeats_MLR_evaby_crossval_metric(List_2cvd_3_expv,df_x,df_y,100,model,'explained_variance',0.35)


----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x1', 'x100', 'x184') with explained_variance = 0.402920654222648


'x1', 'x100', 'x184'

-RMSE

In [138]:
List_2cvd_3_nrmse,List_2cvd_2_nrmse,List_2cvd_0_nrmse = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,5,model,'neg_root_mean_squared_error',-22)


----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x1', 'x10', 'x100') with neg_root_mean_squared_error = -21.540867276216265
Significant!
For features ('x1', 'x10', 'x184') with neg_root_mean_squared_error = -21.128352521037232
Significant!
For features ('x1', 'x100', 'x184') with neg_root_mean_squared_error = -20.495670514834465
Significant!
For features ('x1', 'x12', 'x77') with neg_root_mean_squared_error = -21.64663115339782
Significant!
For features ('x1', 'x14', 'x184') with neg_root_mean_squared_error = -21.557024851949297
Significant!
For features ('x1', 'x184', 'x20') with neg_root_mean_squared_error = -21.728777991593425
Significant!
For features ('x14', 'x46', 'x56') with neg_root_mean_squared_error = -21.279259380750197


'x1', 'x100', 'x184'

max error

In [140]:
List_2cvd_3_me,List_2cvd_2_me,List_2cvd_0_me = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,5,model,'max_error',-20)


----------Linear regression of n features evaby crossval starts----------


In [142]:
len(List_2cvd_0_me)

8

In [143]:
List2_2cvd_3_me,List2_2cvd_2_me,List2_2cvd_0_me = combnfeats_MLR_evaby_crossval_metric(List_2cvd_0_me,df_x,df_y,500,model,'max_error',-100)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x1', 'x10', 'x184') with max_error = -39.16218436141052
Significant!
For features ('x1', 'x101', 'x12') with max_error = -39.52311298677442
Significant!
For features ('x1', 'x12', 'x184') with max_error = -40.581262875074295
Significant!
For features ('x1', 'x12', 'x46') with max_error = -41.19863816245781
Significant!
For features ('x1', 'x12', 'x77') with max_error = -39.83134995217872
Significant!
For features ('x1', 'x14', 'x184') with max_error = -40.13287577068026
Significant!
For features ('x1', 'x184', 'x20') with max_error = -40.75043658089858
Significant!
For features ('x14', 'x46', 'x56') with max_error = -37.444086614708326


'x14', 'x46', 'x56'

# Overall

In [195]:
df_x = df_origin.drop(columns = ['ligand','buch','Unnamed: 0','SMILES'])
df_y = df_origin[['Yield-avg']]

In [192]:
feats = [['x1', 'x12', 'x77'],['x14', 'x46', 'x56'],['x1', 'x100', 'x46'],['x10', 'x30', 'x46'],['x33','x83','x23','x102']]
a,b,c = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,1500,model,'r2',0.1)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ['x1', 'x12', 'x77'] with r2 = 0.16777310116380767
Significant!
For features ['x14', 'x46', 'x56'] with r2 = 0.13434945074268195
Significant!
For features ['x1', 'x100', 'x46'] with r2 = 0.16259172358371282
Significant!
For features ['x10', 'x30', 'x46'] with r2 = 0.11993775506398177
Significant!
For features ['x33', 'x83', 'x23', 'x102'] with r2 = 0.48185557492350284


In [157]:
feats = [['x1', 'x12', 'x77'],['x14', 'x46', 'x56'],['x1', 'x100', 'x46'],['x10', 'x30', 'x46']]
a,b,c = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,1500,model,'neg_mean_absolute_error',-50)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ['x1', 'x12', 'x77'] with neg_mean_absolute_error = -18.102720165418276
Significant!
For features ['x14', 'x46', 'x56'] with neg_mean_absolute_error = -18.145547434971377
Significant!
For features ['x1', 'x100', 'x46'] with neg_mean_absolute_error = -17.58069639437839
Significant!
For features ['x10', 'x30', 'x46'] with neg_mean_absolute_error = -16.99619032018832


In [211]:
feats = [['x1', 'x12', 'x77'],['x14', 'x46', 'x56'],['x1', 'x100', 'x46'],['x10', 'x30', 'x46'],['x33','x83','x23','x102']]
a,b,c = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,500,model,'explained_variance',0.01)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ['x1', 'x12', 'x77'] with explained_variance = 0.3318895209516007
Significant!
For features ['x14', 'x46', 'x56'] with explained_variance = 0.3144896755912715
Significant!
For features ['x1', 'x100', 'x46'] with explained_variance = 0.3342944257547296
Significant!
For features ['x10', 'x30', 'x46'] with explained_variance = 0.29674739622703644
Significant!
For features ['x33', 'x83', 'x23', 'x102'] with explained_variance = 0.5896278378940002


In [193]:
feats = [['x1', 'x12', 'x77'],['x14', 'x46', 'x56'],['x1', 'x100', 'x46'],['x10', 'x30', 'x46'],['x33','x83','x23','x102']]
a,b,c = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,500,model,'neg_root_mean_squared_error',-50)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ['x1', 'x12', 'x77'] with neg_root_mean_squared_error = -21.63558033405203
Significant!
For features ['x14', 'x46', 'x56'] with neg_root_mean_squared_error = -21.751830627464535
Significant!
For features ['x1', 'x100', 'x46'] with neg_root_mean_squared_error = -22.152714285780977
Significant!
For features ['x10', 'x30', 'x46'] with neg_root_mean_squared_error = -22.310551039057273
Significant!
For features ['x33', 'x83', 'x23', 'x102'] with neg_root_mean_squared_error = -17.154156217141843


In [210]:
feats = [['x1', 'x12', 'x77'],['x14', 'x46', 'x56'],['x1', 'x100', 'x46'],['x10', 'x30', 'x46'],['x33','x83','x23','x102']]
a,b,c = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,500,model,'max_error',-100)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ['x1', 'x12', 'x77'] with max_error = -39.83134995217872
Significant!
For features ['x14', 'x46', 'x56'] with max_error = -37.44408661470833
Significant!
For features ['x1', 'x100', 'x46'] with max_error = -41.94735469448683
Significant!
For features ['x10', 'x30', 'x46'] with max_error = -44.00136059699591
Significant!
For features ['x33', 'x83', 'x23', 'x102'] with max_error = -32.969628511694154


# 解释 Interpretation
x1 自然是vmin
x12是spin density，x32是nbo bd，这两个都是nbo的feature，在刚刚的MLR metric screening中也是可以互相替换的关系，符合逻辑。这也证明另一个e feature可以在预测中发挥作用。
x88是vburmin，x212是vburmax，是符合逻辑的，因为我们都知道vbur这个feature可以一定程度上描述ligand的动态体积。
x77是pyrimidization，是许多预测中唯一一个steric feature，完全不知道为什么如此重要，可以看看correlation
x20我也不知道是干么事的，而且是另一个e feature，我们已经有了太多的e feature，一个解释不了的steric feature。这到底是怎么回事呢？需要看1、到底是怎么算的，2、如何解释e feature在反应中的用途，具体到机理上怎么作用于SMC的每个中间体。

x32：Minimum electron occupancy of the three P−X bonding orbital
x20：Electric field gradient interaction tensor XX component