In [3]:
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from rdkit.Chem import AllChem, MolFromSmiles
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem import DataStructs
from rdkit import Chem
from rxnfp.tokenization import get_default_tokenizer, SmilesTokenizer
from rdkit.Chem import rdChemReactions
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [4]:
def combnfeats_MLR_evaby_crossval_metric(feats, df_x, df_y, n_fold, model, metric, score):
    print('----------Linear regression of n features evaby crossval starts----------')
    """_Screen and select Well behalved features from of a list of 3 features_
    Evaluated by sum of 5-fold cross validation.
    This is because just taking consideration of one result from one MLR can cause lost of important features due to randomness.
    If the displaied R^2 is also high, there's a higher chance that these features are of high predictive value.
    If the displaied R^2 is 0/-/unexplainable, it can be due to accident, and further cross-validation can verify their importance.
    """
    # Create a list to store features with high, medium, or low score
    List_cvd_3 = [] 
    List_cvd_2 = [] 
    List_cvd_0 = [] 
    # for r2 and explained variance the closer to 1 the better
    if metric == 'r2' or metric == 'explained_variance':
        sum_score = [score*1,score*0.8,score*0.5]
    # for nmae, nrmse and max error, the closer to 0 the better, therefore we need to readjust the rank
    elif metric == 'neg_mean_absolute_error' or metric == 'neg_root_mean_squared_error' or metric == 'max_error':
        sum_score = [score/1,score/0.8,score/0.5]
    
    # Three different ranks for high, medium and low, based on the score we imputed
    rank_score = sorted(sum_score,reverse=True)
    score_1 = rank_score[0]
    score_2 = rank_score[1]
    score_3 = rank_score[2]
    
    for m in feats:
        fts = feats.index(m)
        df_repeated_cv_x = df_x
        df_repeated_cv_y = df_y
        sum_cvd = 0
        for k in range(0,n_fold):
            # Shuffle the data
            df_repeated_cv_x = df_repeated_cv_x.sample(frac=1,random_state=k)
            df_nf = df_repeated_cv_x[list(feats[fts])]
            df_repeated_cv_y = df_repeated_cv_y.sample(frac=1,random_state=k)
            cvd_scores = cross_val_score(model, df_nf, df_repeated_cv_y, cv=5,scoring= metric)
            sum_cvd += sum(cvd_scores)
        # Calculate average R^2
        avg_cvd = sum_cvd/n_fold/5
        if avg_cvd >= score_1: 
            print('Significant!')
            print(f'For features {feats[fts]} with {metric} = {avg_cvd}')
            List_cvd_3.append(feats[fts]) 
        elif avg_cvd >=score_2:
            List_cvd_2.append(feats[fts])
        elif avg_cvd >=score_3:
            List_cvd_0.append(feats[fts])
        #print(f'For features {feats[fts]} with {metric} = {avg_cvd}'+'cvd_scores = ',cvd_scores,sum_cvd,avg_cvd)   
    return List_cvd_3,List_cvd_2,List_cvd_0

In [5]:
# Data processing
df_origin = pd.read_excel('Afterscreen.xlsx', sheet_name='Result2')
df_origin = df_origin.sample(frac=1,random_state=42)
# Reindex the data
df_origin = df_origin.set_index('ID')


df_x = df_origin.drop(columns = ['Yield-avg','ligand','buch','Unnamed: 0','SMILES'])
df_y = df_origin[['Yield-avg']]
# standardize
scaler = StandardScaler()
df_x=pd.DataFrame(scaler.fit_transform(df_x),index=df_x.index, columns=df_x.columns)
df_std_origin = df_x
zero_std_cols = df_x.columns[df_x.std() == 0]
df_x=df_x[df_x.columns.difference(zero_std_cols)]
print (f"Dropping {len(zero_std_cols)} zero-variance features {zero_std_cols}")

Dropping 4 zero-variance features Index(['x119', 'x122', 'x144', 'x147'], dtype='object')


In [6]:
high_corr_feats = []
selected_feats = ['x1', 'x14', 'x46', 'x56','x100', 'x184']
# For i in selected_feats, find the features with high correlation coefficient (>0.7) 
# from high_corr (corr matrix from earlier), and add them to the list
corr = df_origin.corr(method='pearson')
"""
# select 5 top features corr to Yield-avg
corr = corr.sort_values('Yield-avg',ascending=False)
corr_yield = corr['Yield-avg']
corr1 = corr_yield[:5]
corr2 = corr_yield[-5:]
list_corr_yield1 = corr1.index.tolist()
list_corr_yield2 = corr2.index.tolist()
# combine lists
list_corr_yield =   list_corr_yield1 + list_corr_yield2
print(list_corr_yield)
"""

corr = df_origin.corr(method='pearson')
# Iterate over selected features
for selected_feat in selected_feats:
    # Check if the selected feature is in the DataFrame
    if selected_feat in corr.columns:
        # Get all features with correlation > 0.7 with the selected feature
        correlated_feats = corr.index[abs(corr[selected_feat]) > 0.7].tolist()
        # Remove the selected feature from the list
        correlated_feats.remove(selected_feat)
        # Extend the high_corr_feats list
        high_corr_feats.extend(correlated_feats)

# Remove duplicates
high_corr_feats = list(set(high_corr_feats))

print(len(high_corr_feats))

45


In [9]:
# export to txt
with open('high_corr_feats.txt', 'w') as f:
    for item in high_corr_feats:
        f.write("%s\n" % item)

In [11]:
combined_feats = selected_feats + high_corr_feats
df_x = df_x[combined_feats]

# Combination of 3 features - after adding back correlated feats

In [6]:
print('Shape of input:',df_x.shape,'\nShape of output:' ,df_y.shape)
df_x = df_x.astype(float)
df_y = df_y.astype(float)

Shape of input: (35, 51) 
Shape of output: (35, 1)


In [7]:
from itertools import combinations
feats = list(combinations(high_corr_feats, 3))
len(feats)

14190

In [8]:
model = LinearRegression()
#Screening of features, report significant when score of 5-fold crossval is greater than 0
List_cvd_3_r2,List_cvd_2_r2,List_cvd_0_r2 = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,5,model,'r2',0.1)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x49', 'x27') with r2 = 0.14367466522539252
Significant!
For features ('x186', 'x49', 'x23') with r2 = 0.23145111964257464
Significant!
For features ('x186', 'x32', 'x23') with r2 = 0.3349289341776286
Significant!
For features ('x186', 'x8', 'x27') with r2 = 0.15062580358028313
Significant!
For features ('x186', 'x8', 'x23') with r2 = 0.28708936585492706
Significant!
For features ('x186', 'x2', 'x23') with r2 = 0.1989431688803517
Significant!
For features ('x186', 'x2', 'x33') with r2 = 0.24650188993020605
Significant!
For features ('x186', 'x2', 'x102') with r2 = 0.14142741498596006
Significant!
For features ('x186', 'x2', 'x159') with r2 = 0.101802269653949
Significant!
For features ('x186', 'x4', 'x23') with r2 = 0.2320386972402802
Significant!
For features ('x186', 'x27', 'x99') with r2 = 0.17074327166445993
Significant!
For features ('x186', 'x27', 'x50') with r2 = 0.165136

In [9]:
len(List_cvd_3_r2)

1289

In [13]:
List3_cvd_3_r2,List3_cvd_2_r2,List3_cvd_0_r2 = combnfeats_MLR_evaby_crossval_metric(List_cvd_3_r2,df_x,df_y,20,model,'r2',0.3)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x23', 'x33') with r2 = 0.3499597523736553
Significant!
For features ('x186', 'x23', 'x102') with r2 = 0.3550960865791761
Significant!
For features ('x186', 'x23', 'x98') with r2 = 0.30982139399688974
Significant!
For features ('x49', 'x23', 'x33') with r2 = 0.3285730221719026
Significant!
For features ('x32', 'x99', 'x23') with r2 = 0.474844246198988
Significant!
For features ('x32', 'x50', 'x23') with r2 = 0.3385126020578181
Significant!
For features ('x32', 'x23', 'x175') with r2 = 0.3043744778021484
Significant!
For features ('x32', 'x23', 'x33') with r2 = 0.3611215848285063
Significant!
For features ('x32', 'x23', 'x102') with r2 = 0.4852123994481189
Significant!
For features ('x32', 'x23', 'x98') with r2 = 0.45497834067816656
Significant!
For features ('x32', 'x23', 'x162') with r2 = 0.36979143626638494
Significant!
For features ('x32', 'x23', 'x159') with r2 = 0.456234944

In [14]:
len(List3_cvd_3_r2)

76

In [15]:
List4_cvd_3_r2,List4_cvd_2_r2,List4_cvd_0_r2 = combnfeats_MLR_evaby_crossval_metric(List3_cvd_3_r2,df_x,df_y,20,model,'r2',0.4)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x32', 'x99', 'x23') with r2 = 0.474844246198988
Significant!
For features ('x32', 'x23', 'x102') with r2 = 0.4852123994481189
Significant!
For features ('x32', 'x23', 'x98') with r2 = 0.45497834067816656
Significant!
For features ('x32', 'x23', 'x159') with r2 = 0.4562349443241981
Significant!
For features ('x32', 'x23', 'x158') with r2 = 0.44791945115319026
Significant!
For features ('x99', 'x23', 'x33') with r2 = 0.4389334994595693
Significant!
For features ('x23', 'x33', 'x102') with r2 = 0.44255618843632993
Significant!
For features ('x23', 'x33', 'x98') with r2 = 0.42524257985483815
Significant!
For features ('x23', 'x33', 'x159') with r2 = 0.4132786191469739
Significant!
For features ('x23', 'x33', 'x158') with r2 = 0.42321070006129347


In [16]:
List5_cvd_3_r2,List5_cvd_2_r2,List5_cvd_0_r2 = combnfeats_MLR_evaby_crossval_metric(List4_cvd_3_r2,df_x,df_y,500,model,'r2',0.4)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x32', 'x99', 'x23') with r2 = 0.4668700151847758
Significant!
For features ('x32', 'x23', 'x102') with r2 = 0.48501430269752543
Significant!
For features ('x32', 'x23', 'x98') with r2 = 0.456111542422716
Significant!
For features ('x32', 'x23', 'x159') with r2 = 0.45159303804133105
Significant!
For features ('x32', 'x23', 'x158') with r2 = 0.44968125241633017
Significant!
For features ('x99', 'x23', 'x33') with r2 = 0.4420672881832912
Significant!
For features ('x23', 'x33', 'x102') with r2 = 0.45778706856396373
Significant!
For features ('x23', 'x33', 'x98') with r2 = 0.4395539940948253
Significant!
For features ('x23', 'x33', 'x159') with r2 = 0.421074102645426
Significant!
For features ('x23', 'x33', 'x158') with r2 = 0.4409134217578748


'x32', 'x23', 'x102'

Negative MAE

In [17]:
# Scoring gave you the middle bar - the high bar is scoring *1.5, the low bar is scoring *0.5. A ranking is embedded in the function to 
# ensure that negative metric, like -RMSE, can also be ranked normally, not reversely.
List2_2cvd_3_nmae,List2_2cvd_2_nmae,List2_2cvd_0_nmae = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,5,model,'neg_mean_absolute_error',-19)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x49', 'x27') with neg_mean_absolute_error = -18.72571533404028
Significant!
For features ('x186', 'x49', 'x23') with neg_mean_absolute_error = -17.751141956222806
Significant!
For features ('x186', 'x32', 'x23') with neg_mean_absolute_error = -16.429187344974576
Significant!
For features ('x186', 'x8', 'x23') with neg_mean_absolute_error = -17.59163535893577
Significant!
For features ('x186', 'x2', 'x99') with neg_mean_absolute_error = -18.32687168623706
Significant!
For features ('x186', 'x2', 'x33') with neg_mean_absolute_error = -17.77220056760386
Significant!
For features ('x186', 'x2', 'x102') with neg_mean_absolute_error = -17.338450349684642
Significant!
For features ('x186', 'x2', 'x98') with neg_mean_absolute_error = -17.917779634968422
Significant!
For features ('x186', 'x2', 'x162') with neg_mean_absolute_error = -18.43405983052849
Significant!
For features ('x186', 

In [18]:
len(List2_2cvd_3_nmae)

1085

In [20]:
List3_2cvd_3_nmae,List3_2cvd_2_nmae,List3_2cvd_0_nmae = combnfeats_MLR_evaby_crossval_metric(List2_2cvd_3_nmae,df_x,df_y,5,model,'neg_mean_absolute_error',-17)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x32', 'x23') with neg_mean_absolute_error = -16.429187344974576
Significant!
For features ('x186', 'x27', 'x102') with neg_mean_absolute_error = -16.636961315502397
Significant!
For features ('x186', 'x99', 'x23') with neg_mean_absolute_error = -15.861610728292339
Significant!
For features ('x186', 'x23', 'x33') with neg_mean_absolute_error = -16.194786669548293
Significant!
For features ('x186', 'x23', 'x102') with neg_mean_absolute_error = -14.275031233242993
Significant!
For features ('x186', 'x23', 'x98') with neg_mean_absolute_error = -15.097794482659799
Significant!
For features ('x186', 'x23', 'x162') with neg_mean_absolute_error = -15.768968531106363
Significant!
For features ('x186', 'x23', 'x159') with neg_mean_absolute_error = -15.596059579376282
Significant!
For features ('x186', 'x23', 'x173') with neg_mean_absolute_error = -16.933568928374218
Significant!
For feat

In [21]:
len(List3_2cvd_3_nmae)

260

In [23]:
List4_2cvd_3_nmae,List4_2cvd_2_nmae,List4_2cvd_0_nmae = combnfeats_MLR_evaby_crossval_metric(List3_2cvd_3_nmae,df_x,df_y,10,model,'neg_mean_absolute_error',-15)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x23', 'x102') with neg_mean_absolute_error = -14.370126499928272
Significant!
For features ('x32', 'x99', 'x23') with neg_mean_absolute_error = -14.13049066474737
Significant!
For features ('x32', 'x23', 'x102') with neg_mean_absolute_error = -13.762234389745874
Significant!
For features ('x32', 'x23', 'x98') with neg_mean_absolute_error = -14.529415963902412
Significant!
For features ('x32', 'x23', 'x159') with neg_mean_absolute_error = -14.4068165909628
Significant!
For features ('x32', 'x23', 'x158') with neg_mean_absolute_error = -14.567527483529862
Significant!
For features ('x99', 'x23', 'x33') with neg_mean_absolute_error = -14.849436999416998
Significant!
For features ('x188', 'x33', 'x48') with neg_mean_absolute_error = -14.721021521372387
Significant!
For features ('x23', 'x6', 'x102') with neg_mean_absolute_error = -14.329580630474634
Significant!
For features ('x23'

In [24]:
a,b,c = combnfeats_MLR_evaby_crossval_metric(List4_2cvd_3_nmae,df_x,df_y,500,model,'neg_mean_absolute_error',-15)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x23', 'x102') with neg_mean_absolute_error = -14.144566636932714
Significant!
For features ('x32', 'x99', 'x23') with neg_mean_absolute_error = -14.135436225264135
Significant!
For features ('x32', 'x23', 'x102') with neg_mean_absolute_error = -13.699966990979012
Significant!
For features ('x32', 'x23', 'x98') with neg_mean_absolute_error = -14.503600690493798
Significant!
For features ('x32', 'x23', 'x159') with neg_mean_absolute_error = -14.169729080277424
Significant!
For features ('x32', 'x23', 'x158') with neg_mean_absolute_error = -14.608850729040185
Significant!
For features ('x99', 'x23', 'x33') with neg_mean_absolute_error = -14.69601302303316
Significant!
For features ('x188', 'x33', 'x48') with neg_mean_absolute_error = -14.370194418594874
Significant!
For features ('x23', 'x6', 'x102') with neg_mean_absolute_error = -13.878555361801821
Significant!
For features ('x2

'x32', 'x23', 'x102'

Explained variance

In [25]:
List_2cvd_3_expv,List_2cvd_2_expv,List_2cvd_0_expv = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,5,model,'explained_variance',0.3)


----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x49', 'x27') with explained_variance = 0.3517260247290701
Significant!
For features ('x186', 'x49', 'x23') with explained_variance = 0.3868605117371822
Significant!
For features ('x186', 'x32', 'x23') with explained_variance = 0.4334291667090694
Significant!
For features ('x186', 'x8', 'x27') with explained_variance = 0.3561022361565488
Significant!
For features ('x186', 'x8', 'x23') with explained_variance = 0.40631121813232196
Significant!
For features ('x186', 'x2', 'x23') with explained_variance = 0.337155950794995
Significant!
For features ('x186', 'x2', 'x33') with explained_variance = 0.4067669012430139
Significant!
For features ('x186', 'x2', 'x102') with explained_variance = 0.3240137742465925
Significant!
For features ('x186', 'x2', 'x159') with explained_variance = 0.31823713781040197
Significant!
For features ('x186', 'x4', 'x23') with explained_variance = 0.3411250

In [26]:
len(List_2cvd_3_expv)

1506

In [28]:
List2_2cvd_3_expv,List2_2cvd_2_expv,List2_2cvd_0_expv = combnfeats_MLR_evaby_crossval_metric(List_2cvd_3_expv,df_x,df_y,5,model,'explained_variance',0.4)


----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x32', 'x23') with explained_variance = 0.4334291667090694
Significant!
For features ('x186', 'x8', 'x23') with explained_variance = 0.40631121813232196
Significant!
For features ('x186', 'x2', 'x33') with explained_variance = 0.4067669012430139
Significant!
For features ('x186', 'x27', 'x102') with explained_variance = 0.43318326473405044
Significant!
For features ('x186', 'x27', 'x98') with explained_variance = 0.40072029041423934
Significant!
For features ('x186', 'x99', 'x23') with explained_variance = 0.512399412860655
Significant!
For features ('x186', 'x50', 'x23') with explained_variance = 0.43945569660387945
Significant!
For features ('x186', 'x45', 'x23') with explained_variance = 0.41796816272707193
Significant!
For features ('x186', 'x23', 'x33') with explained_variance = 0.5090477312300923
Significant!
For features ('x186', 'x23', 'x102') with explained_variance = 0

In [29]:
len(List2_2cvd_3_expv)

377

In [30]:
List3_2cvd_3_expv,List3_2cvd_2_expv,List3_2cvd_0_expv = combnfeats_MLR_evaby_crossval_metric(List2_2cvd_3_expv,df_x,df_y,10,model,'explained_variance',0.5)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x99', 'x23') with explained_variance = 0.5222792458513423
Significant!
For features ('x186', 'x23', 'x102') with explained_variance = 0.5503288558183445
Significant!
For features ('x186', 'x23', 'x98') with explained_variance = 0.5298456038849868
Significant!
For features ('x186', 'x23', 'x159') with explained_variance = 0.5253211408615328
Significant!
For features ('x49', 'x23', 'x159') with explained_variance = 0.5214017172192204
Significant!
For features ('x32', 'x99', 'x23') with explained_variance = 0.5937428793778471
Significant!
For features ('x32', 'x23', 'x102') with explained_variance = 0.5949406713422736
Significant!
For features ('x32', 'x23', 'x98') with explained_variance = 0.5774066315322011
Significant!
For features ('x32', 'x23', 'x162') with explained_variance = 0.5017406618950422
Significant!
For features ('x32', 'x23', 'x159') with explained_variance = 0.580

In [31]:
len(List3_2cvd_3_expv)

75

In [35]:
List4_2cvd_3_expv,List4_2cvd_2_expv,List4_2cvd_0_expv = combnfeats_MLR_evaby_crossval_metric(List3_2cvd_3_expv,df_x,df_y,20,model,'explained_variance',0.55)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x32', 'x99', 'x23') with explained_variance = 0.5816623466395823
Significant!
For features ('x32', 'x23', 'x102') with explained_variance = 0.5912772698620405
Significant!
For features ('x32', 'x23', 'x98') with explained_variance = 0.5663428728700451
Significant!
For features ('x32', 'x23', 'x159') with explained_variance = 0.5619162846003578
Significant!
For features ('x32', 'x23', 'x158') with explained_variance = 0.5534181162514866
Significant!
For features ('x99', 'x23', 'x33') with explained_variance = 0.5560352519743311
Significant!
For features ('x23', 'x33', 'x102') with explained_variance = 0.5582265193081213


In [36]:
a,b,c = combnfeats_MLR_evaby_crossval_metric(List4_2cvd_3_expv,df_x,df_y,500,model,'explained_variance',0.55)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x32', 'x99', 'x23') with explained_variance = 0.5678484407946495
Significant!
For features ('x32', 'x23', 'x102') with explained_variance = 0.584136948279881
Significant!
For features ('x32', 'x23', 'x98') with explained_variance = 0.5587627726194275
Significant!
For features ('x32', 'x23', 'x159') with explained_variance = 0.55668725228789
Significant!
For features ('x32', 'x23', 'x158') with explained_variance = 0.5514310933122439
Significant!
For features ('x23', 'x33', 'x102') with explained_variance = 0.5647944611966497


'x32', 'x23', 'x102'

Negative MSE

In [37]:
List_2cvd_3_nrmse,List_2cvd_2_nrmse,List_2cvd_0_nrmse = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,5,model,'neg_root_mean_squared_error',-22)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x49', 'x23') with neg_root_mean_squared_error = -21.49641037860875
Significant!
For features ('x186', 'x32', 'x23') with neg_root_mean_squared_error = -20.083431386804826
Significant!
For features ('x186', 'x8', 'x23') with neg_root_mean_squared_error = -20.77001158854085
Significant!
For features ('x186', 'x2', 'x33') with neg_root_mean_squared_error = -21.101694181763914
Significant!
For features ('x186', 'x4', 'x23') with neg_root_mean_squared_error = -21.706545414298297
Significant!
For features ('x186', 'x99', 'x23') with neg_root_mean_squared_error = -19.494786643558733
Significant!
For features ('x186', 'x50', 'x23') with neg_root_mean_squared_error = -20.746343074166752
Significant!
For features ('x186', 'x188', 'x23') with neg_root_mean_squared_error = -21.693860709245875
Significant!
For features ('x186', 'x45', 'x23') with neg_root_mean_squared_error = -20.9230118069

In [38]:
len(List_2cvd_3_nrmse)

496

In [39]:
List2_2cvd_3_nrmse,List2_2cvd_2_nrmse,List2_2cvd_0_nrmse = combnfeats_MLR_evaby_crossval_metric(List_2cvd_3_nrmse,df_x,df_y,5,model,'neg_root_mean_squared_error',-20)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x99', 'x23') with neg_root_mean_squared_error = -19.494786643558733
Significant!
For features ('x186', 'x23', 'x33') with neg_root_mean_squared_error = -18.909058909898953
Significant!
For features ('x186', 'x23', 'x102') with neg_root_mean_squared_error = -18.578672790121608
Significant!
For features ('x186', 'x23', 'x98') with neg_root_mean_squared_error = -19.027677323630776
Significant!
For features ('x186', 'x23', 'x159') with neg_root_mean_squared_error = -19.47934607033745
Significant!
For features ('x49', 'x23', 'x33') with neg_root_mean_squared_error = -18.980699294901708
Significant!
For features ('x49', 'x23', 'x159') with neg_root_mean_squared_error = -19.68368509425745
Significant!
For features ('x32', 'x99', 'x23') with neg_root_mean_squared_error = -16.98404900822988
Significant!
For features ('x32', 'x50', 'x23') with neg_root_mean_squared_error = -19.2224607479

In [40]:
len(List2_2cvd_3_nrmse)

114

In [41]:
List3_2cvd_3_nrmse,List3_2cvd_2_nrmse,List3_2cvd_0_nrmse = combnfeats_MLR_evaby_crossval_metric(List2_2cvd_3_nrmse,df_x,df_y,10,model,'neg_root_mean_squared_error',-18)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x32', 'x99', 'x23') with neg_root_mean_squared_error = -16.86261104842348
Significant!
For features ('x32', 'x23', 'x102') with neg_root_mean_squared_error = -16.981337084268457
Significant!
For features ('x32', 'x23', 'x98') with neg_root_mean_squared_error = -17.186978944364938
Significant!
For features ('x32', 'x23', 'x159') with neg_root_mean_squared_error = -17.289543565684955
Significant!
For features ('x32', 'x23', 'x158') with neg_root_mean_squared_error = -17.15711083825868
Significant!
For features ('x99', 'x23', 'x33') with neg_root_mean_squared_error = -17.747758296968858
Significant!
For features ('x23', 'x33', 'x102') with neg_root_mean_squared_error = -17.826557352864402
Significant!
For features ('x23', 'x33', 'x98') with neg_root_mean_squared_error = -17.89928133950422
Significant!
For features ('x23', 'x33', 'x158') with neg_root_mean_squared_error = -17.8600558387802

In [42]:
a,b,c= combnfeats_MLR_evaby_crossval_metric(List3_2cvd_3_nrmse,df_x,df_y,500,model,'neg_root_mean_squared_error',-18)

----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x32', 'x99', 'x23') with neg_root_mean_squared_error = -16.830660157434497
Significant!
For features ('x32', 'x23', 'x102') with neg_root_mean_squared_error = -16.793635014345263
Significant!
For features ('x32', 'x23', 'x98') with neg_root_mean_squared_error = -17.06301570096567
Significant!
For features ('x32', 'x23', 'x159') with neg_root_mean_squared_error = -17.03779748224993
Significant!
For features ('x32', 'x23', 'x158') with neg_root_mean_squared_error = -17.172289222048907
Significant!
For features ('x99', 'x23', 'x33') with neg_root_mean_squared_error = -17.560090246670335
Significant!
For features ('x23', 'x33', 'x102') with neg_root_mean_squared_error = -17.54899133870348
Significant!
For features ('x23', 'x33', 'x98') with neg_root_mean_squared_error = -17.642704885256933
Significant!
For features ('x23', 'x33', 'x158') with neg_root_mean_squared_error = -17.6959972821707

'x32', 'x23', 'x102'

max error

In [43]:
List_2cvd_3_me,List_2cvd_2_me,List_2cvd_0_me = combnfeats_MLR_evaby_crossval_metric(feats,df_x,df_y,5,model,'max_error',-20)


----------Linear regression of n features evaby crossval starts----------


In [46]:
len(List_2cvd_0_me)

668

In [51]:
List2_2cvd_3_me,List2_2cvd_2_me,List2_2cvd_0_me = combnfeats_MLR_evaby_crossval_metric(List_2cvd_0_me,df_x,df_y,5,model,'max_error',-37)


----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x32', 'x23') with max_error = -35.371683154204824
Significant!
For features ('x186', 'x8', 'x23') with max_error = -36.74492717947379
Significant!
For features ('x186', 'x2', 'x33') with max_error = -36.180444682488485
Significant!
For features ('x186', 'x99', 'x23') with max_error = -35.66572964117487
Significant!
For features ('x186', 'x23', 'x39') with max_error = -36.92126953156729
Significant!
For features ('x186', 'x23', 'x33') with max_error = -33.19930753155171
Significant!
For features ('x186', 'x23', 'x98') with max_error = -36.62495378781877
Significant!
For features ('x186', 'x23', 'x159') with max_error = -35.747285778603946
Significant!
For features ('x49', 'x2', 'x33') with max_error = -36.2416480201177
Significant!
For features ('x49', 'x23', 'x33') with max_error = -33.344874185315376
Significant!
For features ('x49', 'x23', 'x159') with max_error = -35.4761993

In [52]:
len(List2_2cvd_3_me)

201

In [53]:
List3_2cvd_3_me,List3_2cvd_2_me,List3_2cvd_0_me = combnfeats_MLR_evaby_crossval_metric(List2_2cvd_3_me,df_x,df_y,10,model,'max_error',-36)


----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x32', 'x23') with max_error = -35.141455176083944
Significant!
For features ('x186', 'x23', 'x33') with max_error = -33.817030256876436
Significant!
For features ('x49', 'x23', 'x33') with max_error = -34.17644041871508
Significant!
For features ('x32', 'x8', 'x23') with max_error = -35.24328258577718
Significant!
For features ('x32', 'x2', 'x23') with max_error = -34.539313262758
Significant!
For features ('x32', 'x4', 'x23') with max_error = -35.04866418320002
Significant!
For features ('x32', 'x27', 'x23') with max_error = -35.701679513308214
Significant!
For features ('x32', 'x27', 'x41') with max_error = -35.85780553299511
Significant!
For features ('x32', 'x99', 'x23') with max_error = -30.50745226782563
Significant!
For features ('x32', 'x50', 'x23') with max_error = -35.81328959724969
Significant!
For features ('x32', 'x71', 'x23') with max_error = -35.38742348350519
Si

In [54]:
len(List3_2cvd_3_me)

101

In [56]:
List4_2cvd_3_me,List4_2cvd_2_me,List4_2cvd_0_me = combnfeats_MLR_evaby_crossval_metric(List3_2cvd_3_me,df_x,df_y,20,model,'max_error',-34)


----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x186', 'x23', 'x33') with max_error = -33.67384744102054
Significant!
For features ('x32', 'x2', 'x23') with max_error = -33.85163111918706
Significant!
For features ('x32', 'x99', 'x23') with max_error = -30.50871811819374
Significant!
For features ('x32', 'x23', 'x33') with max_error = -33.235722233067015
Significant!
For features ('x32', 'x23', 'x102') with max_error = -32.40546464667619
Significant!
For features ('x32', 'x23', 'x98') with max_error = -30.77212589906859
Significant!
For features ('x32', 'x23', 'x162') with max_error = -33.6604574523349
Significant!
For features ('x32', 'x23', 'x159') with max_error = -31.097548484318317
Significant!
For features ('x32', 'x23', 'x173') with max_error = -33.159160934502125
Significant!
For features ('x32', 'x23', 'x158') with max_error = -30.91490089740741
Significant!
For features ('x32', 'x23', 'x160') with max_error = -31.613820830

In [58]:
len(List4_2cvd_3_me)

35

In [59]:
List5_2cvd_3_me,List5_2cvd_2_me,List5_2cvd_0_me = combnfeats_MLR_evaby_crossval_metric(List4_2cvd_3_me,df_x,df_y,30,model,'max_error',-32)


----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x32', 'x99', 'x23') with max_error = -29.738438709850566
Significant!
For features ('x32', 'x23', 'x102') with max_error = -31.94912585338481
Significant!
For features ('x32', 'x23', 'x98') with max_error = -29.987769915458365
Significant!
For features ('x32', 'x23', 'x159') with max_error = -30.167704677535205
Significant!
For features ('x32', 'x23', 'x158') with max_error = -30.156735394637998
Significant!
For features ('x32', 'x23', 'x160') with max_error = -31.158948382484436
Significant!
For features ('x99', 'x23', 'x33') with max_error = -31.315703083209325
Significant!
For features ('x23', 'x67', 'x33') with max_error = -31.95803811032669
Significant!
For features ('x23', 'x33', 'x98') with max_error = -31.773985246882933
Significant!
For features ('x23', 'x33', 'x159') with max_error = -31.511611898535772
Significant!
For features ('x23', 'x33', 'x158') with max_error = -31.952

In [60]:
a,b,c = combnfeats_MLR_evaby_crossval_metric(List5_2cvd_3_me,df_x,df_y,500,model,'max_error',-32)


----------Linear regression of n features evaby crossval starts----------
Significant!
For features ('x32', 'x99', 'x23') with max_error = -29.972558580173747
Significant!
For features ('x32', 'x23', 'x102') with max_error = -31.92578618016166
Significant!
For features ('x32', 'x23', 'x98') with max_error = -30.151293511739464
Significant!
For features ('x32', 'x23', 'x159') with max_error = -30.27243660650324
Significant!
For features ('x32', 'x23', 'x158') with max_error = -30.374302951355084
Significant!
For features ('x32', 'x23', 'x160') with max_error = -31.489471878703586
Significant!
For features ('x99', 'x23', 'x33') with max_error = -31.413102187662805
Significant!
For features ('x23', 'x67', 'x33') with max_error = -31.82143260278473
Significant!
For features ('x23', 'x33', 'x98') with max_error = -31.93491727222382
Significant!
For features ('x23', 'x33', 'x159') with max_error = -31.668267331721534


In [62]:
highest_ranked_combination_feats = List5_2cvd_3_me + List3_2cvd_3_nrmse + List4_2cvd_3_expv + List4_2cvd_3_nmae + List4_cvd_3_r2

In [64]:
# get rid of duplicates
highest_ranked_combination_feats = list(set(highest_ranked_combination_feats))
print(len(highest_ranked_combination_feats))

17


In [66]:
highest_ranked_combination_feats

[('x99', 'x23', 'x33'),
 ('x23', 'x33', 'x102'),
 ('x23', 'x33', 'x158'),
 ('x23', 'x33', 'x98'),
 ('x186', 'x23', 'x102'),
 ('x32', 'x23', 'x98'),
 ('x32', 'x23', 'x160'),
 ('x32', 'x99', 'x23'),
 ('x23', 'x6', 'x98'),
 ('x23', 'x33', 'x159'),
 ('x188', 'x33', 'x48'),
 ('x32', 'x23', 'x102'),
 ('x32', 'x23', 'x158'),
 ('x23', 'x175', 'x102'),
 ('x23', 'x6', 'x102'),
 ('x32', 'x23', 'x159'),
 ('x23', 'x67', 'x33')]