# Feature Selection / Dimensionality Reduction

In [2]:
import numpy as np
import pandas as pd
import pymatgen as mg

from sklearn import metrics, linear_model, ensemble, preprocessing, model_selection
from scipy import stats

import matplotlib.pyplot as plt
from matplotlib import rcParams
plt.style.use('seaborn')
rcParams.update({'figure.autolayout': True,
                 'xtick.top': True,
                 'xtick.direction': 'in',
                 'ytick.right': True,
                 'ytick.direction': 'in',
                 'font.sans-serif': 'Arial',
                 'font.size': 14,
                 'savefig.dpi': 300,
                 'figure.dpi': 96
                })
%matplotlib notebook

  from numpy.core.umath_tests import inner1d


## Reload the data from where we left off

In [3]:
df = pd.read_hdf('data/data.h5', key='df_full', mode='r')
df_test = pd.read_hdf('data/data.h5', key='df_test', mode='r')
X = pd.read_hdf('data/data.h5', key='X', mode='r')
X_test = pd.read_hdf('data/data.h5', key='X_test', mode='r')

y = np.array(df['Heat of Formation (kJ/mol H2)'], dtype=float)

In [4]:
df

Unnamed: 0,Material Class,Composition Formula,Hydrogen Weight Percent,Heat of Formation (kJ/mol H2),Temperature (˚C),Pressure (Atmospheres Absolute),Author Year,Reference Number,Comment1,Comment2,Comment3,composition
3,A2B,Th1.5Ce.5Al,0.4,133,650,0.0003,"Van Vucht, 1963",492,,,,"(Th, Ce, Al)"
4,A2B,Th2Al,0.8,130,500,0.001,"Van Vucht, 1963",492,,,,"(Th, Al)"
17,A2B,Ti2Cu,2.2,130,500,0.12,"Kadel, 1978",523,,,,"(Ti, Cu)"
29,A2B,Ti2Pd,1.4,90,,,"Kadel, 1978",523,,,,"(Ti, Pd)"
37,A2B,Zr2Cu,1.3,144,600,0.003,"Pebler, 1966",12,,,,"(Zr, Cu)"
38,A2B,Zr2Cu,1.3,142,700,0.02,"Kadel, 1979",429,,,,"(Zr, Cu)"
43,A2B,Zr2Ni,1.3,183,604,0.003,"Pebler, 1966",12,,,,"(Zr, Ni)"
69,A2B,Hf2Fe,1.1,70,277,0.38,"Aubertin, 1989",425,,,,"(Hf, Fe)"
87,A2B,Mg1.5Ni,2.6,29.1,300,3.7,"Kuji, 2002",1492,,Ball milled,,"(Mg, Ni)"
88,A2B,Mg1.75Ni,2.8,26.6,300,4.5,"Kuji, 2002",1492,,Ball milled,,"(Mg, Ni)"


In [5]:
X.shape

(542, 132)

In [6]:
print('Main frame size: {}\n'.format(df.shape),
      'test frame size: {}\n'.format(df_test.shape),
      'Feature Array size: {}\n'.format(X.shape),
      'Test Features size: {}'.format(X_test.shape))

Main frame size: (542, 12)
 test frame size: (50, 8)
 Feature Array size: (542, 132)
 Test Features size: (50, 132)


# Perform Recursive Feature Elimination on some models from the previous selection process

Namely, the random forest, gradient boosting, and bagging regressors

In [7]:
def parity_plot(y_true, y_pred, newfig=True, figsize=(5,5), lim=450, title=''):
    if newfig:
        plt.figure(figsize=figsize,
                   tight_layout=False)
        
    plt.scatter(y_true, y_pred, alpha=0.5)
    plt.plot((0,lim), (0,lim), linestyle='--', color='k')
    plt.xlim([-10,lim]); plt.ylim([-10,lim])
    plt.gca().set_aspect('equal','datalim')

    plt.xlabel(r'$\Delta H$')
    plt.ylabel(r'predicted $\Delta H$')
    plt.title(title + 'MAE: {} kJ/mol'.format(np.round(metrics.mean_absolute_error(y_true, y_pred),
                                               decimals=2)))

## Retrieve the ensemble models again

In [8]:
ens = pd.Series([s for s in vars(ensemble)['__all__'] if 'Classifier' not in s])
ens

0                  BaseEnsemble
1         RandomForestRegressor
2          RandomTreesEmbedding
3           ExtraTreesRegressor
4              BaggingRegressor
5               IsolationForest
6     GradientBoostingRegressor
7             AdaBoostRegressor
8                       bagging
9                        forest
10            gradient_boosting
11           partial_dependence
12              weight_boosting
dtype: object

In [9]:
df_ens = pd.DataFrame()
df_ens['name'] = ens.loc[[1,4,6]]
df_ens

Unnamed: 0,name
1,RandomForestRegressor
4,BaggingRegressor
6,GradientBoostingRegressor


In [143]:
from sklearn.base import clone

X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=0.2)

reg1 = ensemble.RandomForestRegressor(n_estimators=100)
plt.figure(figsize=(10,5), tight_layout=False)
plt.subplot(1,2,1)
parity_plot(y_train, reg1.fit(X_train,y_train).predict(X_train), newfig=False, title='Training ', lim=200)
plt.subplot(1,2,2)
parity_plot(y_val, reg1.fit(X_train,y_train).predict(X_val), newfig=False, title='Validation ', lim=200)
plt.gcf().suptitle('RF Regression with {} features'.format(X.shape[1]))

<IPython.core.display.Javascript object>

Text(0.5,0.98,'RF Regression with 132 features')

## Define an RFECV object

eval_no_fit is a custom method I wrote to return the raw data on the fitting for all feature sets, instead of trying to pick the best number of features

sklearn allows parallelization for this task

In [113]:
from sklearn.feature_selection import RFECV

estimator0 = ensemble.RandomForestRegressor(n_estimators=100, max_depth=5)
scorer = metrics.make_scorer(metrics.mean_absolute_error, greater_is_better=False)

rfe = RFECV(estimator0,
            cv=model_selection.KFold(5, shuffle=True),
            scoring=scorer,
            verbose=1,
            n_jobs=5)
rfe.eval_no_fit(X,y)

Fitting estimator with 132 features.
Fitting estimator with 132 features.
Fitting estimator with 132 features.
Fitting estimator with 132 features.
Fitting estimator with 132 features.
Fitting estimator with 131 features.
Fitting estimator with 131 features.
Fitting estimator with 131 features.
Fitting estimator with 131 features.
Fitting estimator with 131 features.
Fitting estimator with 130 features.
Fitting estimator with 130 features.
Fitting estimator with 130 features.
Fitting estimator with 130 features.
Fitting estimator with 130 features.
Fitting estimator with 129 features.
Fitting estimator with 129 features.
Fitting estimator with 129 features.
Fitting estimator with 129 features.
Fitting estimator with 129 features.
Fitting estimator with 128 features.
Fitting estimator with 128 features.
Fitting estimator with 128 features.
Fitting estimator with 128 features.
Fitting estimator with 128 features.
Fitting estimator with 127 features.
Fitting estimator with 127 features.
F

Fitting estimator with 88 features.
Fitting estimator with 88 features.
Fitting estimator with 87 features.
Fitting estimator with 87 features.
Fitting estimator with 87 features.
Fitting estimator with 87 features.
Fitting estimator with 87 features.
Fitting estimator with 86 features.
Fitting estimator with 86 features.
Fitting estimator with 86 features.
Fitting estimator with 86 features.
Fitting estimator with 86 features.
Fitting estimator with 85 features.
Fitting estimator with 85 features.
Fitting estimator with 85 features.
Fitting estimator with 85 features.
Fitting estimator with 85 features.
Fitting estimator with 84 features.
Fitting estimator with 84 features.
Fitting estimator with 84 features.
Fitting estimator with 84 features.
Fitting estimator with 84 features.
Fitting estimator with 83 features.
Fitting estimator with 83 features.
Fitting estimator with 83 features.
Fitting estimator with 83 features.
Fitting estimator with 83 features.
Fitting estimator with 82 fe

Fitting estimator with 41 features.
Fitting estimator with 42 features.
Fitting estimator with 43 features.
Fitting estimator with 41 features.
Fitting estimator with 42 features.
Fitting estimator with 40 features.
Fitting estimator with 41 features.
Fitting estimator with 42 features.
Fitting estimator with 40 features.
Fitting estimator with 41 features.
Fitting estimator with 39 features.
Fitting estimator with 40 features.
Fitting estimator with 41 features.
Fitting estimator with 39 features.
Fitting estimator with 40 features.
Fitting estimator with 38 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 37 features.
Fitting estimator with 37 features.
Fitting estimator with 38 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 36 features.
Fitting estimator with 36 features.
Fitting estimator with 37 fe

RFECV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
   estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
   n_jobs=5,
   scoring=make_scorer(mean_absolute_error, greater_is_better=False),
   step=1, verbose=1)

## Get feature ranks from each of the five CV splits

Then average the ranks

In [114]:
df_rfe = pd.DataFrame([{'ranks': np.array(r.ranking_),
                        'scores': np.array(r.scores_)} for r in rfe.rfe_objs])
df_rfe

Unnamed: 0,ranks,scores
0,"[77, 58, 39, 44, 11, 25, 59, 51, 46, 13, 30, 2...","[-8.831099758625367, -8.85925324002213, -9.001..."
1,"[55, 74, 41, 49, 54, 71, 75, 39, 24, 17, 43, 1...","[-8.447942149280417, -8.915507678918482, -8.55..."
2,"[70, 71, 63, 56, 20, 39, 54, 11, 29, 9, 31, 50...","[-9.454178417542753, -9.57031540502959, -9.573..."
3,"[42, 56, 59, 40, 19, 70, 66, 17, 28, 14, 44, 3...","[-9.229331783454162, -8.844418123428541, -9.27..."
4,"[65, 53, 36, 24, 8, 57, 54, 38, 30, 15, 23, 17...","[-11.970187086372073, -11.818234129353883, -11..."


In [115]:
scores_mean = np.flip(np.mean(np.vstack(df_rfe.scores),axis=0))
scores_std = np.flip(np.std(np.vstack(df_rfe.scores),axis=0))

In [116]:
plt.figure()
plt.plot(np.arange(scores_mean.shape[0])+1,
         scores_mean)

plt.fill_between(np.arange(scores_mean.shape[0])+1,
                 scores_mean-scores_std,
                 scores_mean+scores_std,
                 alpha=0.4)

plt.xlabel('Number of Features')
plt.ylabel('Negative MAE (5-fold CV)')
plt.title('Recursive Feature Elimination on RF Regression')

<IPython.core.display.Javascript object>

Text(0.5,1,'Recursive Feature Elimination on RF Regression')

12 features is all you need!

In [133]:
ranks_mean = np.flip(np.mean(np.vstack(df_rfe.ranks),axis=0))
ranks_std = np.flip(np.std(np.vstack(df_rfe.ranks),axis=0))

df_ranks = pd.DataFrame({'avg_rank':ranks_mean,
                         'std_rank':ranks_std,
                         'feat_imp':estimator0.fit(X,y).feature_importances_},
                        index=X.columns)
df_ranks.sort_values('avg_rank', ascending=False)[:12]

Unnamed: 0,avg_rank,std_rank,feat_imp
minimum Row,127.8,6.079474,0.000199
mode NdValence,126.4,3.929377,0.003923
mode Row,126.0,4.049691,0.000918
avg_dev NsUnfilled,125.2,3.709447,0.00041
minimum Electronegativity,125.2,7.304793,0.003206
mode NValence,123.8,5.491812,0.000509
mode Electronegativity,122.8,5.268776,0.009151
minimum AtomicWeight,122.0,4.816638,0.000705
minimum NValence,121.4,6.770524,0.001645
mode AtomicWeight,121.2,7.194442,0.001146


In [118]:
# Generic Model Evaluation Function...

def eval_model(estimator, X, y, hypes=None, n_splits=5, print_name=False):
    
    if print_name:
        print(estimator)
    
    cv = model_selection.KFold(n_splits=n_splits, shuffle=True)

    score = pd.DataFrame(columns=pd.MultiIndex.from_product([['train', 'val'], ['mae', 'pearson', 'spearman']]))
    
    for idx, (train, val) in enumerate(cv.split(X)):
        
        model_i = estimator(**hypes) # Would eventually like to be able to pass hyperparameters here
        model_i.fit(X.iloc[train], y[train])

        # MAE
        score.loc[idx, ('train','mae')] = metrics.mean_absolute_error(y[train], model_i.predict(X.iloc[train]))
        score.loc[idx, ('val','mae')] = metrics.mean_absolute_error(y[val], model_i.predict(X.iloc[val]))

        # pearson -- print just the coefficient; throw away the p-value
        score.loc[idx, ('train','pearson')] = stats.pearsonr(y[train], model_i.predict(X.iloc[train]))[0]
        score.loc[idx, ('val','pearson')] = stats.pearsonr(y[val], model_i.predict(X.iloc[val]))[0]

        # spearman -- print just the coefficient; throw away the p-value
        score.loc[idx, ('train','spearman')] = stats.spearmanr(y[train], model_i.predict(X.iloc[train]))[0]
        score.loc[idx, ('val','spearman')] = stats.spearmanr(y[val], model_i.predict(X.iloc[val]))[0]
        
    model_i.fit(X, y)
    
    return score, model_i

## Evaluate sets of 12 features of progressively lower "rank"

In [120]:
mae_sets = []
mae_sets_std = []
num_feats = 12

for i in range(X.shape[1]-num_feats):
    print(i)
    X_reduced = X[df_ranks.sort_values('avg_rank', ascending=False)[i:i+num_feats].index.tolist()]
    ss, mm = eval_model(getattr(ensemble, df_ens.loc[1,'name']),
                        X_reduced,
                        y,
                        hypes={'n_estimators':100,
                               'max_depth':5})

    mae_sets.append(ss[('val','mae')].mean())
    mae_sets_std.append(ss[('val','mae')].std())

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119


In [121]:
X_reduced.head()

Unnamed: 0,minimum MeltingT,mean NpUnfilled,mode NdUnfilled,maximum MeltingT,range GSmagmom,maximum NpValence,maximum CovalentRadius,range NUnfilled,range NpUnfilled,maximum NUnfilled,range CovalentRadius,range MeltingT
3,933.47,1.666667,8.0,2023.0,0.0,1.0,206.0,17.0,5.0,22.0,85.0,1089.53
4,933.47,1.666667,8.0,2023.0,0.0,1.0,206.0,3.0,5.0,8.0,85.0,1089.53
17,1357.77,0.0,8.0,1941.0,2.3e-05,0.0,160.0,7.0,0.0,8.0,28.0,583.23
29,1828.05,0.0,8.0,1941.0,2.3e-05,0.0,160.0,8.0,0.0,8.0,21.0,112.95
37,1357.77,0.0,8.0,2128.0,0.0,0.0,175.0,7.0,0.0,8.0,43.0,770.23


In [122]:
df_ranks.sort_values('avg_rank', ascending=True)[:12]

Unnamed: 0,avg_rank,std_rank
range NdUnfilled,1.4,0.8
range MeltingT,2.2,0.4
range CovalentRadius,3.2,2.039608
maximum NUnfilled,5.0,2.097618
range NpUnfilled,6.8,0.748331
range NUnfilled,8.6,4.176123
maximum CovalentRadius,8.8,3.059412
maximum NpValence,9.8,5.344156
range GSmagmom,13.6,2.6533
maximum MeltingT,13.8,3.310589


In [123]:
mae_sets = np.array(mae_sets); mae_sets_std = np.array(mae_sets_std)

plt.figure()
plt.plot(np.arange(len(mae_sets))+1,
         mae_sets)

plt.fill_between(np.arange(len(mae_sets))+1,
                 mae_sets-mae_sets_std,
                 mae_sets+mae_sets_std,
                 alpha=0.4)

plt.xlabel('Fit using features [x : x+12]'.format(num_feats))
plt.ylabel('MAE (5-fold CV)')
plt.title('Regression Performance on Progressively Lower-ranked Feature Sets')

<IPython.core.display.Javascript object>

Text(0.5,1,'Regression Performance on Progressively Lower-ranked Feature Sets')

## Check out that model at X=32:44

In [129]:
X_reduced = X[df_ranks.sort_values('avg_rank', ascending=False)[32:44].index.tolist()] #[best_feat_set]
ss, mm = eval_model(getattr(ensemble, df_ens.loc[1,'name']),
                    X_reduced,
                    y,
                    hypes={'n_estimators':100,
                           'max_depth':5})

mm

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [135]:
from sklearn.base import clone

X_train, X_val, y_train, y_val = model_selection.train_test_split(X_reduced, y, test_size=0.2)

reg1 = clone(mm)
plt.figure(figsize=(10,5), tight_layout=False)
plt.subplot(1,2,1)
parity_plot(y_train, reg1.fit(X_train,y_train).predict(X_train), newfig=False, title='Training ', lim=200)
plt.subplot(1,2,2)
parity_plot(y_val, reg1.fit(X_train,y_train).predict(X_val), newfig=False, title='Validation ', lim=200)
plt.gcf().suptitle('RF Regression with {} features'.format(X_reduced.shape[1]))

<IPython.core.display.Javascript object>

Text(0.5,0.98,'RF Regression with 12 features')

## Evaluate sets of 6 features of progressively lower "rank"

In [125]:
mae_sets = []
mae_sets_std = []
num_feats = 6

for i in range(X.shape[1]-num_feats):
    print(i)
    X_reduced = X[df_ranks.sort_values('avg_rank', ascending=False)[i:i+num_feats].index.tolist()]
    ss, mm = eval_model(getattr(ensemble, df_ens.loc[1,'name']),
                        X_reduced,
                        y,
                        hypes={'n_estimators':100,
                               'max_depth':5})

    mae_sets.append(ss[('val','mae')].mean())
    mae_sets_std.append(ss[('val','mae')].std())

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125


In [126]:
mae_sets = np.array(mae_sets); mae_sets_std = np.array(mae_sets_std)

plt.figure()
plt.plot(np.arange(len(mae_sets))+1,
         mae_sets)

plt.fill_between(np.arange(len(mae_sets))+1,
                 mae_sets-mae_sets_std,
                 mae_sets+mae_sets_std,
                 alpha=0.4)

plt.xlabel('Fit using features [x : x+{}]'.format(num_feats))
plt.ylabel('MAE (5-fold CV)')
plt.title('Regression Performance on Progressively Lower-ranked Feature Sets')

<IPython.core.display.Javascript object>

Text(0.5,1,'Regression Performance on Progressively Lower-ranked Feature Sets')

## Evaluate sets of 4 features of progressively lower "rank"

In [23]:
mae_sets = []
mae_sets_std = []
num_feats = 4

for i in range(X.shape[1]-num_feats):
    print(i)
    X_reduced = X[df_ranks.sort_values('avg_rank', ascending=False)[i:i+num_feats].index.tolist()]
    ss, mm = eval_model(getattr(ensemble, df_ens.loc[1,'name']),
                        X_reduced,
                        y,
                        hypes={'n_estimators':100,
                               'max_depth':5})

    mae_sets.append(ss[('val','mae')].mean())
    mae_sets_std.append(ss[('val','mae')].std())

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127


In [24]:
mae_sets = np.array(mae_sets); mae_sets_std = np.array(mae_sets_std)

plt.figure()
plt.plot(np.arange(len(mae_sets))+1,
         mae_sets)

plt.fill_between(np.arange(len(mae_sets))+1,
                 mae_sets-mae_sets_std,
                 mae_sets+mae_sets_std,
                 alpha=0.4)

plt.xlabel('Fit using features [x : x+{}]'.format(num_feats))
plt.ylabel('MAE (5-fold CV)')
plt.title('Regression Performance on Progressively Lower-ranked Feature Sets')

<IPython.core.display.Javascript object>

Text(0.5,1,'Regression Performance on Progressively Lower-ranked Feature Sets')

In [31]:
df_feat_sets = pd.DataFrame({'mae_4':mae_sets})
df_feat_sets.sort_values('mae_4', ascending=True).head()

Unnamed: 0,mae_4
15,9.577333
35,9.764157
78,9.794167
16,9.971935
34,9.977659


In [32]:
i=15; num_feats=4;
df_ranks.sort_values('avg_rank', ascending=False)[i:i+num_feats]

Unnamed: 0,avg_rank,std_rank
avg_dev NValence,115.8,1.720465
range AtomicWeight,115.4,5.535341
avg_dev Electronegativity,114.2,3.867816
mean Electronegativity,113.0,3.847077


## Evalute every combination of 4 features around the minimum at feature set 15

In [34]:
from itertools import combinations

In [127]:
mae_sets = []
mae_sets_std = []
feat_start = 30; feat_stop = 38;
feat_list_reduced = df_ranks.sort_values('avg_rank', ascending=False)[feat_start:feat_stop].index.tolist()
num_feats = 4

df_combi_res = pd.DataFrame()

for feat_set in combinations(feat_list_reduced, num_feats):
    print(feat_set)
    X_reduced = X[list(feat_set)]
    ss, mm = eval_model(getattr(ensemble, df_ens.loc[1,'name']),
                        X_reduced,
                        y,
                        hypes={'n_estimators':100})

    df_combi_res = \
        df_combi_res.append(pd.DataFrame({'feat_set': [feat_set],
                                          'mae': (ss[('val','mae')].mean()),
                                          'mae_std': ss[('val','mae')].std()}),
                            ignore_index=True)

('mode NsUnfilled', 'mode NUnfilled', 'mode NfUnfilled', 'minimum Number')
('mode NsUnfilled', 'mode NUnfilled', 'mode NfUnfilled', 'avg_dev NfUnfilled')
('mode NsUnfilled', 'mode NUnfilled', 'mode NfUnfilled', 'mode Number')
('mode NsUnfilled', 'mode NUnfilled', 'mode NfUnfilled', 'mean NsValence')
('mode NsUnfilled', 'mode NUnfilled', 'mode NfUnfilled', 'mean NsUnfilled')
('mode NsUnfilled', 'mode NUnfilled', 'minimum Number', 'avg_dev NfUnfilled')
('mode NsUnfilled', 'mode NUnfilled', 'minimum Number', 'mode Number')
('mode NsUnfilled', 'mode NUnfilled', 'minimum Number', 'mean NsValence')
('mode NsUnfilled', 'mode NUnfilled', 'minimum Number', 'mean NsUnfilled')
('mode NsUnfilled', 'mode NUnfilled', 'avg_dev NfUnfilled', 'mode Number')
('mode NsUnfilled', 'mode NUnfilled', 'avg_dev NfUnfilled', 'mean NsValence')
('mode NsUnfilled', 'mode NUnfilled', 'avg_dev NfUnfilled', 'mean NsUnfilled')
('mode NsUnfilled', 'mode NUnfilled', 'mode Number', 'mean NsValence')
('mode NsUnfilled', 'm

In [128]:
df_combi_res.sort_values('mae').head()

Unnamed: 0,feat_set,mae,mae_std
36,"(mode NUnfilled, mode NfUnfilled, minimum Numb...",11.554611,1.729804
28,"(mode NsUnfilled, minimum Number, mode Number,...",11.631038,1.363831
6,"(mode NsUnfilled, mode NUnfilled, minimum Numb...",11.707789,1.573606
45,"(mode NUnfilled, minimum Number, avg_dev NfUnf...",11.733471,1.294464
49,"(mode NUnfilled, minimum Number, mode Number, ...",11.7719,0.978801


## Test the best feature set for correlations with itself and target

In [71]:
import seaborn as sns
best_feat_set = list(df_combi_res.sort_values('mae')['feat_set'].iloc[0])
df_feat_cor = pd.concat([X[best_feat_set],
                         pd.DataFrame({'y':y}, index=X.index)],
                        axis=1)
plt.figure(figsize=(6,5))
sns.heatmap(df_feat_cor.corr().abs(),
            annot=True, fmt='.1f', vmin=0, vmax=1, annot_kws={"size": 12})

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x12c4bf438>

## Finally, let's dig into the best model with only 4 features

In [129]:
X_reduced = X[df_ranks.sort_values('avg_rank', ascending=False)[32:44].index.tolist()] #[best_feat_set]
ss, mm = eval_model(getattr(ensemble, df_ens.loc[1,'name']),
                    X_reduced,
                    y,
                    hypes={'n_estimators':100,
                           'max_depth':5})

mm

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [131]:
from sklearn.base import clone

X_train, X_val, y_train, y_val = model_selection.train_test_split(X_reduced, y, test_size=0.2)

reg1 = clone(mm)
plt.figure(figsize=(10,5), tight_layout=False)
plt.subplot(1,2,1)
parity_plot(y_train, reg1.fit(X_train,y_train).predict(X_train), newfig=False, title='Training ')
plt.subplot(1,2,2)
parity_plot(y_val, reg1.fit(X_train,y_train).predict(X_val), newfig=False, title='Validation ')
plt.gcf().suptitle('RF Regression with 4 features')

<IPython.core.display.Javascript object>

Text(0.5,0.98,'RF Regression with 4 features')

In [108]:
from sklearn.externals.six import StringIO  
from PIL import Image  
from sklearn.tree import export_graphviz
import pydotplus

tree_num = 0
dot_data = StringIO()
export_graphviz(reg1.estimators_[tree_num], out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,
                feature_names=X_train.columns)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph_file = 'images/tree_{}.png'.format(tree_num)
graph.write_png(graph_file)
Image.open(graph_file)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [None]:
X_reduced = X[df_ranks.sort_values('avg_rank', ascending=False)[1:13].index.tolist()]
ss, mm = eval_model(getattr(ensemble, df_ens.loc[1,'name']),
                    X_reduced,
                    y,
                    hypes={'n_estimators':100})

ss

In [None]:
X.to_csv('X.csv', index=False)

In [None]:
pd.Series(y, name='y').to_csv('y.csv')