In [35]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.metrics import Recall

from _all_saxes import x_train, x_test, y_train, y_test, \
    NUM_COMPONENTS, RANDOM_STATE as RANDOM_SEED

In [26]:
clf = RandomForestClassifier(n_estimators=400)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.825


In [27]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)


Unnamed: 0,0,1
0,316,139
1,57,608


In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.69      0.76       455
           1       0.81      0.91      0.86       665

    accuracy                           0.82      1120
   macro avg       0.83      0.80      0.81      1120
weighted avg       0.83      0.82      0.82      1120



In [33]:
clf.get_params()  # deep=True has no effect

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 400,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [36]:
# Parameters
p = {'n_estimators': [100, 150, 200, 250, 300, 350, 400],
     'max_features': [NUM_COMPONENTS - 2, NUM_COMPONENTS - 3, 
                      NUM_COMPONENTS - 4, NUM_COMPONENTS - 5,
                      NUM_COMPONENTS - 6, NUM_COMPONENTS - 7],
     'max_depth': [50, 150, 200, 250, 300, 350, 400]}

# RandomForestClassifier(n_estimators=100, *, criterion='gini', 
# max_depth=None, min_samples_split=2, min_samples_leaf=1, 
# min_weight_fraction_leaf=0.0, max_features='auto', 
# max_leaf_nodes=None, min_impurity_decrease=0.0, 
# min_impurity_split=None, bootstrap=True, oob_score=False, 
# n_jobs=None, random_state=None, verbose=0, warm_start=False, 
# class_weight=None, ccp_alpha=0.0, max_samples=None
estimator = RandomForestClassifier()
# GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, 
#              iid='deprecated', refit=True, cv=None, verbose=0, 
#              pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)
clf = GridSearchCV(estimator, p, n_jobs=-1)
search = clf.fit(x_train, y_train)
search.best_params_

{'max_depth': 400, 'max_features': 29, 'n_estimators': 300}

In [37]:
# RandomForestClassifier(n_estimators=100, *, criterion='gini', 
# max_depth=None, min_samples_split=2, min_samples_leaf=1, 
# min_weight_fraction_leaf=0.0, max_features='auto', 
# max_leaf_nodes=None, min_impurity_decrease=0.0, 
# min_impurity_split=None, bootstrap=True, oob_score=False, 
# n_jobs=None, random_state=None, verbose=0, warm_start=False, 
# class_weight=None, ccp_alpha=0.0, max_samples=None
clf = RandomForestClassifier(n_estimators=search.best_params_['n_estimators'],
                             max_features=search.best_params_['max_features'],
                             max_depth=search.best_params_['max_depth'])
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8214285714285714


In [38]:
clf.feature_importances_

array([0.05817672, 0.12287235, 0.06668483, 0.05994778, 0.01982447,
       0.04156552, 0.04780975, 0.03893956, 0.09972629, 0.03176279,
       0.04992773, 0.02219599, 0.02223648, 0.01748192, 0.0152686 ,
       0.02440251, 0.02072069, 0.01748246, 0.01425047, 0.01667815,
       0.01755437, 0.01686146, 0.01375813, 0.01655989, 0.01739165,
       0.01693799, 0.01474808, 0.0161832 , 0.01775151, 0.01365215,
       0.01530535, 0.01534118])

In [39]:
clf.decision_path(x_test)

(<1120x219486 sparse matrix of type '<class 'numpy.longlong'>'
 	with 4536702 stored elements in Compressed Sparse Row format>,
 array([     0,    745,   1452,   2177,   2918,   3639,   4392,   5085,
          5780,   6517,   7252,   7973,   8728,   9471,  10184,  10973,
         11738,  12441,  13212,  13955,  14648,  15355,  16104,  16809,
         17508,  18237,  18988,  19765,  20556,  21275,  22014,  22739,
         23496,  24263,  25004,  25697,  26388,  27107,  27818,  28553,
         29290,  30055,  30784,  31507,  32248,  32995,  33728,  34441,
         35170,  35899,  36630,  37357,  38076,  38791,  39518,  40243,
         40960,  41663,  42408,  43113,  43814,  44533,  45250,  45993,
         46728,  47405,  48132,  48845,  49590,  50363,  51086,  51823,
         52550,  53299,  54036,  54769,  55498,  56273,  57006,  57739,
         58506,  59255,  59990,  60733,  61458,  62243,  62998,  63711,
         64468,  65221,  65968,  66717,  67418,  68121,  68848,  69535,
        

In [40]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,329,126
1,74,591


In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.72      0.77       455
           1       0.82      0.89      0.86       665

    accuracy                           0.82      1120
   macro avg       0.82      0.81      0.81      1120
weighted avg       0.82      0.82      0.82      1120

