In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_csv('new_data.csv')

In [3]:
X = dataset.drop(axis=1, columns=['Y'])
y = dataset['Y']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10147, 67), (2537, 67), (10147,), (2537,))

### Bayesian Optimization

In [5]:
#Space set close to values estimated by TPOT classifier
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'bootstrap': hp.choice('bootstrap', [True, False]),
        'max_features': hp.choice('max_features', [0.3, 0.4, 0.5, 0.6]),
        #'min_samples_leaf': hp.choice('min_samples_leaf', [0.4, 0.6, 0.8, 1.0]), #hp.uniform('min_samples_leaf', 0.5, 1),
        'min_samples_split' : hp.choice ('min_samples_split', [7, 8, 9, 10, 11]),
        'n_estimators' : hp.choice('n_estimators', [150, 200, 250, 300, 350])
    }
space

{'criterion': <hyperopt.pyll.base.Apply at 0x2d831538340>,
 'bootstrap': <hyperopt.pyll.base.Apply at 0x2d831538460>,
 'max_features': <hyperopt.pyll.base.Apply at 0x2d8315385b0>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x2d8315387f0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x2d831538a00>}

In [6]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], bootstrap = space['bootstrap'],
                                 max_features = space['max_features'],
                                 #min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [7]:
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|█████████████████████████████████████████████| 80/80 [1:40:19<00:00, 75.24s/trial, best loss: -0.7442610230475835]


{'bootstrap': 1,
 'criterion': 0,
 'max_features': 0,
 'min_samples_split': 4,
 'n_estimators': 2}

In [8]:
crit = {0: 'entropy', 1: 'gini'}
bootstrap = {0: True, 1: False}
feat = {0: 0.15, 1: 0.20, 2: 0.25, 3: 0.3, 4: 0.35, 5: 0.4, 6: 0.45, 7: 0.5}
split = {0: 5, 1: 6, 2: 7, 3: 8, 4: 9}
est = {0: 10, 1: 50, 2: 300, 3: 750}
#leaf = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}


print("criterion:",crit[best['criterion']])
print("bootstrap:",bootstrap[best['bootstrap']])
print("max_features:",feat[best['max_features']])
print("n_estimators:",est[best['n_estimators']])
print("min_samples_split:",split[best['min_samples_split']])

criterion: entropy
bootstrap: False
max_features: 0.15
n_estimators: 300
min_samples_split: 9


In [9]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], bootstrap = bootstrap[best['bootstrap']], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = 1, 
                                       min_samples_split = split[best['min_samples_split']], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)

In [10]:
predictionforest = trainedforest.predict(X_test)

print("Accuracy:",accuracy_score(y_test,predictionforest)) 
print("F1:",f1_score(y_test,predictionforest))

Accuracy: 0.7272368939692551
F1: 0.7693333333333333


In [11]:
print(classification_report(y_test,predictionforest))

              precision    recall  f1-score   support

         0.0       0.71      0.63      0.67      1101
         1.0       0.74      0.80      0.77      1436

    accuracy                           0.73      2537
   macro avg       0.72      0.72      0.72      2537
weighted avg       0.73      0.73      0.72      2537

