In [61]:
import sys
sys.path.append('./modules/')
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd


from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [62]:
from random_forest import RandomForest

In [63]:
data = pd.read_csv('hotel_data_review.csv')
X = data.drop(['is_canceled'],axis=1).values
y = data['is_canceled'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 42)

# Implemetned Random Forest

In [64]:
def cross_val_score(model, X,y):
    acc_scores = []
    prec_scores = []
    roc_scores = []
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc_score = accuracy_score(y_test,y_pred)
        prec_score = precision_score(y_test,y_pred)
        roc_score = roc_auc_score(y_test,y_pred)
        acc_scores.append(acc_score)
        prec_scores.append(prec_score)
        roc_scores.append(roc_score)

    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score
    
def check_retrain(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    acc = accuracy_score(y_pred_train,y_train), accuracy_score(y_pred_test,y_test)
    prec = precision_score(y_pred_train,y_train), precision_score(y_pred_test,y_test)
    roc= roc_auc_score(y_pred_train,y_train),roc_auc_score(y_pred_test,y_test)
    print('Ошибки на Train --- Test выборках')
    print(f'accuracy = {acc[0]} --- {acc[1]}')
    print(f'precision = {prec[0]} --- {prec[1]}' )
    print(f'roc auc = {roc[0]} --- {roc[1]}')
    
def print_error_validation(accuracy,precission,roc_auc):
    print('Ошибки на валидации')
    print(f'accuracy = {accuracy}')
    print(f'precision = {precision}')
    print(f'roc auc = {roc_auc}')

def search_parametrs(X,y,n_estimators_list,max_depth_list, min_size_list):
    best_max_depth = None
    best_min_size = None
    best_n_estimators = None
    best_roc = -10
    for n_est in n_estimators_list:
        for max_depth in max_depth_list:
            for min_size in min_size_list:
                model = RandomForest(n_estimators=n_est,max_depth=max_depth,min_size=min_size)
                accuracy,precision,roc_auc = cross_val_score(model,X,y)
                if roc_auc > best_roc:
                    best_n_estimators, best_max_depth, best_min_size = n_est, max_depth,min_size
    return best_n_estimators, best_max_depth, best_min_size

### Настройка параметров

In [65]:
n_estimators, max_depth,min_size = search_parametrs(X,y,n_estimators_list=np.arange(10,40,10),max_depth_list=np.arange(7,10,1), min_size_list=np.arange(6,9,1))

5

In [66]:
print(f'{n_estimators},{max_depth},{min_size}')

30,9,8


### Обучение 

In [67]:
clf =  RandomForest(n_estimators=n_estimators,max_depth=max_depth,min_size=min_size)


In [68]:
accuracy,precision,roc_auc = cross_val_score(clf,X,y)

5

In [70]:
print_error_validation(accuracy,precision,roc_auc)

Ошибки на валидации
test_accuracy = 0.621920452312341
test_precision_macro = 0.6029737105847312
test_roc_auc = 0.6539481925759321


In [71]:
check_retrain(clf,X_train,X_test, y_train, y_test)


Ошибки на Train --- Test выборках
accuracy = 0.7078924157329591 --- 0.6827543285746312
precision = 0.5912341746290322 --- 0.5702831820385739
roc auc = 0.7223746321823471 --- 0.7194832930575837


# Sklearn Random Forest

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
scoring = ['precision_macro', 'roc_auc','accuracy']
scoring_test = ['test_accuracy','test_precision_macro','test_roc_auc']

### Посик параметров

In [52]:
clf = RandomForestClassifier()
parameters = {'n_estimators':np.arange(10,40,10),'max_depth':np.arange(5,10,1), 'min_samples_leaf':np.arange(5,10,1)}
clf_cv = GridSearchCV(clf, parameters)


In [53]:
clf_cv.fit(X,y)

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [54]:
clf_cv.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=6, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [55]:
n_estimators,max_depth, min_sample_leaf = 20,9,7

### Обучение

In [56]:
clf = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth, min_samples_leaf=min_sample_leaf)

In [57]:
scores = cross_validate(clf,X,y,cv=5,scoring=scoring)

In [58]:
print('Ошибки на валидации')
for score in scoring_test:
    print(f'{score} = {np.mean(scores[score])}')

Ошибки на валидации
test_accuracy = 0.6564142996993695
test_precision_macro = 0.6272471830054627
test_roc_auc = 0.6884146537635737


In [59]:
check_retrain(clf,X_train,X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.7670645079868749 --- 0.7649455541934088
precision = 0.5664436697132448 --- 0.5604797151691183
roc auc = 0.7660907539864598 --- 0.7652482056655239
