In [6]:
import sys
sys.path.append('./modules/')
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd


from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [7]:
from random_forest import RandomForest

In [3]:
data = pd.read_csv('hotel_data_review.csv')
X = data.drop(['is_canceled'],axis=1).values
y = data['is_canceled'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 42)

# Implemetned Random Forest

In [5]:
def cross_val_score(model, X,y):
    acc_scores = []
    prec_scores = []
    roc_scores = []
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train,y_train)
        y_pred = model.predict_classes(X_test)
        acc_score = accuracy_score(y_test,y_pred)
        prec_score = precision_score(y_test,y_pred)
        roc_score = roc_auc_score(y_test,y_pred)
        acc_scores.append(acc_score)
        prec_scores.append(prec_score)
        roc_scores.append(roc_score)

    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score
    
def check_retrain(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    y_pred_test = model.predict_classes(X_test)
    y_pred_train = model.predict_classes(X_train)
    acc = accuracy_score(y_pred_train,y_train), accuracy_score(y_pred_test,y_test)
    prec = precision_score(y_pred_train,y_train), precision_score(y_pred_test,y_test)
    roc= roc_auc_score(y_pred_train,y_train),roc_auc_score(y_pred_test,y_test)
    print('Ошибки на Train --- Test выборках')
    print(f'accuracy = {acc[0]} --- {acc[0]}')
    print(f'precision = {prec[0]} --- {prec[1]}' )
    print(f'roc auc = {roc[0]} --- {roc[1]}')
    
def print_error_validation(accuracy,precission,roc_auc):
    print('Ошибки на валидации')
    print(f'accuracy = {accuracy}')
    print(f'precision = {precision}')
    print(f'roc auc = {roc_auc}')

def search_parametrs(X,y,n_estimators_list,max_depth_list, min_size_list):
    best_max_depth = None
    best_min_size = None
    best_n_estimators = None
    best_roc = -10
    for n_est in n_estimators_list:
        for max_depth in max_depth_list:
            for min_size in min_size_list:
                model = RandomForest(n_estimators=n_est,max_depth=max_depth,min_size=min_size)
                accuracy,precision,roc_auc = cross_val_score(model,X,y)
                if roc_auc > best_roc:
                    best_n_estimators, best_max_depth, best_min_size = n_est, max_depth,min_size
    return best_n_estimators, best_max_depth, best_min_size

### Настройка параметров

In [None]:
n_estimators, max_depth,min_size = search_parametrs(X,y,n_estimators_list=np.arange(10,40,10))


In [None]:
print(f'{n_estimators},{max_depth},{min_size}')

### Обучение 

In [None]:
clf =  RandomForest(n_estimators=n_estimators,max_depth=max_depth,min_size=min_size)