In [2]:
import sys
sys.path.append('./modules/')
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd

from decision_tree import DecisionTree

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [3]:
data = pd.read_csv('./data/hotel_data_review.csv')
X = data.drop(['is_canceled'],axis=1).values
y = data['is_canceled'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 42)

# Implemetned Decision Tree

In [19]:
def cross_val_score(model, X,y):
    acc_scores = []
    prec_scores = []
    roc_scores = []
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc_score = accuracy_score(y_test,y_pred)
        prec_score = precision_score(y_test,y_pred)
        roc_score = roc_auc_score(y_test,y_pred)
        acc_scores.append(acc_score)
        prec_scores.append(prec_score)
        roc_scores.append(roc_score)

    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score
    

def check_retrain(model,X_train,X_test,y_train,y_test):

    model.fit(X_train,y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    acc = accuracy_score(y_pred_train,y_train), accuracy_score(y_pred_test,y_test)
    prec = precision_score(y_pred_train,y_train), precision_score(y_pred_test,y_test)
    roc= roc_auc_score(y_pred_train,y_train),roc_auc_score(y_pred_test,y_test)
    print('Ошибки на Train --- Test выборках')
    print(f'accuracy = {acc[0]} --- {acc[1]}')
    print(f'precision = {prec[0]} --- {prec[1]}' )
    print(f'roc auc = {roc[0]} --- {roc[1]}')
    
def print_error_validation(accuracy,precission,roc_auc):
    print('Ошибки на валидации')
    print(f'accuracy = {accuracy}')
    print(f'precision = {precision}')
    print(f'roc auc = {roc_auc}')
def search_parametrs(X,y,max_depth_list, min_size_list):
    best_max_depth = None
    best_min_size = None
    best_roc = -10
    
    for max_depth in max_depth_list:
        for min_size in min_size_list:
            model = DecisionTree(max_depth=max_depth,min_size=min_size)
            accuracy,precision,roc_auc = cross_val_score(model,X,y)
            if roc_auc > best_roc:
                best_max_depth, best_min_size = max_depth,min_size
    return best_max_depth, best_min_size

### Настройка параметров

In [20]:
max_depth, min_size = search_parametrs(X,y,max_depth_list=np.arange(7,10,1), min_size_list=np.arange(1,3,1))

In [21]:
print(f'{max_depth}, {min_size}')

9, 2


### Обучение 

In [22]:
dt = DecisionTree(max_depth=max_depth,min_size=min_size)

In [23]:
accuracy,precision,roc_auc = cross_val_score(dt,X,y)

In [24]:
print_error_validation(accuracy,precision,roc_auc)

Ошибки на валидации
accuracy = 0.781
precision = 0.6394400207177451
roc auc = 0.7510276252314176


In [25]:
check_retrain(dt, X_train, X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.9093333333333333 --- 0.832
precision = 0.948905109489051 --- 0.8163265306122449
roc auc = 0.8979576929819435 --- 0.8229715489989462


### Sklearn Decision Tree

In [5]:
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
scoring = ['precision_macro', 'roc_auc','accuracy']
scoring_test = ['test_accuracy','test_precision_macro','test_roc_auc']

### Настройка параметров

In [6]:
clf = tree.DecisionTreeClassifier()
parameters = {'max_depth':np.arange(1,10,1), 'min_samples_leaf':np.arange(1,10,1)}
clf_cv = GridSearchCV(clf, parameters)


In [7]:
clf_cv.fit(X_train,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
 

In [8]:
clf_cv.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [14]:
max_depth, min_samples_leaf = 4,4

In [15]:
clf = tree.DecisionTreeClassifier(max_depth = max_depth,min_samples_leaf=min_samples_leaf)

In [16]:
scores = cross_validate(clf,X,y,cv=5,scoring=scoring)

In [17]:
print('Ошибки на валидации')
for score in scoring_test:
    print(f'{score} = {np.mean(scores[score])}')

Ошибки на валидации
test_accuracy = 0.22300000000000003
test_precision_macro = 0.12443900732472804
test_roc_auc = 0.19829515229515232


In [18]:
check_retrain(clf,X_train,X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.7573333333333333 --- 0.684
precision = 0.6642335766423357 --- 0.5408163265306123
roc auc = 0.7383245139059808 --- 0.6665608913334744
