In [1]:
import sys
sys.path.append('./modules/')
import warnings
warnings.simplefilter("ignore")


import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from knn_classificator import KNNClassificator
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score




In [2]:
def check_retrain(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    acc = accuracy_score(y_pred_train,y_train), accuracy_score(y_pred_test,y_test)
    prec = precision_score(y_pred_train,y_train), precision_score(y_pred_test,y_test)
    roc= roc_auc_score(y_pred_train,y_train),roc_auc_score(y_pred_test,y_test)
    print('Ошибки на Train --- Test выборках')
    print(f'accuracy = {acc[0]} --- {acc[1]}')
    print(f'precision = {prec[0]} --- {prec[1]}' )
    print(f'roc auc = {roc[0]} --- {roc[1]}')

In [3]:
data = pd.read_csv('./data/hotel_data_review.csv')
X = data.drop(['is_canceled'],axis=1).values
y = data['is_canceled'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 42)

# Implemented KNNClassificator

In [4]:
from knn import KNN

In [5]:
def search_n_neighbors(X,y,n_neighbors_list):
    best_n = None
    best_roc = -10
    for n_n in n_neighbors_list:
        model = KNN(n_neighbors=n_n)
        accuracy,precision,roc_auc = cross_val_score(model,X,y)
        if roc_auc > best_roc:
            best_n = n_n
            
    return best_n
def cross_val_score(model, X, y):
    acc_scores = []
    prec_scores = []
    roc_scores = []

    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc_score = accuracy_score(y_test, y_pred)
        prec_score = precision_score(y_test, y_pred)
        roc_score = roc_auc_score(y_test, y_pred)
        acc_scores.append(acc_score)
        prec_scores.append(prec_score)
        roc_scores.append(roc_score)

    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score
    
def print_error_validation(accuracy,precission,roc_auc):
    print('Ошибки на валидации')
    print(f'accuracy = {accuracy}')
    print(f'precision = {precision}')
    print(f'roc auc = {roc_auc}')

### Настройка параметров

In [6]:
n_neighbors = search_n_neighbors(X,y,np.arange(9,12,2))

In [7]:
print(n_neighbors)

11


### Обучение

In [8]:
knn = KNN(n_neighbors=5)

In [9]:
accuracy,precision,roc_auc = cross_val_score(knn,X,y)

In [10]:
print_error_validation(accuracy,precision,roc_auc)

Ошибки на валидации
accuracy = 0.5896000000000001
precision = 0.6939540727186643
roc auc = 0.5542814946770922


In [11]:
check_retrain(knn,X_train,X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.8424 --- 0.808
precision = 0.9156530408773679 --- 0.8963700234192038
roc auc = 0.8306608081066507 --- 0.78451079214438


# Sklear KNeighborsClassifier

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
scoring = ['precision_macro', 'roc_auc','accuracy']
scoring_test = ['test_accuracy','test_precision_macro','test_roc_auc']

### Настройка параметров

In [13]:
knn = KNeighborsClassifier()
parameters = {'n_neighbors':np.arange(9,12,2)}
knn_gs = GridSearchCV(knn, parameters)


In [14]:
knn_gs.fit(X,y)

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': array([ 9, 11])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [15]:
knn_gs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

In [16]:
n_neighbors = 5

### Обучение

In [17]:
knn = KNeighborsClassifier(n_neighbors=n_neighbors)

In [18]:
scores = cross_validate(knn,X,y,cv=5,scoring=scoring)
print('Ошибки на валидации')
for score in scoring_test:
    print(f'{score} = {np.mean(scores[score])}')

Ошибки на валидации
test_accuracy = 0.3471
test_precision_macro = 0.2829001463122244
test_roc_auc = 0.17431190110781372


In [19]:
check_retrain(knn,X_train,X_test, y_train, y_test)

Ошибки на Train --- Test выборках
accuracy = 0.8789333333333333 --- 0.834
precision = 0.9294117647058824 --- 0.9028103044496487
roc auc = 0.8694222817113569 --- 0.8134194715939831
