In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(palette='deep', style='darkgrid', rc={"figure.figsize": (15, 4)})
import scipy.stats as st

import warnings
warnings.simplefilter('ignore')

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

In [3]:
data = pd.read_csv('flight_delays_train.csv')
data['dep_delayed_15min'] = data['dep_delayed_15min'].apply(lambda x: 1 if x == 'Y' else 0)
data['Month'] = data['Month'].str.replace('c-', '').astype('int16')
data['DayofMonth'] = data['DayofMonth'].str.replace('c-', '').astype('int16')
data['DayOfWeek'] = data['DayOfWeek'].str.replace('c-', '').astype('int16')
data['UniqueCarrier'] = pd.factorize(data['UniqueCarrier'])[0]
data['Origin'] = pd.factorize(data['Origin'])[0]
data['Dest'] = pd.factorize(data['Dest'])[0]

x = data.drop('dep_delayed_15min', axis=1)
y = data['dep_delayed_15min'].values

data.shape

(100000, 9)

In [4]:
#Разделим выборку на обущающую и тестовую
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, \
                                                    shuffle=True, random_state=18)

In [7]:
# Классификатор: 
knn = KNeighborsClassifier()

# Параметры для проверки:
params = {'n_neighbors': [3,5,7,9,15],
          'weights': ('uniform', 'distance'),
          'metric': ('euclidean', 'minkowski', 'manhattan', 'chebyshev' )
        }

In [16]:
def cros_val(scoring, metric, weights, x, y, fold):
    est = KNeighborsClassifier(metric = metric, weights = weights)
    result = cross_val_score(est, x, y, scoring=scoring, cv=fold)
    return np.round(np.mean(result), decimals=2)

In [32]:
print('F1: ', cros_val('f1', 'euclidean', 'uniform', x_train, y_train, 3))
print('F1: ', cros_val('f1', 'euclidean', 'uniform', x_train, y_train, 5))
print('F1: ', cros_val('f1', 'euclidean', 'uniform', x_train, y_train, 7))
print('F1: ', cros_val('f1', 'euclidean', 'uniform', x_train, y_train, 9))
print('F1: ', cros_val('f1', 'euclidean', 'uniform', x_train, y_train, 15))

F1:  0.25
F1:  0.26
F1:  0.26
F1:  0.27
F1:  0.27


In [20]:
print('F1: ', cros_val('f1', 'euclidean', 'distance', x_train, y_train, 3))
print('F1: ', cros_val('f1', 'euclidean', 'distance', x_train, y_train, 5))
print('F1: ', cros_val('f1', 'euclidean', 'distance', x_train, y_train, 7))
print('F1: ', cros_val('f1', 'euclidean', 'distance', x_train, y_train, 9))
print('F1: ', cros_val('f1', 'euclidean', 'distance', x_train, y_train, 15))

F1:  0.28
F1:  0.29
F1:  0.3
F1:  0.3
F1:  0.3


In [21]:
print('F1: ', cros_val('f1', 'minkowski', 'uniform', x_train, y_train, 3))
print('F1: ', cros_val('f1', 'minkowski', 'uniform', x_train, y_train, 5))
print('F1: ', cros_val('f1', 'minkowski', 'uniform', x_train, y_train, 7))
print('F1: ', cros_val('f1', 'minkowski', 'uniform', x_train, y_train, 9))
print('F1: ', cros_val('f1', 'minkowski', 'uniform', x_train, y_train, 15))

F1:  0.25
F1:  0.26
F1:  0.26
F1:  0.27
F1:  0.27


In [22]:
print('F1: ', cros_val('f1', 'minkowski', 'distance', x_train, y_train, 3))
print('F1: ', cros_val('f1', 'minkowski', 'distance', x_train, y_train, 5))
print('F1: ', cros_val('f1', 'minkowski', 'distance', x_train, y_train, 7))
print('F1: ', cros_val('f1', 'minkowski', 'distance', x_train, y_train, 9))
print('F1: ', cros_val('f1', 'minkowski', 'distance', x_train, y_train, 15))

F1:  0.28
F1:  0.29
F1:  0.3
F1:  0.3
F1:  0.3


In [23]:
print('F1: ', cros_val('f1', 'manhattan', 'uniform', x_train, y_train, 3))
print('F1: ', cros_val('f1', 'manhattan', 'uniform', x_train, y_train, 5))
print('F1: ', cros_val('f1', 'manhattan', 'uniform', x_train, y_train, 7))
print('F1: ', cros_val('f1', 'manhattan', 'uniform', x_train, y_train, 9))
print('F1: ', cros_val('f1', 'manhattan', 'uniform', x_train, y_train, 15))

F1:  0.25
F1:  0.26
F1:  0.26
F1:  0.27
F1:  0.27


In [24]:
print('F1: ', cros_val('f1', 'manhattan', 'distance', x_train, y_train, 3))
print('F1: ', cros_val('f1', 'manhattan', 'distance', x_train, y_train, 5))
print('F1: ', cros_val('f1', 'manhattan', 'distance', x_train, y_train, 7))
print('F1: ', cros_val('f1', 'manhattan', 'distance', x_train, y_train, 9))
print('F1: ', cros_val('f1', 'manhattan', 'distance', x_train, y_train, 15))

F1:  0.29
F1:  0.3
F1:  0.31
F1:  0.31
F1:  0.31


In [25]:
print('F1: ', cros_val('f1', 'chebyshev', 'uniform', x_train, y_train, 3))
print('F1: ', cros_val('f1', 'chebyshev', 'uniform', x_train, y_train, 5))
print('F1: ', cros_val('f1', 'chebyshev', 'uniform', x_train, y_train, 7))
print('F1: ', cros_val('f1', 'chebyshev', 'uniform', x_train, y_train, 9))
print('F1: ', cros_val('f1', 'chebyshev', 'uniform', x_train, y_train, 15))

F1:  0.25
F1:  0.26
F1:  0.27
F1:  0.27
F1:  0.27


In [26]:
print('F1: ', cros_val('f1', 'chebyshev', 'distance', x_train, y_train, 3))
print('F1: ', cros_val('f1', 'chebyshev', 'distance', x_train, y_train, 5))
print('F1: ', cros_val('f1', 'chebyshev', 'distance', x_train, y_train, 7))
print('F1: ', cros_val('f1', 'chebyshev', 'distance', x_train, y_train, 9))
print('F1: ', cros_val('f1', 'chebyshev', 'distance', x_train, y_train, 15))

F1:  0.28
F1:  0.29
F1:  0.29
F1:  0.3
F1:  0.3


In [23]:
knn = KNeighborsClassifier(n_neighbors=3, metric='manhattan', weights='distance')
knn.fit(x_train, y_train)
prediction = knn.predict(x_test)
print('на тесте: F1 - ', f1_score(y_test, prediction), 
      'accuracy - ', accuracy_score(y_test, prediction))
print('на трейне: F1 - ', f1_score(y_train, knn.predict(x_train)), 
       'accuracy - ', accuracy_score(y_train, knn.predict(x_train)))

на тесте: F1 -  0.3425818242232929 accuracy -  0.80215
на трейне: F1 -  1.0 accuracy -  1.0


In [32]:
predictions_proba=knn.predict_proba(x_test)

In [33]:
roc_auc_score(y_test, predictions_proba[:,1])

0.6723214412271915