In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(palette='deep', style='darkgrid', rc={"figure.figsize": (15, 4)})
import scipy.stats as st

import warnings
warnings.simplefilter('ignore')

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

In [3]:
# Загрузим данные и проведем все предобработки как на семинаре: 
data = pd.read_csv('flight_delays_train.csv')
data['dep_delayed_15min'] = data['dep_delayed_15min'].apply(lambda x: 1 if x == 'Y' else 0)
data['Month'] = data['Month'].str.replace('c-', '').astype('int16')
data['DayofMonth'] = data['DayofMonth'].str.replace('c-', '').astype('int16')
data['DayOfWeek'] = data['DayOfWeek'].str.replace('c-', '').astype('int16')
data['UniqueCarrier'] = pd.factorize(data['UniqueCarrier'])[0]
data['Origin'] = pd.factorize(data['Origin'])[0]
data['Dest'] = pd.factorize(data['Dest'])[0]

x = data.drop('dep_delayed_15min', axis=1)
y = data['dep_delayed_15min'].values

data.shape

(100000, 9)

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, \
                                                    shuffle=True, random_state=18)

In [6]:
tree = DecisionTreeClassifier(min_samples_split=100, max_depth=8)
tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [7]:
features = list(x.columns) 
export_graphviz(tree, feature_names=features,
                      class_names=['Y','N'], out_file='tree.dot', filled=True)

In [9]:
!dot -Tpng 'tree.dot' -o 'tree.png'

"dot" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.


In [10]:
print('Accuracy', accuracy_score(y_test, tree.predict(x_test)))
print('F1', f1_score(y_test, tree.predict(x_test)))

Accuracy 0.81605
F1 0.10638814670876852


In [12]:
tree = DecisionTreeClassifier(criterion='entropy', min_samples_split=100, max_depth=4, min_samples_leaf=100)
tree.fit(x_train, y_train)
print('Accuracy', accuracy_score(y_test, tree.predict(x_test)))
print('F1', f1_score(y_test, tree.predict(x_test)))

Accuracy 0.81125
F1 0.14301929625425652


In [26]:
params = {'criterion': ['entropy', 'gini'],
        'max_depth': range(1,30)
        }

In [27]:
grid = GridSearchCV(tree, params, scoring='accuracy', cv=5, n_jobs=-1)

In [28]:
grid.fit(x,y)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'criterion': ['entropy', 'gini'], 'max_depth': range(1, 30)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [29]:
grid.cv_results_

{'mean_fit_time': array([0.03769517, 0.09235373, 0.17872128, 0.21342888, 0.28264384,
        0.30558372, 0.34348164, 0.38636651, 0.43024864, 0.45019536,
        0.48749647, 0.5132267 , 0.53875923, 0.57246852, 0.58124537,
        0.59780145, 0.6241302 , 0.61655073, 0.64367819, 0.63649821,
        0.65085802, 0.65185575, 0.65165796, 0.65524731, 0.66142936,
        0.65644474, 0.6564446 , 0.67459555, 0.70611248, 0.06343002,
        0.11828294, 0.16136885, 0.22280664, 0.25970635, 0.30119576,
        0.34108829, 0.36801591, 0.42067537, 0.46914539, 0.51183271,
        0.5218049 , 0.52798734, 0.55412226, 0.55072904, 0.56548781,
        0.56648512, 0.57326732, 0.59979692, 0.64308028, 0.62732201,
        0.59740267, 0.60219002, 0.60498219, 0.66003485, 0.66043253,
        0.61814756, 0.61395755, 0.51981578]),
 'std_fit_time': array([0.0013091 , 0.02385556, 0.01864128, 0.00574623, 0.01782508,
        0.02256364, 0.01948923, 0.01174254, 0.02700272, 0.01918989,
        0.01763236, 0.02342935, 0.016

In [30]:
grid.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=9,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [31]:
grid.best_score_

0.81514

In [32]:
grid.best_params_

{'criterion': 'entropy', 'max_depth': 9}