In [135]:
import warnings
import numpy as np
import pandas as pd
import catboost

from catboost import cv, Pool
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [35]:
def cat_clean(train):
    train['DayOfWeek'] = train.DayOfWeek.apply(lambda x: int(x.split('-')[1]))
    train['Month'] = train.Month.apply(lambda x: int(x.split('-')[1]))
    train['DayofMonth'] = train.DayofMonth.apply(lambda x: int(x.split('-')[1]))
    
    train['Origin'] = train.Origin.apply(lambda x: x if x in origin_list else 'other')

    train['Dest'] = train.Dest.apply(lambda x: x if x in dest_list else 'other')
        
    train['UniqueCarrier'] = train.UniqueCarrier.apply(lambda x: x if x in carriers_list else 'other')

    train = pd.get_dummies(train)
    return train

In [143]:
def create_period_faetures(value, period):
    value = value * 2 * np.pi
    return np.sin(value/period), np.cos(value/period)


In [149]:
days_from_start = {
    1: 0, 2: 31, 3: 59, 4: 90, 5: 120, 6: 151, 7: 181,
    8: 212, 9: 243, 10: 273, 11: 304, 12: 334
}

days_in_month = {1: 32, 2: 30, 3: 32, 4: 31, 5: 32, 6: 31,
                 7: 32, 8: 32, 9: 31, 10: 32, 11: 31, 12: 32
                 }

to_weekend = {1: 0, 2:0, 3: 0, 4: 0, 5: 0, 6:1, 7:1}



def change_data(data):

    data.DepTime = (data.DepTime % 2400)
    
    data["hour"] = data.DepTime // 100
    data["minutes"] = data.DepTime % 100
    
    
    data["dep_in_minutes"] = data.hour * 60 + data.minutes
    
    data["day_year"] = (data.Month.map(days_from_start) + data.DayofMonth) * 24
    
    data["weekend"] = data.DayOfWeek.map(to_weekend)
    
    data['sin_dep'], data['cos_dep'] = create_period_faetures(data.dep_in_minutes, 24 * 60)
    
    data['sin_year'], data['cos_year'] = create_period_faetures(data.min_year, 365 * 24 * 60)
    
    
    # data.DepTime = data.DepTime  // 100 * 60 + data.DepTime % 100
    
    data.Distance = data.Distance.apply(np.log10)
        
    for name in "Month DayofMonth DayOfWeek".split():
        data[name] = data[name].astype(str)
        
    '''
    data["days"] = data.Month.map(days_from_start) + data.DayofMonth

    '''

**Read the data**

In [150]:
train_df = pd.read_csv('../../data/flight_delays_train.csv')
test_df = pd.read_csv('../../data/flight_delays_test.csv')

train_df.dep_delayed_15min = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0})

origin_list = train_df.Origin.value_counts().head(10).index
dest_list = train_df.Dest.value_counts().head(10).index
carriers_list = train_df.UniqueCarrier.value_counts().head(10).index


train_df = cat_clean(train_df)
test_df = cat_clean(test_df)

In [146]:
train_df['flight'] = train_df['Origin'] + ' --> ' + train_df['Dest']
test_df['flight'] = test_df['Origin'] + ' --> ' + test_df['Dest']

In [151]:
change_data(train_df)
change_data(test_df)

AttributeError: 'DataFrame' object has no attribute 'min_year'

In [122]:
train_df.DepTime = (train_df.DepTime % 2400)
for name in "Month DayofMonth DayOfWeek".split():
    train_df[name] = train_df[name].astype(str)
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,Distance,dep_delayed_15min,UniqueCarrier_AA,UniqueCarrier_DL,UniqueCarrier_MQ,UniqueCarrier_NW,...,Dest_CVG,Dest_DEN,Dest_DFW,Dest_EWR,Dest_IAH,Dest_LAS,Dest_LAX,Dest_ORD,Dest_PHX,Dest_other
0,8,21,7,1934,732,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,4,20,3,1548,834,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,9,2,5,1422,416,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,11,25,6,1015,872,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,10,7,6,1828,423,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [107]:
names = "DepTime hour minutes dep_in_minutes min_year".split()
drop = lambda x, y: x.drop(y, axis=1, inplace=True)
for data in (train_df, test_df):
    drop(data, names)
    

---
---
---

In [123]:
categ_feat_idx = np.where(train_df.drop('dep_delayed_15min', axis=1).dtypes == 'object')[0]
print(categ_feat_idx)

[0 1 2]


In [124]:
X_train = train_df.drop('dep_delayed_15min', axis=1).values
y_train = train_df['dep_delayed_15min'].values
X_test = test_df.values

In [125]:
X_train_part, X_valid, \
y_train_part, y_valid = train_test_split(X_train, y_train,
                                         test_size=0.3,
                                         random_state=17,
                                         shuffle=True
                                         )
                                        

In [126]:
ctb = CatBoostClassifier(random_seed=17, silent=True)

#### Train

In [127]:
%%time
ctb.fit(X_train_part,
        y_train_part, 
        cat_features=categ_feat_idx,
        eval_set=(X_valid, y_valid));

Wall time: 1min 13s


<catboost.core.CatBoostClassifier at 0x387c57f0>

In [128]:
ctb_valid_pred = ctb.predict_proba(X_train_part)[:, 1]
print(roc_auc_score(y_train_part, ctb_valid_pred))

ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]
print(roc_auc_score(y_valid, ctb_valid_pred))

0.8165431115266721
0.7461249791678416


0.748 - some cat and many time features handle
0.738 - no cat
0.746 - no time work

In [58]:
X_train

array([[ 8.00000000e+00,  2.10000000e+01,  7.00000000e+00, ...,
         3.98749069e-01, -7.72870135e-01, -6.34564224e-01],
       [ 4.00000000e+00,  2.00000000e+01,  3.00000000e+00, ...,
        -5.44639035e-01,  9.44706129e-01, -3.27918174e-01],
       [ 9.00000000e+00,  2.00000000e+00,  5.00000000e+00, ...,
        -8.14115518e-01, -8.84859584e-01, -4.65857829e-01],
       ...,
       [ 1.00000000e+00,  2.40000000e+01,  2.00000000e+00, ...,
         2.63031214e-01,  4.13942522e-01,  9.10303020e-01],
       [ 4.00000000e+00,  2.70000000e+01,  4.00000000e+00, ...,
        -6.59345815e-01,  8.98610553e-01, -4.38747165e-01],
       [ 1.10000000e+01,  1.70000000e+01,  4.00000000e+00, ...,
        -1.83697020e-16, -6.77614789e-01,  7.35417023e-01]])

In [None]:
max_depthes = range(6, 13)

In [None]:
%%time

ac_test = []
ac_train = []

roc_test = []
roc_train = []


for max_depth in max_depthes:
    ctb = CatBoostClassifier(random_seed=17,
                             silent=True,
                             max_depth=max_depth).fit(X_train_part, y_train_part,
                                                      cat_features=categ_feat_idx)
    
    roc_train.append(roc_auc_score(y_train_part, ctb.predict_proba(X_train_part)[:, 1]))
    roc_test.append(roc_auc_score(y_valid, ctb.predict_proba(X_valid)[:, 1]))
    
    ac_train.append(accuracy_score(y_train_part, ctb.predict(X_train_part)))
    ac_test.append(accuracy_score(y_valid, ctb.predict(X_valid)))
    
    print(max_depth, end=" | ")
    
    
    

In [None]:
plt.plot(max_depthes, roc_train, label="roc_train")
plt.plot(max_depthes, roc_test, label="roc_test")
plt.grid(True)
plt.legend(fontsize=14)

In [None]:
plt.plot(max_depthes, ac_train, label="ac_train")
plt.plot(max_depthes, ac_test, label="ac_test")
plt.grid(True)
plt.legend(fontsize=14)

**K Fold**

In [None]:
test_score = []
train_score = []
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17)

i = 0
for test_index, train_index in skf.split(X_train, y_train):
    
    X_train_1, X_test_1 = X_train[train_index, :], X_train[test_index, :]
    y_train_1, y_test_1 = y_train[train_index], y_train[test_index]
    
    
    _ctb = CatBoostClassifier(random_seed=17,
                              silent=True
                             ).fit(X_train_1, y_train_1, cat_features=categ_feat_idx)
    
    test_score.append(roc_auc_score(y_test_1,
                                    _ctb.predict_proba(X_test_1)[:, 1]
                                   )
                     )
    train_score.append(roc_auc_score(y_train_1, _ctb.predict_proba(X_train_1)[:, 1]))
    
    i += 1
    print(i)
    

In [None]:
print(np.mean(test_score))
print(test_score)
print()
print(np.mean(train_score))
print(train_score)

params = \
{
# "iterations": 100,
"eval_metric": "AUC",
"verbose": False
#"devices": -1
}

pool = Pool(data=X_train,
            label=y_train,
            cat_features=categ_feat_idx)

scores = cv(pool,
            params=params,
            fold_count=5,
            plot="True",
            early_stopping_rounds=50)

----
----
----

Result

In [98]:
%%time
ctb = CatBoostClassifier(random_seed=17, silent=True)
ctb.fit(X_train, y_train, cat_features=categ_feat_idx);

Wall time: 1min 29s


In [99]:
ctb_test_pred = ctb.predict_proba(X_test)[:, 1]

In [100]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv('sample_submission.csv', 
                             index_col='id')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv('ctb_pred_2.csv')

In [77]:
!head ctb_pred_1.csv

id,dep_delayed_15min
0,0.03164773503074415
1,0.050111298222643645
2,0.03909599932299011
3,0.3010278785944811
4,0.23644800310746758
5,0.0990110277509689
6,0.051838484675625726
7,0.16266405568265022
8,0.14482774403798257


In [101]:
!head ctb_pred_2.csv

id,dep_delayed_15min
0,0.026621480830522817
1,0.06194317965063598
2,0.04195651507515151
3,0.32841884660315
4,0.32971569352112773
5,0.12167380434128895
6,0.0623029855854102
7,0.22504220022015664
8,0.15672503297973006
