In [1]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from catboost import Pool, cv
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
warnings.simplefilter("ignore")
    
PATH_TO_DATA = Path('../input/flight-delays-fall-2018/')
X_train = pd.read_csv(PATH_TO_DATA / 'flight_delays_train.csv')
X_test = pd.read_csv(PATH_TO_DATA / 'flight_delays_test.csv')
y_train = X_train['dep_delayed_15min'].map({'Y': 1, 'N': 0}).copy()
X_train = X_train.drop('dep_delayed_15min', axis=1)
%matplotlib inline

In [2]:
# simple feature engineering
X_train['Flight'] = X_train['Origin'] + X_train['Dest']
X_test['Flight'] = X_test['Origin'] + X_test['Dest']

X_train['H_c'] ='c-' + ((X_train['DepTime']/100).apply(np.int)).apply(str)
X_test['H_c'] ='c-' + ((X_test['DepTime']/100).apply(np.int)).apply(str)

X_train['H_c'][X_train['H_c']=='c-24']='c-0'
X_train['H_c'][X_train['H_c']=='c-25']='c-1'

X_train['M_c'] = 'c-' + (X_train['DepTime'] % 100).apply(str)
X_test['M_c'] = 'c-' + (X_test['DepTime'] % 100).apply(str)

X_train['Daytime'] = 'Night'
X_train['Daytime'][X_train['H_c'].isin(['c-6', 'c-7', 'c-8', 'c-9','c-10', 'c-11'])]='Morning'
X_train['Daytime'][X_train['H_c'].isin(['c-12', 'c-13', 'c-14', 'c-15','c-16', 'c-17'])]='Afternoon'
X_train['Daytime'][X_train['H_c'].isin(['c-18', 'c-19', 'c-20', 'c-21','c-22', 'c-23'])]='Evening'
X_test['Daytime'] = 'Night'
X_test['Daytime'][X_test['H_c'].isin(['c-6', 'c-7', 'c-8', 'c-9','c-10', 'c-11'])]='Morning'
X_test['Daytime'][X_test['H_c'].isin(['c-12', 'c-13', 'c-14', 'c-15','c-16', 'c-17'])]='Afternoon'
X_test['Daytime'][X_test['H_c'].isin(['c-18', 'c-19', 'c-20', 'c-21','c-22', 'c-23'])]='Evening'

X_train['Season'] = 'Winter'
X_train['Season'][X_train['Month'].isin(['c-6','c-7', 'c-8'])]='Summer'
X_train['Season'][X_train['Month'].isin(['c-3','c-4', 'c-5'])]='Spring'
X_train['Season'][X_train['Month'].isin(['c-9','c-10', 'c-11'])]='Autumn'
X_test['Season'] = 'Winter'
X_test['Season'][X_test['Month'].isin(['c-6','c-7', 'c-8'])]='Summer'
X_test['Season'][X_test['Month'].isin(['c-3','c-4', 'c-5'])]='Spring'
X_test['Season'][X_test['Month'].isin(['c-9','c-10', 'c-11'])]='Autumn'

X_train['Delay']=0
X_train['Delay']=X_train['DepTime']-X_train.groupby(['UniqueCarrier','Flight', 'Daytime', 'Season'])['DepTime'].transform(np.min)
X_test['Delay']=0
X_test['Delay']=X_test['DepTime']-X_test.groupby(['UniqueCarrier','Flight', 'Daytime', 'Season'])['DepTime'].transform(np.min)

X_train['Delay2']=0
X_train['Delay2']=X_train['DepTime']-X_train.groupby(['UniqueCarrier','Flight', 'Daytime', 'Month'])['DepTime'].transform(np.min)
X_test['Delay2']=0
X_test['Delay2']=X_test['DepTime']-X_test.groupby(['UniqueCarrier','Flight', 'Daytime', 'Month'])['DepTime'].transform(np.min)

In [3]:
X_train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,Flight,H_c,M_c,Daytime,Season,Delay,Delay2
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,ATLDFW,c-19,c-34,Evening,Summer,103,0
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,PITMCO,c-15,c-48,Afternoon,Spring,218,0
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,RDUCLE,c-14,c-22,Afternoon,Autumn,66,0
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,DENMEM,c-10,c-15,Morning,Autumn,10,0
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,MDWOMA,c-18,c-28,Evening,Autumn,2,0


In [4]:
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
categ_feat_idx

array([ 0,  1,  2,  4,  5,  6,  8,  9, 10, 11, 12])

In [5]:
cv_dataset = Pool(data=X_train.values, label=y_train.values, cat_features=categ_feat_idx)

params = {"iterations": 1000,
          "depth": 3,
          "loss_function": "Logloss",
          "eval_metric": "AUC",
          "verbose": 10,
          'learning_rate':0.3,
          'early_stopping_rounds':10
          }
scores = cv(cv_dataset, params, fold_count=2)
steps = len(scores)-10
print('Score: %2.4f, steps: %2i' % (scores['test-AUC-mean'].max(), steps))

0:	test: 0.6781795	best: 0.6781795 (0)	total: 431ms	remaining: 7m 10s
10:	test: 0.7453589	best: 0.7453589 (10)	total: 2.47s	remaining: 3m 42s
20:	test: 0.7571467	best: 0.7571467 (20)	total: 4.56s	remaining: 3m 32s
30:	test: 0.7647780	best: 0.7647780 (30)	total: 6.63s	remaining: 3m 27s
40:	test: 0.7703380	best: 0.7703380 (40)	total: 8.69s	remaining: 3m 23s
50:	test: 0.7739129	best: 0.7739129 (50)	total: 10.9s	remaining: 3m 22s
60:	test: 0.7755598	best: 0.7756328 (59)	total: 12.9s	remaining: 3m 19s
70:	test: 0.7769653	best: 0.7770341 (69)	total: 15.1s	remaining: 3m 16s
80:	test: 0.7785019	best: 0.7785019 (80)	total: 17.1s	remaining: 3m 14s
90:	test: 0.7796588	best: 0.7796588 (90)	total: 19.2s	remaining: 3m 11s
100:	test: 0.7803533	best: 0.7803533 (100)	total: 21.3s	remaining: 3m 9s
110:	test: 0.7810009	best: 0.7810009 (110)	total: 23.4s	remaining: 3m 7s
120:	test: 0.7813330	best: 0.7814116 (114)	total: 25.6s	remaining: 3m 6s
130:	test: 0.7822627	best: 0.7822761 (129)	total: 27.7s	remaini

In [6]:
ctb1 = CatBoostClassifier(random_seed=17, silent=True)
ctb1.fit(X_train.values, y_train.values, cat_features=categ_feat_idx);
print('Fit completed')

Fit completed


In [7]:
ctb1_test_pred = ctb1.predict_proba(X_test.values)[:, 1]
print('Predict completed')

Predict completed


In [8]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv(PATH_TO_DATA / 'sample_submission.csv', 
                             index_col='id')
    sample_sub['dep_delayed_15min'] = ctb1_test_pred
    sample_sub.to_csv('subm_11.csv')