In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

In [3]:
PATH_TO_DATA = Path('.')
train_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_train.csv')
test_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_test.csv')

In [4]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [5]:
test_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


%%time
train_df['Month'] = train_df['Month'].apply(lambda x: x[2::])
train_df['DayofMonth'] = train_df['DayofMonth'].apply(lambda x: x[2::])
train_df['DayOfWeek'] = train_df['DayOfWeek'].apply(lambda x: x[2::])
train_df['dep_delayed_15min'] = train_df['dep_delayed_15min'].map({'N':0, 'Y':1})
train_df['Month'] = train_df['Month'].astype('int64')
train_df['DayofMonth'] = train_df['DayofMonth'].astype('int64')
train_df['DayOfWeek'] = train_df['DayOfWeek'].astype('int64')
# **********************************************************************
test_df['Month'] = test_df['Month'].apply(lambda x: x[2::])
test_df['DayofMonth'] = test_df['DayofMonth'].apply(lambda x: x[2::])
test_df['DayOfWeek'] = test_df['DayOfWeek'].apply(lambda x: x[2::])
train_df['Month'] = test_df['Month'].astype('int64')
train_df['DayofMonth'] = test_df['DayofMonth'].astype('int64')
train_df['DayOfWeek'] = test_df['DayOfWeek'].astype('int64')

%%time
allCarrier = pd.DataFrame(train_df['UniqueCarrier'].value_counts())
# allCarrier
delayCarrier = train_df.groupby(['UniqueCarrier'])['dep_delayed_15min'].sum()
# delayCarrier

carriers = allCarrier
carriers['dep_delayed_15min'] = delayCarrier
carriers['percent'] = carriers['dep_delayed_15min'] / carriers['UniqueCarrier']
carriers = carriers.sort_values('percent')
carriers.reset_index(inplace=True)
del carriers['UniqueCarrier']
del carriers['dep_delayed_15min']
medianCompany = carriers['percent'].median()
# Рейтинг "опоздунов"
carriers

%%time
train_df['delCompPers'] = train_df['UniqueCarrier'].apply(lambda x: carriers[carriers['index'] == x]['percent'].values[0])
test_df['delCompPers'] = test_df['UniqueCarrier'].apply(lambda x: carriers[carriers['index'] == x]['percent'].values[0] if x in carriers['index'].values else medianCompany)
del train_df['UniqueCarrier']
del test_df['UniqueCarrier']

In [6]:
train_df['flight'] = train_df['Origin'] + '-->' + train_df['Dest']
test_df['flight'] = test_df['Origin'] + '-->' + test_df['Dest']

In [7]:
categ_feat_idx = np.where(train_df.drop('dep_delayed_15min', axis=1).dtypes == 'object')[0]
categ_feat_idx

array([0, 1, 2, 4, 5, 6, 8])

**Allocate a hold-out set (a.k.a. a validation set) to validate the model**

In [8]:
X_train = train_df.drop('dep_delayed_15min', axis=1).values
y_train = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_test = test_df.values

In [9]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, 
                                                                test_size=0.3, 
                                                                random_state=17)

**Train Catboost with default arguments, passing only the indexes of categorical features.**

In [10]:
%%time
ctb = CatBoostClassifier(random_seed=17, 
                         iterations=3000, 
                         eval_metric='AUC', 
                         depth=4, 
                         subsample=0.9607,
#                          learning_rate=1,
#                          l2_leaf_reg=3,
#                          random_strength=1,
#                          min_data_in_leaf=1,
                         loss_function='CrossEntropy',
                         bootstrap_type='Bernoulli')
#     RMSE
#     Logloss
#     MAE
#     CrossEntropy
#     Quantile
#     LogLinQuantile
#     Lq
#     MultiClass
#     MultiClassOneVsAll
#     MAPE
#     Poisson
#     PairLogit
#     PairLogitPairwise
#     QueryRMSE
#     QuerySoftMax
#     YetiRank
#     YetiRankPairwise

ctb.fit(X_train_part, y_train_part, cat_features=categ_feat_idx, silent=True)
ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, ctb_valid_pred)

CPU times: user 12min 19s, sys: 56.8 s, total: 13min 16s
Wall time: 8min 12s


0.7561263809748852

In [None]:
# 0.7566888519708084
# 0.7566612438072499
# 0.7564531235180606
# 0.7564261899628076
# 0.7552956262350818
# 0.7566211517631125
# 0.7558435967793620
# 0.7552491000235568
# 0.7550907556719235
# 0.7532279445247103
# 0.7584961929060583

**Train on the whole train set, make prediction on the test set. We got ~0.734 in the competition - "Catboost starter" baseline**

In [14]:
%%time
ctb.fit(X_train, y_train, cat_features=categ_feat_idx, silent=True);

CPU times: user 17min 1s, sys: 1min 4s, total: 18min 6s
Wall time: 10min 18s


<catboost.core.CatBoostClassifier at 0x7f924fdbe490>

In [15]:
ctb_test_pred = ctb.predict_proba(X_test)[:, 1]

In [16]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv(PATH_TO_DATA / 'sample_submission.csv', 
                             index_col='id')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv('ctb_pred.csv')

In [17]:
!head ctb_pred.csv

id,dep_delayed_15min
0,0.02514519895045268
1,0.060717735791977125
2,0.04693371970792522
3,0.294992778669616
4,0.30360745932601835
5,0.08436634322639619
6,0.05638420222220752
7,0.19938805982429214
8,0.11740933491293162
