In [1]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_train = pd.read_csv(r'../data/flight_delays_train.csv')
df_test = pd.read_csv(r'../data/flight_delays_test.csv')

In [3]:
df_train['DepTime'].where(df_train['DepTime']<2400, df_train['DepTime']-2400, inplace=True)

In [4]:
df_train['DepTime'] = df_train['DepTime'].apply(lambda x: x//100 + (x % 100)/60) 
df_test['DepTime'] = df_test['DepTime'].apply(lambda x: x//100 + (x % 100)/60) 

In [5]:
df_train['Flight'] = df_train['UniqueCarrier'] + ':' + df_train['Origin'] + '->' + df_train['Dest']
df_test['Flight'] = df_test['UniqueCarrier'] + ':' + df_test['Origin'] + '->' + df_test['Dest']

In [6]:
df_train['DepTime_full_hour'] = df_train['DepTime'].apply(lambda x: str(int(x)))
df_test['DepTime_full_hour'] = df_test['DepTime'].apply(lambda x: str(int(x)))

In [7]:
flight_group_train = df_train.groupby(['Flight'])

def get_prev_train(x):
    times = sorted(flight_group_train.get_group(x.Flight)['DepTime'])
    ind = times.index(x.DepTime)
    return 0 if ind == 0 else times[ind] - times[ind-1]

df_train['Time_since_last'] = df_train.apply(get_prev_train, axis=1)

In [8]:
flight_group_test = df_test.groupby(['Flight'])

def get_prev_test(x):
    times = sorted(flight_group_test.get_group(x.Flight)['DepTime'])
    ind = times.index(x.DepTime)
    return 0 if ind == 0 else times[ind] - times[ind-1]

df_test['Time_since_last'] = df_test.apply(get_prev_test, axis=1)

In [9]:
features = list(df_train.drop('dep_delayed_15min', axis=1).columns)
df_train = df_train[features + ['dep_delayed_15min']]
df_test = df_test[features]

In [10]:
X_train = df_train.drop('dep_delayed_15min', axis=1).values
y_train = df_train['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_test = df_test.values
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, 
                                                                test_size=0.3, 
                                                                random_state=17)

In [11]:
df_train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,Flight,DepTime_full_hour,Time_since_last,dep_delayed_15min
0,c-8,c-21,c-7,19.566667,AA,ATL,DFW,732,AA:ATL->DFW,19,0.916667,N
1,c-4,c-20,c-3,15.8,US,PIT,MCO,834,US:PIT->MCO,15,0.25,N
2,c-9,c-2,c-5,14.366667,XE,RDU,CLE,416,XE:RDU->CLE,14,0.05,N
3,c-11,c-25,c-6,10.25,OO,DEN,MEM,872,OO:DEN->MEM,10,0.166667,N
4,c-10,c-7,c-6,18.466667,WN,MDW,OMA,423,WN:MDW->OMA,18,0.033333,Y


In [12]:
categ_feat_idx = np.where(df_train.drop('dep_delayed_15min', axis=1).dtypes == 'object')[0]
categ_feat_idx

array([0, 1, 2, 4, 5, 6, 8, 9], dtype=int64)

In [13]:
ctb = CatBoostClassifier(random_seed=7, silent=True)
ctb.fit(X_train_part, y_train_part, cat_features=categ_feat_idx);

In [14]:
ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, ctb_valid_pred)

0.8272058213619072

In [16]:
ctb = CatBoostClassifier(random_seed=7, silent=True)
ctb.fit(X_train, y_train, cat_features=categ_feat_idx);

ctb_test_pred = ctb.predict_proba(X_test)[:, 1]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv(r'../data/sample_submission.csv', 
                             index_col='id')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv(r'../submissions/submission_9.csv')