In [1]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

In [2]:
PATH_TO_DATA = Path('.')
train_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_train.csv')
test_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_test.csv')

In [3]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [4]:
test_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [5]:
train_df['dep_delayed_15min'] = train_df['dep_delayed_15min'].map({'N':0, 'Y':1})
train_df['Month'] = train_df['Month'].apply(lambda x: x[2::]).astype('int64')
train_df['DayofMonth'] = train_df['DayofMonth'].apply(lambda x: x[2::]).astype('int64')
train_df['DayOfWeek'] = train_df['DayOfWeek'].apply(lambda x: x[2::]).astype('int64')

test_df['Month'] = test_df['Month'].apply(lambda x: x[2::]).astype('int64')
test_df['DayofMonth'] = test_df['DayofMonth'].apply(lambda x: x[2::]).astype('int64')
test_df['DayOfWeek'] = test_df['DayOfWeek'].apply(lambda x: x[2::]).astype('int64')


def timeToMinuts(tm):
    if tm < 100:
        return tm
    elif tm >= 2500:
        return tm % 100
    else:
        return tm % 100
    
    
def timeToHours(th):
    if th < 100:
        return 0
    elif th >= 2500:
        return th // 100 - 24
    else:
        return th // 100
    
    
def monthToSeason(m):
    if m in [1, 2, 12]:
        return 1
    elif m in [3, 4, 5]:
        return 2
    elif m in [6, 7, 8]:
        return 3
    elif m in [9, 10, 11]:
        return 4
    
def timeToDay(h):
    if h in [0, 1, 2]:
        return 1
    elif h in [3, 4, 5]:
        return 2
    elif h in [6, 7, 8]:
        return 3
    elif h in [9, 10, 11]:
        return 4
    elif h in [12, 13, 14]:
        return 5
    elif h in [15, 16, 17]:
        return 6
    elif h in [18, 19, 20]:
        return 7
    else:
        return 8
        

train_df['flight'] = train_df['Origin'] + '-->' + train_df['Dest']
test_df['flight'] = test_df['Origin'] + '-->' + test_df['Dest']

    
train_df['hours'] = train_df['DepTime'].apply(timeToHours)
train_df['minuts'] = train_df['DepTime'].apply(timeToMinuts)
train_df['season'] = train_df['Month'].apply(monthToSeason)
train_df['t_day'] = train_df['DayofMonth'].apply(timeToDay)
train_df['vec'] = train_df['Origin'] + test_df['Dest']

train_df = train_df.join(pd.get_dummies(train_df['season'], prefix='season'))
train_df = train_df.join(pd.get_dummies(train_df['t_day'], prefix='day'))
train_df = train_df.join(pd.get_dummies(train_df['UniqueCarrier'], prefix='car'))
train_df = train_df.join(pd.get_dummies(train_df['Month'], prefix='m'))

test_df['hours'] = test_df['DepTime'].apply(timeToHours)
test_df['minuts'] = test_df['DepTime'].apply(timeToMinuts)
test_df['season'] = test_df['Month'].apply(monthToSeason)
test_df['t_day'] = test_df['DayofMonth'].apply(timeToDay)
test_df['vec'] = test_df['Origin'] + test_df['Dest']
test_df = test_df.join(pd.get_dummies(test_df['season'], prefix='season'))
test_df = test_df.join(pd.get_dummies(test_df['t_day'], prefix='day'))
test_df = test_df.join(pd.get_dummies(test_df['UniqueCarrier'], prefix='car'))
test_df = test_df.join(pd.get_dummies(test_df['Month'], prefix='m'))


# train_df['flight'] = train_df['Origin'] + '-->' + train_df['Dest']
# test_df['flight'] = test_df['Origin'] + '-->' + test_df['Dest']

# del train_df['DepTime']
# del train_df['season']


In [6]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,flight,...,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,m_11,m_12
0,8,21,7,1934,AA,ATL,DFW,732,0,ATL-->DFW,...,0,0,0,0,0,1,0,0,0,0
1,4,20,3,1548,US,PIT,MCO,834,0,PIT-->MCO,...,0,1,0,0,0,0,0,0,0,0
2,9,2,5,1422,XE,RDU,CLE,416,0,RDU-->CLE,...,0,0,0,0,0,0,1,0,0,0
3,11,25,6,1015,OO,DEN,MEM,872,0,DEN-->MEM,...,0,0,0,0,0,0,0,0,1,0
4,10,7,6,1828,WN,MDW,OMA,423,1,MDW-->OMA,...,0,0,0,0,0,0,0,1,0,0


In [7]:
test_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,flight,hours,...,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,m_11,m_12
0,7,25,3,615,YV,MRY,PHX,598,MRY-->PHX,6,...,0,0,0,0,1,0,0,0,0,0
1,4,17,2,739,WN,LAS,HOU,1235,LAS-->HOU,7,...,0,1,0,0,0,0,0,0,0,0
2,12,2,7,651,MQ,GSP,ORD,577,GSP-->ORD,6,...,0,0,0,0,0,0,0,0,0,1
3,3,25,7,1614,WN,BWI,MHT,377,BWI-->MHT,16,...,1,0,0,0,0,0,0,0,0,0
4,6,6,3,1505,UA,ORD,STL,258,ORD-->STL,15,...,0,0,0,1,0,0,0,0,0,0


In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 58 columns):
Month            100000 non-null int64
DayofMonth       100000 non-null int64
DayOfWeek        100000 non-null int64
DepTime          100000 non-null int64
UniqueCarrier    100000 non-null object
Origin           100000 non-null object
Dest             100000 non-null object
Distance         100000 non-null int64
flight           100000 non-null object
hours            100000 non-null int64
minuts           100000 non-null int64
season           100000 non-null int64
t_day            100000 non-null int64
vec              100000 non-null object
season_1         100000 non-null uint8
season_2         100000 non-null uint8
season_3         100000 non-null uint8
season_4         100000 non-null uint8
day_1            100000 non-null uint8
day_2            100000 non-null uint8
day_3            100000 non-null uint8
day_4            100000 non-null uint8
day_5            100000 non

In [17]:
categ_feat_idx = np.where(train_df.drop('dep_delayed_15min', axis=1).dtypes == 'object')[0]
categ_feat_idx

array([ 4,  5,  6,  8, 13])

In [10]:
X_train = train_df.drop('dep_delayed_15min', axis=1).values
y_train = train_df['dep_delayed_15min'].values
X_test = test_df.values

In [11]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, 
                                                                test_size=0.3, 
                                                                random_state=17)

In [12]:
%%time
ctb = CatBoostClassifier(random_seed=17, silent=True)
ctb.fit(X_train_part, y_train_part, cat_features=categ_feat_idx);
ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, ctb_valid_pred)

CPU times: user 5min 14s, sys: 25.8 s, total: 5min 39s
Wall time: 3min 5s


0.752847672175182

In [13]:
# 0.7515229663325119
# 0.7498169399585913
# 0.7477376303222651
# 0.7471746660643551
# 0.7444933446807770
# 0.7422859392587760
# 0.7439647491771864
# 0.7568802738909720 - origin

In [18]:
%%time
ctb.fit(X_train, y_train, cat_features=categ_feat_idx);
ctb_test_pred = ctb.predict_proba(X_test.iloc[58])#[:, 1]
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv(PATH_TO_DATA / 'sample_submission.csv', index_col='id')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv('ctb_pred.csv')

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [15]:
!head ctb_pred.csv

id,dep_delayed_15min
0,0.039515836607826454
1,0.05434017574898294
2,0.046400591123906314
3,0.31410682839870496
4,0.2934881990704208
5,0.08831835324779773
6,0.041247623829009816
7,0.22549455965021728
8,0.12538830787573005
