In [1]:
import os
import sys
import scipy.misc
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import csv
import pandas as pd

np.random.seed(1)

train_data = pd.read_csv("train.csv")
test_data =pd.read_csv("test.csv")

airport_data = pd.read_csv("airports.csv")
weather_data = pd.read_csv("weather.csv")
sample_submission_data = pd.read_csv("sample_submission.csv")

d = {'DATE - YEAR':'year','DATE - MONTH':'month','DATE - DAY':'day'}

train_data['date2'] = pd.to_datetime(train_data.rename(columns=d)[list(d.values())])
test_data['date2'] = pd.to_datetime(train_data.rename(columns=d)[list(d.values())])


  from ._conv import register_converters as _register_converters


In [2]:
train_data.head()

Unnamed: 0,id,is_delayed,year,month,day,sched_dep_time,sched_arr_time,carrier,origin,dest,distance,date2
0,0,0,2013,12,15,1030,1359,UA,EWR,LAX,2454,2013-12-15
1,2,1,2013,4,2,1630,1847,DL,LGA,DTW,502,2013-04-02
2,4,1,2013,5,13,2055,2215,MQ,EWR,ORD,719,2013-05-13
3,7,0,2013,6,1,855,1120,MQ,LGA,ATL,762,2013-06-01
4,8,0,2013,9,19,852,1207,UA,EWR,SNA,2434,2013-09-19


In [3]:
#train_data["carrier"].unique() # too many categories
#train_data["origin"].unique()
train_data["dest"].unique() #too many categories

array(['LAX', 'DTW', 'ORD', 'ATL', 'SNA', 'PHX', 'DEN', 'MIA', 'BOS',
       'BWI', 'DFW', 'FLL', 'RDU', 'MYR', 'STL', 'CVG', 'GRR', 'PHL',
       'TUL', 'BTV', 'SEA', 'CMH', 'BGR', 'MHT', 'CLT', 'LAS', 'MCO',
       'AUS', 'SAT', 'OAK', 'DCA', 'MSY', 'CHS', 'SJC', 'RIC', 'OMA',
       'TPA', 'SJU', 'JAX', 'PBI', 'MEM', 'LGB', 'SRQ', 'GSO', 'SFO',
       'MKE', 'PDX', 'RSW', 'MDW', 'XNA', 'BUF', 'ACK', 'CLE', 'SAV',
       'ROC', 'SAN', 'IND', 'TYS', 'BNA', 'DAY', 'MSP', 'PSE', 'MVY',
       'PWM', 'CAK', 'ORF', 'MCI', 'SMF', 'IAH', 'IAD', 'PIT', 'HNL',
       'HOU', 'SYR', 'BDL', 'STT', 'SLC', 'ALB', 'PVD', 'OKC', 'MSN',
       'SDF', 'CAE', 'EGE', 'GSP', 'BQN', 'CRW', 'BUR', 'DSM', 'ILM',
       'TVC', 'BHM', 'ABQ', 'HDN', 'AVL', 'BZN', 'SBN', 'CHO', 'MTJ',
       'PSP', 'JAC', 'EYW', 'ANC', 'LEX', 'LGA'], dtype=object)

In [4]:
# FEATURES TO ADD TO TRAINING AND TEST:

train_data = train_data.set_index('id')
train_data.head()

test_data = test_data.set_index('id')
test_data.head()

#%% Holiday to add as feature, on holidays we expect to see more flight delays
import holidays
us_holidays = holidays.US(years = 2013)
train_data['holiday'] =train_data['date2'].isin(us_holidays)
test_data['holiday'] =test_data['date2'].isin(us_holidays)

#add daypart of sched_arr_time
def dayPart(x):  
    if x < 600: 
        return 'night'
    elif x >= 600 and x < 1200:
        return  'morning'
    elif x >= 1200 and x < 1800:
        return 'afternoon'
    if x >= 1800 and x < 2400:
        return 'evening'
    
train_data['sched_dep_time_daypart']= train_data['sched_dep_time'].apply(dayPart)
test_data['sched_dep_time_daypart']= test_data['sched_dep_time'].apply(dayPart)
train_data['sched_arr_time_daypart']= train_data['sched_arr_time'].apply(dayPart)
test_data['sched_arr_time_daypart']= test_data['sched_arr_time'].apply(dayPart)

def season(x):  
    if x==1 or x==2 or x==3: 
        return 'winter'
    if x==4 or x==5 or x==6: 
        return 'spring'
    if x==7 or x==8 or x==9: 
        return 'summer'
    if x==10 or x==11 or x==12: 
        return 'autumn'
    
train_data['sched_dep_time_season']= train_data['sched_dep_time'].apply(dayPart)
test_data['sched_dep_time_season']= test_data['sched_dep_time'].apply(dayPart)

def dayNumber(x):
    import datetime
        
    return x.weekday()
train_data['sched_dep_time_daynumber']= train_data['date2'].apply(dayNumber)
test_data['sched_dep_time_daynumber']= test_data['date2'].apply(dayNumber)

train_data['flightime'] = np.where(train_data['sched_arr_time'] - train_data['sched_dep_time'] < 0, 
          2360 - train_data['sched_dep_time'] + train_data['sched_arr_time'], 
          train_data['sched_arr_time'] - train_data['sched_dep_time'])

test_data['flightime'] = np.where(test_data['sched_arr_time'] - test_data['sched_dep_time'] < 0, 
          2360 - test_data['sched_dep_time'] + test_data['sched_arr_time'], 
          test_data['sched_arr_time'] - test_data['sched_dep_time'])

train_data.drop('date2',axis=1,inplace=True)
test_data.drop('date2',axis=1,inplace=True)

In [5]:
#Idea's: Combine with airport dataset, calculate flight time, #flights on day (per location and total)
#normelize/transform temperature to make low temperatures come forward stronger. Other normelization factors??
#Binary encoding for carrier and des
#Cluster algorithms for carrier and des.
# Does it freeze?
# numbers per day of week | weekday/weekend
# 



In [6]:
train_data.head()

Unnamed: 0_level_0,is_delayed,year,month,day,sched_dep_time,sched_arr_time,carrier,origin,dest,distance,holiday,sched_dep_time_daypart,sched_arr_time_daypart,sched_dep_time_season,sched_dep_time_daynumber,flightime
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,2013,12,15,1030,1359,UA,EWR,LAX,2454,False,morning,afternoon,morning,6,329
2,1,2013,4,2,1630,1847,DL,LGA,DTW,502,False,afternoon,evening,afternoon,1,217
4,1,2013,5,13,2055,2215,MQ,EWR,ORD,719,False,evening,evening,evening,0,160
7,0,2013,6,1,855,1120,MQ,LGA,ATL,762,False,morning,morning,morning,5,265
8,0,2013,9,19,852,1207,UA,EWR,SNA,2434,False,morning,afternoon,morning,3,355


In [7]:
airport_data.head()

Unnamed: 0,faa,name,lat,lon,alt,tz,dst,tzone
0,04G,Lansdowne Airport,41.130472,-80.619583,1044,-5,A,America/New_York
1,06A,Moton Field Municipal Airport,32.460572,-85.680028,264,-6,A,America/Chicago
2,06C,Schaumburg Regional,41.989341,-88.101243,801,-6,A,America/Chicago
3,06N,Randall Airport,41.431912,-74.391561,523,-5,A,America/New_York
4,09J,Jekyll Island Airport,31.074472,-81.427778,11,-5,A,America/New_York


In [8]:
#ADD FEATURES TO WEATHERDATA

# When it snows, there's more chance of a delay. Relative humidity is a possible estimator for this as discussed in this source:
# http://www.sciencebits.com/SnowAboveFreezing
import math 

def fahrenheitToCelcius(tf):
    return (tf - 32) / 1.8

def relativeHumidity(tc):
    return 9.5 * math.exp((-17.27*tc)/(tc+238.3)) * (10.5-tc)
    
weather_data['tempcelcius'] = fahrenheitToCelcius(weather_data['temp'])
weather_data['relative_humidity']  = weather_data['tempcelcius'].apply(relativeHumidity)

weather_data.head()

def freeze(x):  
    if x < 0:
        return 1
    if x> 0: 
        return 0

weather_data['freeze_at_departure']  = weather_data['tempcelcius'].apply(freeze)




In [9]:
weather_data.drop('time_hour',axis=1,inplace=True)
#weather_data.drop('hour',axis=1,inplace=True)

weather_data_trans = weather_data.groupby(['origin','year','month','day']).mean()
weather_data_trans2 = weather_data.groupby(['origin','year','month','day','hour']).mean()

weather_data_trans2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,temp,dewp,humid,wind_dir,wind_speed,wind_gust,precip,pressure,visib,tempcelcius,relative_humidity,freeze_at_departure
origin,year,month,day,hour,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
EWR,2013,1,1,1,39.02,26.06,59.37,270.0,10.35702,,0.0,1012.0,10.0,3.9,47.478315,0.0
EWR,2013,1,1,2,39.02,26.96,61.63,250.0,8.05546,,0.0,1012.3,10.0,3.9,47.478315,0.0
EWR,2013,1,1,3,39.02,28.04,64.43,240.0,11.5078,,0.0,1012.5,10.0,3.9,47.478315,0.0
EWR,2013,1,1,4,39.92,28.04,62.21,250.0,12.65858,,0.0,1012.2,10.0,4.4,42.371934,0.0
EWR,2013,1,1,5,39.02,28.04,64.43,260.0,12.65858,,0.0,1011.9,10.0,3.9,47.478315,0.0


In [10]:
train_data = pd.merge(train_data,weather_data_trans,  how='left', left_on=['origin','year','month','day'], right_index=True )
test_data = pd.merge(test_data,weather_data_trans,  how='left', left_on=['origin','year','month','day'], right_index=True )

In [11]:
train_data['sched_dep_time_hour'] = train_data['sched_dep_time'].divide(100)
train_data['sched_dep_time_hour'] = train_data['sched_dep_time_hour'].astype(np.int64)

test_data['sched_dep_time_hour'] = test_data['sched_dep_time'].divide(100)
test_data['sched_dep_time_hour'] = test_data['sched_dep_time_hour'].astype(np.int64)


train_data = pd.merge(train_data,weather_data_trans2,  how='left', left_on=['origin','year','month','day','sched_dep_time_hour'], right_index=True )
test_data = pd.merge(test_data,weather_data_trans2,  how='left', left_on=['origin','year','month','day','sched_dep_time_hour'], right_index=True )


#Change NAN values for mean of month in each group
def substituteNanNoncatMonth(data,columns):
    for i in columns:
        data[i] = data[["month",i]].groupby("month").transform(lambda x: x.fillna(x.mean()))
    return data

cols = ['sched_dep_time','flightime',
       'sched_arr_time','distance',
       'temp_x', 'dewp_x', 'humid_x', 'wind_dir_x', 'wind_speed_x', 'wind_gust_x',
       'precip_x', 'pressure_x', 'visib_x','tempcelcius_x','relative_humidity_x',
         'temp_y', 'dewp_y', 'humid_y', 'wind_dir_y', 'wind_speed_y', 'wind_gust_y',
       'precip_y', 'pressure_y', 'visib_y','tempcelcius_y','relative_humidity_y','freeze_at_departure_x','freeze_at_departure_y']
train_data = substituteNanNoncatMonth(train_data,cols)
test_data = substituteNanNoncatMonth(test_data,cols)

def substituteNanNoncatHour(data,columns):
    for i in columns:
        data[i] = data[['sched_dep_time_hour',i]].groupby('sched_dep_time_hour').transform(lambda x: x.fillna(x.mean()))
    return data

cols2 = ['temp_y', 'dewp_y', 'humid_y', 'wind_dir_y', 'wind_speed_y', 'wind_gust_y',
       'precip_y', 'pressure_y', 'visib_y','tempcelcius_y','relative_humidity_y','freeze_at_departure_x','freeze_at_departure_y']
train_data = substituteNanNoncatHour(train_data,cols2)
test_data = substituteNanNoncatHour(test_data,cols2)

train_data.drop('hour',axis=1,inplace=True)
test_data.drop('hour',axis=1,inplace=True)
train_data.drop('year',axis=1,inplace=True)
test_data.drop('year',axis=1,inplace=True)


In [12]:
print(train_data[train_data.isnull().any(axis=1)])
x = pd.DataFrame([2359])
x = x.divide(100)
x.astype(np.int64)


Empty DataFrame
Columns: [is_delayed, month, day, sched_dep_time, sched_arr_time, carrier, origin, dest, distance, holiday, sched_dep_time_daypart, sched_arr_time_daypart, sched_dep_time_season, sched_dep_time_daynumber, flightime, temp_x, dewp_x, humid_x, wind_dir_x, wind_speed_x, wind_gust_x, precip_x, pressure_x, visib_x, tempcelcius_x, relative_humidity_x, freeze_at_departure_x, sched_dep_time_hour, temp_y, dewp_y, humid_y, wind_dir_y, wind_speed_y, wind_gust_y, precip_y, pressure_y, visib_y, tempcelcius_y, relative_humidity_y, freeze_at_departure_y]
Index: []

[0 rows x 40 columns]


Unnamed: 0,0
0,23


In [13]:
print(train_data.loc[1224])

is_delayed                        1
month                             3
day                               6
sched_dep_time                 1910
sched_arr_time                 2228
carrier                          B6
origin                          JFK
dest                            SLC
distance                       1990
holiday                       False
sched_dep_time_daypart      evening
sched_arr_time_daypart      evening
sched_dep_time_season       evening
sched_dep_time_daynumber          2
flightime                       318
temp_x                       39.515
dewp_x                      30.5375
humid_x                     70.3692
wind_dir_x                    51.25
wind_speed_x                24.1664
wind_gust_x                 37.7335
precip_x                          0
pressure_x                   1012.5
visib_x                     9.91667
tempcelcius_x                 4.175
relative_humidity_x         45.1171
freeze_at_departure_x             0
sched_dep_time_hour         

In [15]:
from sklearn.preprocessing import OneHotEncoder

cat = ["origin","holiday",'carrier','dest','sched_dep_time_daypart','sched_dep_time_season','sched_dep_time_daynumber']
noncat = ['month','day', 'sched_dep_time','flightime',
       'sched_arr_time','distance',
       'temp_x', 'dewp_x', 'humid_x', 'wind_dir_x', 'wind_speed_x', 'wind_gust_x',
       'precip_x', 'pressure_x', 'visib_x','tempcelcius_x','relative_humidity_x',
         'temp_y', 'dewp_y', 'humid_y', 'wind_dir_y', 'wind_speed_y', 'wind_gust_y',
       'precip_y', 'pressure_y', 'visib_y','tempcelcius_y','relative_humidity_y', 'freeze_at_departure_x','freeze_at_departure_y']
label = ['is_delayed']

enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(pd.concat([train_data[cat],test_data[cat]]))
train_data_cat_sparse=enc.transform(train_data[cat])
test_data_cat_sparse=enc.transform(test_data[cat])

from scipy.sparse import hstack

train_data_sparse=hstack((train_data[noncat], train_data_cat_sparse))
test_data_sparse=hstack((test_data[noncat], test_data_cat_sparse))

train_label = train_data['is_delayed'] 

print(train_data_sparse.tocsr())
"""
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=cat2)
df_binary = encoder.fit(train_data[cat2])
train_data_cat_sparse2=enc.transform(train_data[cat2])
test_data_cat_sparse2=enc.transform(test_data[cat2])

train_data_sparse=hstack((train_data_sparse, train_data_cat_sparse2))
test_data_sparse=hstack((test_data_sparse, test_data_cat_sparse2))




#If some categories don't come forward in the test set, remove them
trainlist = list(train_data)
testlist = list(test_data)

removelist = [x for x in trainlist if x not in testlist]
removelist = removelist +[x for x in testlist if x not in trainlist]

removelist.remove("is_delayed")

try:
    train_data.drop(columns=removelist,axis=1)
except:
    print("do nothing")
try:
    test_data.drop(columns=removelist,axis=1)
except:
    print("do nothing")
    """

  (0, 0)	12.0
  (0, 1)	15.0
  (0, 2)	1030.0
  (0, 3)	329.0
  (0, 4)	1359.0
  (0, 5)	2454.0
  (0, 6)	34.527499999999996
  (0, 7)	28.602500000000003
  (0, 8)	80.52374999999999
  (0, 9)	243.33333333333334
  (0, 10)	12.658579999999999
  (0, 11)	23.59099
  (0, 12)	0.018333333333333333
  (0, 13)	1005.1263157894733
  (0, 14)	8.375
  (0, 15)	1.4041666666666666
  (0, 16)	79.73984615343194
  (0, 17)	33.98
  (0, 18)	30.02
  (0, 19)	86.39
  (0, 20)	270.0
  (0, 21)	13.80936
  (0, 22)	23.759510817008923
  (0, 24)	1019.7060884143726
  (0, 25)	10.0
  :	:
  (168572, 9)	210.43478260869566
  (168572, 10)	11.076257500000004
  (168572, 11)	21.673023333333333
  (168572, 12)	0.0029166666666666664
  (168572, 13)	1011.4411764705883
  (168572, 14)	9.666666666666666
  (168572, 15)	26.38333333333334
  (168572, 16)	-26.648680374822533
  (168572, 17)	75.92
  (168572, 18)	71.96
  (168572, 19)	87.55
  (168572, 20)	170.0
  (168572, 21)	12.65858
  (168572, 22)	22.01042889795897
  (168572, 24)	1011.7
  (168572, 25)	10.0

'\nimport category_encoders as ce\n\nencoder = ce.BinaryEncoder(cols=cat2)\ndf_binary = encoder.fit(train_data[cat2])\ntrain_data_cat_sparse2=enc.transform(train_data[cat2])\ntest_data_cat_sparse2=enc.transform(test_data[cat2])\n\ntrain_data_sparse=hstack((train_data_sparse, train_data_cat_sparse2))\ntest_data_sparse=hstack((test_data_sparse, test_data_cat_sparse2))\n\n\n\n\n#If some categories don\'t come forward in the test set, remove them\ntrainlist = list(train_data)\ntestlist = list(test_data)\n\nremovelist = [x for x in trainlist if x not in testlist]\nremovelist = removelist +[x for x in testlist if x not in trainlist]\n\nremovelist.remove("is_delayed")\n\ntry:\n    train_data.drop(columns=removelist,axis=1)\nexcept:\n    print("do nothing")\ntry:\n    test_data.drop(columns=removelist,axis=1)\nexcept:\n    print("do nothing")\n    '

In [15]:
"""
scalevariables = ['sched_dep_time','sched_arr_time','distance','temp','dewp','humid','wind_dir','wind_speed','wind_gust','precip','pressure','visib']
from sklearn.preprocessing import MinMaxScaler

train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

#We use the standard minmaxscaler as normelization factor
for col in scalevariables:
    scaler = MinMaxScaler()

    train_data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(train_data[col])),columns=[col])
    test_data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(test_data[col])),columns=[col])
"""

"\nscalevariables = ['sched_dep_time','sched_arr_time','distance','temp','dewp','humid','wind_dir','wind_speed','wind_gust','precip','pressure','visib']\nfrom sklearn.preprocessing import MinMaxScaler\n\ntrain_data = train_data.fillna(0)\ntest_data = test_data.fillna(0)\n\n#We use the standard minmaxscaler as normelization factor\nfor col in scalevariables:\n    scaler = MinMaxScaler()\n\n    train_data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(train_data[col])),columns=[col])\n    test_data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(test_data[col])),columns=[col])\n"

In [16]:
""" 
#Decision Tree
from sklearn import tree
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_data.drop('is_delayed',axis=1),train_data["is_delayed"], test_size=0.4, random_state=0)

clf = tree.DecisionTreeClassifier(min_samples_leaf=100,max_depth = 15)

clf = clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

AUC = roc_auc_score(y_test, predictions)
print("AUC = " + str(AUC))

"""

' \n#Decision Tree\nfrom sklearn import tree\nfrom sklearn.model_selection import train_test_split\n\nX_train, X_test, y_train, y_test = train_test_split(\n    train_data.drop(\'is_delayed\',axis=1),train_data["is_delayed"], test_size=0.4, random_state=0)\n\nclf = tree.DecisionTreeClassifier(min_samples_leaf=100,max_depth = 15)\n\nclf = clf.fit(X_train, y_train)\ny_predict = clf.predict(X_test)\n\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.metrics import roc_auc_score\n\nAUC = roc_auc_score(y_test, predictions)\nprint("AUC = " + str(AUC))\n\n'

In [16]:
# Convert dataframes to DMatrices for optimized xgboost performance
from sklearn.model_selection import train_test_split
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(
    train_data_sparse,train_label, test_size=0.3, random_state=0)

dtrain = xgb.DMatrix(X_train,label=y_train)
dtest = xgb.DMatrix(X_test,label = y_test)

In [18]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from scipy import sparse
from scipy.sparse import csc_matrix
from scipy.sparse import coo_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

#source:https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost

train_data_sparse_csr = train_data_sparse.tocsr()
test_data_sparse_csr = test_data_sparse.tocsr()

params = {
        'min_child_weight': [1,3, 5,7, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
        #'reg_alpha':[0,3,5],
        'reg_lambda':[1,3]
        }

folds = 3
param_comb = 15

model = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

skf = StratifiedKFold(n_splits=folds, shuffle = True)

#add early_stopping_rounds here?
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=3, cv=skf.split(train_data_sparse_csr,train_label), verbose=3 )

random_search.fit(train_data_sparse_csr,train_label)

print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-05.csv', index=False)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  30 out of  30 | elapsed: 155.0min finished



 All results:
{'mean_fit_time': array([1113.67024287, 1395.98677031,  808.14971066,  476.9962105 ,
       1376.0057056 ,  463.88114206, 1287.70429476,  856.42309197,
        455.55644377,  434.48569576]), 'std_fit_time': array([ 6.55250698,  1.57904979,  3.34411473,  4.16953679, 11.40357136,
        1.81381846,  7.97637969,  3.7121089 ,  2.83670963,  1.28914487]), 'mean_score_time': array([26.1798776 , 30.71114723, 37.30920625, 11.88498878, 34.13819742,
        8.43349179, 12.51541018, 22.29850332,  6.67864148,  6.96582659]), 'std_score_time': array([1.06344548, 0.78652864, 8.76183977, 0.32843658, 2.22638231,
       0.24494347, 0.56521787, 2.37924983, 0.04125416, 1.33723087]), 'param_subsample': masked_array(data=[1.0, 0.8, 0.8, 0.6, 1.0, 0.8, 1.0, 0.8, 0.8, 0.8],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'param_reg_lambda': masked_array(data=[5, 3, 1, 3, 1, 3, 3, 3, 1,

In [17]:
#Best XGBoost model with CV to leave as baseline - Overall AUC test/train data: 0.7869682439073855

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

train_data_sparse_csr = train_data_sparse.tocsr()
test_data_sparse_csr = test_data_sparse.tocsr()

model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=1.5, learning_rate=0.02,
       max_delta_step=0, max_depth=18, min_child_weight=3, missing=None,
       n_estimators=600, n_jobs=1, nthread=1, objective='binary:logistic',
       random_state=0, reg_alpha=5, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1.0)

## Or only use the train and validation set, so no cross validation:
model.fit(train_data_sparse_csr,train_label, eval_metric='auc')

# make predictions for test data
#Set prediction of test set for submission
pred_prob = model.predict_proba(test_data_sparse)

#predictions = [round(value) for value in y_pred_prob[:,1]]
predictions= [value for value in pred_prob[:,1]]

#print('Overall AUC test/train data:', roc_auc_score(y_test, predictions))


In [None]:
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

train_data_sparse_csr = train_data_sparse.tocsr()
test_data_sparse_csr = test_data_sparse.tocsr()

params = {
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3,6,9,12,15],
        'max_features':[None,10,15],
        'n_estimators':[100,125,150]
        }

folds = 3
param_comb = 10

model = GradientBoostingClassifier()

skf = StratifiedKFold(n_splits=folds, shuffle = True)

#add early_stopping_rounds here?
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=3, cv=skf.split(train_data_sparse_csr,train_label), verbose=3 )

random_search.fit(train_data_sparse_csr,train_label)

print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('grad-boost-grid-search-results-01.csv', index=False)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


In [78]:
"""
#If we don't want to work with sparse matrices:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

train_data2 = pd.get_dummies(train_data, columns=["year","month","day","carrier","origin","dest"], prefix=["year","month","day","carrier", "origin","dest"])
test_data2 = pd.get_dummies(test_data, columns=["year","month","day","carrier","origin","dest"], prefix=["year","month","day","carrier", "origin","dest"])

trainlist = list(train_data2)
testlist = list(test_data2)

removelist1 = [x for x in trainlist if x not in testlist]
removelist2 = [x for x in testlist if x not in trainlist]

removelist1.remove("is_delayed")
if removelist2 == None & removelist1 != None:
    finalremovelist = removelist1
elif removelist2 != None & removelist1 ==None:
    finalremovelist = removelist2
else: finalremovelistremovelist1.add(removelist2)

try:
    train_data2 = train_data2.drop(columns=finalremovelist,axis=1)
except:
    print("do nothing")

try:
    test_data2 = test_data2.drop(columns=finalremovelist,axis=1) 
except:
    print("do nothing")
    
"""
    

do nothing


In [80]:
#Best XGBoost model with CV to leave as baseline - Overall AUC test/train data: 0.7869682439073855

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(
    train_data2.drop('is_delayed',axis=1),train_data2["is_delayed"], test_size=0.4, random_state=0)

model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.5, learning_rate=0.02,
       max_delta_step=0, max_depth=15, min_child_weight=5, missing=None,
       n_estimators=600, n_jobs=1, nthread=1, objective='binary:logistic',
       random_state=0, reg_alpha=3, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1.0)

## Or only use the train and validation set, so no cross validation:
model.fit(X_train, y_train, eval_metric='auc')

# make predictions for test data
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)

predictions = [value for value in y_pred_prob[:,1]]

print('Overall AUC test/train data:', roc_auc_score(y_test, predictions))


Overall AUC test/train data: 0.7861772949679726


In [35]:
print(test_data2)

        sched_dep_time  sched_arr_time  distance  holiday   temp_x   dewp_x  \
id                                                                            
1            47.478315            1430       944    False   821.28   443.64   
3           -19.007343            1745       937    False  1266.06  1015.32   
5           131.212063            2145       425    False   620.22   290.28   
6           -16.767415            1857       746    False  1304.76   796.98   
11          -26.944272            2115      2153    False  1852.68  1708.50   
13          -26.066040            1518       937    False  1961.94  1612.02   
15           36.639550            1103       828    False   973.20   640.20   
17          -26.988207            2256      1990    False  2061.84  1710.48   
18          -27.011672            1139       200    False  1900.02  1639.38   
19           99.750000            1135       764    False   747.30   440.04   
20           -9.059831            1842       762    

In [None]:
import catboost as cb
cat_features_index = [0,1,2,3,4,5,6]

def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
                            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))

#params = {'depth': [4, 7, 10],
#          'learning_rate' : [0.03, 0.1, 0.15],
#         'l2_leaf_reg': [1,4,9],
#         'iterations': [300]}
#cb = cb.CatBoostClassifier()
#cb_model = GridSearchCV(cb, params, scoring="roc_auc", cv = 3)
#cb_model.fit(train, y_train)

# Sem categoricas
clf = cb.CatBoostClassifier(eval_metric="AUC", depth=10, iterations= 100, l2_leaf_reg= 9, learning_rate= 0.15)
clf.fit(train,y_train)
auc(clf, train, test)

# Com categoricas
clf = cb.CatBoostClassifier(eval_metric="AUC",one_hot_max_size=31, \
                            depth=10, iterations= 100, l2_leaf_reg= 9, learning_rate= 0.15)
clf.fit(train,y_train, cat_features= cat_features_index)
auc(clf, train, test)

In [None]:
def get_results(model, X, y):

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        from sklearn.model_selection import cross_val_score
        compute = cross_val_score(model, X, y, cv=10)
        mean = compute.mean()
        std = compute.std()
        return mean, std

def display_classifier_results(X,y):

    models = []

    from xgboost import XGBClassifier
    models += [XGBClassifier()]
    
    from sklearn.neighbors import KNeighborsClassifier
    models += [KNeighborsClassifier()]

    from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
    models += [GaussianNB(), MultinomialNB(), BernoulliNB()]

    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier#, VotingClassifier
    models += [RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), ExtraTreesClassifier()]

    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
    models += [LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis()]

    from sklearn.svm import SVC, LinearSVC
    models += [SVC(),LinearSVC()]

    from sklearn.linear_model import SGDClassifier
    models += [SGDClassifier()]

    from sklearn.neighbors.nearest_centroid import NearestCentroid
    models += [NearestCentroid()]

    output = {}

    for m in models:
        try:
            model_name = type(m).__name__
            from time import time
            start = time()
            scores = get_results(m,X,y)
            finish = time() - start
            time_finished = "%d minutes %2d seconds" % (int(finish / 60), finish % 60) 
            row = {"Mean Accuracy" : scores[0], "(+/-)" : scores[1], "Processing Time": time_finished}
            output[model_name] = row
        except:
            pass

    from pandas import DataFrame
    from IPython.display import display

    result = DataFrame(data=output).T
    result = result[["Mean Accuracy", "(+/-)", "Processing Time"]]
    display(result.sort_values("Mean Accuracy", ascending=False))

In [None]:
#Save model if worth it
import pickle

# save model to file
train.to_pickle("./train_pickle.pkl")
test.to_pickle("./test_pickle.pkl")
 
# load model from file
train = read_pickle("./train_pickle.pkl")
test =  read_pickle("./test_pickle.pkl")

In [20]:
# Make submission and save to file

#Set prediction of test set for submission
pred_prob = model.predict_proba(test_data_sparse)

#predictions = [round(value) for value in y_pred_prob[:,1]]
pred= [value for value in pred_prob[:,1]]

submission = pd.DataFrame()
submission["id"] = test_data.index
submission["is_delayed"] = pred
submission.to_csv("submission #10.csv", index=False)

In [18]:
# Make submission and save to file

y_pred_prob = model.predict_proba(test_data_sparse)

predictions = [value for value in y_pred_prob[:,1]]

submission = pd.DataFrame()
submission["id"] = test_data2.index
submission["is_delayed"] = predictions
submission.to_csv("submission #8.csv", index=False)

NameError: name 'test_data2' is not defined

In [90]:
print(len(test_data.index))

168203
