In [201]:
import pandas as pd
import datetime, warnings, scipy 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly import tools

from sklearn import metrics, linear_model
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from scipy.optimize import curve_fit
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [172]:
df = pd.read_csv('input/train_features.csv')
df_target = pd.read_csv('input/train_target.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1355730 entries, 0 to 1355729
Data columns (total 13 columns):
Year              1355730 non-null int64
Month             1355730 non-null int64
DayofMonth        1355730 non-null int64
DayOfWeek         1355730 non-null int64
CRSDepTime        1355730 non-null int64
CRSArrTime        1355730 non-null int64
UniqueCarrier     1355730 non-null object
FlightNum         1355730 non-null int64
TailNum           1355726 non-null object
CRSElapsedTime    1355591 non-null float64
Origin            1355730 non-null object
Dest              1355730 non-null object
Distance          1355730 non-null int64
dtypes: float64(1), int64(8), object(4)
memory usage: 134.5+ MB


In [173]:
df_target.head()

Unnamed: 0,id,DelayTime
0,0,15.0
1,1,40.0
2,2,31.0
3,3,71.0
4,4,15.0


In [174]:
df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,CRSElapsedTime,Origin,Dest,Distance
0,2008,4,23,3,700,907,US,1195,N119US,307.0,PHL,PHX,2075
1,2008,7,18,5,2154,2349,US,401,N637AW,115.0,PHX,SFO,651
2,2008,12,21,7,1921,1949,NW,1767,N782NC,88.0,DTW,MSN,312
3,2008,4,24,4,1955,2020,WN,1865,N272WN,145.0,SAT,PHX,843
4,2008,6,27,5,1700,1848,OH,5186,N804CA,108.0,ORF,CVG,485


In [175]:
my_dict = {
    'Year': df.Year,
    'Month': df.Month,
    'Day': df.DayofMonth
}

data = pd.to_datetime(pd.DataFrame(my_dict))
df['DATA'] = data

In [176]:
df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,CRSElapsedTime,Origin,Dest,Distance,DATA
0,2008,4,23,3,700,907,US,1195,N119US,307.0,PHL,PHX,2075,2008-04-23
1,2008,7,18,5,2154,2349,US,401,N637AW,115.0,PHX,SFO,651,2008-07-18
2,2008,12,21,7,1921,1949,NW,1767,N782NC,88.0,DTW,MSN,312,2008-12-21
3,2008,4,24,4,1955,2020,WN,1865,N272WN,145.0,SAT,PHX,843,2008-04-24
4,2008,6,27,5,1700,1848,OH,5186,N804CA,108.0,ORF,CVG,485,2008-06-27


In [177]:
def format_time(chaine):
    if pd.isnull(chaine):
        return np.nan
    else:
        if chaine == 2400:
            chaine = 0
        chaine = '{0:04d}'.format(int(chaine))
        time = datetime.time(int(chaine[0:2]), int(chaine[2:4]))
        return time
    
def combine_data_time(x):
    if pd.isnull(x[0]) or pd.isnull(x[1]):
        return np.nan
    else:
        return datetime.datetime.combine(x[0], x[1])

def create_flight_time(df, col):
    l = []
    for index, cols in df[['DATA', col]].iterrows():
        if pd.isnull(cols[1]):
            l.append(np.nan)
        elif float(cols[1]) == 2400:
            cols[0] += datetime.timedelta(days=1)
            cols[1] = datetime.time(0,0)
            l.append(combine_data_time(cols))
        else:
            cols[1] = format_time(cols[1])
            l.append(combine_data_time(cols))
    return pd.Series(l)

In [178]:
%%time

df['SCHEDULED_DEPARTURE'] = create_flight_time(df, 'CRSDepTime')
df['DEPARTURE_TIME'] = df['CRSDepTime'].apply(format_time)
df['SCHEDULED_ARRIVAL'] = df['CRSArrTime'].apply(format_time)
df['ARRIVAL_TIME'] = df['CRSArrTime'].apply(format_time)

Wall time: 2min 45s


In [179]:
variables_to_remove = ['Year', 'Month', 'DayofMonth', 
                       'DayOfWeek', 'CRSDepTime', 'CRSArrTime',
                       'DEPARTURE_TIME', 'ARRIVAL_TIME']
df.drop(variables_to_remove, axis=1, inplace=True)

df.head()

Unnamed: 0,UniqueCarrier,FlightNum,TailNum,CRSElapsedTime,Origin,Dest,Distance,DATA,SCHEDULED_DEPARTURE,SCHEDULED_ARRIVAL
0,US,1195,N119US,307.0,PHL,PHX,2075,2008-04-23,2008-04-23 07:00:00,09:07:00
1,US,401,N637AW,115.0,PHX,SFO,651,2008-07-18,2008-07-18 21:54:00,23:49:00
2,NW,1767,N782NC,88.0,DTW,MSN,312,2008-12-21,2008-12-21 19:21:00,19:49:00
3,WN,1865,N272WN,145.0,SAT,PHX,843,2008-04-24,2008-04-24 19:55:00,20:20:00
4,OH,5186,N804CA,108.0,ORF,CVG,485,2008-06-27,2008-06-27 17:00:00,18:48:00


In [180]:
df['DEPARTURE_DELAY'] = df_target['DelayTime']
df.head()

Unnamed: 0,UniqueCarrier,FlightNum,TailNum,CRSElapsedTime,Origin,Dest,Distance,DATA,SCHEDULED_DEPARTURE,SCHEDULED_ARRIVAL,DEPARTURE_DELAY
0,US,1195,N119US,307.0,PHL,PHX,2075,2008-04-23,2008-04-23 07:00:00,09:07:00,15.0
1,US,401,N637AW,115.0,PHX,SFO,651,2008-07-18,2008-07-18 21:54:00,23:49:00,40.0
2,NW,1767,N782NC,88.0,DTW,MSN,312,2008-12-21,2008-12-21 19:21:00,19:49:00,31.0
3,WN,1865,N272WN,145.0,SAT,PHX,843,2008-04-24,2008-04-24 19:55:00,20:20:00,71.0
4,OH,5186,N804CA,108.0,ORF,CVG,485,2008-06-27,2008-06-27 17:00:00,18:48:00,15.0


In [181]:
#__________________________________________________________________
# function that extract statistical parameters from a grouby objet:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}
#_______________________________________________________________
# Creation of a dataframe with statitical infos on each airline:
global_stats = df['DEPARTURE_DELAY'].groupby(df['UniqueCarrier']).apply(get_stats).unstack()
global_stats = global_stats.sort_values('count')
global_stats

Unnamed: 0_level_0,count,max,mean,min
UniqueCarrier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AQ,520.0,336.0,26.969231,6.0
HA,5366.0,963.0,33.182818,6.0
F9,19682.0,805.0,27.709328,6.0
AS,27577.0,920.0,38.369692,6.0
9E,36308.0,1127.0,47.483998,6.0
OH,36963.0,960.0,49.045343,6.0
B6,38544.0,976.0,55.057519,6.0
YV,46932.0,607.0,55.263189,6.0
FL,49763.0,939.0,42.535177,6.0
NW,55378.0,2457.0,41.33842,6.0


In [193]:
def create_df(df):
    df2 = df[['SCHEDULED_DEPARTURE','SCHEDULED_ARRIVAL',
                                    'Origin','Dest','DEPARTURE_DELAY', 'Distance']]
    df2.dropna(how = 'any', inplace = True)
    df2['weekday'] = df2['SCHEDULED_DEPARTURE'].apply(lambda x:x.weekday())
    #____________________
    # delete delays > 1h
    df2['DEPARTURE_DELAY'] = df2['DEPARTURE_DELAY'].apply(lambda x:x if x < 60 else np.nan)
    df2.dropna(how = 'any', inplace = True)
    #_________________
    # formating times
    fct = lambda x:x.hour*3600+x.minute*60+x.second
    df2['heure_depart'] = df2['SCHEDULED_DEPARTURE'].apply(lambda x:x.time())
    df2['heure_depart'] = df2['heure_depart'].apply(fct)
    df2['heure_arrivee'] = df2['SCHEDULED_ARRIVAL'].apply(fct)
    df3 = df2.groupby(['heure_depart', 'heure_arrivee', 'Distance', 'Origin'],
                      as_index = False).mean()
    return df3

In [212]:
df3 = create_df(df)    
df3[:5]

Unnamed: 0,heure_depart,heure_arrivee,Distance,Origin,DEPARTURE_DELAY,weekday
0,0,3480,156,MFR,9.0,5.0
1,0,3600,258,XNA,23.0,5.0
2,60,22920,1989,SLC,11.0,5.0
3,60,23280,1989,SLC,22.333333,0.666667
4,60,23340,1989,SLC,17.3125,3.0625


## CatBoost 

In [247]:
from catboost import CatBoostRegressor
df.head()

Unnamed: 0,UniqueCarrier,FlightNum,TailNum,CRSElapsedTime,Origin,Dest,Distance,DATA,SCHEDULED_DEPARTURE,SCHEDULED_ARRIVAL,DEPARTURE_DELAY,weekday,heure_depart,heure_arrivee
0,17,1195,253,307.0,220,223,2075,2008-04-23,2008-04-23 07:00:00,09:07:00,15.0,2,25200,32820
1,17,401,3226,115.0,221,261,651,2008-07-18,2008-07-18 21:54:00,23:49:00,40.0,4,78840,85740
2,13,1767,4167,88.0,88,199,312,2008-12-21,2008-12-21 19:21:00,19:49:00,31.0,6,69660,71340
4,14,5186,4306,108.0,212,74,485,2008-06-27,2008-06-27 17:00:00,18:48:00,15.0,4,61200,67680
5,13,1424,4150,91.0,176,183,392,2008-09-28,2008-09-28 06:40:00,08:11:00,21.0,6,24000,29460


In [239]:
data = df
data.dropna(inplace=True)

cols = ["UniqueCarrier","TailNum","Dest","Origin"]

for item in cols:
    data[item] = data[item].astype("category").cat.codes +1

x_train, x_test, y_train, y_test = train_test_split(data.drop(["DEPARTURE_DELAY", 
                                                           'DATA',
                                                          'SCHEDULED_DEPARTURE',
                                                          'SCHEDULED_ARRIVAL'], axis=1), data["DEPARTURE_DELAY"],
                                                random_state=10, test_size=0.3)

In [244]:
x_train.head()

Unnamed: 0,UniqueCarrier,FlightNum,TailNum,CRSElapsedTime,Origin,Dest,Distance,weekday,heure_depart,heure_arrivee
1073069,2,2229,2903,65.0,288,83,237,5,33900,37800
588425,1,2941,103,92.0,88,1,424,1,36480,42000
130834,12,3955,2447,100.0,211,183,491,2,76500,82500
1261164,17,321,4375,226.0,156,213,1515,4,58620,79380
465,8,4571,5001,122.0,87,19,743,3,40800,51720


In [246]:
clf = CatBoostRegressor(eval_metric="AUC", 
                         depth=10, iterations= 100,
                         task_type='GPU')


cat.fit(x_train, y_train, eval_set=(x_test, y_test), plot=True, verbose_eval=False)

TypeError: unhashable type: 'numpy.ndarray'