In [1]:
import problem
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from merge_transformer import MergeTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


from six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
import os


In [2]:
def _encode_dates(X, drop=False):
    # With pandas < 1.0, we wil get a SettingWithCopyWarning
    # In our case, we will avoid this warning by triggering a copy
    # More information can be found at:
    # https://github.com/scikit-learn/scikit-learn/issues/16191
    X_encoded = X.copy()

    # Make sure that DateOfDeparture is of datetime format
    X_encoded.loc[:, 'DateOfDeparture'] = pd.to_datetime(X_encoded['DateOfDeparture'])
    # Encode the DateOfDeparture
    X_encoded.loc[:, 'year'] = X_encoded['DateOfDeparture'].dt.year
    X_encoded.loc[:, 'month'] = X_encoded['DateOfDeparture'].dt.month
    X_encoded.loc[:, 'day'] = X_encoded['DateOfDeparture'].dt.day
    X_encoded.loc[:, 'weekday'] = X_encoded['DateOfDeparture'].dt.weekday
    X_encoded.loc[:, 'week'] = X_encoded['DateOfDeparture'].dt.week
    X_encoded.loc[:, 'n_days'] = X_encoded['DateOfDeparture'].apply(
        lambda date: (date - pd.to_datetime("1970-01-01")).days)
    X_encoded.loc['day_nb'] = X_encoded['DateOfDeparture'].dt.dayofyear
    
    X_encoded['day_nb'] = X_encoded['DateOfDeparture'].dt.dayofyear

    X_encoded['leap_year_year'] = X_encoded['year'].apply(
        lambda x: True if x == 2012 else False)
    X_encoded['leap_year_month'] = X_encoded['month'].apply(
        lambda x: True if x > 2 else False)
    X_encoded.loc[:, 'leap_year'] = X_encoded.loc[:, 'leap_year_year'] & X_encoded.loc[:, 'leap_year_month']

    X_encoded['day_nb_leap'] = X_encoded.apply(lambda x: 
                x.day_nb - 1 if x.leap_year == True else x.day_nb, axis=1)
    X_encoded.drop(['leap_year_year', 'leap_year_month', 'leap_year', 'day_nb'], inplace=True, axis=1)
    X_encoded.rename({'day_nb_leap': 'day_nb'}, axis=1, inplace=True)
    
    if drop:
        X_encoded.drop('DateOfDeparture', inplace=True, axis=1)
        
    return X_encoded

## Get Train Test

In [3]:
X_train, y_train = problem.get_train_data('..')
X_test, y_test = problem.get_test_data('..')

X_train.loc[:, 'DateOfDeparture'] = pd.to_datetime(X_train.loc[:, 'DateOfDeparture'])
X_test.loc[:, 'DateOfDeparture'] = pd.to_datetime(X_test.loc[:, 'DateOfDeparture'])

y_train = pd.DataFrame(y_train, columns=['Passengers'])
y_test = pd.DataFrame(y_test, columns=['Passengers'])

Xy_train = X_train.copy()
Xy_test = X_test.copy()
Xy_train['Passengers'] = y_train
Xy_test['Passengers'] = y_test

In [4]:
Xy_merged = pd.concat([Xy_train, Xy_test], ignore_index=True)

In [5]:
Xy_merged.head()

Unnamed: 0,DateOfDeparture,Departure,Arrival,WeeksToDeparture,std_wtd,Passengers
0,2012-06-19,ORD,DFW,12.875,9.812647,12.331296
1,2012-09-10,LAS,DEN,14.285714,9.466734,10.775182
2,2012-10-05,DEN,LAX,10.863636,9.035883,11.083177
3,2011-10-09,ATL,ORD,11.48,7.990202,11.169268
4,2012-02-21,DEN,SFO,11.45,9.517159,11.269364


In [6]:
date_encoder = FunctionTransformer(_encode_dates)
Xy_merged = date_encoder.fit_transform(Xy_merged)

  X_encoded.loc[:, 'week'] = X_encoded['DateOfDeparture'].dt.week


## Get Route Means

In [7]:
dep_list = Xy_merged['Departure'].unique()
arr_list = Xy_merged['Arrival'].unique()
routes = list(itertools.product(dep_list, arr_list))

# print(routes)

dict_df = {}
for route in routes:
    df1 = Xy_merged.loc[Xy_merged['Arrival'] == route[1]][Xy_merged['Departure'] == route[0]]
    df2 = Xy_merged.loc[Xy_merged['Departure'] == route[1]][Xy_merged['Arrival'] == route[0]]
    df = pd.concat([df1, df2])
    if not (df.empty):
        dict_df[route] = df

print(len(dict_df))
print(dep_list)

  df1 = Xy_merged.loc[Xy_merged['Arrival'] == route[1]][Xy_merged['Departure'] == route[0]]
  df2 = Xy_merged.loc[Xy_merged['Departure'] == route[1]][Xy_merged['Arrival'] == route[0]]


128
['ORD' 'LAS' 'DEN' 'ATL' 'SFO' 'EWR' 'IAH' 'LAX' 'DFW' 'SEA' 'JFK' 'PHL'
 'MIA' 'DTW' 'BOS' 'MSP' 'CLT' 'MCO' 'PHX' 'LGA' nan]


In [8]:
df_routes = pd.DataFrame(routes, columns=['Departure', 'Arrival'])

In [9]:
df_routes.describe()

Unnamed: 0,Departure,Arrival
count,420,420
unique,20,20
top,LGA,EWR
freq,21,21


In [10]:
mean_list = []
std_list = []
routes_list = []
routes_str = []
weeks_to_dep = []

for k, v in dict_df.items():
    dep, arr = k
    std_list.append(v.Passengers.std())
    weeks_to_dep.append(v.WeeksToDeparture.mean())
    routes_list.append(k)
    routes_str.append(dep + "-" + arr)
    tuple_ = (dep, arr, v['Passengers'].mean())
    mean_list.append(tuple_)

df_means_routes = pd.DataFrame(mean_list, columns=['Departure', 'Arrival', 'route_mean'])

In [11]:
df_means_routes

Unnamed: 0,Departure,Arrival,route_mean
0,ORD,DFW,11.919010
1,ORD,DEN,10.737025
2,ORD,LAX,12.008992
3,ORD,SFO,11.719065
4,ORD,LAS,10.678536
...,...,...,...
123,LGA,CLT,10.773446
124,LGA,DTW,10.590409
125,LGA,MIA,10.972960
126,LGA,BOS,11.168998


In [12]:
# df_means_routes.to_csv('../data/routes_means.csv')

## Get Day Means

In [13]:
day_list = Xy_merged['weekday'].unique()
mean_list = []

for day in day_list:
    df = Xy_merged.loc[Xy_merged['weekday'] == day]
    tuple_ = (day, df['Passengers'].mean())
    mean_list.append(tuple_)

df_means_days = pd.DataFrame(mean_list, columns=['weekday', 'day_mean'])
df_means_days

Unnamed: 0,weekday,day_mean
0,1.0,11.080302
1,0.0,11.316519
2,4.0,11.22139
3,6.0,10.812631
4,3.0,11.358502
5,5.0,9.963934
6,2.0,11.222148
7,,


In [14]:
# df_means_days.to_csv('../data/weekdays_means.csv')

## Merge on day means

In [15]:
date_encoder = FunctionTransformer(_encode_dates)
X_train = date_encoder.fit_transform(X_train)
X_test = date_encoder.fit_transform(X_test)

merge_transform = MergeTransformer(
    X_ext=df_means_days, 
#     cols_to_rename={'Date': 'DateOfDeparture', 'Close': 'oil_stock_price', 'Volume': 'oil_stock_volume'},
    how='left',
    on=['weekday'])

X_train = merge_transform.fit_transform(X_train)

merge_transform = MergeTransformer(
    X_ext=df_means_days, 
#     cols_to_rename={'Date': 'DateOfDeparture', 'Close': 'oil_stock_price', 'Volume': 'oil_stock_volume'},
    how='left',
    on=['weekday'])

X_test = merge_transform.fit_transform(X_test)

X_train.head()

  X_encoded.loc[:, 'week'] = X_encoded['DateOfDeparture'].dt.week


Unnamed: 0,DateOfDeparture,Departure,Arrival,WeeksToDeparture,std_wtd,year,month,day,weekday,week,n_days,day_nb,day_mean
0,2012-06-19,ORD,DFW,12.875,9.812647,2012.0,6.0,19.0,1.0,25.0,15510.0,170.0,11.080302
1,2012-09-10,LAS,DEN,14.285714,9.466734,2012.0,9.0,10.0,0.0,37.0,15593.0,253.0,11.316519
2,2012-10-05,DEN,LAX,10.863636,9.035883,2012.0,10.0,5.0,4.0,40.0,15618.0,278.0,11.22139
3,2011-10-09,ATL,ORD,11.48,7.990202,2011.0,10.0,9.0,6.0,40.0,15256.0,282.0,10.812631
4,2012-02-21,DEN,SFO,11.45,9.517159,2012.0,2.0,21.0,1.0,8.0,15391.0,52.0,11.080302


## Get Month Means

In [16]:
month_list = Xy_merged['month'].unique()
mean_list = []

for month in month_list:
    df = Xy_merged.loc[Xy_merged['month'] == month]
    tuple_ = (month, df['Passengers'].mean())
    mean_list.append(tuple_)

df_means_month = pd.DataFrame(mean_list, columns=['month', 'month_mean'])
print(df_means_month.head())

merge_transform = MergeTransformer(
    X_ext=df_means_month, 
#     cols_to_rename={'Date': 'DateOfDeparture', 'Close': 'oil_stock_price', 'Volume': 'oil_stock_volume'},
    how='left',
    on=['month'])

X_train = merge_transform.fit_transform(X_train)

merge_transform = MergeTransformer(
    X_ext=df_means_month, 
#     cols_to_rename={'Date': 'DateOfDeparture', 'Close': 'oil_stock_price', 'Volume': 'oil_stock_volume'},
    how='left',
    on=['month'])

X_test = merge_transform.fit_transform(X_test)

X_train.head()

   month  month_mean
0    6.0   11.240980
1    9.0   11.163944
2   10.0   11.243521
3    2.0   10.926466
4    1.0   10.834784


Unnamed: 0,DateOfDeparture,Departure,Arrival,WeeksToDeparture,std_wtd,year,month,day,weekday,week,n_days,day_nb,day_mean,month_mean
0,2012-06-19,ORD,DFW,12.875,9.812647,2012.0,6.0,19.0,1.0,25.0,15510.0,170.0,11.080302,11.24098
1,2012-09-10,LAS,DEN,14.285714,9.466734,2012.0,9.0,10.0,0.0,37.0,15593.0,253.0,11.316519,11.163944
2,2012-10-05,DEN,LAX,10.863636,9.035883,2012.0,10.0,5.0,4.0,40.0,15618.0,278.0,11.22139,11.243521
3,2011-10-09,ATL,ORD,11.48,7.990202,2011.0,10.0,9.0,6.0,40.0,15256.0,282.0,10.812631,11.243521
4,2012-02-21,DEN,SFO,11.45,9.517159,2012.0,2.0,21.0,1.0,8.0,15391.0,52.0,11.080302,10.926466


In [17]:
# df_means_month.to_csv('../data/months_means.csv')

## Get Week Mean

In [18]:
week_list = Xy_merged['week'].unique()
mean_list = []

for week in week_list:
    df = Xy_merged.loc[Xy_merged['week'] == week]
    tuple_ = (week, df['Passengers'].mean())
    mean_list.append(tuple_)

df_means_week = pd.DataFrame(mean_list, columns=['week', 'week_mean'])
print(df_means_week.head())

merge_transform = MergeTransformer(
    X_ext=df_means_week, 
#     cols_to_rename={'Date': 'DateOfDeparture', 'Close': 'oil_stock_price', 'Volume': 'oil_stock_volume'},
    how='left',
    on=['week'])

X_train = merge_transform.fit_transform(X_train)

merge_transform = MergeTransformer(
    X_ext=df_means_week, 
#     cols_to_rename={'Date': 'DateOfDeparture', 'Close': 'oil_stock_price', 'Volume': 'oil_stock_volume'},
    how='left',
    on=['week'])

X_test = merge_transform.fit_transform(X_test)

X_train.head()

   week  week_mean
0  25.0  11.301741
1  37.0  11.282255
2  40.0  11.210159
3   8.0  10.950214
4   4.0  10.817967


Unnamed: 0,DateOfDeparture,Departure,Arrival,WeeksToDeparture,std_wtd,year,month,day,weekday,week,n_days,day_nb,day_mean,month_mean,week_mean
0,2012-06-19,ORD,DFW,12.875,9.812647,2012.0,6.0,19.0,1.0,25.0,15510.0,170.0,11.080302,11.24098,11.301741
1,2012-09-10,LAS,DEN,14.285714,9.466734,2012.0,9.0,10.0,0.0,37.0,15593.0,253.0,11.316519,11.163944,11.282255
2,2012-10-05,DEN,LAX,10.863636,9.035883,2012.0,10.0,5.0,4.0,40.0,15618.0,278.0,11.22139,11.243521,11.210159
3,2011-10-09,ATL,ORD,11.48,7.990202,2011.0,10.0,9.0,6.0,40.0,15256.0,282.0,10.812631,11.243521,11.210159
4,2012-02-21,DEN,SFO,11.45,9.517159,2012.0,2.0,21.0,1.0,8.0,15391.0,52.0,11.080302,10.926466,10.950214


In [19]:
# df_means_week.to_csv('../data/weeks_means.csv')

## Merge on Routes means

In [20]:
merge_transform = MergeTransformer(
    X_ext=df_means_routes, 
#     cols_to_rename={'Date': 'DateOfDeparture', 'Close': 'oil_stock_price', 'Volume': 'oil_stock_volume'},
    how='left',
    on=['Departure', 'Arrival'])

X_train = merge_transform.fit_transform(X_train)

merge_transform = MergeTransformer(
    X_ext=df_means_routes, 
#     cols_to_rename={'Date': 'DateOfDeparture', 'Close': 'oil_stock_price', 'Volume': 'oil_stock_volume'},
    how='left',
    on=['Departure', 'Arrival'])

X_test = merge_transform.fit_transform(X_test)

In [21]:
X_train

Unnamed: 0,DateOfDeparture,Departure,Arrival,WeeksToDeparture,std_wtd,year,month,day,weekday,week,n_days,day_nb,day_mean,month_mean,week_mean,route_mean
0,2012-06-19,ORD,DFW,12.875000,9.812647,2012.0,6.0,19.0,1.0,25.0,15510.0,170.0,11.080302,11.240980,11.301741,11.919010
1,2012-09-10,LAS,DEN,14.285714,9.466734,2012.0,9.0,10.0,0.0,37.0,15593.0,253.0,11.316519,11.163944,11.282255,10.451326
2,2012-10-05,DEN,LAX,10.863636,9.035883,2012.0,10.0,5.0,4.0,40.0,15618.0,278.0,11.221390,11.243521,11.210159,11.049831
3,2011-10-09,ATL,ORD,11.480000,7.990202,2011.0,10.0,9.0,6.0,40.0,15256.0,282.0,10.812631,11.243521,11.210159,11.300291
4,2012-02-21,DEN,SFO,11.450000,9.517159,2012.0,2.0,21.0,1.0,8.0,15391.0,52.0,11.080302,10.926466,10.950214,10.822171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8898,2012-09-25,DFW,ORD,12.772727,10.641034,2012.0,9.0,25.0,1.0,39.0,15608.0,268.0,11.080302,11.163944,11.254786,11.919010
8899,2012-01-19,SFO,LAS,11.047619,7.908705,2012.0,1.0,19.0,3.0,3.0,15358.0,19.0,11.358502,10.834784,10.926686,10.769626
8900,2013-02-03,ORD,PHL,6.076923,4.030334,2013.0,2.0,3.0,6.0,5.0,15739.0,34.0,10.812631,10.926466,10.829387,11.179790
8901,2011-11-26,DTW,ATL,9.526316,6.167733,2011.0,11.0,26.0,5.0,47.0,15304.0,330.0,9.963934,10.939110,10.183350,10.414953


In [22]:
X_test

Unnamed: 0,DateOfDeparture,Departure,Arrival,WeeksToDeparture,std_wtd,year,month,day,weekday,week,n_days,day_nb,day_mean,month_mean,week_mean,route_mean
0,2012-05-21,LAS,ORD,12.000000,9.860938,2012.0,5.0,21.0,0.0,21.0,15481.0,141.0,11.316519,11.218335,11.091592,10.678536
1,2012-12-20,SFO,DEN,10.600000,9.954634,2012.0,12.0,20.0,3.0,51.0,15694.0,354.0,11.358502,10.604162,10.464457,10.822171
2,2012-11-01,LGA,DTW,11.950000,9.207977,2012.0,11.0,1.0,3.0,44.0,15645.0,305.0,11.358502,10.939110,10.878689,10.590409
3,2012-01-03,DEN,LAS,11.476190,9.352107,2012.0,1.0,3.0,1.0,1.0,15342.0,3.0,11.080302,10.834784,10.343277,10.451326
4,2012-11-19,LAX,ATL,13.444444,10.363892,2012.0,11.0,19.0,0.0,47.0,15663.0,323.0,11.316519,10.939110,10.183350,11.288425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2222,2012-09-01,ATL,DEN,11.000000,7.937254,2012.0,9.0,1.0,5.0,35.0,15584.0,244.0,9.963934,11.163944,10.791055,10.365239
2223,2012-08-10,ORD,EWR,9.476190,6.903760,2012.0,8.0,10.0,4.0,32.0,15562.0,222.0,11.221390,11.094082,11.097547,11.356683
2224,2012-07-03,ORD,IAH,10.250000,7.107261,2012.0,7.0,3.0,1.0,27.0,15524.0,184.0,11.080302,10.985364,10.325949,10.714472
2225,2012-02-25,DFW,PHL,6.727273,6.388911,2012.0,2.0,25.0,5.0,8.0,15395.0,56.0,9.963934,10.926466,10.950214,10.201987


In [23]:
# print("Testing RMSE: ", mean_squared_error(y_test, X_test['route_mean'], squared=False))

In [24]:
# print("Testing RMSE: ", mean_squared_error(y_train, X_train['route_mean'], squared=False))

## Get day mean

In [25]:
Xy_merged.head()

Unnamed: 0,DateOfDeparture,Departure,Arrival,WeeksToDeparture,std_wtd,Passengers,year,month,day,weekday,week,n_days,day_nb
0,2012-06-19,ORD,DFW,12.875,9.812647,12.331296,2012.0,6.0,19.0,1.0,25.0,15510.0,170.0
1,2012-09-10,LAS,DEN,14.285714,9.466734,10.775182,2012.0,9.0,10.0,0.0,37.0,15593.0,253.0
2,2012-10-05,DEN,LAX,10.863636,9.035883,11.083177,2012.0,10.0,5.0,4.0,40.0,15618.0,278.0
3,2011-10-09,ATL,ORD,11.48,7.990202,11.169268,2011.0,10.0,9.0,6.0,40.0,15256.0,282.0
4,2012-02-21,DEN,SFO,11.45,9.517159,11.269364,2012.0,2.0,21.0,1.0,8.0,15391.0,52.0


In [26]:
# print(X_train['DateOfDeparture'][0].dayofyear)

# Xy_merged['day_nb'] = Xy_merged['DateOfDeparture'].dt.dayofyear

# Xy_merged['leap_year_year'] = Xy_merged['year'].apply(
#     lambda x: True if x == 2012 else False)
# Xy_merged['leap_year_month'] = Xy_merged['month'].apply(
#     lambda x: True if x > 2 else False)
# Xy_merged.loc[:, 'leap_year'] = Xy_merged.loc[:, 'leap_year_year'] & Xy_merged.loc[:, 'leap_year_month']

# Xy_merged['day_nb_leap'] = Xy_merged.apply(lambda x: 
#             x.day_nb - 1 if x.leap_year == True else x.day_nb, axis=1)
# Xy_merged.head()

In [27]:
day_nb_list = Xy_merged['day_nb'].unique()
mean_list = []

for day in day_nb_list:
    df = Xy_merged.loc[Xy_merged['day_nb'] == day]
    tuple_ = (day, df['Passengers'].mean())
    mean_list.append(tuple_)

df_means_day_nb = pd.DataFrame(mean_list, columns=['day_nb', 'day_nb_mean'])

merge_transform = MergeTransformer(
    X_ext=df_means_day_nb, 
    how='left',
    on=['day_nb'])

X_train = merge_transform.fit_transform(X_train)

merge_transform = MergeTransformer(
    X_ext=df_means_day_nb, 
    how='left',
    on=['day_nb'])

X_test = merge_transform.fit_transform(X_test)

X_train.head()


Unnamed: 0,DateOfDeparture,Departure,Arrival,WeeksToDeparture,std_wtd,year,month,day,weekday,week,n_days,day_nb,day_mean,month_mean,week_mean,route_mean,day_nb_mean
0,2012-06-19,ORD,DFW,12.875,9.812647,2012.0,6.0,19.0,1.0,25.0,15510.0,170.0,11.080302,11.24098,11.301741,11.91901,11.203774
1,2012-09-10,LAS,DEN,14.285714,9.466734,2012.0,9.0,10.0,0.0,37.0,15593.0,253.0,11.316519,11.163944,11.282255,10.451326,10.768827
2,2012-10-05,DEN,LAX,10.863636,9.035883,2012.0,10.0,5.0,4.0,40.0,15618.0,278.0,11.22139,11.243521,11.210159,11.049831,11.528612
3,2011-10-09,ATL,ORD,11.48,7.990202,2011.0,10.0,9.0,6.0,40.0,15256.0,282.0,10.812631,11.243521,11.210159,11.300291,11.201538
4,2012-02-21,DEN,SFO,11.45,9.517159,2012.0,2.0,21.0,1.0,8.0,15391.0,52.0,11.080302,10.926466,10.950214,10.822171,11.330403


In [28]:
df_means_day_nb.to_csv('../data/day_nb_means.csv')

In [29]:
gewkgweijfew

NameError: name 'gewkgweijfew' is not defined

## Encode Dates and RandomForestRegressor

In [None]:
X_test.drop(['DateOfDeparture', 'weekday', 'month', 'week'], axis=1, inplace=True)
X_train.drop(['DateOfDeparture', 'weekday', 'month', 'week'], axis=1, inplace=True)

In [None]:
X_train_lol = X_train.drop(['Departure', 'Arrival'], axis=1)
X_test_lol = X_test.drop(['Departure', 'Arrival'], axis=1)

In [None]:
X_train_lol.head()
X_test_lol.head()

In [None]:
grid_params = {
    'n_estimators': [100],
    'min_samples_split': [0.001, 0.005, 0.01],
#     'oob_score': [True]
#     'max_features': [0.5, 0.75]
}

gs = GridSearchCV(estimator=RandomForestRegressor(),
                  param_grid=grid_params,
                  n_jobs=-1,
                  cv=5,
                  verbose=0)

gs.fit(X_train_lol, y_train)        
print(gs.best_score_)
print(gs.best_params_) 

In [None]:
print("Training RMSE: ", mean_squared_error(y_train, gs.predict(X_train_lol), squared=False))
print("Testing RMSE: ", mean_squared_error(y_test, gs.predict(X_test_lol), squared=False))

In [None]:
gs.score(X_test_lol, y_test)

In [None]:
os.environ['PATH'] = os.environ['PATH']+';'+os.environ['CONDA_PREFIX']+r"\Library\bin\graphviz"
dot_data = StringIO()
export_graphviz(gs.best_estimator_[0], out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
Image(graph.create_png())