In [8]:
%matplotlib inline
import os
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import problem
from sklearn.preprocessing import FunctionTransformer
pd.set_option('display.max_columns', None)

In [4]:
data = pd.read_csv(
    os.path.join('data', 'train.csv.bz2')
)
data.loc[:, 'DateOfDeparture'] = pd.to_datetime(data.loc[:, 'DateOfDeparture'])

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8902 entries, 0 to 8901
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   DateOfDeparture   8902 non-null   datetime64[ns]
 1   Departure         8902 non-null   object        
 2   Arrival           8902 non-null   object        
 3   WeeksToDeparture  8902 non-null   float64       
 4   log_PAX           8902 non-null   float64       
 5   std_wtd           8902 non-null   float64       
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 417.4+ KB


In [2]:
X, y = problem.get_train_data()

In [11]:
from sklearn.preprocessing import FunctionTransformer

def _encode_dates(X):
    # With pandas < 1.0, we wil get a SettingWithCopyWarning
    # In our case, we will avoid this warning by triggering a copy
    # More information can be found at:
    # https://github.com/scikit-learn/scikit-learn/issues/16191
    X_encoded = X.copy()

    # Make sure that DateOfDeparture is of datetime format
    X_encoded.loc[:, 'DateOfDeparture'] = pd.to_datetime(X_encoded['DateOfDeparture'])
    # Encode the DateOfDeparture
    X_encoded.loc[:, 'year'] = X_encoded['DateOfDeparture'].dt.year
    X_encoded.loc[:, 'month'] = X_encoded['DateOfDeparture'].dt.month
    X_encoded.loc[:, 'day'] = X_encoded['DateOfDeparture'].dt.day
    X_encoded.loc[:, 'weekday'] = X_encoded['DateOfDeparture'].dt.weekday
    X_encoded.loc[:, 'week'] = X_encoded['DateOfDeparture'].dt.week
    X_encoded.loc[:, 'n_days'] = X_encoded['DateOfDeparture'].apply(
        lambda date: (date - pd.to_datetime("1970-01-01")).days
    )
    # Once we did the encoding, we will not need DateOfDeparture
    return X_encoded.drop(columns=["DateOfDeparture"])

date_encoder = FunctionTransformer(_encode_dates(X))

  X_encoded.loc[:, 'week'] = X_encoded['DateOfDeparture'].dt.week


In [13]:
__file__ = os.path.join('submissions', 'starting_kit', 'estimator.py')
filepath = os.path.join(os.path.dirname(__file__), 'external_data.csv')
filepath

'submissions/starting_kit/external_data.csv'

In [111]:
from sklearn.base import BaseEstimator, TransformerMixin

class MergeTransformer(FunctionTransformer):
    """Custom scaling transformer"""
    
    def read_csv_ramp(self, parse_dates=["Date"]):
        self.filepath = os.path.join(
            self.filepath, self.filename
        )
        
        data = pd.read_csv(os.path.join('data', 'train.csv.bz2'))
        if parse_dates is not None:
            ext_data = pd.read_csv(self.filepath, parse_dates=parse_dates)
        else:
            ext_data = pd.read_csv(self.filepath)
        return ext_data
    
    def merge_external_data(self):
#         print('on', on) 
#         print('X_ext', X_ext) 
#         print('cols_to_keep', cols_to_keep)
#         print('cols_to_rename', cols_to_rename)

        X = self.X.copy()  # to avoid raising SettingOnCopyWarning
        # Make sure that DateOfDeparture is of dtype datetime
        X.loc[:, "DateOfDeparture"] = pd.to_datetime(X['DateOfDeparture'])

        if not(self.filename is None):
            self.X_ext = self.read_csv_ramp(parse_dates=self.parse_dates)

        if self.cols_to_keep != 'all':
            self.X_ext = self.X_ext[self.cols_to_keep]

        if self.cols_to_rename != None:
            self.X_ext = self.X_ext.rename(columns=self.cols_to_rename)

        X_merged = pd.merge(
            X, self.X_ext, how=self.how, on=self.on, sort=False
        )
        return X_merged

    
    def __init__(self, X_ext=None, filename=None, filepath='submissions/starting_kit/', cols_to_keep='all', cols_to_rename=None, how='left', on=None, parse_dates=None):
#         super().__init__(func)
        self.X_ext = X_ext
        self.filename = filename
        self.filepath = filepath
        self.cols_to_keep = cols_to_keep
        self.cols_to_rename = cols_to_rename
        self.how = how
        self.on = on
        self.parse_dates = parse_dates
        
    def fit_transform(self, X):
        self.fit(X)
        return self.transform()

    def fit(self, X):
        self.X = X

    def transform(self):
        return self.merge_external_data()

In [112]:
merge_transform = MergeTransformer(
    X_ext=None, 
    filename='external_data.csv',
    filepath='submissions/starting_kit/',
    cols_to_keep=['Date', 'AirPort', 'Max TemperatureC'], 
    cols_to_rename={'Date': 'DateOfDeparture', 'AirPort': 'Arrival'}, 
    how='left',
    on=['DateOfDeparture', 'Arrival'],
    parse_dates=['Date'])

X_transformed = merge_transform.fit_transform(X)
print(X_transformed)

ValueError: You are trying to merge on datetime64[ns] and object columns. If you wish to proceed you should use pd.concat

In [110]:
coordinates_data = pd.read_csv('data/list-of-airports-in-united-states-of-america-hxl-tags-1.csv', index_col=0)
# print(coordinates_data.columns)
coordinates_data[coordinates_data.loc[:, 'iata_code'] == 'ORD']
# print(coordinates_data.loc[:, 'ident'])

merge_transform = MergeTransformer(
    X_ext=None, 
    filename='list-of-airports-in-united-states-of-america-hxl-tags-1.csv',
    filepath='data/',
    cols_to_keep=['lattitude_deg', 'longitude_deg', 'iata_code'], 
    cols_to_rename=None, 
    how='left',
    on=['Departure', 'iata_code'])

X_transformed_dep = merge_transform.fit_transform(X)
X_transformed_dep

ValueError: Missing column provided to 'parse_dates': 'Date'

In [63]:
# merge_transform = MergeTransformer(
#     X_ext=None, 
#     filename='list-of-airports-in-united-states-of-america-hxl-tags-1.csv',
#     filepath='data/',
#     cols_to_keep=['Date', 'AirPort', 'Max TemperatureC'], 
#     cols_to_rename={'Date': 'DateOfDeparture', 'AirPort': 'Arrival'}, 
#     how='left',
#     on=['DateOfDeparture', 'Arrival'])

# X_transformed = merge_transform.fit_transform(X)
# print(X_transformed)

     DateOfDeparture Departure Arrival  WeeksToDeparture    std_wtd  \
0         2012-06-19       ORD     DFW         12.875000   9.812647   
1         2012-09-10       LAS     DEN         14.285714   9.466734   
2         2012-10-05       DEN     LAX         10.863636   9.035883   
3         2011-10-09       ATL     ORD         11.480000   7.990202   
4         2012-02-21       DEN     SFO         11.450000   9.517159   
...              ...       ...     ...               ...        ...   
8897      2011-10-02       DTW     ATL          9.263158   7.316967   
8898      2012-09-25       DFW     ORD         12.772727  10.641034   
8899      2012-01-19       SFO     LAS         11.047619   7.908705   
8900      2013-02-03       ORD     PHL          6.076923   4.030334   
8901      2011-11-26       DTW     ATL          9.526316   6.167733   

      Max TemperatureC  
0                   34  
1                   33  
2                   22  
3                   27  
4                   16