In [1]:
%matplotlib inline
import os
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import problem
from sklearn.preprocessing import FunctionTransformer
pd.set_option('display.max_columns', None)

ModuleNotFoundError: No module named 'rampwf'

In [None]:
data = pd.read_csv(
    os.path.join('data', 'train.csv.bz2')
)
data.loc[:, 'DateOfDeparture'] = pd.to_datetime(data.loc[:, 'DateOfDeparture'])

In [None]:
data.info()

In [None]:
X, y = problem.get_train_data()

In [None]:
from sklearn.preprocessing import FunctionTransformer

def _encode_dates(X):
    # With pandas < 1.0, we wil get a SettingWithCopyWarning
    # In our case, we will avoid this warning by triggering a copy
    # More information can be found at:
    # https://github.com/scikit-learn/scikit-learn/issues/16191
    X_encoded = X.copy()

    # Make sure that DateOfDeparture is of datetime format
    X_encoded.loc[:, 'DateOfDeparture'] = pd.to_datetime(X_encoded['DateOfDeparture'])
    # Encode the DateOfDeparture
    X_encoded.loc[:, 'year'] = X_encoded['DateOfDeparture'].dt.year
    X_encoded.loc[:, 'month'] = X_encoded['DateOfDeparture'].dt.month
    X_encoded.loc[:, 'day'] = X_encoded['DateOfDeparture'].dt.day
    X_encoded.loc[:, 'weekday'] = X_encoded['DateOfDeparture'].dt.weekday
    X_encoded.loc[:, 'week'] = X_encoded['DateOfDeparture'].dt.week
    X_encoded.loc[:, 'n_days'] = X_encoded['DateOfDeparture'].apply(
        lambda date: (date - pd.to_datetime("1970-01-01")).days
    )
    # Once we did the encoding, we will not need DateOfDeparture
#     return X_encoded.drop(columns=["DateOfDeparture"])
    return X_encoded

date_encoder = FunctionTransformer(_encode_dates)
X = date_encoder.fit_transform(X)

In [None]:
__file__ = os.path.join('submissions', 'starting_kit', 'estimator.py')
filepath = os.path.join(os.path.dirname(__file__), 'external_data.csv')
filepath

In [None]:
class MergeTransformer():
    """Custom scaling transformer"""
    
    def read_csv_ramp(self, parse_dates=["Date"]):
        self.filepath = os.path.join(
            self.filepath, self.filename
        )
        
        data = pd.read_csv(os.path.join('data', 'train.csv.bz2'))
        if parse_dates is not None:
            ext_data = pd.read_csv(self.filepath, parse_dates=parse_dates)
        else:
            ext_data = pd.read_csv(self.filepath)
        return ext_data
    
    def merge_external_data(self):

        X = self.X.copy()  # to avoid raising SettingOnCopyWarning
        # Make sure that DateOfDeparture is of dtype datetime
        X.loc[:, "DateOfDeparture"] = pd.to_datetime(X['DateOfDeparture'])

        if not(self.filename is None):
            self.X_ext = self.read_csv_ramp(parse_dates=self.parse_dates)

        if self.cols_to_keep != 'all':
            self.X_ext = self.X_ext[self.cols_to_keep]

        if self.cols_to_rename != None:
            self.X_ext = self.X_ext.rename(columns=self.cols_to_rename)

        X_merged = pd.merge(
            X, self.X_ext, how=self.how, on=self.on, sort=False
        )
        return X_merged

    
    def __init__(self, X_ext=None, filename=None, filepath='submissions/starting_kit/', cols_to_keep='all', cols_to_rename=None, how='left', on=None, parse_dates=None):
#         super().__init__(func)
        self.X_ext = X_ext
        self.filename = filename
        self.filepath = filepath
        self.cols_to_keep = cols_to_keep
        self.cols_to_rename = cols_to_rename
        self.how = how
        self.on = on
        self.parse_dates = parse_dates
        
    def fit_transform(self, X):
        self.fit(X)
        return self.transform()

    def fit(self, X):
        self.X = X

    def transform(self):
        return self.merge_external_data()

In [None]:
merge_transform = MergeTransformer(
    X_ext=None, 
    filename='external_data.csv',
    filepath='submissions/starting_kit/',
    cols_to_keep=['Date', 'AirPort', 'Max TemperatureC'], 
    cols_to_rename={'Date': 'DateOfDeparture', 'AirPort': 'Arrival'}, 
    how='left',
    on=['DateOfDeparture', 'Arrival'],
    parse_dates=['Date'])

X = merge_transform.fit_transform(X)

In [None]:
coordinates_data = pd.read_csv('data/list-of-airports-in-united-states-of-america-hxl-tags-1.csv', index_col=0)
coordinates_data[coordinates_data.loc[:, 'iata_code'] == 'ORD']

coordinates_data.loc[:, 'iso_region'] = coordinates_data.loc[:, 'iso_region'].str.strip('US-')
# print(coordinates_data.head())

merge_transform = MergeTransformer(
    X_ext=coordinates_data, 
    filename=None,
    filepath=None,
    cols_to_keep=['latitude_deg', 'longitude_deg', 'iata_code', 'iso_region'], 
    cols_to_rename={'iata_code': 'Departure',
                    'latitude_deg': 'latitude_departure',
                    'longitude_deg': 'longitude_departure',
                    'iso_region': 'state'}, 
    how='left',
    on=['Departure'],
    parse_dates=None)

X = merge_transform.fit_transform(X)

merge_transform = MergeTransformer(
    X_ext=coordinates_data, 
    filename=None,
    filepath=None,
    cols_to_keep=['latitude_deg', 'longitude_deg', 'iata_code'], 
    cols_to_rename={'iata_code': 'Arrival', 'latitude_deg': 'latitude_arrival', 'longitude_deg': 'longitude_arrival'}, 
    how='left',
    on=['Arrival'],
    parse_dates=None)

X = merge_transform.fit_transform(X)

import geopy.distance

X['distance'] = X.apply(lambda x: geopy.distance.geodesic(
    (x.latitude_departure, x.longitude_departure), 
    (x.latitude_arrival, x.longitude_arrival)).km, axis=1)
X

In [None]:
merge_transform = MergeTransformer(
    X_ext=None, 
    filename='oil_price.csv',
    filepath='data/',
    cols_to_keep=['date', 'value'], 
    cols_to_rename={'date': 'DateOfDeparture', 'value': 'OilPrice'},
    how='left',
    on=['DateOfDeparture'],
    parse_dates=['date'])

X = merge_transform.fit_transform(X)

In [None]:
import holidays

us_holidays = holidays.US(years=2011, state='CA')
for key, value in us_holidays.items():
    print(f"key = {key}, value = {value}")

In [None]:
X.head()

In [None]:
states = X.loc[:, 'state'].unique()
years = [2011, 2012, 2013]

X['bank_holidays'] = X.apply(lambda x: x.DateOfDeparture in holidays.US(years = x.year, state=x.state), axis=1)

In [None]:
school_holidays = pd.read_csv('data/holidays.csv', sep=';', parse_dates=['date'])

In [None]:
merge_transform = MergeTransformer(
    X_ext=school_holidays, 
    filename=None,
    filepath=None,
    cols_to_keep=['date', 'is_vacation'], 
    cols_to_rename={'date': 'DateOfDeparture', 'is_vacation': 'school_holidays'},
    how='left',
    on=['DateOfDeparture'],
    parse_dates=None)

X = merge_transform.fit_transform(X)

In [None]:
X.loc[:, 'holidays'] = X.loc[:, 'bank_holidays'] | X.loc[:, 'school_holidays']
X.drop(['bank_holidays', 'school_holidays'], inplace=True, axis=1)

In [None]:
X.head()

In [None]:
airports_rank = pd.read_csv('data/airports_passengers.csv', sep=';', encoding = "utf-8")
airports_rank.head()