In [1]:
import pandas as pd
import geopy.distance
import problem
from merge_transformer import MergeTransformer
from sklearn.preprocessing import FunctionTransformer

from create_db import create_db, _encode_dates
import os

In [2]:
X_train, y_train = problem.get_train_data('..')
X_train.loc[:, "DateOfDeparture"] = pd.to_datetime(X_train['DateOfDeparture'])

X_test, y_test = problem.get_test_data('..')
X_test.loc[:, "DateOfDeparture"] = pd.to_datetime(X_test['DateOfDeparture'])

In [3]:
def clean_df(X):
    
    date_encoder = FunctionTransformer(_encode_dates)
    X = date_encoder.fit_transform(X)
    X.drop('DateOfDeparture', axis=1, inplace=True)
    X.rename({'DateBooked': 'DateOfDeparture', 
             'year':'year_departure',
             'month':'month_departure',
             'day':'day_departure',
             'weekday':'weekday_departure',
              'week':'week_departure',
              'n_days':'n_days_departure'}, axis=1, inplace=True)
#     X.drop('DateOfDeparture', axis=1, inplace=True)
    

#     X.rename({'DateBooked': 'DateOfDeparture'}, axis=1, inplace=True)
    date_encoder = FunctionTransformer(_encode_dates)
    X = date_encoder.fit_transform(X)
    X.rename({'year':'year_booked',
             'month':'month_booked',
             'day':'day_booked',
             'weekday':'weekday_booked',
              'week':'week_booked',
              'n_days':'n_days_booked'}, axis=1, inplace=True)

    columns = ['DateOfDeparture', 'Departure', 'Arrival', 'Days_to_departure', 'state_departure', 'state_arrival']
    X.drop(columns, axis=1, inplace=True)
    
    return X

In [4]:
def merge_dfs(database, X):
    
    X['Days_to_departure'] = (X['WeeksToDeparture'] * 7).round()
    X['DateBooked'] = X['DateOfDeparture'] -  pd.to_timedelta(X['Days_to_departure'], unit='d')
    
    merge_transform = MergeTransformer(
        X_ext=database['Date'],
        how='left',
        cols_to_rename={'DateOfDeparture': 'DateBooked'},
        cols_to_keep=['DateBooked', 'oil_stock_price', 'oil_stock_volume', 'AAL_stock_price', 'AAL_stock_volume', 'SP_stock_price', 'SP_stock_volume'],
        on=['DateBooked'])
    X = merge_transform.fit_transform(X)
    
    merge_transform = MergeTransformer(
        X_ext=database['Airport'], 
        cols_to_rename={'iata': 'Departure',
                        'latitude_deg': 'latitude_departure',
                        'longitude_deg': 'longitude_departure',
                        'state': 'state_departure',
                        'capacity': 'capacity_departure',
                        'pop2010': 'population_departure'},
        how='left', on=['Departure'])
    X = merge_transform.fit_transform(X)
    
    merge_transform = MergeTransformer(
        X_ext=database['Airport'], 
        cols_to_rename={'iata': 'Arrival', 
                        'latitude_deg': 'latitude_arrival', 
                        'longitude_deg': 'longitude_arrival',
                        'state': 'state_arrival',
                        'capacity': 'capacity_arrival',
                        'pop2010': 'population_arrival'},
        how='left', on=['Arrival'])

    X = merge_transform.fit_transform(X)
        
    X['distance'] = X.apply(lambda x: geopy.distance.geodesic(
        (x.latitude_departure, x.longitude_departure), 
        (x.latitude_arrival, x.longitude_arrival)).km, axis=1)
    
    merge_transform = MergeTransformer(
        X_ext=database['AirportStatistics'], 
        cols_to_rename={'Date': 'DateOfDeparture', 'AirPort': 'Departure'}, 
        how='left',
        on=['DateOfDeparture', 'Departure'])

    X = merge_transform.fit_transform(X)
        
    merge_transform = MergeTransformer(
        X_ext=database['AirportStatistics'], 
        cols_to_rename={'Date': 'DateOfDeparture', 'AirPort': 'Arrival'}, 
        how='left',
        on=['DateOfDeparture', 'Arrival'])

    X = merge_transform.fit_transform(X)
        
    database['StateFeatures'].rename({'Abbreviation': 'state_departure'}, axis=1, inplace=True)    
    merge_transform = MergeTransformer(
        X_ext=database['StateFeatures'],
        cols_to_keep=['DateOfDeparture', 'state_departure', 'holidays', 'GDP_per_cap'],
        how='left',
        on=['DateOfDeparture', 'state_departure'])
    
    X = merge_transform.fit_transform(X)
    
    merge_transform = MergeTransformer(
        X_ext=database['StateFeatures'],
        cols_to_keep=['DateBooked', 'UnemploymentRateBooked'],
        cols_to_rename={'DateOfDeparture': 'DateBooked', 'UnemploymentRate': 'UnemploymentRateBooked'},
        how='left',
        on=['DateBooked', 'state_departure'])
    
    X = merge_transform.fit_transform(X)
    
    ### Cleaning related features ###
    X = clean_df(X)
   
    return X

In [5]:
database = create_db()

oil col Index(['Date', 'Close', 'Volume'], dtype='object')
Index(['rank', 'name', 'usps', 'pop2020', 'pop2010', 'change', 'density',
       'aland'],
      dtype='object')



Index(['iata', 'municipality', 'latitude_deg', 'longitude_deg', 'state'], dtype='object')
['pop2010', 'municipality']
['UnemploymentRate', 'year', 'month', 'State']
['State', 'GDP_per_cap']
['DateOfDeparture', 'school_holidays']


In [6]:
print(X_train.info())
X_train = merge_dfs(database, X_train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8902 entries, 0 to 8901
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   DateOfDeparture   8902 non-null   datetime64[ns]
 1   Departure         8902 non-null   object        
 2   Arrival           8902 non-null   object        
 3   WeeksToDeparture  8902 non-null   float64       
 4   std_wtd           8902 non-null   float64       
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 347.9+ KB
None
['DateBooked', 'oil_stock_price', 'oil_stock_volume', 'AAL_stock_price', 'AAL_stock_volume', 'SP_stock_price', 'SP_stock_volume']
['DateOfDeparture', 'state_departure', 'holidays', 'GDP_per_cap']
In if if :  ['DateBooked', 'UnemploymentRateBooked']
['DateBooked', 'UnemploymentRateBooked', 'state_departure']


In [7]:
X_test = merge_dfs(database, X_test)

['DateBooked', 'oil_stock_price', 'oil_stock_volume', 'AAL_stock_price', 'AAL_stock_volume', 'SP_stock_price', 'SP_stock_volume']
['DateOfDeparture', 'state_departure', 'holidays', 'GDP_per_cap']
In if if :  ['DateBooked', 'UnemploymentRateBooked']
['DateBooked', 'UnemploymentRateBooked', 'state_departure']


In [8]:
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)

In [9]:
y_train_pd = pd.Series(y_train)
y_test_pd = pd.Series(y_test)

y_train_pd.to_csv('../data/y_train.csv', index=False)
y_test_pd.to_csv('../data/y_test.csv', index=False)