In [1]:
import pandas as pd
import geopy.distance
import problem
from merge_transformer import MergeTransformer
from create_db import create_db
import os

In [2]:
X_train, y_train = problem.get_train_data('..')
X_train.loc[:, "DateOfDeparture"] = pd.to_datetime(X_train['DateOfDeparture'])

X_test, y_test = problem.get_test_data('..')
X_test.loc[:, "DateOfDeparture"] = pd.to_datetime(X_test['DateOfDeparture'])

In [3]:
def merge_dfs(X):
    
    database = create_db()
        
    merge_transform = MergeTransformer(X_ext=database['Date'], how='left', on=['DateOfDeparture'])
    X = merge_transform.fit_transform(X)
        
    merge_transform = MergeTransformer(
        X_ext=database['Airport'], 
        cols_to_rename={'iata': 'Departure',
                        'latitude_deg': 'latitude_departure',
                        'longitude_deg': 'longitude_departure',
                        'state': 'state_departure',
                        'capacity': 'capacity_departure'},
        how='inner', on=['Departure'])
    X = merge_transform.fit_transform(X)

    merge_transform = MergeTransformer(
        X_ext=database['Airport'], 
        cols_to_rename={'iata': 'Arrival', 
                        'latitude_deg': 'latitude_arrival', 
                        'longitude_deg': 'longitude_arrival',
                        'state': 'state_arrival',
                        'capacity': 'capacity_arrival'},
        how='inner', on=['Arrival'])

    X = merge_transform.fit_transform(X)
    
    X['distance'] = X.apply(lambda x: geopy.distance.geodesic(
        (x.latitude_departure, x.longitude_departure), 
        (x.latitude_arrival, x.longitude_arrival)).km, axis=1)
    
    merge_transform = MergeTransformer(
        X_ext=database['Weather'], 
        cols_to_rename={'Date': 'DateOfDeparture', 'AirPort': 'Arrival'}, 
        how='left',
        on=['DateOfDeparture', 'Arrival'])

    X = merge_transform.fit_transform(X)

    merge_transform = MergeTransformer(
        X_ext=database['StateFeatures'],
        cols_to_keep=['DateOfDeparture', 'state', 'UnemploymentRate', 'holidays'],
        cols_to_rename={'state': 'state_departure'}, 
        how='left',
        on=['DateOfDeparture', 'state_departure'])
    
#     print(database['StateFeatures'].head())

    X = merge_transform.fit_transform(X)
    
    return X

In [4]:
X_train = merge_dfs(X_train)

In [5]:
X_test = merge_dfs(X_test)

In [6]:
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)

In [7]:
y_train_pd = pd.Series(y_train)
y_test_pd = pd.Series(y_test)

y_train_pd.to_csv('../data/y_train.csv', index=False)
y_test_pd.to_csv('../data/y_test.csv', index=False)