In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Merge base input and feature data

- Takes the flights data
- Processes the schedule/realized datetimes and computes the delay in seconds
- Remove observations with unknown prediction targets
- Write prediction target with minimal feature set to CSV

### Parameters

-------------------
- `base_file`: Filepath of base model input with at least column 'id'
- `features`: List of feature files or a string of feature files separated by a '+'


### Returns

-----------------

Output CSV file  with minimal model input


          id             |  aircraftRegistration   |  airlineCode   |  terminal   |  ...   |  year   | ...
    123414481790510775   |         PHPXB           |     148.0     |     NaN      | ...    |  2018   | ...
    123414479288269149   |         PHHSJ           |     164.0     |     1.0      | ...    |  2018   | ...
    123414479666542945   |         PHHSG           |     100.0     |     1.0      | ...    |  2018   | ...
    123414479288365061   |         PHHSG           |     164.0     |     1.0      | ...    |  2018   | ...
    123414479288274329   |         PHHXB           |     164.0     |     1.0      | ...    |  2018   | ...


# File parameters

In [None]:
# input parameters cell
base_file = "../lvt-schiphol-assignment-snakemake/data/model_input/delays_base_input.csv"
features = [
    "../lvt-schiphol-assignment-snakemake/data/model_input/features/route_destinations.csv",
    "../lvt-schiphol-assignment-snakemake/data/model_input/features/schedule_time_features.csv"
]


output_file = "../lvt-schiphol-assignment-snakemake/data/model_input/delays_extended_input.csv"

In [None]:
if isinstance(features, str):
    features = features.split('+')
    print("Parsed features from string instead of List object")
    print(features)

In [None]:
columns_to_ignore = [
    "scheduleDateTime", "scheduleDate", "scheduleTime", "actualOffBlockTime"
]

## Libraries

In [None]:
import pandas as pd
import numpy as np

import sys
sys.path.append("../")

from src.data.google_storage_io import read_csv_data, write_csv_data

## Read data

In [None]:
%%time
df_base = read_csv_data(base_file)
df_base.head()

In [None]:
%%time

# read feature data from multiple files and merge by 'id'
print(f"Reading features from first file: {features[0]}")
df_features = read_csv_data(features[0])

if len(features) > 0:
    for feature_file in features[1:]:
        print(f"Merging features from file: {feature_file}")
        old_shape = df_features.shape
        tmp_features = read_csv_data(feature_file)
        df_features = pd.merge(
            df_features,
            tmp_features,
            on="id",
            how="inner"
        )
        print(f"Merged features. Shape {old_shape} -> {df_features.shape}")
df_features.head()

## Merge base model input with features

- One large file to pass onto model notebooks

Downside: One large file with a lot of copied values

Upside: Easier to verify downstream model notebooks use the same data

In [None]:
df_output = pd.merge(
    df_base,
    df_features,
    on="id",
    how="inner")
print(f"Data shape: {df_output.shape}")
df_output.head()

## Write output to CSV

Local or Google Storage is both handled

In [None]:
# # write output file
write_csv_data(df_output, output_file, index=False)

### Overview of the output data

In [None]:
df_output.info()

In [None]:
# from collections import Counter
# import matplotlib.pyplot as plt

# aircraft_flight_counts = Counter(df_model_input["aircraftRegistration"])
# import seaborn as sns
# sns.distplot(list(aircraft_flight_counts.values()))
# plt.show()

# df_aircraft_groups = df_model_input.groupby("aircraftRegistration")
# for group, group_data in list(df_aircraft_groups):
#     if aircraft_flight_counts[group] > 1000:
#         group_data.plot(x='scheduleDateTime', y='scheduleDelaySeconds')
#         plt.show()

# # df_aircraft_groups