In [7]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create features from destinations

- Takes the flights data
- Processes the schedule/realized datetimes and computes the delay in seconds
- Remove observations with unknown prediction targets
- Write prediction target with minimal feature set to CSV

### Parameters

-------------------
- `input_file`: Filepath of flights data in format received from Schiphol
- `output_file`: Filepath to write output csv file with minimal modelling input


### Returns

-----------------

Output CSV file  with minimal model input

    
    id                   |  aircraftRegistration  |  airlineCode  |  terminal  |  serviceType  |      scheduleDateTime        |     actualOffBlockTime      |  scheduleDelaySeconds
    124257473326719795   |    PHEXI               |     80.0      |     2.0    |       J       |  2018-05-01 16:35:00+02:00   |  2018-05-01 16:58:16+02:00  |         1396.0
    124538476600837715   |    PHEXL               |     2481.0    |     1.0    |       J       |  2018-06-10 13:00:00+02:00   |  2018-06-10 13:11:25+02:00  |         685.0
    123512829091050355   |    PHBQO               |     100.0     |     2.0    |       J       |  2018-01-15 10:15:00+01:00   |  2018-01-15 10:35:10+01:00  |         1210.0
    123786805997701057   |    PHEXG               |     2481.0    |     1.0    |       J       |  2018-02-23 17:45:00+01:00   |  2018-02-23 17:55:52+01:00  |         652.0
    124664922607744671   |    PHBXP               |     1551.0    |     2.0    |       J       |  2018-06-28 20:50:00+02:00   |  2018-06-28 22:09:23+02:00  |         4763.0


# File parameters

In [8]:
# input parameters cell
flights_file = "../lvt-schiphol-assignment-snakemake/data/raw/flights.csv"
airports_file = "../lvt-schiphol-assignment-snakemake/data/raw/airports.csv"
output_file = "processed_flights.csv"

## Libraries

In [9]:
import pandas as pd
import numpy as np

import sys
sys.path.append("../")

from src.data.google_storage_io import read_csv_data, write_csv_data

## Read data

In [11]:
%%time
df_flights = read_csv_data(flights_file)
df_airports = read_csv_data(airports_file)

Reading file from local directory
File:	../lvt-schiphol-assignment-snakemake/data/raw/flights.csv

Reading file from local directory
File:	../lvt-schiphol-assignment-snakemake/data/raw/airports.csv

Wall time: 1.8 s


# Destination features

- Route to destination in a column of lists
- Get final destination and number of destination

In [17]:
%%time

# route destinations parsed as a list then calculate length and expand list to columns
df_routes = df_flights[["id", "route.destinations"]] \
    .assign(route_list = lambda d: d["route.destinations"].apply(eval)) \
    .assign(route_length = lambda d: d["route_list"].apply(len),
            first_destination = lambda d: d["route_list"].apply(lambda x: x[0]),
            final_destination = lambda d: d["route_list"].apply(lambda x: x[-1]))

# determine separate route output columns
max_route_length = df_routes["route_length"].max()
destination_columns = [f"destination_{i}" for i in range(max_route_length)]

# unlist routes into multiple columns
df_routes[destination_columns] =  pd.DataFrame(df_routes["route_list"] \
                                               .apply(lambda x: (x + [np.nan] * max_route_length)[:max_route_length]).tolist(),
                                               index= df_routes.index)
df_routes

Wall time: 4.32 s


Unnamed: 0,id,route.destinations,route_list,route_length,first_destination,final_destination,destination_0,destination_1,destination_2,destination_3,destination_4
0,123414478192901837,['AMS'],[AMS],1,AMS,AMS,AMS,,,,
1,123414481790516475,['AMS'],[AMS],1,AMS,AMS,AMS,,,,
2,123414478192901991,['AMS'],[AMS],1,AMS,AMS,AMS,,,,
3,123414481790510775,['AMS'],[AMS],1,AMS,AMS,AMS,,,,
4,123414479288269149,['SPC'],[SPC],1,SPC,SPC,SPC,,,,
...,...,...,...,...,...,...,...,...,...,...,...
523270,124896773782315507,['MAN'],[MAN],1,MAN,MAN,MAN,,,,
523271,124896773782912169,['SEN'],[SEN],1,SEN,SEN,SEN,,,,
523272,124896745325235371,['DUB'],[DUB],1,DUB,DUB,DUB,,,,
523273,124896745995906173,['NCL'],[NCL],1,NCL,NCL,NCL,,,,


In [19]:
df_airports.head()

Unnamed: 0,Airport,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz,Type,Source
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports


In [26]:
df_airports.query("IATA == 'AMS'")

Unnamed: 0,Airport,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz,Type,Source
574,580,Amsterdam Airport Schiphol,Amsterdam,Netherlands,AMS,EHAM,52.308601,4.76389,-11,1,E,Europe/Amsterdam,airport,OurAirports


In [54]:
def distance_to_schiphol(lat, lon):
    schiphol_coords = np.array([52.308601, 4.76389])
    dist = np.linalg.norm(np.array([lat, lon]) -  schiphol_coords)
    return dist

In [66]:
%%time

df_final_destination_features = pd.merge(
    df_routes[["id", "final_destination"]],
    df_airports[["IATA", "Country", "City", "Latitude", "Longitude", "Altitude", "DST", "Type"]], 
    how = "left",
    left_on = ["final_destination"],
    right_on = ["IATA"])

df_final_destination_features = df_final_destination_features \
    .assign(destination_distance = lambda d: d[["Latitude", "Longitude"]] \
                                .apply(lambda x: distance_to_schiphol(lat=x[0], lon=x[1]), axis=1)
           )
df_final_destination_features

Wall time: 22.9 s


Unnamed: 0,id,final_destination,IATA,Country,City,Latitude,Longitude,Altitude,DST,Type,destination_distance
0,123414478192901837,AMS,AMS,Netherlands,Amsterdam,52.308601,4.763890,-11.0,E,airport,4.338444e-07
1,123414481790516475,AMS,AMS,Netherlands,Amsterdam,52.308601,4.763890,-11.0,E,airport,4.338444e-07
2,123414478192901991,AMS,AMS,Netherlands,Amsterdam,52.308601,4.763890,-11.0,E,airport,4.338444e-07
3,123414481790510775,AMS,AMS,Netherlands,Amsterdam,52.308601,4.763890,-11.0,E,airport,4.338444e-07
4,123414479288269149,SPC,SPC,Spain,Santa Cruz De La Palma,28.626499,-17.755600,107.0,E,airport,3.267980e+01
...,...,...,...,...,...,...,...,...,...,...,...
523270,124896773782315507,MAN,MAN,United Kingdom,Manchester,53.353699,-2.274950,257.0,E,airport,7.116003e+00
523271,124896773782912169,SEN,SEN,United Kingdom,Southend,51.571400,0.695556,49.0,E,airport,4.134587e+00
523272,124896745325235371,DUB,DUB,Ireland,Dublin,53.421299,-6.270070,242.0,E,airport,1.108992e+01
523273,124896745995906173,NCL,NCL,United Kingdom,Newcastle,55.037498,-1.691670,266.0,E,airport,7.008647e+00


In [68]:
# meta columns for utility for columns we will often merge by
output_columns = ["id", "final_destination", "Country", "City", "Latitude", "Longitude", "Altitude", "DST", "destination_distance"]

# DataFrame with id + merging columns + prediction target
df_output = df_final_destination_features[output_columns]
df_output.head()

Unnamed: 0,id,final_destination,Country,City,Latitude,Longitude,Altitude,DST,destination_distance
0,123414478192901837,AMS,Netherlands,Amsterdam,52.308601,4.76389,-11.0,E,4.338444e-07
1,123414481790516475,AMS,Netherlands,Amsterdam,52.308601,4.76389,-11.0,E,4.338444e-07
2,123414478192901991,AMS,Netherlands,Amsterdam,52.308601,4.76389,-11.0,E,4.338444e-07
3,123414481790510775,AMS,Netherlands,Amsterdam,52.308601,4.76389,-11.0,E,4.338444e-07
4,123414479288269149,SPC,Spain,Santa Cruz De La Palma,28.626499,-17.7556,107.0,E,32.6798


## Write output to CSV

Local or Google Storage is both handled.ipynb_checkpoints/

In [64]:
# # write output file
write_csv_data(df_output, output_file, index=False)

### Overview of the output data

In [65]:
df_output.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 523275 entries, 0 to 523274
Data columns (total 7 columns):
id                      523275 non-null int64
final_destination       523275 non-null object
Latitude                522336 non-null float64
Longitude               522336 non-null float64
Altitude                522336 non-null float64
DST                     522336 non-null object
destination_distance    522336 non-null float64
dtypes: float64(4), int64(1), object(2)
memory usage: 31.9+ MB
