In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import pandas as pd
import numpy as np

import sys
sys.path.append("../")

from src.data.google_storage_io import read_csv_data

# File parameters

In [3]:
# input parameters cell
input_file = "../lvt-schiphol-assignment-snakemake/data/raw/flights.csv"
output_file = "processed_flights.csv"

# utility functions

In [4]:
def missing_values_percentages(df):
    """Calculate summary of missing values per column"""
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})

    missing_value_df = missing_value_df.sort_values('percent_missing', ascending=False)
    return missing_value_df


def check_col_singular(x):
    """check if pd.Series contains more than 1 unique value excluding NaN"""
    return x.dropna().nunique() <= 1


def drop_singular_columns(df, verbose=False):
    """Drop DataFrame columns with 1 or fewer unique values excluding NaN"""
    col_singular = df.apply(check_col_singular, axis=0)
    if verbose:
        n_singular = sum(col_singular)
        print(f"Dropping {n_singular} columns")
        print(f"{col_singular[col_singular].index}")
        
    df_output = df[[col for col, is_singular in col_singular.items() 
                    if not is_singular]]
    return df_output


def clean_flights(df_flights, verbose=True):
    """Clean flights data by removing singular columns and formatting dates"""
    df = df_flights
    df = df.dropna(subset=["scheduleDate", "scheduleTime", "actualOffBlockTime"]).reset_index()
    
    # remove singular columns
    df = drop_singular_columns(df, verbose=verbose)
    
    # format datetime fields
    df["actualOffBlockTime"] = pd.to_datetime(df["actualOffBlockTime"], utc=True).dt.tz_convert('Europe/Amsterdam')
    
    series_datetime_str = df['scheduleDate'].astype(str) + " " + df['scheduleTime'].astype(str)
    df["scheduleDateTime"] = pd.to_datetime(series_datetime_str, format="%Y%m%d %H:%M:%S").dt.tz_localize('Europe/Amsterdam')
    
#     calculate delay as difference between scheduled and realized departure
    df["scheduleDelaySeconds"] = pd.to_timedelta(df["actualOffBlockTime"] - df["scheduleDateTime"]).dt.total_seconds()

    return df


def read_flights_data(filename):
    """Read data local or from Google Storage bucket and clean it"""
    df = read_csv_data(input_file)
    print(f"Loaded data from: {input_file}\n"
          f"Shape of data: {df.shape}")
    
    df = clean_flights(df)
    print(f"Cleaned flights data\n"
          f"Shape of data: {df.shape}")
    
    return df

### Read data

In [5]:
df = read_flights_data(input_file)

Reading file from local directory
File:	../lvt-schiphol-assignment-snakemake/data/raw/flights.csv

Loaded data from: ../lvt-schiphol-assignment-snakemake/data/raw/flights.csv
Shape of data: (523275, 28)
Dropping 5 columns
Index(['baggageClaim', 'estimatedLandingTime', 'expectedTimeOnBelt',
       'flightDirection', 'transferPositions'],
      dtype='object')
Cleaned flights data
Shape of data: (487716, 26)


In [6]:
df = df.sample(10000)

# Destination features

- Route to destination in a column of lists
- Get final destination and number of destination

In [28]:
%%time

# route destinations parsed as a list then calculate length and expand list to columns
df_routes = df[["id", "route.destinations"]] \
    .assign(route_list = df["route.destinations"].apply(eval)) \
    .assign(length = lambda d: d["route_list"].apply(len),
            final_destination = lambda d: d["route_list"].apply(lambda x: x[-1]))
    
max_route_length = df_routes["length"].max()

destination_columns = [f"destination_{i}" for i in range(max_route_length)]
df_routes[destination_columns] =  pd.DataFrame(df_routes["route_list"] \
                                               .apply(lambda x: (x + [np.nan] * max_route_length)[:max_route_length]).tolist(),
                                               index= df.index)
df_routes

Unnamed: 0,id,route.destinations,route_list,length,final_destination,destination_0,destination_1,destination_2,destination_3
288877,124257473326719795,['SOU'],[SOU],1,SOU,SOU,,,
397668,124538476600837715,['HAJ'],[HAJ],1,HAJ,HAJ,,,
31890,123512829091050355,"['JRO', 'DAR']","[JRO, DAR]",2,DAR,JRO,DAR,,
122861,123786805997701057,['MUC'],[MUC],1,MUC,MUC,,,
447697,124664922607744671,['HEL'],[HEL],1,HEL,HEL,,,
...,...,...,...,...,...,...,...,...,...
359194,124440123685717595,['LPI'],[LPI],1,LPI,LPI,,,
338532,124383922939316193,['DUS'],[DUS],1,DUS,DUS,,,
361078,124447148626387397,['DTW'],[DTW],1,DTW,DTW,,,
198547,124004576470391157,['OSL'],[OSL],1,OSL,OSL,,,


In [35]:
# px.sunburst(df_routes, path=["final_destination"])

# Output prediction target

In [23]:
df.columns

Index(['index', 'actualOffBlockTime', 'aircraftRegistration',
       'aircraftType.iatamain', 'aircraftType.iatasub', 'airlineCode',
       'expectedTimeBoarding', 'expectedTimeGateClosing',
       'expectedTimeGateOpen', 'flightName', 'flightNumber', 'gate', 'id',
       'mainFlight', 'prefixIATA', 'prefixICAO', 'publicEstimatedOffBlockTime',
       'publicFlightState.flightStates', 'route.destinations', 'scheduleDate',
       'scheduleTime', 'serviceType', 'terminal',
       'transferPositions.transferPositions', 'scheduleDateTime',
       'scheduleDelaySeconds'],
      dtype='object')

In [34]:
# meta columns for utility for columns we will often merge by
output_columns = ["id", "aircraftRegistration", "airlineCode", "terminal", 
                  "serviceType", "scheduleDateTime", "actualOffBlockTime", "scheduleDelaySeconds"]

# DataFrame with id + merging columns + prediction target
df_target = df[output_columns]
df_target.head()

Unnamed: 0,id,aircraftRegistration,airlineCode,terminal,serviceType,scheduleDateTime,actualOffBlockTime,scheduleDelaySeconds
288877,124257473326719795,PHEXI,80.0,2.0,J,2018-05-01 16:35:00+02:00,2018-05-01 16:58:16+02:00,1396.0
397668,124538476600837715,PHEXL,2481.0,1.0,J,2018-06-10 13:00:00+02:00,2018-06-10 13:11:25+02:00,685.0
31890,123512829091050355,PHBQO,100.0,2.0,J,2018-01-15 10:15:00+01:00,2018-01-15 10:35:10+01:00,1210.0
122861,123786805997701057,PHEXG,2481.0,1.0,J,2018-02-23 17:45:00+01:00,2018-02-23 17:55:52+01:00,652.0
447697,124664922607744671,PHBXP,1551.0,2.0,J,2018-06-28 20:50:00+02:00,2018-06-28 22:09:23+02:00,4763.0


## Write output to CSV

Local or Google Storage is both handled

In [24]:
# write output file
write_csv_data(df_target, output_file, index=False)

NameError: name 'df_to_save' is not defined

In [None]:
df_output.columns

### Overview of the output data

In [53]:
df_output.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342366 entries, 125574 to 75688
Data columns (total 7 columns):
data_id    342366 non-null object
RaceId     342366 non-null object
step       342366 non-null int32
Passed     342366 non-null float64
Loop       342366 non-null int64
Speed      342366 non-null float64
seconds    342366 non-null float64
dtypes: float64(3), int32(1), int64(1), object(2)
memory usage: 19.6+ MB


In [None]:
def enrich_airport_features(df_airports):

In [9]:
import plotly.express as px

In [10]:
# df_routes["parent"] = "schiphol"
# sunburst_path_columns = ["parent"] + destination_columns

# for src, target in zip(sunburst_path_columns[:-1], sunburst_path_columns[1:]):
    
#     df_path_counts = df_routes[[src, target, "id"]].dropna().groupby([src,target]).count()
    
    

In [11]:
# px.sunburst(df_routes, path=destination_columns)

In [12]:
# # route destinations parsed as a list then calculate length and expand list to columns
# df_sunburst = df[["id", "route.destinations"]] \
#     .assign(route_list = df["route.destinations"].apply(eval)) \
#     .assign(length = lambda d: d["route_list"].apply(len))

# df_sunburst

In [13]:
# df_sunburst = df.assign("route")

In [14]:
# df = read_flights_data(input_file)

In [15]:
df.head()

Unnamed: 0,index,actualOffBlockTime,aircraftRegistration,aircraftType.iatamain,aircraftType.iatasub,airlineCode,expectedTimeBoarding,expectedTimeGateClosing,expectedTimeGateOpen,flightName,...,publicEstimatedOffBlockTime,publicFlightState.flightStates,route.destinations,scheduleDate,scheduleTime,serviceType,terminal,transferPositions.transferPositions,scheduleDateTime,scheduleDelaySeconds
288877,298940,2018-05-01 16:58:16+02:00,PHEXI,EMJ,E75,80.0,2018-05-01T16:24:00.000+02:00,2018-05-01T16:39:00.000+02:00,2018-05-01T15:54:00.000+02:00,AF8406,...,2018-05-01T16:54:00.000+02:00,"['DEP', 'DEL']",['SOU'],2018-05-01,16:35:00,J,2.0,,2018-05-01 16:35:00+02:00,1396.0
397668,410781,2018-06-10 13:11:25+02:00,PHEXL,EMJ,E75,2481.0,2018-06-10T12:30:00.000+02:00,2018-06-10T12:45:00.000+02:00,2018-06-10T12:00:00.000+02:00,9W8648,...,,['DEP'],['HAJ'],2018-06-10,13:00:00,J,1.0,,2018-06-10 13:00:00+02:00,685.0
31890,33140,2018-01-15 10:35:10+01:00,PHBQO,777,772,100.0,2018-01-15T09:30:00.000+01:00,2018-01-15T10:00:00.000+01:00,2018-01-15T08:45:00.000+01:00,KL0569,...,,['DEP'],"['JRO', 'DAR']",2018-01-15,10:15:00,J,2.0,,2018-01-15 10:15:00+01:00,1210.0
122861,127004,2018-02-23 17:55:52+01:00,PHEXG,EMJ,E75,2481.0,2018-02-23T17:15:00.000+01:00,2018-02-23T17:30:00.000+01:00,2018-02-23T16:45:00.000+01:00,9W8705,...,,['DEP'],['MUC'],2018-02-23,17:45:00,J,1.0,,2018-02-23 17:45:00+01:00,652.0
447697,461657,2018-06-28 22:09:23+02:00,PHBXP,739,73J,1551.0,2018-06-28T21:37:00.000+02:00,2018-06-28T21:52:00.000+02:00,2018-06-28T21:07:00.000+02:00,MF9917,...,2018-06-28T22:07:00.000+02:00,"['DEP', 'DEL']",['HEL'],2018-06-28,20:50:00,J,2.0,,2018-06-28 20:50:00+02:00,4763.0


In [16]:
# df.query("`transferPositions.transferPositions` == `transferPositions.transferPositions`")

## Data description

- <font color='red'>**Flight Direction (flightDirection):**</font>  
Indicates whether the commercial flight is a departure flight (d) or an arrival flight (a).  

- <font color='red'>**Flightname (flightName):**</font>    
The flight name of the commercial flight as indicated on the ticket of the passenger.  

- <font color='red'>**Flightnumber (flightNumber):**</font>    
Nummeric part of the flight name.  

- <font color='red'>**IATA flight number prefix (prefixIATA):**</font>    
The two character IATA prefix of the airline that operates the commercial flight.  

- <font color='red'>**ICAO flight number prefix (prefixICAO):**</font>    
The three character ICAO prefix of the airline that operates the commercial flight.  

- <font color='red'>**Schedule date (scheduleDate):**</font>   
The date on which the scheduled commercial flight will be operated.  

- <font color='red'>**Schedule time (scheduleTime):**</font>    
The time the commercial flight is scheduled to depart or arrive.  

- <font color='red'>**Service type of a flight. (serviceType):**</font>    
Category of the commercial flight. J = Passenger Line, C= Passenger Charter, F = Freight Line and H = Freight Charter.

- <font color='red'>**Main flight (mainFlight):**</font>    
In case of a codeshare (one flight with several flight numbers of different airlines) the main flight shows the flight number of the airline that actually operates the flight.  

- <font color='red'>**Codeshares (codeshares):**</font>    
All the flight numbers that are related to the main flight.  

- <font color='red'>**Route (route):**</font>    
The complete route of the commercial flight. In most cases this is a direct route between two airports. There are actually also flights that have more than one stopover. Example: GCK-AMS-LGW (Jakarta – Amsterdam – London Gatwick)  

- <font color='red'>**Public Flightstate (publicFlightState):**</font>    
The status of a public flight  

- <font color='red'>**Terminal (terminal):**</font>    
The section of the Schiphol terminal from which the commercial flight will be leaving or will arrive.  

- <font color='red'>**Gate (gate):**</font>    
The number of the gate from which the scheduled flight will depart and the passengers will be boarding.  

- <font color='red'>**Baggage belt (baggageClaim):**</font>    
The luggage belt number in the reclaim hall (reclaim 1,2,3 or 4) the arrived passengers can pick up their luggage.  

- <font color='red'>**Check-in desks (checkinAllocations):**</font>    
The check-in desks that are allocated for the flight (or flights in case of a codeshare).  

- <font color='red'>**Check-in class:**</font>    
The flight classes that are allocated for a specific flight.  

- <font color='red'>**Transfer desks (transferPositions):**</font>    
The transfer desks that are allocated to the flight.  

- <font color='red'>**Aircraft Type, IATA main (aircraftType, iatamain):**</font>    
The IATA main category code of the aircraft that operates the main flight.  

- <font color='red'>**Aircraft Type, IATA sub (aircraftType, iatasub):**</font>    
The IATA sub category code of the aircraft that operates the main flight.  

- <font color='red'>**Aircraft registration (aircraftRegistration):**</font>    
The unique alphanumeric string that identifies the aircraft that operates the main scheduled flight.  

- <font color='red'>**Estimated Landing Time, ELDT. (estimatedLandingTime):**</font>    
The expected time the (arriving) flight will be landing at Amsterdam Airport Schiphol.  

- <font color='red'>**Actual Landing Time, ALDT. (actualLandingTime):**</font>    
The time the aircraft landed on the runway.  

- <font color='red'>**Public estimated off block time (publicEstimatedOffBlockTime):**</font>    
The expected departure time of a departing flight that is made publically. Made visible on the flight displays in the terminal, on the website, on the Schiphol app, on the information Terminals in the lounges etc.  

- <font color='red'>**Actual Off Block time, AOBT (actualOffBlockTime):**</font>    
The actual time of departure of a flight from Amsterdam Airport Schiphol.  

- <font color='red'>**Expected time gate open (expectedTimeGateOpen):**</font>    
The time the gated is expected to be opened by the flight handler.  

- <font color='red'>**Expected time boarding (expectedTimeBoarding):**</font>    
The expected time the boarding of passengers for a flight will start.  

- <font color='red'>**Expected time closing (expectedTimeGateClosing):**</font>  
The time it is expected that the gate will be closed by the flight handler.

- <font color='red'>**Expected time on belt (expectedTimeOnBelt):**</font>    
The time it is expected that the first luggage of a flight will be on the reclaim belt.  

- <font color='red'>**Airline (airlineCode):**</font>    
NVLS code of the airline  

- <font color='red'>**Routes Eu (routesEu):**</font>    
S (Schengen), E (Europe) or N (non-Europe)  

- <font color='red'>**Visa (visa):**</font>    
Destination requires visa (true) or not (false)  

- <font color='red'>**Last Updated (lastUpdated):**</font>    
Time of last update to flight in flight information system.  

- <font color='red'>**Flight ID (id):**</font>    
Unique identifier of the flight.  

- <font color='red'>**Schema Version (schemaVersion):**</font>    
Schema version of the API.  

In [17]:
df_missing_prc = missing_values_percentages(df.query("serviceType == 'J'"))
df_missing_prc.plot(kind='bar', x="column_name", y="percent_missing")

<matplotlib.axes._subplots.AxesSubplot at 0x21df4704b48>

In [18]:
import matplotlib.pyplot as plt

# Counts per airline

In [19]:
df.groupby(["airlineCode"])["index"].count().reset_index(name="n_obs") 

Unnamed: 0,airlineCode,n_obs
0,6.0,11
1,7.0,386
2,12.0,208
3,21.0,10
4,30.0,1
...,...,...
131,5071.0,1
132,5074.0,2
133,5077.0,1
134,5081.0,5


In [20]:
df.head()

Unnamed: 0,index,actualOffBlockTime,aircraftRegistration,aircraftType.iatamain,aircraftType.iatasub,airlineCode,expectedTimeBoarding,expectedTimeGateClosing,expectedTimeGateOpen,flightName,...,publicEstimatedOffBlockTime,publicFlightState.flightStates,route.destinations,scheduleDate,scheduleTime,serviceType,terminal,transferPositions.transferPositions,scheduleDateTime,scheduleDelaySeconds
288877,298940,2018-05-01 16:58:16+02:00,PHEXI,EMJ,E75,80.0,2018-05-01T16:24:00.000+02:00,2018-05-01T16:39:00.000+02:00,2018-05-01T15:54:00.000+02:00,AF8406,...,2018-05-01T16:54:00.000+02:00,"['DEP', 'DEL']",['SOU'],2018-05-01,16:35:00,J,2.0,,2018-05-01 16:35:00+02:00,1396.0
397668,410781,2018-06-10 13:11:25+02:00,PHEXL,EMJ,E75,2481.0,2018-06-10T12:30:00.000+02:00,2018-06-10T12:45:00.000+02:00,2018-06-10T12:00:00.000+02:00,9W8648,...,,['DEP'],['HAJ'],2018-06-10,13:00:00,J,1.0,,2018-06-10 13:00:00+02:00,685.0
31890,33140,2018-01-15 10:35:10+01:00,PHBQO,777,772,100.0,2018-01-15T09:30:00.000+01:00,2018-01-15T10:00:00.000+01:00,2018-01-15T08:45:00.000+01:00,KL0569,...,,['DEP'],"['JRO', 'DAR']",2018-01-15,10:15:00,J,2.0,,2018-01-15 10:15:00+01:00,1210.0
122861,127004,2018-02-23 17:55:52+01:00,PHEXG,EMJ,E75,2481.0,2018-02-23T17:15:00.000+01:00,2018-02-23T17:30:00.000+01:00,2018-02-23T16:45:00.000+01:00,9W8705,...,,['DEP'],['MUC'],2018-02-23,17:45:00,J,1.0,,2018-02-23 17:45:00+01:00,652.0
447697,461657,2018-06-28 22:09:23+02:00,PHBXP,739,73J,1551.0,2018-06-28T21:37:00.000+02:00,2018-06-28T21:52:00.000+02:00,2018-06-28T21:07:00.000+02:00,MF9917,...,2018-06-28T22:07:00.000+02:00,"['DEP', 'DEL']",['HEL'],2018-06-28,20:50:00,J,2.0,,2018-06-28 20:50:00+02:00,4763.0


# Features

## Destination routes and distances

Use `route.destinations`

In [21]:
def calculate_destination_features(df_flights, df_airports):
    

SyntaxError: unexpected EOF while parsing (<ipython-input-21-e76846bf33e6>, line 2)

## Airline categories and deal with rare airlines

...

## Previous flights of same plane

Use `aircraftRegistration`

## Windowing delays by destination, gate, airline

Find out which gates are close

In [None]:
df.columns

In [None]:
df.prefixIATA

In [None]:
df.prefixIATA.fillna("").astype(str).apply(len)