In [36]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
import pandas as pd
import pandas_profiling as pp

import sys
sys.path.append("../")

from src.data.google_storage_io import read_csv_data

# File parameters

In [38]:
# input parameters cell
input_file = "gs://lvt-schiphol-assignment-snakemake/data/raw/flights.csv"
output_file = "processed_flights.csv"

# utility functions

In [39]:
def missing_values_percentages(df):
    """Calculate summary of missing values per column"""
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})

    missing_value_df = missing_value_df.sort_values('percent_missing', ascending=False)
    return missing_value_df

def check_col_singular(x):
    """check if pd.Series contains more than 1 unique value excluding NaN"""
    return x.dropna().nunique() > 1

def drop_singular_columns(df, verbose=False):
    """Drop DataFrame columns with 1 or fewer unique values excluding NaN"""
    col_singular = df.apply(check_col_singular, axis=0)
    if verbose:
        n_singular = sum(not col_singular)
        
    df_output = df[[col for col, is_valid in col_singular.items() if is_valid]]
    return df_output

def clean_flights(df_flights):
    """Clean flights data by removing singular columns and formatting dates"""
    df = df_flights
    df = df.dropna(subset=["scheduleDate", "scheduleTime", "actualOffBlockTime"]).reset_index()
    # remove singular
    df = drop_singular_columns(df)
    
    # format datetime fields
    df["actualOffBlockTime"] = pd.to_datetime(df["actualOffBlockTime"], utc=True).dt.tz_convert('Europe/Amsterdam')
    
    series_datetime_str = df['scheduleDate'].astype(str) + " " + df['scheduleTime'].astype(str)
    df["scheduleDateTime"] = pd.to_datetime(series_datetime_str, format="%Y%m%d %H:%M:%S").dt.tz_localize('Europe/Amsterdam')
    
#     calculate delay as difference between scheduled and realized departure
    df["scheduleDelaySeconds"] = pd.to_timedelta(df["actualOffBlockTime"] - df["scheduleDateTime"]).dt.total_seconds()

    return df

def read_flights_data(filename):
    """Read data local or from Google Storage bucket and clean it"""
    df = read_csv_data(input_file)
    print(f"Loaded data from: {input_file}\n"
          f"Shape of data: {df.shape}")
    
    df = clean_flights(df)
    print(f"Cleaned flights data\n"
          f"Shape of data: {df.shape}")
    
    return df

### Read data

In [40]:
df = read_flights_data(input_file)

Reading file from Google Storage
Bucket:	lvt-schiphol-assignment-snakemake
File:	data/raw/flights.csv

Loaded data from: gs://lvt-schiphol-assignment-snakemake/data/raw/flights.csv
Shape of data: (523275, 28)
Cleaned flights data
Shape of data: (487716, 26)


In [41]:
df.head()

Unnamed: 0,index,actualOffBlockTime,aircraftRegistration,aircraftType.iatamain,aircraftType.iatasub,airlineCode,expectedTimeBoarding,expectedTimeGateClosing,expectedTimeGateOpen,flightName,...,publicEstimatedOffBlockTime,publicFlightState.flightStates,route.destinations,scheduleDate,scheduleTime,serviceType,terminal,transferPositions.transferPositions,scheduleDateTime,scheduleDelaySeconds
0,3,2018-01-01 03:22:00+01:00,PHPXB,,,148.0,,,,ZXP022,...,,['DEP'],['AMS'],2018-01-01,03:30:00,,,,2018-01-01 03:30:00+01:00,-480.0
1,4,2018-01-01 05:58:22+01:00,PHHSJ,73H,73H,164.0,,,,HV5641,...,,['DEP'],['SPC'],2018-01-01,06:00:00,J,1.0,,2018-01-01 06:00:00+01:00,-98.0
2,5,2018-01-01 06:00:00+01:00,PHHSG,73H,73H,100.0,,,,KL2533,...,,['DEP'],['LPA'],2018-01-01,06:05:00,J,1.0,,2018-01-01 06:05:00+01:00,-300.0
3,6,2018-01-01 06:00:00+01:00,PHHSG,73H,73H,164.0,,,,HV6455,...,,['DEP'],['LPA'],2018-01-01,06:05:00,J,1.0,,2018-01-01 06:05:00+01:00,-300.0
4,7,2018-01-01 06:26:34+01:00,PHHXB,73H,73H,164.0,,,,HV5801,...,,['DEP'],['TLV'],2018-01-01,06:15:00,J,1.0,,2018-01-01 06:15:00+01:00,694.0


In [58]:
df.query("`transferPositions.transferPositions` == `transferPositions.transferPositions`")

Unnamed: 0,index,actualOffBlockTime,aircraftRegistration,aircraftType.iatamain,aircraftType.iatasub,airlineCode,expectedTimeBoarding,expectedTimeGateClosing,expectedTimeGateOpen,flightName,...,publicEstimatedOffBlockTime,publicFlightState.flightStates,route.destinations,scheduleDate,scheduleTime,serviceType,terminal,transferPositions.transferPositions,scheduleDateTime,scheduleDelaySeconds
61,64,2018-01-01 07:06:16+01:00,CSTNJ,320,32S,163.0,,,,TP669,...,,['DEP'],['LIS'],2018-01-01,07:00:00,J,1.0,[3],2018-01-01 07:00:00+01:00,376.0
65,68,2018-01-01 07:07:35+01:00,HBIJL,320,32S,50.0,,,,LX737,...,,['DEP'],['ZRH'],2018-01-01,07:00:00,J,1.0,[3],2018-01-01 07:00:00+01:00,455.0
68,71,2018-01-01 06:53:16+01:00,EIIMD,319,319,12.0,,,,AZ0119,...,,['DEP'],['LIN'],2018-01-01,07:00:00,J,1.0,[2],2018-01-01 07:00:00+01:00,-404.0
109,112,2018-01-01 07:34:20+01:00,GEUPR,319,319,31.0,2018-01-01T07:05:00.000+01:00,2018-01-01T07:15:00.000+01:00,2018-01-01T06:35:00.000+01:00,BA423,...,,['DEP'],['LHR'],2018-01-01,07:35:00,J,3.0,[5],2018-01-01 07:35:00+01:00,-40.0
120,124,2018-01-01 07:37:36+01:00,VPBAX,321,321,7.0,,,,SU2793,...,,['DEP'],['SVO'],2018-01-01,07:40:00,J,3.0,[5],2018-01-01 07:40:00+01:00,-144.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487403,502075,2018-07-12 16:08:22+02:00,GEUUS,320,32S,31.0,2018-07-12T15:40:00.000+02:00,2018-07-12T15:50:00.000+02:00,2018-07-12T15:10:00.000+02:00,BA439,...,,['DEP'],['LHR'],2018-07-12,16:10:00,J,3.0,[5],2018-07-12 16:10:00+02:00,-98.0
487406,502078,2018-07-12 16:34:41+02:00,A7BEB,777,77W,356.0,2018-07-12T15:15:00.000+02:00,2018-07-12T15:55:00.000+02:00,2018-07-12T15:00:00.000+02:00,QR274,...,,['DEP'],['DOH'],2018-07-12,16:15:00,J,3.0,[5],2018-07-12 16:15:00+02:00,1181.0
487574,502247,2018-07-12 17:06:57+02:00,CSTTK,319,319,65.0,,,,MS9425,...,,['DEP'],['OPO'],2018-07-12,16:45:00,J,1.0,[3],2018-07-12 16:45:00+02:00,1317.0
487579,502252,2018-07-12 17:06:57+02:00,CSTTK,319,319,163.0,,,,TP659,...,,['DEP'],['OPO'],2018-07-12,16:45:00,J,1.0,[3],2018-07-12 16:45:00+02:00,1317.0


## Data description

- <font color='red'>**Flight Direction (flightDirection):**</font>  
Indicates whether the commercial flight is a departure flight (d) or an arrival flight (a).  

- <font color='red'>**Flightname (flightName):**</font>    
The flight name of the commercial flight as indicated on the ticket of the passenger.  

- <font color='red'>**Flightnumber (flightNumber):**</font>    
Nummeric part of the flight name.  

- <font color='red'>**IATA flight number prefix (prefixIATA):**</font>    
The two character IATA prefix of the airline that operates the commercial flight.  

- <font color='red'>**ICAO flight number prefix (prefixICAO):**</font>    
The three character ICAO prefix of the airline that operates the commercial flight.  

- <font color='red'>**Schedule date (scheduleDate):**</font>   
The date on which the scheduled commercial flight will be operated.  

- <font color='red'>**Schedule time (scheduleTime):**</font>    
The time the commercial flight is scheduled to depart or arrive.  

- <font color='red'>**Service type of a flight. (serviceType):**</font>    
Category of the commercial flight. J = Passenger Line, C= Passenger Charter, F = Freight Line and H = Freight Charter.

- <font color='red'>**Main flight (mainFlight):**</font>    
In case of a codeshare (one flight with several flight numbers of different airlines) the main flight shows the flight number of the airline that actually operates the flight.  

- <font color='red'>**Codeshares (codeshares):**</font>    
All the flight numbers that are related to the main flight.  

- <font color='red'>**Route (route):**</font>    
The complete route of the commercial flight. In most cases this is a direct route between two airports. There are actually also flights that have more than one stopover. Example: GCK-AMS-LGW (Jakarta – Amsterdam – London Gatwick)  

- <font color='red'>**Public Flightstate (publicFlightState):**</font>    
The status of a public flight  

- <font color='red'>**Terminal (terminal):**</font>    
The section of the Schiphol terminal from which the commercial flight will be leaving or will arrive.  

- <font color='red'>**Gate (gate):**</font>    
The number of the gate from which the scheduled flight will depart and the passengers will be boarding.  

- <font color='red'>**Baggage belt (baggageClaim):**</font>    
The luggage belt number in the reclaim hall (reclaim 1,2,3 or 4) the arrived passengers can pick up their luggage.  

- <font color='red'>**Check-in desks (checkinAllocations):**</font>    
The check-in desks that are allocated for the flight (or flights in case of a codeshare).  

- <font color='red'>**Check-in class:**</font>    
The flight classes that are allocated for a specific flight.  

- <font color='red'>**Transfer desks (transferPositions):**</font>    
The transfer desks that are allocated to the flight.  

- <font color='red'>**Aircraft Type, IATA main (aircraftType, iatamain):**</font>    
The IATA main category code of the aircraft that operates the main flight.  

- <font color='red'>**Aircraft Type, IATA sub (aircraftType, iatasub):**</font>    
The IATA sub category code of the aircraft that operates the main flight.  

- <font color='red'>**Aircraft registration (aircraftRegistration):**</font>    
The unique alphanumeric string that identifies the aircraft that operates the main scheduled flight.  

- <font color='red'>**Estimated Landing Time, ELDT. (estimatedLandingTime):**</font>    
The expected time the (arriving) flight will be landing at Amsterdam Airport Schiphol.  

- <font color='red'>**Actual Landing Time, ALDT. (actualLandingTime):**</font>    
The time the aircraft landed on the runway.  

- <font color='red'>**Public estimated off block time (publicEstimatedOffBlockTime):**</font>    
The expected departure time of a departing flight that is made publically. Made visible on the flight displays in the terminal, on the website, on the Schiphol app, on the information Terminals in the lounges etc.  

- <font color='red'>**Actual Off Block time, AOBT (actualOffBlockTime):**</font>    
The actual time of departure of a flight from Amsterdam Airport Schiphol.  

- <font color='red'>**Expected time gate open (expectedTimeGateOpen):**</font>    
The time the gated is expected to be opened by the flight handler.  

- <font color='red'>**Expected time boarding (expectedTimeBoarding):**</font>    
The expected time the boarding of passengers for a flight will start.  

- <font color='red'>**Expected time closing (expectedTimeGateClosing):**</font>  
The time it is expected that the gate will be closed by the flight handler.

- <font color='red'>**Expected time on belt (expectedTimeOnBelt):**</font>    
The time it is expected that the first luggage of a flight will be on the reclaim belt.  

- <font color='red'>**Airline (airlineCode):**</font>    
NVLS code of the airline  

- <font color='red'>**Routes Eu (routesEu):**</font>    
S (Schengen), E (Europe) or N (non-Europe)  

- <font color='red'>**Visa (visa):**</font>    
Destination requires visa (true) or not (false)  

- <font color='red'>**Last Updated (lastUpdated):**</font>    
Time of last update to flight in flight information system.  

- <font color='red'>**Flight ID (id):**</font>    
Unique identifier of the flight.  

- <font color='red'>**Schema Version (schemaVersion):**</font>    
Schema version of the API.  

In [42]:
df_missing_prc = missing_values_percentages(df.query("serviceType == 'J'"))
df_missing_prc.plot(kind='bar', x="column_name", y="percent_missing")

<matplotlib.axes._subplots.AxesSubplot at 0x21a3320e608>

# Counts per airline

In [43]:
df.groupby(["airlineCode"])["index"].count().reset_index(name="n_obs") 

Unnamed: 0,airlineCode,n_obs
0,6.0,639
1,7.0,18228
2,12.0,9289
3,16.0,55
4,19.0,14
...,...,...
354,5104.0,2
355,5105.0,1
356,5106.0,1
357,5107.0,1


In [44]:
df.head()

Unnamed: 0,index,actualOffBlockTime,aircraftRegistration,aircraftType.iatamain,aircraftType.iatasub,airlineCode,expectedTimeBoarding,expectedTimeGateClosing,expectedTimeGateOpen,flightName,...,publicEstimatedOffBlockTime,publicFlightState.flightStates,route.destinations,scheduleDate,scheduleTime,serviceType,terminal,transferPositions.transferPositions,scheduleDateTime,scheduleDelaySeconds
0,3,2018-01-01 03:22:00+01:00,PHPXB,,,148.0,,,,ZXP022,...,,['DEP'],['AMS'],2018-01-01,03:30:00,,,,2018-01-01 03:30:00+01:00,-480.0
1,4,2018-01-01 05:58:22+01:00,PHHSJ,73H,73H,164.0,,,,HV5641,...,,['DEP'],['SPC'],2018-01-01,06:00:00,J,1.0,,2018-01-01 06:00:00+01:00,-98.0
2,5,2018-01-01 06:00:00+01:00,PHHSG,73H,73H,100.0,,,,KL2533,...,,['DEP'],['LPA'],2018-01-01,06:05:00,J,1.0,,2018-01-01 06:05:00+01:00,-300.0
3,6,2018-01-01 06:00:00+01:00,PHHSG,73H,73H,164.0,,,,HV6455,...,,['DEP'],['LPA'],2018-01-01,06:05:00,J,1.0,,2018-01-01 06:05:00+01:00,-300.0
4,7,2018-01-01 06:26:34+01:00,PHHXB,73H,73H,164.0,,,,HV5801,...,,['DEP'],['TLV'],2018-01-01,06:15:00,J,1.0,,2018-01-01 06:15:00+01:00,694.0


# Features

## Destination routes and distances

Use `route.destinations`

## Airline categories and deal with rare airlines

...

## Previous flights of same plane

Use `aircraftRegistration`

## Windowing delays by destination, gate, airline

Find out which gates are close

In [45]:
df.columns

Index(['index', 'actualOffBlockTime', 'aircraftRegistration',
       'aircraftType.iatamain', 'aircraftType.iatasub', 'airlineCode',
       'expectedTimeBoarding', 'expectedTimeGateClosing',
       'expectedTimeGateOpen', 'flightName', 'flightNumber', 'gate', 'id',
       'mainFlight', 'prefixIATA', 'prefixICAO', 'publicEstimatedOffBlockTime',
       'publicFlightState.flightStates', 'route.destinations', 'scheduleDate',
       'scheduleTime', 'serviceType', 'terminal',
       'transferPositions.transferPositions', 'scheduleDateTime',
       'scheduleDelaySeconds'],
      dtype='object')

In [46]:
df.prefixIATA

0         NaN
1          HV
2          KL
3          HV
4          HV
         ... 
487711     MF
487712     AF
487713     CA
487714    NaN
487715     CI
Name: prefixIATA, Length: 487716, dtype: object

In [52]:
df.prefixIATA.fillna("").astype(str).apply(len)

0         0
1         2
2         2
3         2
4         2
         ..
487711    2
487712    2
487713    2
487714    0
487715    2
Name: prefixIATA, Length: 487716, dtype: int64