## Import Packages

In [2]:
import pandas as pd
import numpy as np

## Generate Data

We decided to merge the datasets ourselves to better practice preprocessing.

In [None]:
# Names of the airlines
names = pd.read_csv("data/raw_data/CARRIER_DECODE.csv")
names.drop_duplicates(inplace=True)
names.drop_duplicates(subset=['OP_UNIQUE_CARRIER'], inplace=True)

In [None]:
# Passenger handlings
employees = pd.read_csv('data/raw_data/P10_EMPLOYEES.csv')
employees = employees[['OP_UNIQUE_CARRIER', 'PASS_GEN_SVC_ADMIN', 'PASSENGER_HANDLING']]
employees = employees.groupby('OP_UNIQUE_CARRIER').sum().reset_index()

In [None]:
# Cities Data
cities = pd.read_csv('data/raw_data/airports_list.csv')

In [None]:
# Weather Data
weather = pd.read_csv('data/raw_data/airport_weather_2019.csv')

# In the DATE column, only show rows which have the format YYYY-MM-DD using regex
slash_indices = weather[weather['DATE'].str.contains(r'\d+/\d+/\d{4}')].index

weather['DATE'].loc[slash_indices] =  weather.loc[slash_indices]['DATE'].apply(lambda x: x.split('/')[2] + '-' + x.split('/')[0] + '-' + x.split('/')[1] if len(x.split('/')) > 1 else x)

weather_merge = pd.merge(cities, weather, how='left', on='NAME')
weather_merge.drop(weather_merge.loc[weather_merge['ORIGIN_AIRPORT_ID'].isna()].index, axis=0, inplace=True)

weather_merge['MONTH'] = pd.DatetimeIndex(weather_merge['DATE']).month
weather_merge['DAY_OF_MONTH'] = pd.DatetimeIndex(weather_merge['DATE']).day

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather['DATE'].loc[slash_indices] =  weather.loc[slash_indices]['DATE'].apply(lambda x: x.split('/')[2] + '-' + x.split('/')[0] + '-' + x.split('/')[1] if len(x.split('/')) > 1 else x)


In [None]:
# Aircraft Inventory 
aircraft = pd.read_csv("data/raw_data/B43_AIRCRAFT_INVENTORY.csv",encoding='latin1')

In [None]:
def month_cleanup(monthly_data, aircraft, names, weather, cities):
    # Dont include aircraft data, monthly airport passengers, 
    monthly_data.drop(monthly_data.loc[monthly_data['DEP_TIME'].isna()].index, axis=0, inplace=True)
    monthly_data.drop(monthly_data.loc[monthly_data['TAIL_NUM'].isna()].index, axis=0, inplace=True)
    # MERGING
    # Merge to get proper carrier name
    print("Applying Carrier Names - CARRIER_NAME")  
    monthly_data = pd.merge(monthly_data, names, how='left', on=['OP_UNIQUE_CARRIER'])
    print(len(monthly_data))
    
    # FEATURE ENGINEERING - PLANE AGE
    # Calculate age of plane
    print("Calculate Fleet Age - PLANE_AGE")
    monthly_data = pd.merge(monthly_data, aircraft, how="left", on='TAIL_NUM')
    monthly_data['MANUFACTURE_YEAR'].fillna((monthly_data['MANUFACTURE_YEAR'].mean()), inplace=True)
    monthly_data['PLANE_AGE'] = 2020 - monthly_data['MANUFACTURE_YEAR']
    print(len(monthly_data))

    # FEATURE ENGINEERING - PREVIOUS AIRPORT
    # Get previous airport for tail number
    print("Adding airports - PREVIOUS_AIRPORT")
    monthly_data = pd.merge(monthly_data, cities, how='left', on=['ORIGIN_AIRPORT_ID'])
    monthly_data["SEGMENT_NUMBER"] = monthly_data.groupby(["TAIL_NUM", 'DAY_OF_MONTH'])["DEP_TIME"].rank("dense", ascending=True)
    segment_temp = monthly_data[['DAY_OF_MONTH', 'TAIL_NUM', 'DISPLAY_AIRPORT_NAME', 'SEGMENT_NUMBER']]
    monthly_data = pd.merge_asof(monthly_data.sort_values('SEGMENT_NUMBER'), segment_temp.sort_values('SEGMENT_NUMBER'), on='SEGMENT_NUMBER', by=['DAY_OF_MONTH', 'TAIL_NUM'], allow_exact_matches=False)
    monthly_data['DISPLAY_AIRPORT_NAME_y'].fillna('NONE', inplace=True)
    monthly_data.rename(columns={"DISPLAY_AIRPORT_NAME_y": "PREVIOUS_AIRPORT", "DISPLAY_AIRPORT_NAME_x": "DEPARTING_AIRPORT"}, inplace=True)  

    # MERGING
    # Merge weather data
    print("Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND")
    monthly_data = pd.merge(monthly_data, weather, how='inner', on=['ORIGIN_AIRPORT_ID', 'MONTH', 'DAY_OF_MONTH'])
    print(len(monthly_data))
    
    # reset index
    monthly_data.reset_index(inplace=True, drop=True)
    
    # print elapsed time
    #print(f'Elapsed Time: {time.time() - start}')
    
    print("FINISHED")
    
    # return cleaned file
    return monthly_data

In [None]:
# Select files with ONTIME_REPORTING in their name

for i in range(12):
    df = pd.read_csv('data/raw_data/ONTIME_REPORTING_{:02d}.csv'.format(i+1))
    dfs = month_cleanup(df, aircraft, names, weather_merge)
    # concat the dataframes every iteration
    if i == 0:
        df_final = dfs
    else:
        df_final = pd.concat([df_final, dfs], axis=0)

Applying Carrier Names - CARRIER_NAME
583985
Calculate Fleet Age - PLANE_AGE
585967
Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
529867
FINISHED
Applying Carrier Names - CARRIER_NAME
533175
Calculate Fleet Age - PLANE_AGE
535114
Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
483331
FINISHED
Applying Carrier Names - CARRIER_NAME
632074
Calculate Fleet Age - PLANE_AGE
634249
Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
572502
FINISHED
Applying Carrier Names - CARRIER_NAME
612023
Calculate Fleet Age - PLANE_AGE
613640
Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
555593
FINISHED
Applying Carrier Names - CARRIER_NAME
636390
Calculate Fleet Age - PLANE_AGE
637068
Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
575980
FINISHED
Applying Carrier Names - CARRIER_NAME
636691
Calculate Fleet Age - PLANE_AGE
637418
Adding daily weather data - PRCP, SNOW, SNWD, SMAX, TMIN, AWND
572987
FINISHED
Applying Carrier Names

## Exploratory Data Analysis

In [3]:
df = pd.read_csv('data/full_data_flightdelay.csv')
df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,...,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,1,7,0,0800-0859,2,1,25,143,Southwest Airlines Co.,13056,...,8,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
1,1,7,0,0700-0759,7,1,29,191,Delta Air Lines Inc.,13056,...,3,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
2,1,7,0,0600-0659,7,1,27,199,Delta Air Lines Inc.,13056,...,18,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
3,1,7,0,0600-0659,9,1,27,180,Delta Air Lines Inc.,13056,...,2,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
4,1,7,0,0001-0559,7,1,10,182,Spirit Air Lines,13056,...,1,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91


## Preprocessing

In [3]:
print("Let's preprocess")
print("NO")

Let's preprocess


## Training

## Evaluation