In [1]:
import pandas as pd
import re

In [2]:
trim_tail_number = lambda s : None if not s else re.sub("^[A-Z]", "", s)

In [3]:

years = list(range(2018, 2023))
for year in years:
    _faa_df = pd.read_parquet(f"./data/faa_{year}.parquet")

    _delay_df = pd.read_parquet(f"./data/delay_{year}.parquet")

    # Remove preceeding identificaton letters from tail 
    # numbers, since the FAA doesn't include them
    _delay_df["Tail_Number"] = _delay_df["Tail_Number"].apply(trim_tail_number)

    # Trim excess from delay dataframe
    _delay_df = _delay_df[[
        'Cancelled', 
        'Diverted', 
        'DepDelayMinutes', 
        'ArrDelayMinutes',
		'Tail_Number',
        ]]

    # Join with FAA
    _clean_df = _delay_df.merge(
        _faa_df,
        left_on='Tail_Number',
        right_on='N-NUMBER',
        how='inner',
        suffixes=('__DELAY__', '__FAA__')
    )

    print(f"original: {len(_delay_df)} rows | joined: {len(_clean_df)} rows")
    _clean_df.to_parquet(f"./data/clean_joined_{year}.parquet")

original: 5689512 rows | joined: 5502497 rows
original: 8091684 rows | joined: 7803194 rows
original: 5022397 rows | joined: 4695086 rows
original: 6311871 rows | joined: 6021567 rows
original: 4078318 rows | joined: 3913835 rows


~~Yikes. That's one hell of a reduction.~~ Fixed!

In [4]:
clean_2018_df = pd.read_parquet("./data/clean_joined_2018.parquet")
clean_2018_df.describe()

Unnamed: 0,DepDelayMinutes,ArrDelayMinutes,YEAR MFR,SPEED,TYPE-ACFT,NO-SEATS,NO-ENG,AGE
count,5438374.0,5422618.0,5502497.0,5502497.0,5502497.0,5502497.0,5502497.0,5502497.0
mean,13.44554,13.66774,2006.321,0.043658,4.997466,136.7629,1.996581,11.67943
std,44.18298,44.00786,16.26092,2.110273,0.05991162,69.88713,0.06132817,16.26092
min,0.0,0.0,0.0,0.0,4.0,1.0,1.0,0.0
25%,0.0,0.0,2002.0,0.0,5.0,88.0,2.0,5.0
50%,0.0,0.0,2005.0,0.0,5.0,140.0,2.0,13.0
75%,7.0,8.0,2013.0,0.0,5.0,179.0,2.0,16.0
max,2625.0,2635.0,2018.0,126.0,6.0,563.0,3.0,2018.0


In [5]:
manufacturers = clean_2018_df['MFR'].unique()
manufacturers.sort()
manufacturers

array(['AIR TRACTOR INC               ', 'AIRBUS                        ',
       'AIRBUS INDUSTRIE              ', 'AMERICAN CHAMPION AIRCRAFT    ',
       'AVEKO SRO                     ', 'BARD JOHN A                   ',
       'BENHAM JOHN                   ', 'BETTS JOHN P                  ',
       'BIGHAM JACK L                 ', 'BOEING                        ',
       'BOMBARDIER INC                ', 'CANADAIR                      ',
       'CESSNA                        ', 'CHRISTOPHER LORD              ',
       'CIRRUS DESIGN CORP            ', 'DASSAULT                      ',
       'DIAMOND AIRCRAFT IND GMBH     ', 'DIAMOND AIRCRAFT IND INC      ',
       'DOUGLAS                       ', 'EADS/ALENIA ATR               ',
       'EMBRAER                       ', 'EMBRAER S A                   ',
       'EMBRAER-EMPRESA BRASILEIRA DE ', 'GLEASON BRENT L               ',
       'GULFSTREAM AEROSPACE          ', 'GULFSTREAM AMERICAN CORP.     ',
       'MCDONNELL DOUGLAS