In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.metrics import r2_score

In [None]:
# Helper functions for data cleaning

def diff(a, b):
    return a - b

def drop_col(df):
    return df.drop([
        'OP_CARRIER_FL_NUM', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN', 
        'CANCELLATION_CODE', 'DISTANCE', 'CARRIER_DELAY','WEATHER_DELAY', 
        'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY','Unnamed: 27'
    ], axis=1)

def remove_na(df):
    return df[df['ACTUAL_ELAPSED_TIME'].notna()]

def total_delay(df):
    df['TOTAL_DELAY'] = df.apply(lambda x: diff(x['ACTUAL_ELAPSED_TIME'], x['CRS_ELAPSED_TIME']), axis=1)
    return df

def to_dt(df):
    df["FL_DATE"] = pd.to_datetime(df["FL_DATE"])
    return df

def to_days(df):
    df["FL_DATE_DAYS"] = df['FL_DATE'].dt.dayofyear
    return df

# run on year files which are a leap year
def clean_leapyear_data(df):
    df = drop_col(df)
    df = remove_na(df)
    df = total_delay(df)
    df = to_dt(df)
    df = df[(df.FL_DATE.dt.month != 2) | (df.FL_DATE.dt.day != 29)]
    df.FL_DATE = df.FL_DATE.apply(lambda x: x.replace(year = x.year - 1))
    df = to_days(df)
    return df

# run on all other files
def clean_data(df):
    df = drop_col(df)
    df = remove_na(df)
    df = total_delay(df)
    df = to_dt(df)
    df = to_days(df)
    return df

In [None]:
# Import Data sets
df09 = pd.read_csv("./df09.csv")
df10 = pd.read_csv("./df10.csv")
df11 = pd.read_csv("./df11.csv")
df12 = pd.read_csv("./df12.csv")
df13 = pd.read_csv("./df13.csv")
df14 = pd.read_csv("./df14.csv")
df15 = pd.read_csv("./df15.csv")
df16 = pd.read_csv("./df16.csv")
df17 = pd.read_csv("./df17.csv")
df18 = pd.read_csv("./df18.csv")

In [None]:
df = pd.concat([df09, df10, df11, df12, df13, df14, df15, df16, df17, df18], axis=0, ignore_index=True)
df6 = df[df.OP_CARRIER.isin(['WN', 'DL', 'AA', 'UA'])]

In [None]:
dfWN = df6.loc[df.OP_CARRIER.isin(['WN'])]
dfDL = df6.loc[df.OP_CARRIER.isin(['DL'])]
dfAA = df6.loc[df.OP_CARRIER.isin(['AA'])]
dfUA = df6.loc[df.OP_CARRIER.isin(['UA'])]