## Multiclass Prediction

The purpose of this notebook is to provide prediction for flight delay cause based on provided flight information.

---

**INPUT**: flights_test.csv <br>
**INPUT**: label_encoders.pkl, scaler.pkl, pca.pkl and model_reg.pkl <br>
**OUTPUT**: multiclass_submission.csv <br>

---

#### Packages

In [7]:
import pandas as pd
import numpy as np
import pickle 
import datetime
import calendar
import joblib

#### Reading the Data
Create 2 data frames <br>
1. To be transformed and used in model to predict 
2. As the output file framework

In [None]:
# read csv
df = pd.read_csv("flights_test_week.csv")
df.drop(columns = 'Unnamed: 0', inplace = True)

In [None]:
df_submission = df.copy()
df_submission = df_submission[["fl_date","mkt_carrier","mkt_carrier_fl_num","origin","dest"]]

In [None]:
df_model = df.copy()

#### Prepping the Data
on the df_model we will perform all the same steps we performed on our training data

In [None]:
def get_month(date):
    """Returns the month the flight occurs
    INPUT date in form YYYY-MM-DD
    RETURNS month"""
    DATE = datetime.datetime.strptime(date, "%Y-%m-%d")
    return DATE.month

def get_day_of_week(date):
    """Returns the day of the week
    INPUT date in the form YYYY-MM-DD
    RETURNS number of day of the week:
            where 0 = Monday and 6 = Sunday"""
    DATE = datetime.datetime.strptime(date, "%Y-%m-%d")
    return DATE.weekday()

calendar.setfirstweekday(6)

def get_week_of_month(date):
    date = str(date).split("-")
    year, month, day = int(date[0]), int(date[1]), int(date[2])
    x = np.array(calendar.monthcalendar(year, month))
    week_of_month = np.where(x==day)[0][0] + 1
    return(week_of_month)

def check_codeshare(branded_codeshare):
    words = branded_codeshare.split("_")
    if len(words) == 2:
        return 1
    return 0

def check_time_block(time):
    """Determine the hour of the day
    INPUT time HHMM
    RETURNS hour 0-23 where 0 == 12am and 23 == 11pm"""
    if type(time) == 'numpy.ndarray':
        time = time.astype(int)
    time = int(time)
    
    t = time // 100
    if t == 0:
        return 0
    return t-1

def define_haul_length(distance):
    """Determine the haul length of a given distance
    INPUT distance (in miles)
    RETURNS {0: short haul, 1: medium haul, 2: long haul}"""
    
    if distance < 700:
        return 0 
    elif distance < 3000:
        return 1
    else: 
        return 2
    
def mkt_op_combo(mkt_unique, op_unique):
    """Output a string for the unique combination of the mkt_unique and op_unique
        INPUT strings mkt_unique and op_unique
        RETURNS string of unique combo"""
    combo = mkt_unique+op_unique
    return combo

def flight_type(fl_num):
    """Takes in a fl_num and returns the type of flight
        INPUT fl_num
        RETURNs flight_type 
                2: premium, 1 : regular, 0 : regional, ferry, codeshare"""
    fl_num = str(fl_num)
    if len(fl_num) < 3:
        return 2 #preimum
    if len(fl_num) == 3:
        return 1 #reg
    else: 
        return 0 #regional affiliate, ferry, codeshare, etc.

def get_hist_m_dep_delay(dep_hour):
    """returns the mean dep_delay for the hour based on 2018-2019 data
        INPUT departure hour
        OUTPUT historical_mean_dep_delay"""
    historical_delays = {0: 6.711716493969764, 1: 9.675249362096961, 2: 10.619793205317578, 3: 6.630692167577413, 4: -0.4136475056746801,
                         5: 0.11429594186434255, 6: 1.6161646966248562, 7: 2.9157571077032296, 8: 4.135963064069914, 9: 5.26489329589765,
                         10: 6.13392546097791, 11: 7.0181790875984875, 12: 8.131680643107183, 13: 9.510919588229374, 14: 10.304524506138106,
                         15: 11.336050018124949, 16: 12.382205588783819, 17: 13.536146413960237, 18: 13.756275088751552, 19: 13.289175306396258,
                         20: 12.634817154337778, 21: 10.544540416974117, 22: 7.987786063255336}
    hour = int(dep_hour)
    return historical_delays[hour]

def get_hist_m_arr_delay(dep_hour):
    """returns the mean arr_delay for the hour based on 2018-2019 data
        INPUT departure hour
        OUTPUT historical_mean_arr_delay"""
    historical_delays = {0: -1.6642602344148123, 1: 3.440501043841336, 2: 4.343870014771049, 3: 1.3802367941712204, 4: -5.484170909895949,
                         5: -4.606989162142505, 6: -2.907391092361358, 7: -1.8630207558845115, 8: -1.085556598557783, 9: 0.08580366114366288,
                         10: 0.5798795108102116, 11: 2.115288435274042, 12: 3.1173279785990315, 13: 4.805705640109731, 14: 5.449820220021156,
                         15: 6.739257741261025, 16: 7.6573414085094385, 17: 8.877695016385436, 18: 8.378205975005564, 19: 7.965689735642226,
                         20: 6.8922934459978995, 21: 4.922862682118726, 22: 0.5166495242993058}
    
    hour = int(dep_hour)
    return historical_delays[hour]

def get_hist_med_dep_delay(dep_hour):
    """returns the median dep_delay for the hour based on 2018-2019 data
        INPUT departure hour
        OUTPUT historical_median_dep_delay"""
    historical_delays = {0: -2.0, 1: -2.0, 2: -3.0, 3: -4.0, 4: -4.0, 5: -4.0, 6: -4.0, 7: -3.0, 8: -3.0, 9: -3.0, 10: -2.0, 11: -2.0,
                         12: -2.0, 13: -1.0, 14: -1.0, 15: -1.0, 16: -1.0, 17: -1.0, 18: -1.0, 19: -1.0, 20: -1.0, 21: -1.0, 22: -2.0}
    hour = int(dep_hour)
    return historical_delays[hour]

def get_hist_med_arr_delay(dep_hour):
    """returns the median arr_delay for the hour based on 2018-2019 data
        INPUT departure hour
        OUTPUT historical_med_arr_delay"""
    historical_delays = {0: -9.0, 1: -6.0, 2: -6.0, 3: -7.0, 4: -9.0, 5: -9.0, 6: -8.0, 7: -8.0, 8: -7.0, 9: -7.0, 10: -7.0, 11: -6.0,
                         12: -6.0, 13: -5.0, 14: -5.0, 15: -4.0, 16: -4.0, 17: -3.0, 18: -3.0, 19: -3.0, 20: -4.0, 21: -4.0, 22: -7.0}
    hour = int(dep_hour)
    return historical_delays[hour]

def get_delay_type(carrier_delay, weather_delay, nas_delay, security_delay, late_aircraft_delay):
    """Takes in all potential delay causes and returns the primary cause
    in case of 2 identical delays the first is returned
    INPUT carrier_delay, weather_delay, nas_delay, security_delay, late_aircraft_delay
    RETURNS {0 : no delay cause noted,
            1 : carrier_delay,
            2 : weather_delay,
            3 : NAS_delay,
            4 : security_delay,
            5 : late_aircraft_delay}"""
    
    if carrier_delay == weather_delay == nas_delay == security_delay == late_aircraft_delay == 0:
        return 0
    
    delays = {}
    delays[1] = carrier_delay
    delays[2] = weather_delay
    delays[3] = nas_delay
    delays[4] = security_delay
    delays[5] = late_aircraft_delay
    
    return max(delays, key=delays.get)


In [None]:
df_model.drop(columns = ['mkt_carrier', 'tail_num', 'op_carrier_fl_num', 
                         'origin_airport_id', 'dest_airport_id', 'dup', 
                         'flights'], inplace = True)

#fl_month
df_model.loc[:,'fl_month'] = df_model['fl_date'].apply(get_month)
#fl_day_of_week
df_model.loc[:,'fl_day_of_week'] = df_model['fl_date'].apply(get_day_of_week)
#week of month
df_model.loc[:,'fl_week_of_month'] = df_model['fl_date'].apply(get_week_of_month)
#drop_fl_date
df_model.drop(columns = 'fl_date', inplace=True)

#get mkt_op_combo
df_model.loc[:,"mkt_op_combo"] = df_model.apply(lambda x: mkt_op_combo(x.mkt_unique_carrier, 
                                                                       x.op_unique_carrier), 
                                                axis=1)
df_model.drop(columns = ['mkt_unique_carrier', 'op_unique_carrier'], inplace = True)

df_model.loc[:,"branded_code_share"] = df_model["branded_code_share"].apply(check_codeshare)

#get fl_type
df_model.loc[:,"fl_type"] = df_model["mkt_carrier_fl_num"].apply(flight_type)

df_model.drop(columns = 'mkt_carrier_fl_num', inplace=True)

df_model.loc[:,'crs_dep_time'] = df_model['crs_dep_time'].apply(check_time_block)
df_model.loc[:,'m_hist_dep_delay'] = df_model['crs_dep_time'].apply(get_hist_m_dep_delay)
df_model.loc[:,'med_hist_dep_delay'] = df_model['crs_dep_time'].apply(get_hist_med_dep_delay)
df_model.loc[:,'m_hist_arr_delay'] = df_model['crs_dep_time'].apply(get_hist_m_arr_delay)
df_model.loc[:,'med_hist_arr_delay'] = df_model['crs_dep_time'].apply(get_hist_med_arr_delay)

df_model.loc[:,'crs_arr_time'] = df_model['crs_arr_time'].apply(check_time_block)

air_time was carried in training despite being a prediction factor as such due to time limitations we have replaced that column with the average from that column to allow completion of the assignment, if possible we would have returned to the error to correct

In [None]:
df_model.insert (6, "air_time", 106.1)

In [None]:
# load in encoders 
loaded_origin = pickle.load(open('Revamp/origin_encoder.pkl', 'rb'))
loaded_dest = pickle.load(open('Revamp/dest_encoder.pkl', 'rb'))
loaded_mktopcombo = pickle.load(open('Revamp/mkt_op_combo.pickle', 'rb'))
df_model.loc[:,"origin"] = loaded_origin.transform(df_model.origin)
df_model.loc[:,"dest"] = loaded_dest.transform(df_model.dest)
df_model.loc[:,"mkt_op_combo"] = loaded_mktopcombo.transform(df_model.mkt_op_combo)

#### Scaling the Data
we must scale our model using the same scaler used on our training data

In [None]:
# load in encoder
loaded_scale = pickle.load(open('Revamp/scaler.pkl', 'rb'))

X_scaled = loaded_scale.transform(df_model)

#### Dimensionality Reduction
we must reduce our features using the same pca() as determined on our training data

In [None]:
data_values = X_scaled.copy()
#load in the pca
loaded_pca = pickle.load(open('pca.pkl', 'rb'))
component = loaded_pca.transform(X_scaled)
w_transpose = np.transpose(loaded_pca.components_)
reduced_feat = np.matmul(data_values, w_transpose)

#### Running our Model
now we will run our trained model 

In [None]:
#load in the model

#### Prepping the Submission File
we will merge our y_pred with our df_submission for the output csv

In [None]:
y_col = pd.DataFrame(data = y_pred,
                    columns = "predicted_delay_type")

In [5]:
def define_delay_type(delay):
    if delay == 0:
        return "unknown or unreported cause"
    elif delay == 1: 
        return "carrier_delay"
    elif delay == 2:
        return "weather_delay"
    elif delay == 3:
        return "nas_delay"
    elif delay == 4:
        return "security_delay"
    else:
        return "late_aircraft_delay"

In [None]:
y_col[:,"predicted_delay_type"] = y_col["predicted_delay_type"].apply(define_delay_type)

In [None]:
output = df_submission.merge(y_col, right_index = True, left_index = True)

In [None]:
output.to_csv("multiclass_submission.csv")