<a href="https://colab.research.google.com/github/HeatherDriver/IU-Model-Engineering/blob/main/05_ClassFn_for_Test_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install ordered_set
! pip install wandb
! pip install category_encoders
! pip install "pandas<2.0.0"
! pip install -U kaleido

Collecting ordered_set
  Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)
Installing collected packages: ordered_set
Successfully installed ordered_set-4.1.0
Collecting wandb
  Downloading wandb-0.17.1-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.5.1-py2.py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.6/289.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
from ordered_set import OrderedSet
import datetime as dt
import warnings
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing as pp
import pickle
import joblib
from sklearn.preprocessing import OneHotEncoder
from itertools import product
from sklearn.model_selection import KFold, ParameterGrid, GridSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
import wandb
import joblib
from xgboost import XGBRegressor
import json
import os
import ast
import category_encoders
import torch
from sklearn.multioutput import MultiOutputRegressor
from itertools import zip_longest

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/01_Data/02_Processed'

data_path = '/content/drive/MyDrive/01_Data/02_Processed/'
pkl_path = '/content/drive/MyDrive/01_Data/03_Modelling/'
results_path = '/content/drive/MyDrive/01_Data/03_Modelling/'
images_folder = '/content/drive/MyDrive/02_Docs'

Mounted at /content/drive
/content/drive/MyDrive/01_Data/02_Processed


In [4]:
# Display all columns, rows and import the data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

pd.options.mode.chained_assignment = None

warnings.filterwarnings('ignore')
sns.set(palette="Dark2")

In [5]:
class FlightPrep:
    """The Flight_prep class represents all the loading and transformation steps carried out with the flight data prior to running the model.
    """
    # Initiates new constructor
    def __init__(self, data_dir, flight_pkl):
        self.data_dir = data_dir
        self.flight_pkl = flight_pkl

    #### Start of helper load functions ####
    def _loads_pkl(self, pickle_file_name):
        with open(pkl_path + pickle_file_name, 'rb') as file:
            data = pickle.load(file)
        return data

    def _flight_cleanse(self, df):
        # Drop 'dep_sched_date', 'dep_sched_time', 'arr_sched_date', 'arr_sched_time' because data in 'dep_sched_datetime' and 'arr_sched_datetime'
        # respectively. Drop 'fn_carrier' as the information is redundant
        df.drop(['fn_carrier', 'dep_sched_date', 'dep_sched_time', 'arr_sched_date', 'arr_sched_time'], axis=1, inplace=True)
        df['leg_no'] = df['leg_no'].astype('int')

        # Sort by ac_reg, then by dep_sched_datetime for window calculations per ac_reg over a timeframe
        df = df.sort_values(['ac_registration', 'dep_sched_datetime'], ascending = [True, True]).reset_index(drop=True)
        return df

    def _sgd_model(self, X):
        # Load model
        sgd_pipeline = joblib.load(pkl_path + 'sgd_pipeline.pkl')
        # Run sgd_pipeline model for arr_delay_imputed
        y_pred = sgd_pipeline.predict(X)
        X['y_pred'] = y_pred
        return X

    def _arr_delay_m_onblockdt_imputed(self, df):
        # Evaluate the time difference in minutes between 'arr_sched_time' and 'm_onblockdt' (expected arriving time versus actual arriving time)
        # This is effectively the 'arrival delay' (we already have the 'departure delay')
        df['arr_delay_imputed'] = df['m_onblockdt'] - df['arr_sched_datetime']
        df['arr_delay_imputed'] = df['arr_delay_imputed'] / np.timedelta64(1, 'm')

        # Add column indicating if 'arr_delay_imputed' is null.
        df['arr_delay_imputed_null'] = df['m_onblockdt'].isna().astype('int')

        # Get predictions
        # X = df[['change_reason_code', 'Ac_Type_Code', 'day_of_week', 'trans_time']]
        X = df[df['arr_delay_imputed_null'] == 1][['change_reason_code', 'Ac_Type_Code', 'day_of_week', 'trans_time']]
        mydf2 = self._sgd_model(X)
        mydf2 = mydf2.reset_index(drop=False)

        # Replace the 'arr_delay_imputed' with y_pred where necessary.
        mydict = dict()
        for index, row in df.iterrows():
            if index in mydf2['index'].to_list():
               mydict.update({index : mydf2.loc[mydf2['index'] == index]['y_pred'].values[0]})
            else:
                mydict.update({index : row.arr_delay_imputed})

        df['arr_delay_imputed'] = df.index.map(mydict)
        df['arr_delay_imputed'] = pd.to_timedelta(df['arr_delay_imputed'], unit='m')

        # Reverse calculate the m_onblockdt
        df['m_onblockdt_imputed'] = df['arr_delay_imputed'] + df['arr_sched_datetime']

        # Re-calculate the 'arr_delay_imputed' to use the 'm_onblockdt_imputed' column instead of the 'm_onblockdt'.
        df['arr_delay_imputed'] = df['m_onblockdt_imputed'] - df['arr_sched_datetime']
        df['arr_delay_imputed'] = df['arr_delay_imputed'] / np.timedelta64(1, 'm')
        df['arr_delay_imputed'] = df['arr_delay_imputed'].astype('int')

        return df

    #### End of helper load functions ####

    def _prepare_loaded_data(self):
        flight_file = self._loads_pkl(self.flight_pkl)
        flight = self._flight_cleanse(flight_file)
        flight = self._arr_delay_m_onblockdt_imputed(flight)
        return flight


    #### Start of cleaning functions ####

    def _creates_duration_col(self, df, start_col, end_col):
        # creates duration calculation, this is the difference in time for the arrival versus the departure.
        df2 = df.copy(deep=True)
        if start_col == 'dep_sched_time' :
            # Convert 'dep_sched_date' and 'arr_sched_date' to datetime if not already
            df2[start_col] = pd.to_datetime(df2['dep_sched_date'].astype('str') + ' ' + df2['dep_sched_time'].astype('str'), format="%Y-%m-%d %H:%M")
        if end_col == 'arr_sched_time':
            df2[end_col] = pd.to_datetime(df2['arr_sched_date'].astype('str') + ' ' + df2['arr_sched_time'].astype('str'), format="%Y-%m-%d %H:%M")

        if df2[start_col].dtype != 'datetime64[ns]':
            df2[start_col] = pd.to_datetime(df2[start_col])
        if df2[end_col].dtype != 'datetime64[ns]':
            df2[end_col] = pd.to_datetime(df2[end_col])

        duration = (df2[end_col] -  df2[start_col])/ np.timedelta64(1, 'm')
        return duration.values

    def _TLC_trans_to_crew(self, df):
        mylist = []
        to_replace = ["_nan", "Mrs. ", "Dr. ", "Mr. ", "Miss ", "MD", "PhD", "DDS", "DVM", "Jr.", "Ms.", "II", "_\d+", "_-\d+", "]", "[", " '", "'"]
        TLC_trans_cleaned_split = df['TLC_trans'].array

        for split in TLC_trans_cleaned_split:
            split_string = str(split)
            for item in to_replace:
                try:
                    split_string = re.sub(item, "", split_string)
                except:
                    split_string = split_string.replace(item, "")
            newlist = split_string.split(',')
            newlist.sort()
            mylist.append(newlist)

        df['TLC_trans_pos'] = mylist

        mylist = []
        for reg in OrderedSet(df['ac_registration'].values):
            subset = df.loc[df['ac_registration'] == reg][['TLC_trans_pos']]
            subset_ord = subset.iloc[:-1].reset_index(drop=True)
            subset_lag = subset.iloc[1:].reset_index(drop=True)
            maximum = max(subset_ord.index)
            for x, y, idx in zip(subset_ord.values, subset_lag.values, subset_ord.index):
                diff = set(x.tolist()[0]).difference(y.tolist()[0])
                if diff:
                    z = re.findall("_[a-z][a-z]",str(diff))
                    z = re.sub("_","", str(z))
                    mylist.append((idx, z))
                else:
                    mylist.append((idx, "[]"))
                if idx == maximum:
                    mylist.append((idx + 1, "[]"))

        mylist = [ii for i, ii in mylist]

        mylist_i = []
        for string_line in mylist:
            string_line = re.sub("\[","", string_line)
            string_line = re.sub("\]", "", string_line)
            string_line = re.sub("\'", "", string_line)
            string_line = re.sub("\,", "", string_line)
            string_line = string_line.strip()
            new_list = y = string_line.split()
            new_list.sort()
            mylist_i.append(new_list)

        df['crew_type_change_imputed'] = mylist_i
        df.drop(['crew_type_change', 'TLC_trans'], axis=1, inplace=True)
        return df

    def _crew_group_imputed(self, df):
        mylist = []

        for reg in OrderedSet(df['ac_registration'].values):
            subset = df.loc[df['ac_registration'] == reg][['Crew_Group', 'crew_type_change_imputed']]
            subset_ord = subset.iloc[:-1].reset_index(drop=True)['crew_type_change_imputed']
            mylist_ = ['start']
            for val, idx in zip(subset_ord, subset_ord.index):
                if len(val) == 0:
                    crew = 'no change'
                else:
                    crew = 'change'
                mylist_.extend([crew])
            mylist.append(mylist_)

        mylist = [i for item in mylist for i in item]
        df['Crew_Group_imputed'] = mylist
        df['change_reason_code_imputed'] = df['change_reason_code'].where(df['dep_delay'] != 0, 'no reason')
        return df

    def _calculates_time_diff_to_line_below(self, df, lagged_col, normal_col, new_col_name):
        mydict = dict()
        df = df.sort_values(['ac_registration', 'dep_sched_datetime'], ascending = [True, True]).reset_index(drop=True)
        for reg in OrderedSet(df['ac_registration'].values):
            subset = df.loc[df['ac_registration'] == reg][[lagged_col, normal_col, 'last_flight_indicator']].reset_index(drop=False)
            subset_ord = subset.iloc[:-1].reset_index(drop=True)[normal_col]
            subset_lag = subset.iloc[1:].reset_index(drop=True)[lagged_col]
            subset['diff'] = (subset_lag - subset_ord) / np.timedelta64(1, 'm')
            subset['diff'] = subset['diff'].fillna(0)
            subset.loc[subset['last_flight_indicator'] == 1, 'diff'] = 0
            mydict.update({index: value for (index, value) in zip(subset['index'].values, subset['diff'].values)})
        df[new_col_name] =  df.index.map(mydict)
        df[new_col_name] = df[new_col_name].astype('int')
        return df

    #### End of helper cleaning functions ####

    def _clean_data(self, df):
        # Recalculate actual_duration to use imputed values
        df['actual_duration'] = self._creates_duration_col(df, 'm_offblockdt', 'm_onblockdt_imputed')
        df['actual_duration'] = np.round(df['actual_duration'],0)
        df = self._TLC_trans_to_crew(df)
        df = self._crew_group_imputed(df)
        df = self._calculates_time_diff_to_line_below(df, 'dep_sched_datetime', 'arr_sched_datetime', 'Sched_Groundtime_imputed')
        df = self._calculates_time_diff_to_line_below(df, 'm_offblockdt', 'm_onblockdt_imputed', 'Act_Groundtime_imputed')
        return df

    def create_flight_df(self):
        dataframe = self._prepare_loaded_data()
        dataframe = self._clean_data(dataframe)
        return dataframe

In [6]:
class GroundPrep(FlightPrep):
    """The Ground_prep class represents all the loading and transformation steps applied to the ground data prior to running the model.
        Inherited functions from Flight_prep
    """
    # Initiates constructor for inheritance
    def __init__(self, data_dir, flight_pkl, ground_pkl):
        super().__init__(data_dir, flight_pkl)
        self.ground_pkl = ground_pkl

    def _replaces_blank_arr_leg_outbound(self, df, flight_lookup):
        # Correcting blanks for 'arr_leg_outbound' in df file, sourced from calling the flight file.
        df = df.sort_values(['ac_registration', 'day_of_origin', 'sched_inbound_dep'], ascending = [True, True, True]).reset_index(drop=True)
        df['day_of_origin'] = pd.to_datetime(df['day_of_origin'])

        df, flight_lookup = df.reset_index(drop=False), flight_lookup.reset_index(drop=False)

        investigation = df.loc[df['arr_leg_outbound'].isna()]
        mylist = []
        for idx, row in investigation.iterrows():
            subset = flight_lookup.loc[(flight_lookup['dep_sched_datetime'].dt.date == row.day_of_origin.date()) &
            (flight_lookup['fn_number'] == row.fn_number) & (flight_lookup['dep_ap_sched'] == row.dep_leg_inbound) &
            (flight_lookup['arr_ap_sched'] == row.arr_leg_inbound) & (flight_lookup['ac_registration'] == row.ac_registration)]
            idx_to_match = subset['index'].values[0] + 1
            mylist.append(idx_to_match)
        matched = flight_lookup[flight_lookup['index'].isin(mylist)]['arr_ap_sched'].to_list()

        for idx, matched_loc in zip(investigation.index, matched):
            df.iloc[idx, df.columns.get_loc('arr_leg_outbound')] = matched_loc
        return df

    def _mingt_mode_imputer(self, df):
        imp_mode_df = df.groupby('ac_registration')['mingt'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan).reset_index()
        mydict = {ac_reg: mgt for (ac_reg, mgt) in zip(imp_mode_df['ac_registration'], imp_mode_df['mingt'])}
        df['mingt_mode_imputed'] = df['ac_registration'].map(mydict)
        return df

    def _clean_dedupe_data(self, df, flight_lookup):
        # Cleaning duration column will be completely dropped since there are only 3 rows with data.
        df.drop(['cleaning_duration', 'pax_boarding_duration'], axis=1, inplace=True)
        # Correcting blanks for 'arr_leg_outbound' in df file, sourced from calling the flight file.
        df = self._replaces_blank_arr_leg_outbound(df, flight_lookup)
        # Replacing the mingt column with the mode per ac_registration #
        df = self._mingt_mode_imputer(df)
        # Remove index columns if present
        if 'index' in df.columns:
            df.drop(columns=['index'], inplace=True)

        if 'index' in flight_lookup.columns:
            flight_lookup.drop(columns=['index'], inplace=True)

        return df

    def _calcs_duration_between_lines(self, df, col1, col2, name_new_col):
        mylist1 = []
        for reg in OrderedSet(df['ac_registration'].values):
            _mylist1 = []
            subset = df.loc[df['ac_registration'] == reg][[col1, col2]]
            subset_ord = subset.iloc[:-1].reset_index()
            subset_lag = subset.iloc[1:].reset_index()

            subset_lag[col1] = pd.to_datetime(subset_lag[col1])
            subset_ord[col2] = pd.to_datetime(subset_ord[col2])

            diff = (subset_lag[col1] - subset_ord[col2])/ np.timedelta64(1, 'm')
            _mylist1.extend(diff.to_list())
            _mylist1.extend([0])
            mylist1.append(_mylist1)

        mylist1 = [i for item in mylist1 for i in item]
        df[name_new_col] = mylist1
        return df

    def _combine_ground_vertically(self, df, flight_lookup):
        # Combining ground into 1 vertical file (not horizontal)

        df_a = df[['day_of_origin', 'ac_type', 'fn_number', 'ac_registration', 'dep_leg_inbound', 'arr_leg_inbound', 'sched_inbound_dep',
                           'sched_inbound_arr', 'leg_inbound', 'mingt_mode_imputed']]

        df_b = df[['day_of_origin', 'ac_type', 'fn_number', 'ac_registration', 'arr_leg_inbound', 'arr_leg_outbound', 'sched_outbound_dep',
               'sched_outbound_arr', 'leg_outbound', 'mingt_mode_imputed']]

        df_a['leg_no'] = df_a['leg_inbound']
        df_a.drop('leg_inbound', inplace=True, axis=1)
        df_b['leg_no'] = df_b['leg_outbound']
        df_b.drop('leg_outbound', inplace=True, axis=1)

        df_a['dep_ap_sched'] = df_a['dep_leg_inbound']
        df_a.drop('dep_leg_inbound', inplace=True, axis=1)
        df_b['dep_ap_sched'] = df_b['arr_leg_inbound']
        df_b.drop('arr_leg_inbound', inplace=True, axis=1)

        df_a['arr_ap_sched'] = df_a['arr_leg_inbound']
        df_a.drop('arr_leg_inbound', inplace=True, axis=1)
        df_b['arr_ap_sched'] = df_b['arr_leg_outbound']
        df_b.drop('arr_leg_outbound', inplace=True, axis=1)

        df_a['dep_sched_datetime'] = df_a['sched_inbound_dep']
        df_a.drop('sched_inbound_dep', inplace=True, axis=1)
        df_b['dep_sched_datetime'] = df_b['sched_outbound_dep']
        df_b.drop('sched_outbound_dep', inplace=True, axis=1)

        df_a['arr_sched_datetime'] = df_a['sched_inbound_arr']
        df_a.drop('sched_inbound_arr', inplace=True, axis=1)
        df_b['arr_sched_datetime'] = df_b['sched_outbound_arr']
        df_b.drop('sched_outbound_arr', inplace=True, axis=1)

        df_a.rename(columns={'ac_type':'Ac_Type_Code'}, inplace=True)
        df_b.rename(columns={'ac_type':'Ac_Type_Code'}, inplace=True)

        df_composite = pd.concat([df_a, df_b])
        df_composite = df_composite[['day_of_origin', 'leg_no', 'fn_number', 'ac_registration', 'dep_ap_sched', 'arr_ap_sched',
                                     'dep_sched_datetime', 'arr_sched_datetime', 'Ac_Type_Code', 'mingt_mode_imputed']]
        df_composite = df_composite.sort_values(['ac_registration', 'dep_sched_datetime'], ascending = [True, True]).reset_index(drop=True)
        df_composite = self._calcs_duration_between_lines(df_composite, 'dep_sched_datetime', 'arr_sched_datetime', 'taround_imputed')

        # Mapping the correct leg_nos since there are duplicates due to the vertical combination
        df_composite['leg_no'] = df_composite['leg_no'].astype('int')
        flight_lookup['leg_no'] = flight_lookup['leg_no'].astype('int')
        mydict = {key: value for (key, value) in zip(flight_lookup['leg_no'].values, flight_lookup['fn_number'].values)}
        df_composite['fn_number_flight'] = df_composite['leg_no'].map(mydict)

        df_composite = df_composite[['day_of_origin', 'leg_no', 'fn_number', 'fn_number_flight', 'ac_registration', 'dep_ap_sched',
                  'arr_ap_sched', 'dep_sched_datetime', 'arr_sched_datetime', 'Ac_Type_Code', 'mingt_mode_imputed', 'taround_imputed']]

        # Since DHX is DH4 and 319, 321 and 322 are 320, these will be replaced in the ground file.
        df_composite.loc[(df_composite['Ac_Type_Code'] == '319') | (df_composite['Ac_Type_Code'] == '321') |  (df_composite['Ac_Type_Code'] == '322'),
        ['Ac_Type_Code']] = '320'
        df_composite.loc[df_composite['Ac_Type_Code'] == 'DHX', ['Ac_Type_Code']] = 'DH4'
        return df_composite

    #### End of prepare loaded data calcs ####

    def create_ground_df(self):
        ground_file = super()._loads_pkl(self.ground_pkl)
        flight_file = super()._loads_pkl(self.flight_pkl)
        ground = self._clean_dedupe_data(ground_file, flight_file)
        ground_composite = self._combine_ground_vertically(ground, flight_file)
        return ground_composite


In [7]:
class DataPrep(GroundPrep):
    """The Data prep class finally combines the flight and ground data prior to running the model.
        Inherited functions from GroundPrep and FlightPrep
    """
        # Initiates constructor for inheritance
    def __init__(self, data_dir, flight_pkl, ground_pkl):
        super().__init__(data_dir, flight_pkl, ground_pkl)

    def _combine_files(self):
        flight = super().create_flight_df()
        ground = super().create_ground_df()

        df = flight.merge(ground, how='left', left_on='leg_no', right_on='leg_no')
        df = df[['day_of_origin_x', 'leg_no', 'fn_number_x', 'ac_registration_x', 'dep_ap_sched_x', 'arr_ap_sched_x', 'dep_sched_datetime_x',
                         'arr_sched_datetime_x', 'm_offblockdt', 'm_onblockdt', 'change_reason_code', 'dep_delay', 'arr_delay_imputed',
                 'arr_delay_imputed_null', 'Ac_Type_Code_x', 'trans_time', 'sched_trans_time', 'Crew_Group', 'Sched_Groundtime', 'Act_Groundtime',
                 'm_onblockdt_imputed', 'TLC_trans_pos', 'last_flight_indicator', 'first_flight_indicator', 'sched_duration', 'actual_duration',
                         'crew_type_change_imputed', 'Sched_Groundtime_imputed', 'Act_Groundtime_imputed',
                         'Crew_Group_imputed', 'change_reason_code_imputed', 'mingt_mode_imputed', 'day_of_week']]

        df = df.rename(columns={'day_of_origin_x': 'day_of_origin', 'fn_number_x' : 'fn_number', 'ac_registration_x' : 'ac_registration',
                               'dep_ap_sched_x' : 'dep_ap_sched', 'arr_ap_sched_x' : 'arr_ap_sched', 'dep_sched_datetime_x' : 'dep_sched_datetime',
                               'arr_sched_datetime_x' : 'arr_sched_datetime', 'Ac_Type_Code_x' : 'Ac_Type_Code'})

        # mingt_mode_imputed can be recalculated for empty cells - first rename before running function.
        df = df.rename(columns={'mingt_mode_imputed' : 'mingt'})
        df = super()._mingt_mode_imputer(df)

        df.loc[df['ac_registration'] == 'ECLGEX', 'mingt_mode_imputed'] = 35
        df.drop(['mingt'], axis=1, inplace=True)
        df.drop(['Sched_Groundtime', 'm_onblockdt', 'Act_Groundtime'], axis=1, inplace=True)
        return df

    def _creates_binned_act_duration(self, df):
        df['combo'] = df['dep_ap_sched'] + ' : ' + df['arr_ap_sched']
        mydf = {}
        # Iterate through each row in the DataFrame
        for index, row in df.iterrows():
            dep_ap = row['dep_ap_sched']
            arr_ap = row['arr_ap_sched']

            # Add mappings for both 'dep_ap' and 'arr_ap'
            mydf[(dep_ap, arr_ap)] = row['combo']
            mydf[(arr_ap, dep_ap)] = row['combo']

        df.drop(columns=['combo'], inplace=True)

        # Create a new column 'route' with a tuple in each cell
        df['route'] = df.apply(lambda row: (row['dep_ap_sched'], row['arr_ap_sched']), axis=1)
        df['route'] = df['route'].map(mydf)

        #Gets the average duration from a groupby
        investigation = df[['route', 'actual_duration']].groupby('route').agg('mean')
        investigation = investigation.reset_index()
        investigation['actual_duration'] = np.round(investigation['actual_duration'],0).astype('int')
        investigation['duration_bin'] = pd.cut(investigation['actual_duration'].values, bins=8, labels=False, retbins=False).tolist()
        investigation['duration_bin'] = investigation['duration_bin'].astype('int')

        # Add these mappings of duration_bins to the merged dataset.
        mydict = {k:v for (k, v) in zip(investigation['route'].to_list(), investigation['duration_bin'].to_list())}
        df['duration_bin'] = df['route'].map(mydict)
        mylist = []
        for val in df['duration_bin'].values:
            if val >= 6:
                mylist.append('6+')
            elif val < 6:
                 mylist.append(str(int(val)))
            else:
                mylist.append('NaN')

        df['duration_bin'] = mylist
        return df

    def _cal_cols(self, df):
        # Flight durations can be calculated from subtracting the departure time from the arrival time
        df['sched_duration'] = (df['arr_sched_datetime'] - df['dep_sched_datetime'])/ np.timedelta64(1, 'm')
        df['actual_duration'] = (df['m_onblockdt_imputed'] - df['m_offblockdt'])/ np.timedelta64(1, 'm')
        df['hub_flight_indicator'] = np.where((df['dep_ap_sched'] == 'East Carmen') | (df['arr_ap_sched'] == 'East Carmen'), 1, 0)

        # Add ordinal variable 'flight_number_of_day', which will encode first flight of the day as '1', second as '2', etc.
        # This will be done for each day, for each aircraft reg.
        ac_reg = OrderedSet(df['ac_registration'].values)
        for reg in ac_reg:
            subset = df.loc[df['ac_registration'] == reg][['day_of_origin', 'm_offblockdt']]
            reg_list = []
            for day in OrderedSet(subset['day_of_origin'].values):
                subset_ranked = subset.loc[subset['day_of_origin'] == day]['m_offblockdt'].rank().astype('int').values
                reg_list.append(subset_ranked)
            reg_list = [r for reg in reg_list for r in reg]
            if reg == ac_reg[0]:
                mylist = reg_list
            else:
                mylist.extend(reg_list)

        df['flight_number_of_day'] = mylist
        df['flight_number_of_day'] = df['flight_number_of_day'].astype('str')

        df['leg_no'] = df['leg_no'].astype('str')
        df['mingt_mode_imputed'] = df['mingt_mode_imputed'].astype('int')

        # Adds low flight indicator - hardcoded to ECLGEX and ECLGNX
        df['low_flight_count'] = np.where((df['ac_registration'].values == 'ECLGEX') | (df['ac_registration'].values == 'ECLGNX'), 1, 0)
        df['low_flight_count'] = df['low_flight_count'].astype('int')

        # Adds high departure times
        high_departure_hours = {'1': [5, 8, 11, 13, 16, 18]}
        df['departure_hour'] = df['m_offblockdt'].dt.hour
        df['arrival_hour'] = df['m_onblockdt_imputed'].dt.hour
        mydict = {val: key for key, value in high_departure_hours.items() for val in value}
        df['high_departure_hour'] = df['departure_hour'].map(mydict).fillna(0).astype('int')

        # Drop Crew group and crew type change imputed columns
        df.drop(columns=['Crew_Group', 'crew_type_change_imputed'], inplace=True)

        # Add bins and route mapper
        df = self._creates_binned_act_duration(df)
        df['actual_duration'] = np.round(df['actual_duration'], 0)

        # Reformat m_onblockdt_imputed
        df['m_onblockdt_imputed'] = pd.to_datetime(df['m_onblockdt_imputed'])
        df['m_onblockdt_imputed'] = df['m_onblockdt_imputed'].dt.strftime('%Y-%m-%d %H:%M:%S')
        return df

    def _adds_quantile_transformer(self, df, list_of_cols):
        # with open(pkl_path + 'quantile_scaler.pkl', 'rb') as file:
        #     scaler_fitted = pickle.load(file)
        scaler_fitted = super()._loads_pkl('quantile_scaler.pkl')

        scaler_transform = scaler_fitted.transform(df[list_of_cols])
        scaler_df = pd.DataFrame(scaler_transform)
        scaler_df.columns = ['quant_transf_'+ col for col in list_of_cols]
        df = pd.concat([df, scaler_df], axis = 1)
        if 'quant_transf_arr_delay_imputed' in df.columns:
            df.drop(columns=['quant_transf_arr_delay_imputed'], inplace=True)
        return df

    def _one_hot_encoder(self, df, list_of_cols):
        with open('onehot_encoder.pkl', 'rb') as file:
            loaded_encoder = pickle.load(file)
        encoded = loaded_encoder.transform(df[list_of_cols]).toarray()
        cols = loaded_encoder.get_feature_names_out(df[list_of_cols].columns)
        df_ = pd.DataFrame(encoded, columns=cols)
        df = pd.concat([df, df_], axis=1)
        return df

    def _target_encoder(self, df, X_col_list, y_col_list):
        with open('target_encoded.pkl', 'rb') as file:
            enc = pickle.load(file)
        cartesian_product = list(product(X_col_list, y_col_list))
        name_cols = [x + '_' + y for (x, y) in cartesian_product]
        mylist = []
        for item_x in X_col_list:
            for item_y in y_col_list:
                _mylist = []
                X = df[[item_x]]
                numeric_dataset = enc[item_x][item_y].transform(X)
                _mylist.append(numeric_dataset)
                mylist.append(_mylist)
            # Concatenate DataFrames horizontally within each sublist
            concatenated_dfs = [pd.concat(sublist, axis=1) for sublist in mylist]
            # Concatenate the resulting DataFrames horizontally across sublists
            mydf = pd.concat(concatenated_dfs, axis=1)
        mydf.columns = name_cols
        df = pd.concat([df, mydf], axis=1)
        return df

    def _ordinal_normaliser(self, df, list_of_cols):
        with open('minmax_scaler.pkl', 'rb') as file:
            enc = pickle.load(file)
        array_representation  = df[list_of_cols].values
        transformed = enc.transform(array_representation)
        names = ['minmax_transf_' + col for col in list_of_cols]

        mydf = pd.DataFrame(transformed, columns=names)
        df = pd.concat([df, mydf], axis=1)
        return df

    def prepare_loaded_data(self):
        merged = self._combine_files()
        merged = self._cal_cols(merged)

        numeric_cols = ['sched_duration', 'actual_duration', 'dep_delay', 'arr_delay_imputed', 'trans_time', 'sched_trans_time',
                                              'Sched_Groundtime_imputed', 'Act_Groundtime_imputed']
        merged = self._adds_quantile_transformer(merged, numeric_cols)

        cat_cols = ['duration_bin', 'change_reason_code_imputed', 'Ac_Type_Code', 'mingt_mode_imputed', 'Crew_Group_imputed']
        merged = self._one_hot_encoder(merged, cat_cols)

        cat_cols_target_enc = ['ac_registration', 'route']
        targets = ['quant_transf_actual_duration', 'quant_transf_dep_delay', 'quant_transf_Act_Groundtime_imputed']
        merged = self._target_encoder(merged, cat_cols_target_enc, targets)

        merged_ordinal = ['flight_number_of_day', 'day_of_week', 'arrival_hour', 'departure_hour']
        merged = self._ordinal_normaliser(merged, merged_ordinal)
        return merged

    def _prepares_cv_dict(self, train_dataset):
        summary = train_dataset.groupby(by=['ac_registration', 'day_of_origin'])[train_dataset.columns[2]].count().reset_index()
        summary.columns = ['ac_registration', 'day_of_origin', 'count']

        X = summary[['ac_registration', 'day_of_origin']].values
        kfold = KFold(n_splits=5, shuffle=True, random_state=1)

        kfold_dict = dict()
        for i, (train_index, validation_index) in enumerate(kfold.split(X)):
            train_summary = summary.iloc[train_index]
            mylist = []
            mydict = dict()
            for idx, row in train_summary.iterrows():
                subset = train_dataset.loc[(train_dataset['ac_registration'] == row.ac_registration) & (train_dataset['day_of_origin'] == row.day_of_origin)]
                mylist.extend(subset.index.to_list())
            mydict['train_indices'] = mylist

            validation_summary = summary.iloc[validation_index]
            mylist = []
            for idx, row in validation_summary.iterrows():
                subset = train_dataset.loc[(train_dataset['ac_registration'] == row.ac_registration) & (train_dataset['day_of_origin'] == row.day_of_origin)]
                mylist.extend(subset.index.to_list())
            mydict['valid_indices'] = mylist

            assert len(mydict['train_indices']) + len(mydict['valid_indices']) == train_dataset.shape[0], 'error'
            kfold_dict.update({i : mydict})
        return kfold_dict

In [8]:
class DataPrepPivot(DataPrep):
  """ DataPrepPivot runs the pivotting steps on the data and populates uneven areas with zeros in the matrix, so that variable flight
  counts are all contained within the matrix.
  """
  # Initiates constructor for inheritance
  def __init__(self, data_dir, flight_pkl, ground_pkl):
    super().__init__(data_dir, flight_pkl, ground_pkl)

  # Function to convert names to 3 letter code within dictionary
  def _creates_airport_code(self, df_col):
    mydict = dict()
    for name in df_col.unique():
      to_list = name.split()
      if len(to_list) == 1:
        code = to_list[0][:2].upper() + to_list[0][-1].upper()
      else:
        code = to_list[0][:2].upper() + to_list[1][0].upper()
      if code not in mydict.keys():
        mydict.update({code : name})
      else:
        code = to_list[0][:3].upper()
        if code not in mydict.keys():
          mydict.update({code : name})
        else:
          if to_list[0] in ['Lake', 'North', 'East', 'Port']:
            code = to_list[0][0].upper() + to_list[1][:2].upper()
            if code not in mydict.keys():
              mydict.update({code : name})
            else:
              print(code, name)
    mydict = {value : key for (key, value) in mydict.items()}
    return mydict

  def _interim_run_i(self):
    merged = super().prepare_loaded_data()

    airport_codes = self._creates_airport_code(merged['dep_ap_sched'])
    _airport_codes = self._creates_airport_code(merged['arr_ap_sched'])

    airport_codes.update(_airport_codes)

    merged['dep_ap_sched_code'] = merged['dep_ap_sched'].map(airport_codes)
    merged['arr_ap_sched_code'] = merged['arr_ap_sched'].map(airport_codes)

    sub_matrix = merged[['day_of_origin', 'ac_registration', 'leg_no']].groupby(by=['day_of_origin', 'ac_registration']).count().reset_index(drop=False)
    sub_matrix.columns = ['day_of_origin', 'ac_registration', 'count']
    sub_matrix['day_of_origin'] = pd.to_datetime(sub_matrix['day_of_origin'])
    sub_matrix['LineNumber'] = sub_matrix.groupby('count').cumcount() + 1
    sub_matrix = sub_matrix.sort_values(by=['count', 'LineNumber'], ascending=[False, True]).reset_index(drop=True)
    return merged, sub_matrix

  def _creates_airport_columns(self, merged, sub_matrix):
    subset_count = sub_matrix['count'].unique()
    new_list = []
    for num in subset_count:
      mylist = []
      sub_sub_matrix = sub_matrix.loc[sub_matrix['count'] == num]
      for idx, row in sub_sub_matrix.iterrows():
        _df, to_concat = [], []
        subset = merged.loc[(merged['day_of_origin'] == row.day_of_origin) & (merged['ac_registration'] == row.ac_registration)]
        airport = subset['dep_ap_sched_code'].to_list()
        airport_arr = subset['arr_ap_sched_code'].to_list()
        airport_arr = [airport_arr[-1]]
        airport.extend(airport_arr)
        df = pd.DataFrame(airport).T
        df.columns = ['airport_' + str(item) for item in df.columns]
        to_concat.append(df)
        happy_df = pd.concat(to_concat, axis=0).reset_index(drop=True)
        mylist.append(happy_df)
        _another_df = pd.concat(mylist, axis=0).reset_index(drop=True)
      new_list.append(_another_df)
    return new_list

  def _creates_transposed_columns(self, cols_to_pivot, merged, sub_matrix):
    subset_count = sub_matrix['count'].unique()
    new_list = []
    for num in subset_count:
      sub_sub_matrix = sub_matrix.loc[sub_matrix['count'] == num]
      _mylist = []
      for idx, row in sub_sub_matrix.iterrows():
        _df, to_concat = [], []
        for col_name in cols_to_pivot:
          subset = merged.loc[(merged['day_of_origin'] == row.day_of_origin) & (merged['ac_registration'] == row.ac_registration)][col_name].to_list()
          df = pd.DataFrame(subset).T
          df.columns = [col_name + '_' + str(item) for item in df.columns]
          to_concat.append(df)
          happy_df = pd.concat(to_concat, axis=1)
        _mylist.append(happy_df)
        another_concat = pd.concat(_mylist, axis=0).reset_index(drop=True)
      new_list.append(another_concat)
    return new_list

  def _creates_pivotted_data(self, pkl_file_name, cols_to_pivot, merged, sub_matrix):
    output_file = pkl_path + pkl_file_name
    if not os.path.exists(output_file):
      pivotted_data = self._creates_transposed_columns(cols_to_pivot, merged, sub_matrix)
      with open(output_file, 'wb') as f:
        pickle.dump(pivotted_data, f)
        print(f"The pkl file '{output_file}' has been created.")
    else:
      print(f"The pkl file '{output_file}' already exists.")
      with open(output_file, 'rb') as f:
        pivotted_data = pickle.load(f)
    return pivotted_data

  def _interim_run_ii(self, cols_to_pivot_x, cols_to_pivot_y, pkl_file_name_x, pkl_file_name_y):
    merged, sub_matrix = self._interim_run_i()

    # Pivot x, y columns and fill nas with zeros for full matrix
    pivotted_x = self._creates_pivotted_data(pkl_file_name_x, cols_to_pivot_x, merged, sub_matrix)
    pivotted_y = self._creates_pivotted_data(pkl_file_name_y, cols_to_pivot_y, merged, sub_matrix)
    full_X_pivotted = pd.concat(pivotted_x, axis=0)
    for col in full_X_pivotted.columns:
      full_X_pivotted[col] = full_X_pivotted[col].fillna(0)
    full_y_pivotted = pd.concat(pivotted_y, axis=0)
    for col in full_y_pivotted.columns:
      full_y_pivotted[col] = full_y_pivotted[col].fillna(0)

    # Use sub_matrix and append the ['mingt_mode_imputed_40', 'mingt_mode_imputed_45', 'Ac_Type_Code_E95', 'Ac_Type_Code_DH4'] for each ac_registration; and the 'minmax_transf_day_of_week' for the day_of_origin
    _df = merged[['ac_registration','mingt_mode_imputed_40', 'mingt_mode_imputed_45', 'Ac_Type_Code_E95', 'Ac_Type_Code_DH4']]
    _df = _df.groupby(by=['ac_registration', 'mingt_mode_imputed_40', 'mingt_mode_imputed_45', 'Ac_Type_Code_E95', 'Ac_Type_Code_DH4']).count().reset_index(drop=False)
    sub_matrix = pd.merge(sub_matrix, _df, how="left", on="ac_registration")

    _df = merged[['day_of_origin', 'minmax_transf_day_of_week']]
    _df = _df.groupby(by=['day_of_origin', 'minmax_transf_day_of_week']).count().reset_index(drop=False)

    sub_matrix['day_of_origin'] = pd.to_datetime(sub_matrix['day_of_origin'])
    _df['day_of_origin'] = pd.to_datetime(_df['day_of_origin'])

    sub_matrix = pd.merge(sub_matrix, _df, how="left", on="day_of_origin")
    return full_X_pivotted, full_y_pivotted, merged, sub_matrix, pivotted_x

  def _creates_full_pivotted_airports(self, cols_to_pivot_x, cols_to_pivot_y, pkl_file_name_x, pkl_file_name_y):
    full_X_pivotted, full_y_pivotted, merged, sub_matrix, pivotted_x = self._interim_run_ii(cols_to_pivot_x, cols_to_pivot_y, pkl_file_name_x, pkl_file_name_y)
    airports = self._creates_airport_columns(merged, sub_matrix)

    col_to_list = airports[0].columns.to_list()
    three_letter_airport = airports[0].values.tolist()
    three_letter_airport = [item for items in three_letter_airport for item in items]
    a0 = [col + '_' + val for col, val in zip(col_to_list, three_letter_airport)]

    mylist = []
    for i, df in enumerate(airports):
      cols = df.columns.to_list()
      last = cols[-1].split('_')[-1]

      if i == 0 and last == '11':
        dummy = pd.DataFrame(1, index=range(1), columns=a0)
      else:
        dummy = pd.get_dummies(df, prefix=cols, drop_first=False)
      mylist.append(dummy)

    pivotted_airports = []
    for x, airp in zip(pivotted_x, mylist):
      extended = pd.concat([airp, x], axis=1)
      pivotted_airports.append(extended)

    for df in pivotted_airports:
      my_list = df.columns.to_list()
      filtered_list = [element for element in my_list if 'change_reason_code_imputed_technical problem_' in element][-1]
      num = filtered_list.split("_")[-1]
      df['count'] = int(num) + 1
      df['count'] = df['count'].astype('int')

    for df in pivotted_airports:
      df.reset_index(drop=False, inplace=True)
      df['LineNumber'] = df['index'] + 1
      df.drop(['index'], axis=1, inplace=True)

    full_pivotted_airports = pd.concat(pivotted_airports, axis=0)

    for col in full_pivotted_airports.columns:
      full_pivotted_airports[col] = full_pivotted_airports[col].fillna(0)

    full_pivotted_airports = full_pivotted_airports.reindex(sorted(full_pivotted_airports.columns), axis=1)

    full_pivotted_airports = pd.merge(full_pivotted_airports, sub_matrix, how='left', on=['count', 'LineNumber'])
    full_pivotted_airports.drop(columns=['count', 'LineNumber'], inplace=True)

    full_pivotted_airports = full_pivotted_airports[['ac_registration'] + [col for col in full_pivotted_airports.columns if col != 'ac_registration']]
    full_pivotted_airports = full_pivotted_airports[['day_of_origin'] + [col for col in full_pivotted_airports.columns if col != 'day_of_origin']]
    return full_pivotted_airports, full_y_pivotted

  def _creates_data_for_pca(self, cols_to_pivot_x, cols_to_pivot_y, pkl_file_name_x, pkl_file_name_y):
    full_pivotted_airports, full_y_pivotted = self._creates_full_pivotted_airports(cols_to_pivot_x, cols_to_pivot_y, pkl_file_name_x, pkl_file_name_y)
    # y_names = ['sched_duration', 'actual_duration', 'dep_delay', 'Act_Groundtime_imputed', 'Sched_Groundtime_imputed']
    y_names = ['actual_duration', 'dep_delay', 'Act_Groundtime_imputed']
    selected_columns, y_selected = [], []

    for item in y_names:
        # Columns that contain item
        contains_item_X = [col for col in full_pivotted_airports.columns if item in col]
        contains_item_y = [col for col in full_y_pivotted.columns if item in col]
        y_selected.append(contains_item_y)

        # Columns that do not contain rest of the names
        dnc_rest = [col for col in full_pivotted_airports.columns if all(name not in col for name in y_names if name != item)]

        contains_item_X.extend(dnc_rest)
        selected_columns_ = list(set(contains_item_X))
        selected_columns_ = sorted(selected_columns_)
        selected_columns.append(selected_columns_)

    X_for_pca, y_subset = [], []
    for col_subset in selected_columns:
      _df = full_pivotted_airports[col_subset]
      X_for_pca.append(_df)

    for y_vars in y_selected:
      _df = full_y_pivotted[y_vars]
      y_subset.append(_df)
    return X_for_pca, y_subset

  def prepare_data(self, cols_to_pivot_x, cols_to_pivot_y, pkl_file_name_x, pkl_file_name_y):
    X_pca, full_y_pivotted = self._creates_data_for_pca(cols_to_pivot_x, cols_to_pivot_y, pkl_file_name_x, pkl_file_name_y)
    return X_pca, full_y_pivotted

In [9]:
# Running the class object with the training data

X_pivot = ['route_quant_transf_actual_duration', 'route_quant_transf_Act_Groundtime_imputed', 'route_quant_transf_dep_delay',
         'Crew_Group_imputed_no change', 'last_flight_indicator', 'minmax_transf_departure_hour',
         'change_reason_code_imputed_rotational problem', 'change_reason_code_imputed_technical problem', 'change_reason_code_imputed_other problem',
         'ac_registration_quant_transf_actual_duration', 'ac_registration_quant_transf_dep_delay', 'ac_registration_quant_transf_Act_Groundtime_imputed',
         'duration_bin_1', 'duration_bin_2', 'duration_bin_3', 'duration_bin_4', 'duration_bin_5', 'duration_bin_6+']

y_pivot = ['quant_transf_dep_delay', 'quant_transf_actual_duration', 'quant_transf_Act_Groundtime_imputed' ]

# Create the training data
data_obj = DataPrepPivot(pkl_path, 'flight_train.pkl', 'ground_train.pkl')
X_train, y_train = data_obj.prepare_data(X_pivot, y_pivot, 'X_train_transposed.pkl', 'y_train_transposed.pkl')

The pkl file '/content/drive/MyDrive/01_Data/03_Modelling/X_train_transposed.pkl' already exists.
The pkl file '/content/drive/MyDrive/01_Data/03_Modelling/y_train_transposed.pkl' already exists.


In [10]:
# Running the class object with the testing data to create the test data with same cleaning and transformation steps
data_obj = DataPrepPivot(pkl_path, 'flight_test.pkl', 'ground_test.pkl')
X_test, y_test = data_obj.prepare_data(X_pivot, y_pivot, 'X_test_transposed.pkl', 'y_test_transposed.pkl')

The pkl file '/content/drive/MyDrive/01_Data/03_Modelling/X_test_transposed.pkl' already exists.
The pkl file '/content/drive/MyDrive/01_Data/03_Modelling/y_test_transposed.pkl' already exists.


In [11]:
class PCA_analysis():
  """This implements the PCA transformation steps, and selects the components giving 95% of the variance
  """
  def __init__(self, X_train, X_test, y_train, y_test):
    self.X_train =  X_train
    self.X_test  =  X_test
    self.y_train =  y_train
    self.y_test  =  y_test

  def _creates_identical_cols(self, training_df, testing_df):
    training_df_sorted, testing_df_sorted = [], []

    for train_df, test_df in zip(training_df, testing_df):
        train_cols = train_df.columns.tolist()
        test_cols = test_df.columns.tolist()

        # Columns not in test and train
        _cols_not_in_test  = [col for col in train_cols if col not in test_cols]
        _cols_not_in_train = [col for col in test_cols if col not in train_cols]

        # Add missing columns with zeros
        for col in _cols_not_in_train:
            train_df[col] = 0
        for col in _cols_not_in_test:
            test_df[col] = 0

        # Sort columns alphabetically
        train_df = train_df.reindex(sorted(train_df.columns), axis=1)
        test_df = test_df.reindex(sorted(test_df.columns), axis=1)

        training_df_sorted.append(train_df)
        testing_df_sorted.append(test_df)

    return training_df_sorted, testing_df_sorted

  def _creates_PCA_numbers_dict(self, X_for_pca):
    # y_names = ['sched_duration', 'actual_duration', 'dep_delay', 'Act_Groundtime_imputed', 'Sched_Groundtime_imputed']
    y_names = ['actual_duration', 'dep_delay', 'Act_Groundtime_imputed']
    mydict = {}

    for _df, y_name in zip(X_for_pca, y_names):
      X = _df.copy(deep=True)
      X.drop(columns=['day_of_origin', 'ac_registration'], inplace=True)
      pca = PCA(min(X.shape) - 1)
      X_pca = pca.fit_transform(X)
      _ = pd.DataFrame(pca.components_, columns=X.columns)
      var_df = pd.DataFrame(pca.explained_variance_ratio_, columns=['Explained_Variance'])
      var_df['Running_Total'] = var_df['Explained_Variance'].cumsum(axis=0)
      var_df = var_df.reset_index(drop=False)
      var_df['index'] = var_df['index'] + 1
      princ_comp_df = pd.concat([var_df,_], axis=1)
      val = princ_comp_df.loc[princ_comp_df['Running_Total'] <= 0.951]['index'].tail(1).values[0]
      mydict.update({y_name : val})
    return mydict

  def pca_dimension_reduction(self, X_for_pca, pca_nums):
    # Re-running PCA with components and saving to a pkl file to use for test data.
    my_dict = {}
    filename = 'pca_model_dict.pkl'
    pca_list = []
    X_for_pca_name = [name for name, value in globals().items() if value is X_for_pca][0]

    if 'train' in X_for_pca_name:
      for _df, pca_num, pca_key in zip(X_for_pca, pca_nums.values(), pca_nums.keys()):
        X = _df.copy(deep=True)
        X.drop(columns=['day_of_origin', 'ac_registration'], inplace=True)
        X = X.to_numpy()

        pca = PCA(pca_num)
        pca.fit(X)
        X_pca = pca.transform(X)
        assert X_pca.shape[1] == pca_num, 'Error'
        pca_list.append(X_pca)
        my_dict.update({pca_key : pca})

        # Save PCA model to pickle file
        with open(filename, 'wb') as file:
          pickle.dump(my_dict, file)
    else:
      # Load PCA model from pickle file
      with open(filename, 'rb') as file:
        my_dict = pickle.load(file)
      for _df, pca_key in zip(X_for_pca, my_dict.keys()):
        pca_num = pca_nums[pca_key]
        X = _df.copy(deep=True)
        X.drop(columns=['day_of_origin', 'ac_registration'], inplace=True)
        X = X.to_numpy()

        pca = my_dict[pca_key]
        X_pca = pca.transform(X)
        assert X_pca.shape[1] == pca_num, 'Error'
        pca_list.append(X_pca)
    return pca_list

In [12]:
# Applying the PCA transformation to the training and testing data

pca_obj = PCA_analysis(X_train, X_test, y_train, y_test)

X_train, X_test = pca_obj._creates_identical_cols(X_train, X_test)
y_train, y_test = pca_obj._creates_identical_cols(y_train, y_test)
pca_nums = pca_obj._creates_PCA_numbers_dict(X_train)

for _test_df, _train_df in zip(X_test, X_train):
  assert _test_df.columns.to_list() == _train_df.columns.to_list(), 'error'

for _test_df, _train_df in zip(y_train, y_test):
  assert _test_df.columns.to_list() == _train_df.columns.to_list(), 'error'

X_train_pca = pca_obj.pca_dimension_reduction(X_train, pca_nums)
X_test_pca = pca_obj.pca_dimension_reduction(X_test, pca_nums)

In [13]:
# Loading other pickle files in preparation to run model

y_names = ['actual_duration', 'dep_delay', 'Act_Groundtime_imputed']

with open(pkl_path + 'models_dict.pkl', 'rb') as f:
  loaded_models_dict = pickle.load(f)

with open(pkl_path + 'baselines.pkl', 'rb') as f:
  baseline_df = pickle.load(f)

with open(pkl_path + 'kfold_dict.pkl', 'rb') as f:
  k_fold_dict = pickle.load(f)

with open(pkl_path + 'Groundtime_corr_coeff.pkl', 'rb') as f:
  groundtime_correlation_dict = pickle.load(f)  # Maps to co_eff, intercept

with open(pkl_path + 'Duration_corr_coeff.pkl', 'rb') as f:
  duration_correlation_dict = pickle.load(f)    # Maps to co_eff, intercept

# Function to calculate RMSE
def calcs_rmse(y_actual, y_hat):
  rmse = np.sqrt(np.mean((y_actual - y_hat)**2, axis=0))
  return rmse

In [14]:
# Generates a list of column names from the given set, and additional names. Returns names specifically formatted with index identification.
def y_test_name_generator():
  my_list = []
  for y_test_name in y_test:
    _lista = y_test_name.columns.to_list()
    del _lista[3]
    my_list.append(_lista)

  y_names_additional = ['sched_duration', 'Sched_Groundtime_imputed']

  for y_name in y_names_additional:
    _my_list = []
    for i in range(0, 11):
      if y_name == 'Sched_Groundtime_imputed':
        name = 'quant_transf_Sched_Groundtime_imputed_' + str(i)
      if y_name == 'sched_duration':
        name = 'quant_transf_sched_duration_' + str(i)
      _my_list.append(name)
      _my_list.sort()
    my_list.extend([_my_list])
  return my_list

# Uses results and returns these in a datframe
def _returns_prediction_df_pre(rmse_results):
  _mylist = []
  _mylist.append(rmse_results)
  mydf = pd.DataFrame(_mylist)
  cols = mydf.columns.to_list()
  mydf.columns = [name + '_' + str(i) for i in cols]
  mydf = mydf.T.reset_index(drop=False)
  mydf.columns = ['predicted_variable', 'rmse']
  mydf['mean_rmse'] = mydf['rmse'].mean()
  mydf['predicted'] = name
  return mydf

# For ground time and schedule duration: calculates adjusted predictions based on loaded correlation values.
def _calcs_pred_groundtime_sched_duration(key, input_matrix):
  # Empty array
  corr_vector = np.empty((1, 11, 2))

  # Iterate through the dictionary to create vector
  for i in range(0, 11):
    if 'sched_duration' in key:
      corr_vector[0, i] = groundtime_correlation_dict[('quant_transf_Act_Groundtime_imputed_' + str(i), 'quant_transf_Sched_Groundtime_imputed_' + str(i))]
    else:
      corr_vector[0, i] = duration_correlation_dict[('quant_transf_actual_duration_' + str(i), 'quant_transf_sched_duration_' + str(i))]

  if input_matrix.shape[1] != 11:
    input_matrix = input_matrix[:, :11]

  corr_vector_reshaped = corr_vector[:, :input_matrix.shape[1], :]
  result = input_matrix * corr_vector_reshaped[:,:,0] + corr_vector_reshaped[:,:,1]
  return result

In [15]:
# Adds data for scheduled duration and scheduled groundtime, although this will be replaced when model runs.
y_names.extend(['sched_duration', 'Sched_Groundtime_imputed'])

X_train_pca.extend([X_train_pca[0]])
X_train_pca.extend([X_train_pca[-1]])

y_train_pca.extend([y_train_pca[0]])
y_train_pca.extend([y_train_pca[-1]])

X_test_pca.extend([X_test_pca[0]])
X_test_pca.extend([X_test_pca[-1]])

y_test_pca.extend([y_test_pca[0]])
y_test_pca.extend([y_test_pca[-1]])

In [16]:
# Finally runs the model, with predictions calculated from the co-efficients for scheduled duration and scheduled groundtime
mydict, mytestdict = dict(), dict()
mylist = []
kf = KFold(n_splits=27)

for name, X_train_matrix, X_test_matrix, y_train_matrix, y_test_ in zip(y_names, X_train_pca, X_test_pca, y_train_pca, y_test_pca):
  model = loaded_models_dict[name]
  regr = MultiOutputRegressor(model)

  all_predictions_validation = np.zeros((y_train_matrix.shape), dtype=float)  # Shape of zeros matches y_train_matrix

  for train_index, validation_index in kf.split(X_train_matrix):
    # Get train and validation folds
    X_train, X_valid = X_train_matrix[train_index], X_train_matrix[validation_index]
    y_train, y_valid = y_train_matrix[train_index], y_train_matrix[validation_index]

    if name not in ['sched_duration', 'Sched_Groundtime_imputed']:
      # Fit model
      regr = regr.fit(X_train, y_train)
      fold_predictions_validation = regr.predict(X_valid)
      all_predictions_validation[validation_index, :] = fold_predictions_validation
      mydict.update({name : all_predictions_validation})

    if name == 'sched_duration':
      all_predictions_validation = _calcs_pred_groundtime_sched_duration(name, mydict['actual_duration'])
      mydict.update({name:all_predictions_validation})
    if name == 'Sched_Groundtime_imputed':
      all_predictions_validation = _calcs_pred_groundtime_sched_duration(name, mydict['Act_Groundtime_imputed'])
      mydict.update({name:all_predictions_validation})

  if name not in ['sched_duration', 'Sched_Groundtime_imputed']:
    predictions_test = regr.predict(X_test_matrix)
  if name == 'sched_duration':
    predictions_test = _calcs_pred_groundtime_sched_duration(name, mytestdict['actual_duration'])
  if name == 'Sched_Groundtime_imputed':
    predictions_test = _calcs_pred_groundtime_sched_duration(name, mytestdict['Act_Groundtime_imputed'])
  mytestdict.update({name : predictions_test})

  # Prediction evaluation for validation set
  rmse_validation = calcs_rmse(y_train_matrix, all_predictions_validation)
  validation_df = _returns_prediction_df_pre(rmse_validation)
  validation_df['type'] = 'test'
  mylist.append(validation_df)

  # Prediction evaluation for test set
  rmse_test = calcs_rmse(y_test_, predictions_test)
  test_df = _returns_prediction_df_pre(rmse_test)
  test_df['type'] = 'validation'
  mylist.append(test_df)

all_results = pd.concat(mylist, axis=0).reset_index(drop=True)

In [17]:
# Visualises the results per variable
for name in y_names:
  to_plot = all_results.loc[all_results['predicted'] == name]
  mean_rmse_test = np.round(to_plot.loc[to_plot['type'] == 'test']['rmse'].mean(), 4)
  mean_rmse_valid = np.round(to_plot.loc[to_plot['type'] == 'validation']['rmse'].mean(), 4)

  baseline = np.round(baseline_df.loc[baseline_df['Predicted_Variable'] == name]['Average_Baseline'].head(1).values[0], 4)

  fig = px.line(to_plot, x="predicted_variable", y="rmse", color="type", width=1800, height=800)

  fig.add_hline(y=mean_rmse_test, line_width=2, line_dash="dot", line_color='rgb(37, 137, 190)', annotation_text= f"mean test: {mean_rmse_test}")
  fig.add_hline(y=mean_rmse_valid, line_width=2, line_dash="dot", line_color='rgb(57, 1, 74)', annotation_text= f"mean validation: {mean_rmse_valid}")

  fig.add_hline(y=baseline, line_width=2, line_dash="dot", line_color="red", annotation_text= f"baseline rmse: {baseline}")
  fig['data'][0]['line']['color']='rgb(37, 137, 190)'
  fig['data'][1]['line']['color']='rgb(57, 1, 74)'
  fig['data'][0]['line']['width'] = 4
  fig['data'][1]['line']['width'] = 4
  fig.write_image(images_folder + "/results_" + name + '.png')
  fig.show()

In [18]:
print(dir(DataPrep))

['_TLC_trans_to_crew', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adds_quantile_transformer', '_arr_delay_m_onblockdt_imputed', '_cal_cols', '_calcs_duration_between_lines', '_calculates_time_diff_to_line_below', '_clean_data', '_clean_dedupe_data', '_combine_files', '_combine_ground_vertically', '_creates_binned_act_duration', '_creates_duration_col', '_crew_group_imputed', '_flight_cleanse', '_loads_pkl', '_mingt_mode_imputer', '_one_hot_encoder', '_ordinal_normaliser', '_prepare_loaded_data', '_prepares_cv_dict', '_replaces_blank_arr_leg_outbound', '_sgd_model', '_target_encoder', 'create_flight_df', 'create_ground_df', 'prepare_loaded_data']
