<a href="https://colab.research.google.com/github/HeatherDriver/IU-Model-Engineering/blob/main/02_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install ordered_set
! pip install -U kaleido

Collecting ordered_set
  Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)
Installing collected packages: ordered_set
Successfully installed ordered_set-4.1.0
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from ordered_set import OrderedSet
import datetime as dt
import warnings
import math
from collections import Counter
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [3]:
# Mount Google Drive into the Colab environment and change current directory
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/01_Data/02_Processed'
images_folder = '/content/drive/MyDrive/02_Docs'

Mounted at /content/drive
/content/drive/MyDrive/01_Data/02_Processed


In [4]:
# Display all columns, rows and import the data from 01_DataPrep
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.mode.chained_assignment = None

warnings.filterwarnings('ignore')
# plt.style.use('seaborn-whitegrid')
sns.set(palette="Dark2")

data_path = '/content/drive/MyDrive/01_Data/02_Processed/'

merged = pd.read_csv(data_path + 'merged.csv', index_col=0, infer_datetime_format=True, parse_dates=['day_of_origin', 'dep_sched_datetime',
'arr_sched_datetime', 'm_offblockdt', 'm_onblockdt_imputed'])
merged['m_onblockdt_imputed'] = pd.to_datetime(merged['m_onblockdt_imputed'], format='ISO8601')

### Distribution of predicted variables

In [5]:
# Univariate analysis: departure delay
mydf = merged.dropna()
fig = px.histogram(mydf, x='dep_delay', height=600, color_discrete_sequence=px.colors.qualitative.Bold)
fig.update_traces(opacity=0.75)
fig.update_layout(title_text="Histogram of values across 'departure delay'")
fig.show()

In [6]:
# Univariate analysis: scheduled duration
fig = px.histogram(mydf, x='sched_duration', height=600, color_discrete_sequence=px.colors.qualitative.Bold)
fig.update_traces(opacity=0.75)
fig.update_layout(title_text="Histogram of values across 'scheduled duration'")
fig.show()

In [7]:
# Univariate analysis: actual duration
fig = px.histogram(mydf, x='actual_duration', height=600, color_discrete_sequence=px.colors.qualitative.Bold)
fig.update_traces(opacity=0.75)
fig.update_layout(title_text="Histogram of values across 'actual duration'")
fig.show()

In [8]:
# Univariate analysis: actual ground time
fig = px.histogram(mydf, x='Act_Groundtime_imputed', height=600, color_discrete_sequence=px.colors.qualitative.Bold)
fig.update_traces(opacity=0.75)
fig.update_layout(title_text="Histogram of values across 'actual ground time'")
fig.show()

In [9]:
# Univariate analysis: scheduled ground time
fig = px.histogram(mydf, x='Sched_Groundtime_imputed', height=600, color_discrete_sequence=px.colors.qualitative.Bold)
fig.update_traces(opacity=0.75)
fig.update_layout(title_text="Histogram of values across 'scheduled ground time'")
fig.show()

In [10]:
# All of the predicted variables are right skewed and the model data will require extensive transformation

In [11]:
# Univariate analysis of flight durations

# Function to concat two columns vertially so they can be displayed on a facet_plot
def concats_two_cols(list_of_cols):
    mylist = []
    for col in list_of_cols:
        subset = merged[[col]]
        subset.columns = ['value']
        subset['variable_name'] = col
        subset = subset[['variable_name', 'value']]
        mylist.append(subset)
    mydf = pd.concat(mylist, axis=0)
    return mydf

# Basic statistics of the numeric columns with the mode
def col_statistics(df, col_name, print_skew=True):
    x = df[[col_name]].describe()
    y = df.loc[df[col_name].notnull(), [col_name]].mode().iloc[[0]]
    y.index = ['mode']
    z = df.loc[df[col_name].notnull(), [col_name]].median()
    z.index = ['median']
    a = pd.DataFrame(z, columns=[col_name])
    summary = pd.concat([x, y, a], axis = 0)
    if print_skew:
    # Right skewed = mode < median < mean.
        if summary.loc['mode'].values[0] < summary.loc['median'].values[0] < summary.loc['mean'].values[0]:
            print(col_name + ' is right skewed (positively skewed)')
     # Left skewed = mean < median < mode.
        if summary.loc['mean'].values[0] < summary.loc['median'].values[0] < summary.loc['mode'].values[0]:
            print(col_name + ' is left skewed (negatively skewed)')

    return summary

In [12]:
# Graph of the data
investigation = concats_two_cols(['sched_duration', 'actual_duration'])
fig = px.histogram(investigation, x='value', color='variable_name', facet_col='variable_name', facet_col_wrap=2, height=400,
                   color_discrete_sequence=px.colors.qualitative.Bold,
                          title= '')
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_traces(opacity=0.75)
fig.update_layout(title=None)
fig.write_image(images_folder + "/Comparison of distributions for sched and actual durations.png")
fig.show()

In [13]:
# This indicates that in reality we are looking at a more dispersed array of values, scheduled durations almost appear
# categorical in nature - expected since durations would be defined based on the two airports the flight was travelling between.

In [14]:
pd.concat([col_statistics(merged, 'sched_duration', False), col_statistics(merged, 'actual_duration', False)], axis=1)

Unnamed: 0,sched_duration,actual_duration
count,6463.0,6463.0
mean,99.417453,94.784465
std,42.940296,41.802025
min,35.0,28.0
25%,75.0,68.0
50%,90.0,86.0
75%,115.0,110.0
max,325.0,311.0
mode,95.0,85.0
median,90.0,86.0


In [15]:
# It thus appears to be that actual flight durations are shorter than scheduled durations on average.
# Possibly the scheduled duration as an estimate has some built in buffer time.

In [16]:
# Plotting average flight durations
merged['combo'] = merged['dep_ap_sched'] + ' : ' + merged['arr_ap_sched']

mydf = {}
# Iterate through each row in the DataFrame
for index, row in merged.iterrows():
    dep_ap = row['dep_ap_sched']
    arr_ap = row['arr_ap_sched']

    # Add mappings for both 'dep_ap' and 'arr_ap'
    mydf[(dep_ap, arr_ap)] = row['combo']
    mydf[(arr_ap, dep_ap)] = row['combo']

merged.drop(columns=['combo'], inplace=True)

# Create a new column 'route' with a tuple in each cell
merged['route'] = merged.apply(lambda row: (row['dep_ap_sched'], row['arr_ap_sched']), axis=1)
merged['route'] = merged['route'].map(mydf)

In [17]:
# Plotting average flight durations between destinations and creating duration bins
investigation = merged[['route', 'actual_duration']].groupby('route').agg('mean')
investigation = investigation.reset_index()
investigation['actual_duration'] = np.round(investigation['actual_duration'],0).astype('int')
investigation = investigation.sort_values(by=['actual_duration'], ascending=True)
investigation['duration_bin'] = pd.cut(investigation['actual_duration'].values, bins=8, labels=False, retbins=False).tolist()
investigation['duration_bin'] = investigation['duration_bin'].astype('str')

fig = px.bar(investigation, x="route", y='actual_duration', color='duration_bin', title="Average duration of flights (minutes) between destinations. Durations binned to 8 groups",
             color_discrete_sequence=px.colors.qualitative.Bold, height=600)
fig.update_layout(title=None)
fig.write_image(images_folder + "/Binned_dest_groups.png")
fig.show()

In [18]:
# Note that majority of flights are less than 2 hours in duration

In [19]:
# Add these mappings of duration_bins to the merged dataset.
mydict = {k:v for (k, v) in zip(investigation['route'].to_list(), investigation['duration_bin'].to_list())}
merged['duration_bin'] = merged['route'].map(mydict)

In [20]:
# Look at the distribution of 'duration_bin'
fig = go.Figure()
fig.add_trace(
    go.Histogram(x=merged['duration_bin'], marker=dict(color=px.colors.qualitative.Bold[1]))
)
fig.update_layout(title_text="Histogram of values across the 'duration_bin' field")
fig.update_layout(title=None)
fig.write_image(images_folder + "/Duration_bin_histogram.png")
fig.show()

In [21]:
# Thus zones 6 and greater will be merged to '6+'.

merged['duration_bin'] = merged['duration_bin'].astype('int')

mylist = []
for val in merged['duration_bin'].values:
    if val >= 6:
        mylist.append('6+')
    elif val < 6:
         mylist.append(str(int(val)))
    else:
        mylist.append('NaN')

merged['duration_bin'] = mylist

In [22]:
# Add a binary column indicating if this is a hub flight, or flight between destinations
# 1: hub flight
# 0: flight between destinations (i.e. arrival or departure does not include East Carmen)

merged['hub_flight_indicator'] = np.where((merged['dep_ap_sched'] == 'East Carmen') | (merged['arr_ap_sched'] == 'East Carmen'), 1, 0)

# check done correctly
merged.loc[(merged['dep_ap_sched']!= 'East Carmen') & (merged['arr_ap_sched']!= 'East Carmen')]['hub_flight_indicator'].value_counts()

hub_flight_indicator
0    582
Name: count, dtype: int64

### Feature  engineering further columns

In [23]:
# Add ordinal variable 'flight_number_of_day', which will encode first flight of the day as '1', second as '2', etc.
# This will be done for each day, for each aircraft reg.

ac_reg = OrderedSet(merged['ac_registration'].values)
for reg in ac_reg:
    subset = merged.loc[merged['ac_registration'] == reg][['day_of_origin', 'm_offblockdt']]
    reg_list = []
    for day in OrderedSet(subset['day_of_origin'].values):
        subset_ranked = subset.loc[subset['day_of_origin'] == day]['m_offblockdt'].rank().astype('int').values
        reg_list.append(subset_ranked)
    reg_list = [r for reg in reg_list for r in reg]
    if reg == ac_reg[0]:
        mylist = reg_list
    else:
        mylist.extend(reg_list)

assert len(mylist) == merged.shape[0], "problem encountered"

merged['flight_number_of_day'] = mylist
merged['flight_number_of_day'] = merged['flight_number_of_day'].astype('str')

merged['leg_no'] = merged['leg_no'].astype('str')
merged['mingt_mode_imputed'] = merged['mingt_mode_imputed'].astype('int')

In [24]:
# Defining function to analyse histograms of a variable, by another variable (bivariate)

def plots_delay_by_variable(col_to_analyse, col_to_analyse_by, x_range=False):
    subset = merged[[col_to_analyse, col_to_analyse_by]]
    set_to_loop = OrderedSet(subset[col_to_analyse_by].values)

    fig = go.Figure()
    for idx, item in enumerate(set_to_loop):
        subset_selected = subset.loc[subset[col_to_analyse_by] == item][[col_to_analyse]]
        # fd = freedman_diaconis(subset_selected, col_to_analyse)
        for_chart = subset_selected[col_to_analyse]
        fig.add_trace(
            go.Histogram(x=for_chart.values, name=item, #nbinsy=fd,
                         marker=dict(color=px.colors.qualitative.Bold[idx]))
        )
    fig.update_layout(barmode='overlay', height=400, width=1000, title_text='Histograms of ' + col_to_analyse + ' by ' + col_to_analyse_by)
    fig.update_traces(opacity=0.75)
    if x_range:
        fig.update_xaxes(range=x_range)
    fig.show()

def plots_facet_histogram(col_to_analyse, categorical_col, x_range=False, save_file_name=False):
    mydf = merged.dropna()
    ordered_cats = sorted(list(OrderedSet(mydf[categorical_col].values)))
    length = len(set(mydf[categorical_col]))
    if length > 4:
        fig = px.histogram(mydf, x=col_to_analyse, color=categorical_col, facet_col=categorical_col, facet_col_wrap=4, height=600,
                   color_discrete_sequence=px.colors.qualitative.Bold, category_orders={categorical_col:ordered_cats},
                          title= 'Histograms of ' + col_to_analyse + ' by ' + categorical_col)
    else:
        fig = px.histogram(mydf, x=col_to_analyse, color=categorical_col, facet_col=categorical_col, facet_col_wrap=4,
                   color_discrete_sequence=px.colors.qualitative.Bold, category_orders={categorical_col:ordered_cats},
                          title= 'Histograms of ' + col_to_analyse + ' by ' + categorical_col)
    fig.update_traces(opacity=0.75)
    fig.update_layout(xaxis = {"categoryorder":"category ascending"})
    if x_range:
      fig.update_xaxes(range=x_range)
      fig.show()
    if save_file_name:
      fig.update_layout(title=None)
      fig.write_image(images_folder + "/" + save_file_name)
    fig.show()

def delay_data_plots_facet_histogram(col_to_analyse, categorical_col, x_range=False, save_file_name=False):
    mydf = merged.dropna()
    mydf = mydf.loc[mydf['dep_delay'] > 0, :]
    ordered_cats = sorted(list(OrderedSet(mydf[categorical_col].values)))
    length = len(set(mydf[categorical_col]))
    subset = mydf.loc[mydf['change_reason_code']!= 'no reason']
    if length > 4:
        fig = px.histogram(subset, x=col_to_analyse, color=categorical_col, facet_col=categorical_col, facet_col_wrap=4, height=600,
                   color_discrete_sequence=px.colors.qualitative.Bold, category_orders={categorical_col:ordered_cats})
    else:
        fig = px.histogram(subset, x=col_to_analyse, color=categorical_col, facet_col=categorical_col, facet_col_wrap=4,
                   color_discrete_sequence=px.colors.qualitative.Bold, category_orders={categorical_col:ordered_cats},
                          title= 'Histograms of ' + col_to_analyse + ' by ' + categorical_col)
    fig.update_traces(opacity=0.75)
    if x_range:
        fig.update_xaxes(range=x_range)
        fig.show()
    if save_file_name:
      fig.write_image(images_folder + "/" + save_file_name)
    fig.show()

def delay_data_plots_agg_bar(col_to_analyse, categorical_col, aggregate='mean', save_file_name=False):
    mydf = merged.dropna()
    mydf = mydf.loc[mydf['dep_delay'] > 0, :]
    ordered_categorical = list(OrderedSet(mydf[categorical_col].values))
    mydf = mydf[[col_to_analyse, categorical_col]]
    if aggregate == 'mean':
        mydf = mydf.groupby([categorical_col]).mean()
        title_start = 'Average '
    if aggregate == 'sum':
        mydf = mydf.groupby([categorical_col]).sum()
        title_start = 'Total '
    if aggregate == 'count':
        mydf = mydf.groupby([categorical_col]).count()
        title_start = 'Counted '
    mydf = mydf.reset_index()
    mydf.columns = [categorical_col, title_start + col_to_analyse]
    fig = px.bar(mydf, x=mydf.columns[0], y=mydf.columns[1], height=350, title= title_start + col_to_analyse + ' by ' + categorical_col,
                 color_discrete_sequence=px.colors.qualitative.Bold, color=mydf.columns[0])
    fig.update_layout(xaxis = {"categoryorder":"category ascending"})
    fig.update_traces(opacity=0.75)
    if save_file_name:
      fig.update_layout(title=None)
      fig.write_image(images_folder + "/" + save_file_name)
    fig.show()

def plots_agg_bar(col_to_analyse, categorical_col, aggregate='mean', save_file_name=False):
    mydf = merged.dropna()
    ordered_categorical = list(OrderedSet(mydf[categorical_col].values))
    mydf = mydf[[col_to_analyse, categorical_col]]
    if aggregate == 'mean':
        mydf = mydf.groupby([categorical_col]).mean()
        title_start = 'Average '
    if aggregate == 'sum':
        mydf = mydf.groupby([categorical_col]).sum()
        title_start = 'Total '
    if aggregate == 'count':
        mydf = mydf.groupby([categorical_col]).count()
        title_start = 'Counted '
    mydf = mydf.reset_index()
    mydf.columns = [categorical_col, title_start + col_to_analyse]
    fig = px.bar(mydf, x=mydf.columns[0], y=mydf.columns[1], height=350, title= title_start + col_to_analyse + ' by ' + categorical_col,
                 color_discrete_sequence=px.colors.qualitative.Bold, color=mydf.columns[0])
    fig.update_layout(xaxis = {"categoryorder":"category ascending"})
    fig.update_traces(opacity=0.75)
    if save_file_name:
      fig.write_image(images_folder + "/" + save_file_name)
    fig.show()

In [25]:
ac_reg = merged['ac_registration'].unique()

my_list = []
for reg in ac_reg:
  subset = merged.loc[merged['ac_registration'] == reg][['ac_registration', 'day_of_origin', 'flight_number_of_day']]
  max_flights_per_day = subset.groupby('day_of_origin')['flight_number_of_day'].max().reset_index(drop=False)
  max_flights_per_day['flight_number_of_day'] = max_flights_per_day['flight_number_of_day'].astype('int')
  average_flights_per_day = max_flights_per_day['flight_number_of_day'].mean(axis=0)
  my_list.append({'ac_registration': reg, 'average_flights_per_day': average_flights_per_day})
average_flights_df = pd.DataFrame(my_list)
average_flights_df = average_flights_df.sort_values(by="average_flights_per_day", ascending=False)
mean_flights = np.round(average_flights_df['average_flights_per_day'].mean(), 4)

fig = px.bar(average_flights_df, x='ac_registration', y='average_flights_per_day', height=350, title= 'Average flights per day, for each aircraft registration', color_continuous_scale=px.colors.sequential.Viridis,
             color='average_flights_per_day')
fig.add_hline(y=mean_flights, annotation_text='Mean: ' + f'{mean_flights}', line_dash="dot")
fig.update_layout(coloraxis_showscale=False, title=None, showlegend=False, width=1700, height=600)
fig.write_image(images_folder + "/avg_flights_per_day.png")
fig.show()

In [26]:
# Univariate analysis of number_of flights per day, for each day_of_week
mydf = merged.groupby(['day_of_week']).count()
mydf = mydf.reset_index()
mydf = mydf[['day_of_week', 'day_of_origin']]
mydf.columns = ['day_of_week', 'count']
fig = px.bar(mydf, x='day_of_week', y='count', color='count', color_continuous_scale=px.colors.sequential.Viridis)
fig.write_image(images_folder + "/flights_per_day_of_week.png")
fig.show()

In [27]:
# There appear to be more flights on the weekend. Analysing the max flight_number_of_day, per day_of_week will give
# further insight.
investigation = merged[['day_of_origin','ac_registration','flight_number_of_day']]
ac_reg = OrderedSet(investigation['ac_registration'].values)
days = OrderedSet(investigation['day_of_origin'].values)

mydict = dict()
for reg in ac_reg:
    mydict_ = dict()
    for day in days:
        max_subset = investigation.loc[(investigation['ac_registration'] == reg) & (investigation['day_of_origin'] == day)]['flight_number_of_day'].max()
        mydict_.update({day: max_subset})
    mydict.update({reg : mydict_})

In [28]:
mydf = pd.DataFrame(mydict)
mydf = mydf.fillna(0).sort_index(ascending=False)

for col in mydf:
    if col != 'index':
        mydf[col] = mydf[col].astype('int')

fig = px.imshow(mydf, color_continuous_scale='viridis', height=700, width=1100, aspect="equal", #text_auto=True,
                title='Heat map of number of flights per day per aircraft registration for June')
fig.update_yaxes(autorange="reversed")
fig.update_layout(xaxis = dict(tickmode = 'linear', tick0 = 0.5, dtick = 0.75))
fig.update_layout(title=None)
fig.write_image(images_folder + "/flights_per_ac_date.png")
fig.show()

In [29]:
# Obviously the testing data is showing up as zero here too, aircraft with low flights will be vertically low (horizontally
# low indicates the testing data).

# In cases where there are no flights, these appear to be buffered by a low number of flights on days leading to the date and
# days following the date. This could indicate a possible maintenance step being performed, if emergency or not cannot be
# accurately determined.

# ECLGEX and ECLGNX appear to be offline the majority of the time, possibly with the load being absorbed by other aircraft eg ECLGMX

In [30]:
# Further investigation of ECLGEX
mydf = mydf.reset_index()
mydf = mydf.rename(columns={'index':'date'})
fig = px.bar(mydf, x='date', y='ECLGEX', color='ECLGEX', color_continuous_scale=px.colors.sequential.Viridis, title='Flights per day for ECLGEX')
fig.show()

In [31]:
# Creating an indicator to show the aircraft has a low flight count
mydict = {}
for col in mydf.columns:
    if col!= 'date':
        total = mydf[col].sum()
        mydict.update({col: total})

mydf = pd.DataFrame.from_dict(mydict, orient='index', columns=['num_flights'])
mydf = mydf.sort_values(by=['num_flights'])

merged.loc[(merged['ac_registration'] == 'ECLGEX') | (merged['ac_registration'] == 'ECLGNX'), ['low_flight_count']] = 1

merged['low_flight_count'] = merged['low_flight_count'].where(merged['low_flight_count'].notnull(), 0)
merged['low_flight_count'] = merged['low_flight_count'].astype('int')
merged['low_flight_count'].value_counts()

low_flight_count
0    6418
1      45
Name: count, dtype: int64

### Looking at the distribution of arrival and departure times

In [32]:
merged['arrival_hour'] = merged['m_onblockdt_imputed'].dt.hour
merged['departure_hour'] = merged['m_offblockdt'].dt.hour

a_investigation = merged[['arrival_hour']]
a_investigation.columns = ['hour']
a_investigation['count'] = 1
a_investigation['type'] = 'arrival'

b_investigation = merged[['departure_hour']]
b_investigation.columns = ['hour']
b_investigation['count'] = 1
b_investigation['type'] = 'departure'

investigation = pd.concat([b_investigation, a_investigation], axis=0)

In [33]:
fig = px.histogram(investigation, x='hour', color='type', facet_col='type', facet_col_wrap=2, height=500,
                   color_discrete_sequence=px.colors.qualitative.Bold,
                          title= 'Histograms of number of arrivals and departures by hour')
fig.update_traces(opacity=0.75)
fig.write_image(images_folder + "/flights_per_ac_date.png")
fig.show()

In [34]:
# Doesn't indicate anything unexpected, peak travel times are 5, 8 AM and 4, 6PM for departures. Will look at departures per aircraft to see if there is
# a relaince on a particular aircraft.

In [35]:
def creates_heatmap_num_cat_col(numeric_col, categorical_col):
    investigation = merged[[numeric_col, categorical_col]]
    investigation['count'] = 1

    mydf = investigation.groupby([numeric_col, categorical_col]).count()
    mydf = mydf.reset_index()
    mydf = mydf.pivot(index=numeric_col, columns=categorical_col, values='count').fillna(0).sort_index(ascending=True)

    for col in mydf:
        mydf[col] = mydf[col].astype('int')

    fig = px.imshow(mydf, color_continuous_scale='viridis', height=700, width=1100, aspect="equal",
                title='Heatmap of number of flights per ' + numeric_col + ' per ' + categorical_col)
    fig.update_yaxes(autorange="reversed")
    fig.update_layout(xaxis = dict(tickmode = 'linear', tick0 = 0.5, dtick = 0.75))
    fig.write_image(images_folder + "/flights_per_ac_hour.png")
    fig.show()

In [36]:
creates_heatmap_num_cat_col('departure_hour', 'ac_registration')

In [37]:
# Better to classify a time with a count greater than 400 as a high departure time.
investigation = investigation.groupby(['hour', 'type']).sum().reset_index()
investigation = investigation.loc[(investigation['type'] == 'departure') & (investigation['count'] >= 400)]

mylist = []
for hour in merged['departure_hour'].to_list():
    if hour in investigation['hour'].to_list():
        mylist.append(1)
    else:
        mylist.append(0)
merged['high_departure_hour'] = mylist
# merged.to_excel('merged.xlsx')

### Analysing the volume per route

In [38]:
# Investigating whether there flight routes that are heavily travelled making up the majority of the data
investigation = merged[['route','day_of_origin']].groupby(by=['route']).count().reset_index()
investigation.columns = ['route', 'count']
investigation = investigation.sort_values(by=['count'], ascending=False)
investigation['perc'] = investigation['count']/investigation['count'].sum()
investigation['cumulative_perc'] = investigation['perc'].cumsum()

fig = go.Figure(
    data=go.Bar(
        x=investigation['route'].values,
        y=investigation['count'].values,
        name="Route",
        marker=dict(color="turquoise"),
    )
)
fig.add_trace(
    go.Scatter(
        x=investigation['route'].values,
        y=investigation['cumulative_perc'].values,
        yaxis="y2",
        name="Cumulative percentage",
        marker=dict(color="purple"),
    )
)
fig.update_layout(
    title="Count and Cumulative Percentage of Route Volume",
    legend=dict(yanchor="top", xanchor="center", y=1.15, x=0.5, orientation="h"),
    width=1100, height=700,
    yaxis=dict(
        title=dict(text="Count"),
        side="left",
        range=[0, 300],
    ),
    yaxis2=dict(
        title=dict(text="Cumulative percentage"),
        side="right",
        range=[0, 1.2],
        overlaying="y",
        tickmode="sync",
    ),
)
fig.update_layout(title=None)
fig.write_image(images_folder + "/flight_pareto.png")
fig.show()

In [39]:
# Although the top 48 routes constitute 80% of the flight volume there is not a definite skewness and relaince on certain routes

## Numeric variables analysed by categorical variables

### Actual duration analysed by various categories

In [40]:
merged['flight_number_of_day'] = merged['flight_number_of_day'].astype('str')
plots_agg_bar('actual_duration', 'flight_number_of_day', 'mean')

In [41]:
plots_facet_histogram('actual_duration', 'flight_number_of_day')

In [42]:
# Here it is noted that we generally see shorter duration flights as the day progresses.
# This does make sense since aircraft will be occupied to fly longer length flights, thus we are looking at a situation of
# many shorter flights versus a longer flight duration.

In [43]:
plots_agg_bar('actual_duration', 'day_of_week', 'mean')

In [44]:
plots_facet_histogram('actual_duration', 'day_of_week', save_file_name='act_dur_by_day.png')

In [45]:
# Saturday and Sunday flights appear longer in duration - check statistics in order to verify

In [46]:
my_list, my_list_2 = [], []
for i in merged['day_of_week'].unique():
  subset = merged.loc[merged['day_of_week'] == i][['actual_duration']]
  stats = col_statistics(subset, 'actual_duration', False)
  my_list.append(stats)
  my_list_2.append(stats.columns[0] + '_' + str(i))
stats = pd.concat(my_list, axis=1)
stats.columns = my_list_2
stats = stats.reindex(sorted(stats.columns), axis=1)
stats

Unnamed: 0,actual_duration_0,actual_duration_1,actual_duration_2,actual_duration_3,actual_duration_4,actual_duration_5,actual_duration_6
count,822.0,857.0,837.0,842.0,846.0,1106.0,1153.0
mean,91.390511,91.002334,90.962963,93.31829,92.485816,102.094033,98.535126
std,39.508454,38.915977,38.106716,41.007535,39.019148,47.63504,43.463894
min,30.0,30.0,32.0,30.0,29.0,28.0,29.0
25%,67.0,66.0,67.0,68.0,68.0,70.0,71.0
50%,84.0,84.0,84.0,85.0,85.0,91.0,89.0
75%,105.0,106.0,104.0,106.0,107.75,124.75,117.0
max,284.0,271.0,270.0,298.0,270.0,311.0,282.0
mode,74.0,63.0,82.0,82.0,90.0,85.0,75.0
median,84.0,84.0,84.0,85.0,85.0,91.0,89.0


In [47]:
# Does seem to be the case that there are longer flights on Saturdays and Sundays (day_of_week = 5 and 6).
# Possibly holiday destinations versus standard short commuter flights in the week?

In [48]:
plots_agg_bar('actual_duration', 'Crew_Group', 'mean')

In [49]:
plots_facet_histogram('actual_duration', 'Crew_Group')

In [50]:
# Flights by A crew group are the most numerous, however for all groups the general duration appears around 100 minutes.
# Thus there does not appear to be a definite situation where the starting or new crew group takes longer flights.

In [51]:
plots_agg_bar('actual_duration', 'Ac_Type_Code', 'mean')

In [52]:
plots_facet_histogram('actual_duration', 'Ac_Type_Code', save_file_name='act_dur_by_ac_code.png')

In [53]:
# The above indicates that the 320 aircraft is being used most frequently (most of the aircraft fleet are 320s).
# The DH4 aircraft might be smaller in size since they are generally used for shorter duration flights, while the E95 is used for medium
# duration flights, and the 320 is used for the longest duration flights (Can't say for sure so will not add indicator)

In [54]:
merged['mingt_mode_imputed'] = merged['mingt_mode_imputed'].astype('str')
plots_agg_bar('actual_duration', 'mingt_mode_imputed', 'mean')

In [55]:
plots_facet_histogram('actual_duration', 'mingt_mode_imputed')

In [56]:
# The above shows that flights with the mingt_mode as 40 are dominant and also take around 100 minutes while those at mingt_mode=35
# are far shorter in duration. mingt_mode as 45 have the longest durations. Thus these could be the larger aircraft?
# Difficult to deduce anything from this since a ground time of 35 minutes is close to 40 minutes and 45 minutes.

In [57]:
mydf = merged[['actual_duration', 'mingt_mode_imputed', 'Ac_Type_Code']]
mydf = mydf.groupby(['Ac_Type_Code', 'mingt_mode_imputed']).mean()
mydf = mydf.reset_index()

fig = px.bar(mydf, x="actual_duration", y="mingt_mode_imputed", color='Ac_Type_Code', orientation='h',
             height=400, color_discrete_sequence=px.colors.qualitative.Bold,
             title='Min Groundtime by Ac_type')
fig.update_traces(opacity=0.75)
fig.show()

In [58]:
# Thus it is noted that the 320's have 40-45 minutes minimum groundtime, while our smaller aircraft (DH4 and E95) have lower
# minimum groundtimes.

### Departure delay analysed by various categories

In [59]:
delay_data_plots_facet_histogram('dep_delay', 'Ac_Type_Code')

In [60]:
delay_data_plots_agg_bar('dep_delay', 'Ac_Type_Code', 'mean', save_file_name='delay_ac_code.png')

In [61]:
# The above shows that where a delay occurs, this is generally of the same duration across the aircraft types,
# roughly 24-26 minutes.

In [62]:
delay_data_plots_facet_histogram('dep_delay', 'duration_bin')

In [63]:
# This shows that flights in bin 1 are more likely to have delays, however this is not significantly longer than the
# other proximity zones.

In [64]:
delay_data_plots_facet_histogram('dep_delay', 'Crew_Group')

In [65]:
# This indicates that crew group A and Start are more likely to experience delays due to more flights being classified as
# this type. These do tend to be around the 15-20 minute time. Crew group C in comparison has delays of around 30 minutes.

In [66]:
delay_data_plots_facet_histogram('dep_delay', 'mingt_mode_imputed')

In [67]:
# No significant difference in delays across mingt_modes.

In [68]:
delay_data_plots_facet_histogram('dep_delay', 'day_of_week')

In [69]:
merged['day_of_week'] = merged['day_of_week'].astype('str')
delay_data_plots_agg_bar('dep_delay', 'day_of_week', 'mean', save_file_name='delay_day_of_week.png')

In [70]:
delay_data_plots_facet_histogram('arr_delay_imputed', 'change_reason_code')

In [71]:
# Similar to the departure delay it can be noted that there is a definite hierarchy or ordering of change reason codes in
# terms of their relative delays when these are analysed in terms of their arrival delays.

In [72]:
delay_data_plots_facet_histogram('arr_delay_imputed', 'Ac_Type_Code')

In [73]:
# It is noted that the arrival delays are on average shorter than the departure delays, interestingly across all aircraft types
# an average of around 10 minutes' delay is noted.

In [74]:
delay_data_plots_facet_histogram('arr_delay_imputed', 'Crew_Group')

In [75]:
# a higher arrival delay of around 30 minutes is noted for crew_group C versus the other crew groups, however the departure
# delay within this group is also around 30 minutes.

In [76]:
delay_data_plots_facet_histogram('dep_delay', 'Crew_Group_imputed')

In [77]:
delay_data_plots_agg_bar('dep_delay', 'Crew_Group_imputed', 'mean', save_file_name='delay_crew_grp.png')

In [78]:
delay_data_plots_facet_histogram('arr_delay_imputed', 'Crew_Group_imputed')

In [79]:
# Here it is noted that there is almost no delays for departure or arrival when the crew_group is the starting group, although it is
# also noted that there is not sufficient data to make this observation significant.

## Evaluation of variance in data

In [80]:
print('-' * 55)
print(f'{"Variance by Numeric Column" :^55}')
print('-' * 55)

for col in ['sched_duration', 'actual_duration', 'dep_delay', 'arr_delay_imputed', 'trans_time', 'trans_time', 'sched_trans_time',
            'Sched_Groundtime_imputed', 'Act_Groundtime_imputed']:
    print(f"{col:30}: {merged[col].var(ddof=0)}")

-------------------------------------------------------
              Variance by Numeric Column               
-------------------------------------------------------
sched_duration                : 1843.5837399473296
actual_duration               : 1747.1389076787902
dep_delay                     : 635.2979493669304
arr_delay_imputed             : 689.6445882486696
trans_time                    : 1020.7686104472767
trans_time                    : 1020.7686104472767
sched_trans_time              : 993.0214992833795
Sched_Groundtime_imputed      : 3268.2550745481803
Act_Groundtime_imputed        : 4761.279204691728


In [81]:
# Extreme variance is noted across all columns, need to check that this is the case per Ac_reg

ac_reg = OrderedSet(merged['ac_registration'])
mydict = dict()
for col in ['sched_duration', 'actual_duration', 'dep_delay', 'arr_delay_imputed','trans_time', 'trans_time', 'sched_trans_time',
            'Sched_Groundtime_imputed', 'Act_Groundtime_imputed']:
    mydict_ = dict()
    for reg in ac_reg:
        subset_var = merged.loc[merged['ac_registration'] == reg][col].var(ddof=0)
        mydict_[reg] = subset_var

    mydict[col] = {k : v for (k, v) in mydict_.items()}

mydf = pd.DataFrame(mydict)

cols = mydf.columns.to_list()
cols_per_col = [{column: 1} if column in cols[::2] else {column: 2} for column in cols]

rows_per_col = []
for column in cols:
    if column in cols[0:2]:
        _rows_per_col = {column: 1}
    elif column in cols[2:4]:
        _rows_per_col = {column: 2}
    elif column in cols[4:6]:
        _rows_per_col = {column: 3}
    else:
        _rows_per_col = {column: 4}
    rows_per_col.append(_rows_per_col)

col_dict = {}
for i in cols_per_col:
    col_dict.update(i)

row_dict = {}
for i in rows_per_col:
    row_dict.update(i)

nums = {ii:i for i, ii in enumerate(row_dict.keys())}

fig = make_subplots(rows=4, cols=2, subplot_titles=cols)

for key, row_num in row_dict.items():
    col_num = col_dict[key]
    fig.add_trace(
        go.Bar(x=mydf[key].index.to_list(),
               y=mydf[key].to_list(),
        marker=dict(color=px.colors.qualitative.Bold[nums[key]])),
        row = row_num, col = col_num
    )
fig.update_layout(height=1200, width=1000, showlegend=False, title_text="Variance by column, for each Ac_reg")
fig.show()

In [82]:
# High variance for groundtime for EXCLGEX is due to its servicing/low flight count.

## Evaluation of representation of each ac_registration

In [83]:
# The Task requires that the model be able to predict per ac_reg, the arrival times for a connection chain supplied.
# Current representation of the data is as below

subset = merged[['ac_registration', 'day_of_origin']]
subset['to_count'] = 1

ac_reg_count = pd.pivot_table(subset, index=['ac_registration'], values='to_count', aggfunc="sum")
ac_reg_count = ac_reg_count.reset_index()
ac_reg_count = ac_reg_count.sort_values(by=['to_count'], ascending=False)
ac_reg_count['diff_to_mean'] = ac_reg_count['to_count'] - ac_reg_count['to_count'].mean()

fig = go.Figure()
fig.add_trace(go.Bar(
    x=ac_reg_count['ac_registration'].to_list(),
    y=ac_reg_count['diff_to_mean'].to_list(),
    marker=dict(color=ac_reg_count['diff_to_mean'].to_list(), colorscale="tealrose"),
))
fig.update_layout(height=400, width=1100, title_text='Difference to the overall mean for number of records per Ac Registration number')
fig.show()

In [84]:
merged.columns

Index(['day_of_origin', 'leg_no', 'fn_number', 'ac_registration',
       'dep_ap_sched', 'arr_ap_sched', 'dep_sched_datetime',
       'arr_sched_datetime', 'm_offblockdt', 'change_reason_code', 'dep_delay',
       'arr_delay_imputed', 'arr_delay_imputed_null', 'Ac_Type_Code',
       'trans_time', 'sched_trans_time', 'Crew_Group', 'm_onblockdt_imputed',
       'TLC_trans_pos', 'last_flight_indicator', 'first_flight_indicator',
       'sched_duration', 'actual_duration', 'crew_type_change_imputed',
       'Sched_Groundtime_imputed', 'Act_Groundtime_imputed',
       'Crew_Group_imputed', 'change_reason_code_imputed', 'day_of_week',
       'mingt_mode_imputed', 'route', 'duration_bin', 'hub_flight_indicator',
       'flight_number_of_day', 'low_flight_count', 'arrival_hour',
       'departure_hour', 'high_departure_hour'],
      dtype='object')

In [85]:
# Exporting for variable scaling and transformations
merged.to_csv(data_path + 'merged.csv')