Project by:

- Jack Chen 4427737
- Joost Litjes 4540700
- Felicia Hung 7568479

In [1]:
import os

import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

from dash import Dash, html, dcc, callback, Output, Input


In [2]:
# Helpers

def exportImage(plot, name):
    pio.write_html(plot, os.path.join("plots", name + '.html'))


In [3]:
db = pd.read_csv("airlinedelaycauses_DelayedFlights_Filtered.csv")
db.describe()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,...,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0,...,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0,1247486.0
mean,2008.0,6.065399,15.72542,3.980082,1558.832,1487.949,1616.749,1652.458,2276.526,135.3779,...,741.5867,7.297232,20.66033,0.0,0.0,19.17943,3.703355,15.02162,0.09013728,25.29649
std,0.0,3.508937,8.793008,1.99327,454.33,421.1782,583.9416,461.7372,1997.547,72.29636,...,559.3643,6.033239,16.67983,0.0,0.0,43.54624,21.49153,33.83308,2.022716,42.05489
min,2008.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,14.0,...,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2008.0,3.0,8.0,2.0,1232.0,1150.0,1326.0,1340.0,637.0,83.0,...,334.0,4.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2008.0,6.0,16.0,4.0,1618.0,1529.0,1737.0,1722.0,1589.0,118.0,...,595.0,6.0,16.0,0.0,0.0,2.0,0.0,2.0,0.0,8.0
75%,2008.0,9.0,23.0,6.0,1924.0,1830.0,2048.0,2022.0,3676.0,167.0,...,972.0,8.0,24.0,0.0,0.0,21.0,0.0,15.0,0.0,33.0
max,2008.0,12.0,31.0,7.0,2400.0,2359.0,2400.0,2359.0,9741.0,1114.0,...,4962.0,240.0,422.0,0.0,0.0,2436.0,1352.0,1357.0,392.0,1316.0


In [4]:
# Visualization 1: Histogram of Arrival Delay
fig1 = px.histogram(db, x='ArrDelay', nbins=30, title='Distribution of Arrival Delay')



In [5]:
# Visualization 2: Scatter plot of Departure Delay vs. Arrival Delay
fig2 = px.scatter(db, x='DepDelay', y='ArrDelay', title='Scatter plot of Departure Delay vs. Arrival Delay')


What do we want to visualize??

I am a customer, I would want to know the following
- Which carriers are most likely to have delayed flights


In [6]:
# Create a bar chart showing carriers with the highest average arrival delay
delayed_carriers = db.groupby('UniqueCarrier')['ArrDelay'].mean().reset_index()
delayed_carriers = delayed_carriers.sort_values(by='ArrDelay', ascending=False)

fig1 = px.bar(delayed_carriers, x='UniqueCarrier', y='ArrDelay', title='Carriers with Highest Average Arrival Delay')
fig1.update_xaxes(title='Carrier')
fig1.update_yaxes(title='Average Arrival Delay (minutes)')

# Export the plot as HTML
exportImage(fig1, 'most_delayed_carriers')

What do we want to visualize??

I am part of a flight carrier company, I would want to know the following

What do we want to visualize??

I am part of a airplane producing company, I would want to know the following
- Which are the most common causes for aircraft delays
- Which causes for aircraft delays are the most delaying
- If an aircraft is delayed by cause x, what is the average delay time for that delay type

In [7]:
# Calculate the number of times the delay is larger than 0 for each cause
delay_causes = db[['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].gt(0).sum().reset_index()
delay_causes.columns = ['DelayCause', 'CountGreaterThanZero']

# Create a bar chart to show the count of times each delay cause is greater than 0
fig5 = px.bar(delay_causes, x='DelayCause', y='CountGreaterThanZero', title='Count of Delays Greater Than 0 Minutes by Cause')
fig5.update_xaxes(title='Delay Cause')
fig5.update_yaxes(title='Count of Delays Greater Than 0 Minutes')

# Export the plot as HTML
exportImage(fig5, 'how_often_delayed_by_cause.html')

In [8]:
# Create a bar chart to show the most common causes for aircraft delays
delay_causes = db[['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].mean().reset_index()
delay_causes.columns = ['DelayCause', 'AverageDelay']

fig6 = px.bar(delay_causes, x='DelayCause', y='AverageDelay', title='Common Causes for Aircraft Delays')
fig6.update_xaxes(title='Delay Cause')
fig6.update_yaxes(title='Average Delay (minutes)')

# Export the plot as HTML
exportImage(fig6, 'common_causes_for_aircraft_delays')


In [9]:
# Calculate the average delay time for each delay cause
sum_delays  = db[['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].sum().reset_index()
count_delays_gt_zero  = db[['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].gt(0).sum().reset_index()
sum_delays.columns = ['DelayCause', 'SumOfDelays']
count_delays_gt_zero.columns = ['DelayCause', 'CountGreaterThanZero']

result = sum_delays.merge(count_delays_gt_zero, on='DelayCause')
result['Ratio'] = result['SumOfDelays'] / result['CountGreaterThanZero']

fig7 = px.bar(result, x='DelayCause', y='Ratio', title='Common Causes for Aircraft Delays')
fig7.update_xaxes(title='Delay Cause')
fig7.update_yaxes(title='Average Delay (minutes)')

# Export the plot as HTML
exportImage(fig7, 'delay_time_if_delayed_by_cause')

What do we want to visualize??

If I am part of any of the three above, I would want to know the following:
- What times of the year are most likely to be delayed
- What departure time slots are most likely to be delayed
- What days of the week are most likely to be delayed
- What arrival time slot is most likely to be delayed
- What departure airport is most likely to be delayed
- What arrival airport is most likely to be delayed
- What routes are most likely to be delayed
- Which aircrafts are most likely to be delayed

In [10]:
# Group by Month and DayOfMonth
delay_by_month_day = db.groupby(['Month', 'DayofMonth'])['ArrDelay'].mean().reset_index()

# Create a date column by combining Month and DayOfMonth
delay_by_month_day['Date'] = delay_by_month_day['Month'].astype(str) + '/' + delay_by_month_day['DayofMonth'].astype(str)

# Calculate a circular rolling average for smoother lines
window_size = 50  # You can adjust this window size as needed
n = len(delay_by_month_day)
smoothed_arr_delay = []

for i in range(n):
    start_idx = max(0, i - window_size // 2)
    end_idx = min(n, i + window_size // 2)
    smoothed_value = delay_by_month_day['ArrDelay'].iloc[start_idx:end_idx].mean()
    smoothed_arr_delay.append(smoothed_value)

delay_by_month_day['Smoothed_ArrDelay'] = smoothed_arr_delay

# Create the line chart with both smoothed and unsmoothed lines
fig1 = px.line(delay_by_month_day, x='Date', y=['ArrDelay', 'Smoothed_ArrDelay'],
                labels={'ArrDelay': 'Unsmoothed ArrDelay', 'Smoothed_ArrDelay': 'Smoothed ArrDelay'},
                title='Average Arrival Delay by Month and Day (with Smoothing)')

fig1.update_xaxes(title_text='Date')
fig1.update_yaxes(title_text='Average Arrival Delay')

# Export the visualization
exportImage(fig1, 'average_arrival_delay_with_smoothing')


In [11]:
delay_by_dayofweek = db.groupby('DayOfWeek')['ArrDelay'].mean().reset_index()
fig2 = px.line(delay_by_dayofweek, x='DayOfWeek', y='ArrDelay', title='DayOfWeek Arrival Delay Distribution')
exportImage(fig2, 'dayofweek_departure_delay_distribution')

In [12]:
delay_by_deptime = db.groupby('DepTime')['ArrDelay'].mean().reset_index()
fig3 = px.histogram(delay_by_deptime, x='DepTime', y='ArrDelay', title='DepTime Arrival Delay Distribution', histfunc='avg', nbins=24)
exportImage(fig3, 'deptime_departure_delay_distribution')

In [13]:
delay_by_arrtime = db.groupby('ArrTime')['ArrDelay'].mean().reset_index()
fig3 = px.histogram(delay_by_arrtime, x='ArrTime', y='ArrDelay', title='ArrTime Arrival Delay Distribution', histfunc='avg', nbins=24)
exportImage(fig3, 'arrtime_arrival_delay_distribution')

In [14]:
delay_by_origin = db.groupby('Origin')['ArrDelay'].mean().reset_index()
delay_by_origin = delay_by_origin.sort_values(by='ArrDelay', ascending=False)
fig4 = px.bar(delay_by_origin, x='Origin', y='ArrDelay', title='Average Arrival Delay by Departure Airport')
exportImage(fig4, 'average_arrival_delay_by_departure_airport')

In [15]:
delay_by_dest = db.groupby('Dest')['ArrDelay'].mean().reset_index()
delay_by_dest = delay_by_dest.sort_values(by='ArrDelay', ascending=False)
fig5 = px.bar(delay_by_dest, x='Dest', y='ArrDelay', title='Average Arrival Delay by Arrival Airport')
exportImage(fig5, 'average_arrival_delay_by_arrival_airport')

In [16]:
db['Route'] = db['Origin'] + '-' + db['Dest']
delay_by_route = db.groupby('Route')['ArrDelay'].mean().reset_index()
delay_by_route = delay_by_route.sort_values(by='ArrDelay', ascending=False)
fig6 = px.bar(delay_by_route, x='Route', y='ArrDelay', title='Average Arrival Delay by Route')
exportImage(fig6, 'average_arrival_delay_by_route')

In [17]:
delay_by_aircraft = db.groupby('TailNum')['ArrDelay'].mean().reset_index()
delay_by_aircraft = delay_by_aircraft.sort_values(by='ArrDelay', ascending=False)
fig7 = px.bar(delay_by_aircraft, x='TailNum', y='ArrDelay', title='Average Arrival Delay by Aircraft')
exportImage(fig7, 'average_arrival_delay_by_aircraft')