In [13]:
!python3 -m pip install plotly
!python3 -m pip install chart_studio
!python3 -m pip install nbformat

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
from scipy import signal



def day_of_week_arrive_delay_pie_chart(data):
    data = data.copy()
    data = change_day_of_week_to_category(data)

    fig = px.pie(data, values= 'ARR_DELAY', names='DATE', title='Arrive Dalay for different day in the week')
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.show()

    
def heat_map_correlation_matrix(data):
    data = data.copy()
    data = data.drop(columns=['YEAR', 'MONTH', 'DAY_OF_MONTH', 'FL_NUM', 'AIRLINE_ID'])

    # Compute the correlation matrix
    corr = data.corr()
    
    fig = px.imshow(corr)
    fig.update_layout(
    width = 750, height = 750,
    autosize = False )
    fig.show()

    
def carrier_arrive_delay_day_of_week_box_plot(data):
    new_data = data.copy()
    
    new_data = change_day_of_week_to_category(new_data)


    fig = px.box(new_data,
    x= 'CARRIER',
    y='ARR_DELAY',
    color = 'CARRIER',
    facet_col="DATE",
    category_orders={"DATE": ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]},
    title='Which day of the week got the most delay among all Carriers?')

    fig.update_layout(
    width = 1800, height = 900,
    autosize = False )

    fig.show()


def every_carrier_arrive_delay_day_of_week_pie(data):
    data = data.copy()
    data = change_day_of_week_to_category(data)

    carriers = set(data.loc[:, 'CARRIER'])

    for i in carriers:
        carrier_data = data[(data['CARRIER'] == '%s' %i)]
        individual_carrier_pie(carrier_data, '%s' %i)


def individual_carrier_pie(data, carrier_name):
    fig = px.pie(data, values= 'ARR_DELAY', names='DATE', title='Arrive Dalay for carrier %s' %carrier_name)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.show()


def change_day_of_week_to_category(new_data):
    new_data = new_data.copy()
    new_data['DATE'] = new_data['DAY_OF_WEEK']
    new_data.loc[new_data['DAY_OF_WEEK'] == 1, 'DATE'] = "Monday"
    new_data.loc[new_data['DAY_OF_WEEK'] == 2, 'DATE'] = "Tuesday"
    new_data.loc[new_data['DAY_OF_WEEK'] == 3, 'DATE'] = "Wednesday"
    new_data.loc[new_data['DAY_OF_WEEK'] == 4, 'DATE'] = "Thursday"
    new_data.loc[new_data['DAY_OF_WEEK'] == 5, 'DATE'] = "Friday"
    new_data.loc[new_data['DAY_OF_WEEK'] == 6, 'DATE'] = "Saturday"
    new_data.loc[new_data['DAY_OF_WEEK'] == 7, 'DATE'] = "Sunday"
    return new_data


def delay_corre_no_AIR_TIME(df):
    df = df.copy()

    dayOfWeek={1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday', 5:'Friday', 
                                           6:'Saturday', 7:'Sunday'}

    fig = make_subplots(rows=2, cols=6, shared_yaxes=True,
    subplot_titles=("ARR","CARRIER", "WEATHER", "NAS", "SECURITY", "LATE_AIRCRAFT",
    "", "ARR corr CARRIER", "ARR corr WEATHER", "ARR corr NAS", "ARR corr SECURITY", "ARR corr LATE_AIRCRAFT"))


    dff0 = df.groupby('DAY_OF_WEEK').ARR_DELAY.mean().round(2)
    
    dff0.index = dff0.index.map(dayOfWeek)

    trace0 = go.Scatter(
        x=dff0.index,
        y=dff0.values,
        name = 'ARR_DELAY'
    )

    dff1 = df.groupby('DAY_OF_WEEK').CARRIER_DELAY.mean().round(2)
    
    dff1.index = dff1.index.map(dayOfWeek)

    trace1 = go.Scatter(
        x=dff1.index,
        y=dff1.values,
        name = 'CARRIER_DELAY',
        # marker=dict(
        #     color = 'red'
        # )
    )

    dff2 = df.groupby('DAY_OF_WEEK').WEATHER_DELAY.mean().round(2)
    dff2.index = dff2.index.map(dayOfWeek)

    trace2 = go.Scatter(
        x=dff2.index,
        y=dff2.values,
        name='WEATHER_DELAY',
        # marker=dict(
        #     color = 'blue'
        # )
    )

    dff3 = df.groupby('DAY_OF_WEEK').NAS_DELAY.mean().round(2)
    dff3.index = dff3.index.map(dayOfWeek)

    trace3 = go.Scatter(
        x=dff3.index,
        y=dff3.values,
        name='NAS_DELAY',
        # marker=dict(
        #     color = 'yellow'
        # )
    )

    dff4 = df.groupby('DAY_OF_WEEK').SECURITY_DELAY.mean().round(2)
    dff4.index = dff4.index.map(dayOfWeek)

    trace4 = go.Scatter(
        x=dff4.index,
        y=dff4.values,
        name='SECURITY_DELAY',
        # marker=dict(
        #     color = 'black'
        # )
    )

    dff5 = df.groupby('DAY_OF_WEEK').LATE_AIRCRAFT_DELAY.mean().round(2)
    dff5.index = dff5.index.map(dayOfWeek)

    trace5 = go.Scatter(
        x=dff5.index,
        y=dff5.values,
        name='LATE_AIRCRAFT_DELAY',
        # marker=dict(
        #     color = 'green'
        # )
    )

    trace6 = go.Scatter(
        x=dff1.index,
        y=signal.convolve(dff0.values, dff1.values),
        name='Corre ARR_DELAY CARRIER_DELAY'
    )

    trace7 = go.Scatter(
        x=dff2.index,
        y=signal.convolve(dff0.values, dff2.values),
        name='Corre ARR_DELAY WEATHER_DELAY'
    )

    trace8 = go.Scatter(
        x=dff3.index,
        y=signal.convolve(dff0.values, dff3.values),
        name='Corre ARR_DELAY NAS_DELAY'
    )

    trace9 = go.Scatter(
        x=dff4.index,
        y=signal.convolve(dff0.values, dff4.values),
        name='Corre ARR_DELAY SECURITY_DELAY'
    )

    trace10 = go.Scatter(
        x=dff5.index,
        y=signal.convolve(dff0.values, dff5.values),
        name='Corre ARR_DELAY LATE_AIRCRAFT_DELAY'
    )



    fig.append_trace(trace0, col=1, row=1)
    fig.append_trace(trace1, col=2, row=1)
    fig.append_trace(trace2, col=3, row=1)
    fig.append_trace(trace3, col=4, row=1)
    fig.append_trace(trace4, col=5, row=1)
    fig.append_trace(trace5, col=6, row=1)

    fig.append_trace(trace6, col=2, row=2)
    fig.append_trace(trace7, col=3, row=2)
    fig.append_trace(trace8, col=4, row=2)
    fig.append_trace(trace9, col=5, row=2)
    fig.append_trace(trace10, col=6, row=2)



    # data = [trace1,trace2,trace3,trace4,trace5]
    # layout = go.Layout(
    #     title=' Mean of the Delay (Day of Week)', 
    #     yaxis = dict(title = 'Mean')
    # )

    # fig = go.Figure(data=data, layout=layout)
    # fig.update_layout(barmode='relative', width = 900, height = 700, autosize = False )
    fig.update_layout(height=1000, width=2000, title_text="Mean of the Delay (Day of Week)")
    fig.show()


def delay_corre_AIR_TIME(df):
    df = df.copy()

    dayOfWeek={1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday', 5:'Friday', 
                                           6:'Saturday', 7:'Sunday'}

    fig = make_subplots(rows=2, cols=6, shared_yaxes=True,
    subplot_titles=("ARR","CARRIER", "WEATHER", "NAS", "SECURITY", "LATE_AIRCRAFT",
    "", "ARR corr CARRIER", "ARR corr WEATHER", "ARR corr NAS", "ARR corr SECURITY", "ARR corr LATE_AIRCRAFT"))

    dff0 = df.groupby('DAY_OF_WEEK').ARR_DELAY.mean().round(2)
    dff_ = df.groupby('DAY_OF_WEEK').AIR_TIME.mean().round(2)
    dff0.index = dff0.index.map(dayOfWeek)

    trace0 = go.Scatter(
        x=dff0.index,
        y=dff0.values / dff_.values * 100,
        name = 'ARR_DELAY'
    )

    dff1 = df.groupby('DAY_OF_WEEK').CARRIER_DELAY.mean().round(2)
    dff_ = df.groupby('DAY_OF_WEEK').AIR_TIME.mean().round(2)
    dff1.index = dff1.index.map(dayOfWeek)

    trace1 = go.Scatter(
        x=dff1.index,
        y=dff1.values / dff_.values * 100,
        name = 'CARRIER_DELAY',
        # marker=dict(
        #     color = 'red'
        # )
    )

    dff2 = df.groupby('DAY_OF_WEEK').WEATHER_DELAY.mean().round(2)
    dff_ = df.groupby('DAY_OF_WEEK').AIR_TIME.mean().round(2)

    dff2.index = dff2.index.map(dayOfWeek)

    trace2 = go.Scatter(
        x=dff2.index,
        y=dff2.values / dff_.values * 100,
        name='WEATHER_DELAY',
        # marker=dict(
        #     color = 'blue'
        # )
    )

    dff3 = df.groupby('DAY_OF_WEEK').NAS_DELAY.mean().round(2)
    dff_ = df.groupby('DAY_OF_WEEK').AIR_TIME.mean().round(2)

    dff3.index = dff3.index.map(dayOfWeek)

    trace3 = go.Scatter(
        x=dff3.index,
        y=dff3.values / dff_.values * 100,
        name='NAS_DELAY',
        # marker=dict(
        #     color = 'yellow'
        # )
    )

    dff4 = df.groupby('DAY_OF_WEEK').SECURITY_DELAY.mean().round(2)
    dff_ = df.groupby('DAY_OF_WEEK').AIR_TIME.mean().round(2)

    dff4.index = dff4.index.map(dayOfWeek)

    trace4 = go.Scatter(
        x=dff4.index,
        y=dff4.values / dff_.values * 100,
        name='SECURITY_DELAY',
        # marker=dict(
        #     color = 'black'
        # )
    )

    dff5 = df.groupby('DAY_OF_WEEK').LATE_AIRCRAFT_DELAY.mean().round(2)
    dff_ = df.groupby('DAY_OF_WEEK').AIR_TIME.mean().round(2)

    dff5.index = dff5.index.map(dayOfWeek)

    trace5 = go.Scatter(
        x=dff5.index,
        y=dff5.values / dff_.values * 100,
        name='LATE_AIRCRAFT_DELAY',
        # marker=dict(
        #     color = 'green'
        # )
    )

    trace6 = go.Scatter(
        x=dff1.index,
        y=signal.convolve(dff0.values/ dff_.values * 100, dff1.values/ dff_.values * 100),
        name='Corre ARR_DELAY CARRIER_DELAY'
    )

    trace7 = go.Scatter(
        x=dff2.index,
        y=signal.convolve(dff0.values/ dff_.values * 100, dff2.values/ dff_.values * 100),
        name='Corre ARR_DELAY WEATHER_DELAY'
    )

    trace8 = go.Scatter(
        x=dff3.index,
        y=signal.convolve(dff0.values/ dff_.values * 100, dff3.values/ dff_.values * 100),
        name='Corre ARR_DELAY NAS_DELAY'
    )

    trace9 = go.Scatter(
        x=dff4.index,
        y=signal.convolve(dff0.values/ dff_.values * 100, dff4.values/ dff_.values * 100),
        name='Corre ARR_DELAY SECURITY_DELAY'
    )

    trace10 = go.Scatter(
        x=dff5.index,
        y=signal.convolve(dff0.values/ dff_.values * 100, dff5.values/ dff_.values * 100),
        name='Corre ARR_DELAY LATE_AIRCRAFT_DELAY'
    )



    fig.append_trace(trace0, col=1, row=1)
    fig.append_trace(trace1, col=2, row=1)
    fig.append_trace(trace2, col=3, row=1)
    fig.append_trace(trace3, col=4, row=1)
    fig.append_trace(trace4, col=5, row=1)
    fig.append_trace(trace5, col=6, row=1)

    fig.append_trace(trace6, col=2, row=2)
    fig.append_trace(trace7, col=3, row=2)
    fig.append_trace(trace8, col=4, row=2)
    fig.append_trace(trace9, col=5, row=2)
    fig.append_trace(trace10, col=6, row=2)


    fig.update_layout(height=1000, width=2000, title_text="Mean/AIR_TIME * 100 of the Delay (Day of Week)")
    fig.show()



In [2]:

    ### read data as pandas Data Frame
    data = pd.read_csv("./Flights dataset.csv")

    ### remove useless column
    data = data.drop(['Unnamed: 35', 'CANCELLATION_CODE'], axis=1)

    ### discard na
    data = data.dropna()
    # print(data)

    heat_map_correlation_matrix(data)
    day_of_week_arrive_delay_pie_chart(data)
    carrier_arrive_delay_day_of_week_box_plot(data)
    delay_corre_no_AIR_TIME(data)
    delay_corre_AIR_TIME(data)

    # every_carrier_arrive_delay_day_of_week_pie(data)

FileNotFoundError: [Errno 2] File ./Flights dataset.csv does not exist: './Flights dataset.csv'