In [1]:
import numpy as np 
import pandas as pd 
import json
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
from datetime import datetime
import glob
import seaborn as sns
import re
import os

In [2]:
#For loop to combine them into one dataframe

files = ['./Resources/Flight_Data.csv']
all_dataframes = list()

for csv in files:
  df = pd.read_csv(csv)
  all_dataframes.append(df)


combined_data_v1 = pd.concat(all_dataframes)

In [3]:
#Drop unneccesary columns

combined_data_v2 = combined_data_v1.drop(['DELAY_CARRIER_(MINUTES)',
                                          'DELAY_WEATHER_(MINUTES)', 'DELAY_NATIONAL_AVIATION_SYSTEM_(MINUTES)', 'DELAY_SECURITY_(MINUTES)',
                                          'DELAY_LATE_AIRCRAFT_ARRIVAL_(MINUTES)'], axis=1)

#Drop all rows with NA
combined_data_v2 = combined_data_v2.dropna()
combined_data_v2 

Unnamed: 0,CARRIER_CODE,DATE_(MM/DD/YYYY),FLIGHT_NUMBER,TAIL_NUMBER,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE_TIME,ACTUAL_DEPARTURE_TIME,SCHEDULED_ELAPSED_TIME_(MINUTES),ACTUAL_ELAPSED_TIME_(MINUTES),DEPARTURE_DELAY_(MINUTES),WHEELS-OFF_TIME,TAXI-OUT_TIME_(MINUTES)
161,UA,01/01/1995,173,N7299U,IAD,07:00,06:59,69,67,-1,07:08,9
162,UA,01/01/1995,201,N511UA,DEN,16:35,17:02,258,235,27,17:18,16
163,UA,01/01/1995,289,N320UA,DEN,17:55,17:56,264,252,1,18:07,11
164,UA,01/01/1995,291,N321UA,DEN,09:00,08:58,275,262,-2,09:05,7
165,UA,01/01/1995,295,N319UA,DEN,13:30,13:29,266,248,-1,13:42,13
...,...,...,...,...,...,...,...,...,...,...,...,...
132458,UA,06/20/2013,503,N441UA,IAH,17:03,17:02,231,223,-1,17:27,25
132459,UA,06/20/2013,509,N570UA,DEN,17:29,17:28,262,243,-1,17:52,24
132460,UA,06/20/2013,515,N419UA,IAH,14:48,14:43,232,200,-5,14:52,9
132461,UA,06/20/2013,531,N448UA,IAH,10:38,10:42,232,216,4,11:03,21


#Exploratory Data Analysis (EDA)

In [4]:
#Getting summary statistics
dep_max = combined_data_v2['DEPARTURE_DELAY_(MINUTES)'].max()
dep_min = combined_data_v2['DEPARTURE_DELAY_(MINUTES)'].min()
dep_mean = combined_data_v2['DEPARTURE_DELAY_(MINUTES)'].mean()
dep_mode = combined_data_v2['DEPARTURE_DELAY_(MINUTES)'].mode()
dep_sd = combined_data_v2['DEPARTURE_DELAY_(MINUTES)'].std()
dep_md = combined_data_v2['DEPARTURE_DELAY_(MINUTES)'].median()

print("Departure Delay")
print("Max: ", dep_max)
print("Min: ", dep_min)
print("Mean: ", dep_mean)
#print(dep_mode)
print("SD: ", dep_sd)
print("Median: ", dep_md)


Departure Delay
Max:  1149
Min:  -24
Mean:  10.516202249468032
SD:  40.25624015466634
Median:  -1.0


In [6]:
#Function that classifies delay into 5 sections
def classify(num):
  if (num < 0):
    if (num < -30):
      return ('Early: > 30 mins') 
    else:
      return ('Early: < 30 mins')
  else:
    if (num < 30):
      return ('Late: < 30 mins')
    elif (num < 250):
      return ('Late: < 5 hours')
    else:
      return ('Late: > 5 hours')


#Creating new column for delay group
group_dept_df = combined_data_v2[[ 'DESTINATION_AIRPORT', 'DEPARTURE_DELAY_(MINUTES)']]
group_dept_df['Delay Amount'] = group_dept_df['DEPARTURE_DELAY_(MINUTES)'].apply(lambda x: classify(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
group_dept_df

Unnamed: 0,DESTINATION_AIRPORT,DEPARTURE_DELAY_(MINUTES),Delay Amount
161,IAD,-1,Early: < 30 mins
162,DEN,27,Late: < 30 mins
163,DEN,1,Late: < 30 mins
164,DEN,-2,Early: < 30 mins
165,DEN,-1,Early: < 30 mins
...,...,...,...
132458,IAH,-1,Early: < 30 mins
132459,DEN,-1,Early: < 30 mins
132460,IAH,-5,Early: < 30 mins
132461,IAH,4,Late: < 30 mins
