# American_Airline_Analysis

as presented by: Nateé Johnson & Joe McAllister

## Data Sources

https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236

## IMPORTant Libraries/Configs


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import norm
import statsmodels
import statsmodels.stats
import statsmodels.stats.proportion


sns.set_style('whitegrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
pd.options.display.max_columns = None

## IMPORTant Functions & Data

figure out syntax for important functions from single python file 

In [None]:
## Defining variable:




# Plot customer ratings

In [None]:
airline=['American','Delta','United','Southwest', 'Spirit']
ratings=[3,5,3,5,3]

plt.figure(figsize=(5,3))

sns.barplot(airline, ratings).set_title('Customer Ratings out of 10')
plt.ylim(0,10)
plt.savefig('ratings')

## Checking out distributions of raw data

In [None]:
sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(12,10)})
sns.distplot(AA.CARRIER_DELAY.loc[AA.ARR_DEL15 == 1], bins = range(0,250, 10))

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(12,10)})
sns.set_style("ticks", {"xtick.major.size":50, "ytick.major.size":50})
ax = sns.distplot(AA.CARRIER_DELAY.loc[(AA.ARR_DEL15 == 1) & (AA.CARRIER_DELAY != 0)], bins = range(0,250, 10))
ax.set_title('American Airlines - Carrier Delay Time Distribution', {'fontsize': 20,
        'fontweight' : 12,
        'verticalalignment': 'baseline'})
ax.set_xlabel('Delay Times (minutes)', fontdict={'fontsize' : 15})

In [None]:
sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(12,10)})
sns.distplot(pop.CARRIER_DELAY.loc[pop.ARR_DEL15 == 1], bins = range(0,250, 10))

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(12,10)})
sns.distplot(pop.CARRIER_DELAY.loc[(pop.ARR_DEL15 == 1) & (pop.CARRIER_DELAY != 0)], bins = range(0,250, 10))

Looking a distribution of AA sample and total carrier population it doesn't appear to have a normal distribution. I believe we should use Welch's t-test to compare the means of the sample and population - specifically whether or not AA's avg carrier delay time is greater than the population's.

# Graph cancelled flights AA v Top 3 v All others

In [None]:
top_3_prop = calc_prop(top_3_df, 'cancelled')
AA_prop = calc_prop(AA_df, 'cancelled')
not_AA_prop = calc_prop(not_AA_df, 'cancelled')
not_top_4_prop = calc_prop(not_top_4_df, 'cancelled')

plt.figure(figsize=(10,6))
sns.barplot(['American', 'Delta/United/Southwest', 'all others'], [AA_prop[0], top_3_prop[0], not_top_4_prop[0]]).set_title('Proportion of flights cancelled')
sns.set(context='poster')
plt.savefig('proportion_flights_cancelled_AA_Top3')

# Graph delayed flights

In [None]:
## proportions are commented out - these will be calculated in the t-test and are not needed.  
AA_prop_delay = calc_prop(AA_df, 'delayed')
not_AA_prop_delay = calc_prop(not_AA_df, 'delayed')
top_3_prop_delay = calc_prop(top_3_df, 'delayed')
not_top_4_prop_delay = calc_prop(not_top_4_df, 'delayed')

plt.figure(figsize=(10,6))
sns.barplot(['American', 'all others'], [AA_prop_delay[0], not_AA_prop_delay[0]]).set_title('Proportion of flights delayed')
sns.set(context='poster')

# Graph proportion delayed flights American v top 3 v all others

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(['American', 'Delta/United/Southwest', 'all others'], [AA_prop_delay[0], top_3_prop_delay[0], not_top_4_prop_delay[0]]).set_title('Proportion of flights delayed')
sns.set(context='poster')

# Proportions Z-Test! Now we get to the good stuff!

In [None]:
AA_prop_delay[0],not_AA_prop_delay[0], AA_prop_delay[1],not_AA_prop_delay[1]

In [None]:
zstat, pvalue = statsmodels.stats.proportion.proportions_ztest(count=[AA_prop_delay[2],not_AA_prop_delay[2]], nobs=[AA_prop_delay[1],not_AA_prop_delay[1]])
zstat, pvalue

P is approximatley zero so reject the null - difference between two proportions is significant. 

## Method Review: 

- Isolated key data of interest (Carrier_Delay)
- Resample this data using "central limit theorem plotter" function
- Save resampled distribution as variable
- Use saved var in Welchs_t, Welchs_df, and P functions. Then compare to using |stats.ttest_ind(group1, group2, equal_var=False)
- Use Cohen's D to find how different they are

### Welch's t-test on raw data

In [None]:
## Series needed for Welch's functions
AA_C_Delays = AA.CARRIER_DELAY.loc[(AA.ARR_DEL15 == 1) & (AA.CARRIER_DELAY != 0)]
pop_C_Delays = pop.CARRIER_DELAY.loc[(pop.ARR_DEL15 == 1) & (pop.CARRIER_DELAY != 0)]

t = welch_t(AA_C_Delays, pop_C_Delays)
dof = welch_df(AA_C_Delays, pop_C_Delays)
p = 1 - stats.t.cdf(t, dof)
print(p)

## Perhaps this result isn't meaningful because this test assumes a normal 
## distribution, which we have not observed for this dataset.


## Making stuff Logarithmic!!

In [None]:
AA_C_Delays_df = pd.DataFrame(AA_C_Delays)

## Since data is right-skewed, results of ANOVA (and related tests) may not be useful for data 'as-is' - going to
## transform to log-scale
AA_C_Delays_df['lognorm'] = np.log(AA_C_Delays_df.CARRIER_DELAY)
AA_C_Delays_df.head()

In [None]:
sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(12,10)})
sns.set_style("ticks", {"xtick.major.size":50, "ytick.major.size":50})
ax = sns.distplot(AA_C_Delays_df.lognorm)
ax.set_title('American Airlines - Carrier Delay Time Distribution', {'fontsize': 20,
        'fontweight' : 12,
        'verticalalignment': 'baseline'})
ax.set_xlabel('Log(Delay Times) (minutes)', fontdict={'fontsize' : 15})

In [None]:
pop_C_Delays_df = pd.DataFrame(pop_C_Delays)
pop_C_Delays_df['lognorm'] = np.log(pop_C_Delays_df.CARRIER_DELAY)

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(12,10)})
sns.distplot(pop_C_Delays_df.lognorm)

In [None]:
t_log = welch_t(AA_C_Delays_df['lognorm'], pop_C_Delays_df['lognorm'])
dof_log = welch_df(AA_C_Delays_df['lognorm'], pop_C_Delays_df['lognorm'])
p = 1 - stats.t.cdf(t_log, dof_log)
print(p)

Still not a normal distribution.
After conversations with classmates, we've learned that for the Welch's t-test, we should apply a resampling method 

Going to use the central limit theorem applied to non-zero carrier delays - to quantify difference in length of delay

Hypothesis read as: "Given there is a carrier delay, is average carrier delay time greater for AA compared to the 16 other airlines"

The subsetted data frames to use: "AA_C_Delays_df" & "pop_C_Delays_df".

Now going to resample data!

In [None]:
AA_RSM = central_limit_theorem_plotter(AA_C_Delays_df.CARRIER_DELAY, 10000, 5000) ## RSM = Random Sample Means

In [None]:
pop_RSM = central_limit_theorem_plotter(pop_C_Delays_df.CARRIER_DELAY, 10000, 5000)

In [None]:
t_RSM = welch_t(AA_RSM, pop_RSM)
print(t_RSM)
dof_RSM = welch_df(AA_RSM, pop_RSM)
p = 1 - stats.t.cdf(t_RSM, dof_RSM)
print(p)

In [None]:
## Double checking Welch values using stats package
stats.ttest_ind(pop_RSM, AA_RSM, equal_var=False)

A good result! - The large t-value shows that there is a statistical difference between the two means.

Now we can use Cohen's D to quanitfy the size of that difference.
This answers the question - is it just a small effect that never happens, or a large effect that never happens...


### Cohen's $d$, standardized metrics for effect size
Cohen’s $d$ is one of the most common ways to measure effect size. As an effect size, Cohen's d is typically used to represent the magnitude of differences between two (or more) groups on a given variable, with larger values representing a greater differentiation between the two groups on that variable.

$d$ = effect size (difference of means) / pooled standard deviation;

$d = \frac{\mu1 - \mu2}{\sigma pooled}$

In [2]:
cohen_d(AA_RSM, pop_RSM)

With a Cohen's d of 5.49, 100 % of the treatment group will be above the mean of the control group (Cohen's U3

### Subsetting to compare to major US Carriers:

#### Variables to plot: 

 - top_3_nonAA_DELAY_RSM
 - AA_RSM
 - everyone_else_DELAY_RSM
 - UA_DELAY_RSM
 - WN_DELAY_RSM 
 - DL_DELAY_RSM

In [None]:
top_3_nonAA_DELAY_RSM = central_limit_theorem_plotter(top_3_nonAA_DELAY.CARRIER_DELAY, 10000, 5000)

t_AA_vs_Top3 = welch_t(AA_RSM, top_3_nonAA_DELAY_RSM)
dof_AA_vs_Top3 = welch_df(AA_RSM, top_3_nonAA_DELAY_RSM)
p = 1 - stats.t.cdf(t_AA_vs_Top3, dof_AA_vs_Top3)

print(t_AA_vs_Top3)
print(dof_AA_vs_Top3)
print(p)

stats.ttest_ind(AA_RSM, top_3_nonAA_DELAY_RSM, equal_var=False)
cohen_d(AA_RSM, top_3_nonAA_DELAY_RSM)

In [None]:
central_limit_theorem_mean(pop.loc[mask].CARRIER_DELAY, 10000, 5000)

In [None]:
# Testing mask to filter on individual airlines with Delays (at least 15min) and Carrier Delays, ignoring NAN values

mask = (pop.OP_UNIQUE_CARRIER == 'WN') & (pop.DEP_DEL15 == 1) & 
       (pop.CARRIER_DELAY != 0) & (pop.CARRIER_DELAY.isna() == False)

pop.loc[mask]

everyone_else = pop.loc[(~pop.OP_UNIQUE_CARRIER.isin(
                         ['DL', 'WN', 'UA', 'AA']))]

# Checked to see that we have filtered correctly
# everyone_else.OP_UNIQUE_CARRIER.unique()

mask2 = (everyone_else.DEP_DEL15 == 1) & (everyone_else.CARRIER_DELAY != 0) 
        & (everyone_else.CARRIER_DELAY.isna() == False)
    
everyone_else_DELAY = everyone_else.loc[mask2]
everyone_else_DELAY_RSM = central_limit_theorem_plotter(everyone_else_DELAY.CARRIER_DELAY, 
                                                        10000, 5000)

In [None]:
DL = pop.loc[(pop.OP_UNIQUE_CARRIER == 'DL')] 
##everyone_else.OP_UNIQUE_CARRIER.unique() ## Checked to see that we have filtered correctly
mask3 = (DL.DEP_DEL15 == 1) & (DL.CARRIER_DELAY != 0) & (DL.CARRIER_DELAY.isna() == False)
DL_DELAY = DL.loc[mask3]
DL_DELAY_RSM = central_limit_theorem_plotter(DL_DELAY.CARRIER_DELAY, 10000, 5000)

In [None]:
WN = pop.loc[(pop.OP_UNIQUE_CARRIER == 'WN')] 
##everyone_else.OP_UNIQUE_CARRIER.unique() ## Checked to see that we have filtered correctly
mask4 = (WN.DEP_DEL15 == 1) & (WN.CARRIER_DELAY != 0) & (WN.CARRIER_DELAY.isna() == False)
WN_DELAY = WN.loc[mask4]
WN_DELAY_RSM = central_limit_theorem_plotter(WN_DELAY.CARRIER_DELAY, 10000, 5000)

In [None]:
UA = pop.loc[(pop.OP_UNIQUE_CARRIER == 'UA')] 
##everyone_else.OP_UNIQUE_CARRIER.unique() ## Checked to see that we have filtered correctly
mask5 = (UA.DEP_DEL15 == 1) & (UA.CARRIER_DELAY != 0) & (UA.CARRIER_DELAY.isna() == False)
UA_DELAY = UA.loc[mask5]
UA_DELAY_RSM = central_limit_theorem_plotter(UA_DELAY.CARRIER_DELAY, 10000, 5000)

## Barplot of each airlines Resampled Mean Carrier Delay Time

In [None]:
## This returns the resampled means of each individual airline reported in the dataset.

mean_carrier_delay_airlines_all = {}
for airline in names_to_map.UNIQUE_CARRIER:
    mask = (df.OP_UNIQUE_CARRIER == airline) & (df.DEP_DEL15 == 1) & (df.CARRIER_DELAY != 0) & (df.CARRIER_DELAY.isna() == False)
    temp = df.loc[mask]
    temp_mean = central_limit_theorem_mean(temp.CARRIER_DELAY, 10000, 5000)
    mean_carrier_delay_airlines_all[airline] = temp_mean

mean_CarrierDelay_all_df = pd.DataFrame(data = mean_carrier_delay_airlines_all.values(), columns = ['Mean Carrier Delay'], index = mean_carrier_delay_airlines_all.keys())

mean_CarrierDelay_all_df


In [None]:
consol_df = mean_CarrierDelay_all_df.merge(names_to_map, how = 'outer', left_index = True, right_index = True)
plot = sns.barplot(x = consol_df.Description, y = consol_df['Mean Carrier Delay']) ## Missing American Airlines
plot.set_title('Resampled Mean of Carrier Delays for Individual Airlines')
for item in plot.get_xticklabels():
    item.set_rotation(45)
    

## Figure: "Resampled Distribution of Means for Carrier Delays by Airline/Group"

In [None]:
#f, ax = plt.subplots(1, 1)
plt.figure(figsize=(9,6))

ax = sns.distplot(top_3_nonAA_DELAY_RSM, bins=50, kde=True, label = 'Top 3')
ax = sns.distplot(AA_RSM, bins=50, kde=True, label = 'AA')
ax = sns.distplot(everyone_else_DELAY_RSM, bins=50, kde=True, label = 'Bottom 13')
ax = sns.distplot(UA_DELAY_RSM, bins=50, kde=True, label = 'UA')
ax = sns.distplot(WN_DELAY_RSM , bins=50, kde=True, label = 'WN')
ax = sns.distplot(DL_DELAY_RSM, bins=50, kde=True, label = 'DL')
#sns.set(context='talk')
ax.set_title('Resampled Distribution of Means for Carrier Delays by Airline/Group')

plt.axvline(top_3_nonAA_DELAY_RSM.mean(), 0, .65, color='red', alpha=.8)
plt.axvline(AA_RSM.mean(), 0, .5, color='red', alpha=.8)
plt.axvline(everyone_else_DELAY_RSM.mean(), 0, .4, color='red', alpha=.8)
plt.axvline(UA_DELAY_RSM.mean(), 0, .5, color='red', alpha=.8)
plt.axvline(WN_DELAY_RSM.mean(), 0, .95 , color='red', alpha=.8)
plt.axvline(DL_DELAY_RSM.mean(), 0, .4, color='red', alpha=.8)

ax.legend()

## Figure: "Carrier Delay Distribution - American vs. Everyone Else"

In [None]:
plt.figure(figsize=(9,6))
ax = sns.distplot(AA_RSM, bins=50, kde=True, label = 'AA')
ax = sns.distplot(everyone_else_DELAY_RSM, bins=50, kde=True, label = 'Bottom 13')
ax.set_title('Carrier Delay Distribution - American vs. Everyone Else')
plt.axvline(AA_RSM.mean(), 0, .5, color='red', alpha=.8)
plt.axvline(everyone_else_DELAY_RSM.mean(), 0, .4, color='red', alpha=.8)

ax.legend()

## Figure: "American vs. Top 3 (Carrier Delay Times)"

In [None]:
plt.figure(figsize=(9,6))

ax = sns.distplot(top_3_nonAA_DELAY_RSM, bins=50, kde=True, label = 'Top 3', color = 'darkblue')
ax = sns.distplot(AA_RSM, bins=50, kde=True, label = 'AA', color = 'darkorange')
ax.set_title('American vs. Top 3 (Carrier Delay Times)')
plt.axvline(top_3_nonAA_DELAY_RSM.mean(), 0, .9, color='red', alpha=.8)
plt.axvline(AA_RSM.mean(), 0, .7, color='red', alpha=.8)

ax.legend()

## Figure: "American vs. Everyone Else"

In [None]:
plt.figure(figsize=(9,6))

ax = sns.distplot(pop_RSM, bins=50, kde=True, label = 'Everyone Else', color = 'purple')
ax = sns.distplot(AA_RSM, bins=50, kde=True, label = 'AA', color = 'darkorange')
ax.set_title('American vs. Everyone Else')

plt.axvline(pop_RSM.mean(), 0, .9, color='red', alpha=.8)
plt.axvline(AA_RSM.mean(), 0, .9, color='red', alpha=.8)

ax.legend()

## Figure: "American vs. Top 3 Individually (Carrier Delay Times)"

In [None]:
plt.figure(figsize=(9,6))

ax = sns.distplot(AA_RSM, bins=50, kde=True, label = 'American', color = 'darkorange')
ax = sns.distplot(UA_DELAY_RSM, bins=50, kde=True, label = 'United')
ax = sns.distplot(WN_DELAY_RSM , bins=50, kde=True, label = 'Southwest', color = 'salmon')
ax = sns.distplot(DL_DELAY_RSM, bins=50, kde=True, label = 'Delta', color = 'seagreen')

ax.set_title('American vs. Top 3 Individually (Carrier Delay Times)')

plt.axvline(AA_RSM.mean(), 0, .5, color='red', alpha=.8)
plt.axvline(UA_DELAY_RSM.mean(), 0, .5, color='red', alpha=.8)
plt.axvline(WN_DELAY_RSM.mean(), 0, .95 , color='red', alpha=.8)
plt.axvline(DL_DELAY_RSM.mean(), 0, .4, color='red', alpha=.8)

ax.legend()

# Moving on to anova for delays at DC area airports DCA/IAD/BWI
Are cancellation rates uniform across airports?  

# Create dfs for Anova
##### Select data airport specific american airlines data

In [None]:
# these come from already existing AA_df dataframe generated above
DCA = lean_df.loc[((lean_df.ORIGIN=='DCA')&(lean_df.CANCELLED==0)&(lean_df.DEP_DELAY_NEW>=15)&(lean_df.CARRIER_DELAY>0)&(lean_df.CARRIER_NAME=='American Airlines Inc.'))]
IAD = lean_df.loc[((lean_df.ORIGIN=='IAD')&(lean_df.CANCELLED==0)&(lean_df.DEP_DELAY_NEW>=15)&(lean_df.CARRIER_DELAY>0)&(lean_df.CARRIER_NAME=='American Airlines Inc.'))]
BWI = lean_df.loc[((lean_df.ORIGIN=='BWI')&(lean_df.CANCELLED==0)&(lean_df.DEP_DELAY_NEW>=15)&(lean_df.CARRIER_DELAY>0)&(lean_df.CARRIER_NAME=='American Airlines Inc.'))]

# Anova Testing

In [None]:
DCA_s = CLM_resampler(DCA.CARRIER_DELAY, 100, 1000)
IAD_s = CLM_resampler(IAD.CARRIER_DELAY, 100, 1000)
BWI_s = CLM_resampler(BWI.CARRIER_DELAY, 100, 1000)

print('DCA mean = ' + str(DCA_s.mean()) + ',IAD mean = ' + str(IAD_s.mean()) + ' BWI mean = '+ str(BWI_s.mean()))

resultsD_I = stats.kruskal(DCA_s, IAD_s)
resultsD_B = stats.kruskal(DCA_s, BWI_s)
resultsB_I = stats.kruskal(DCA_s, BWI_s)

print('DCA/IAD' + str(resultsD_I) + ', DCA/BWI' + str(resultsD_B) + ', BWI/IAD' + str(resultsB_I))

#### Check for homoscedasticity 

This appears to fail levene test for equal variance.  Bah.  Looks like we can't assume that.

we'll use Kruskal-Wallis H-test instead - this is non parametric version of one-way anova test.  

In [None]:
stats.levene(DCA_s, IAD_s, BWI_s)

# Graph individual carrier delay distributions for airports

In [None]:
plt.figure(figsize=(12,22))

print(stats.kruskal(DCA_s, IAD_s))
print(stats.ttest_ind(DCA_s, IAD_s, equal_var=False))
plt.subplot(311)
ax1 = sns.distplot(DCA_s, bins=range(0,120,2))
ax1 = sns.distplot(IAD_s, bins=range(0,120,2))
ax1.set_title('Sampling distribution american airlines delays DCA and IAD')


plt.subplot(312)
print(stats.kruskal(DCA_s, BWI_s))
print(stats.ttest_ind(DCA_s, BWI_s, equal_var=False))
ax2 = sns.distplot(DCA_s, bins=range(0,120,2))
ax2 = sns.distplot(BWI_s, bins=range(0,120,2))
ax2.set_title('Sampling distribution american airlines delays DCA and BWI')


plt.subplot(313)
print(stats.kruskal(BWI_s, IAD_s))
print(stats.ttest_ind(BWI_s, IAD_s, equal_var=False))
ax3 = sns.distplot(BWI_s, bins=range(0,120,2))
ax3 = sns.distplot(IAD_s, bins=range(0,120,2))
ax3.set_title('Sampling distribution american airlines delays BWI and IAD')

plt.savefig('sampling_distribution_airport_subplots.jpg')

# Graph delay dist all airports

In [None]:
plt.figure(figsize=(9,6))
#sns.set(context='talk')
ax = sns.distplot(BWI_s, bins=range(0,120,2), label='BWI', color='blue', kde=False, fit=norm)
ax = sns.distplot(IAD_s, bins=range(0,120,2), label='IAD', color='orange', kde=False, fit=norm)
ax = sns.distplot(DCA_s, bins=range(0,120,2), label='DCA', color='green', kde=False, fit=norm)
ax.set_title('Sampling distribution american airlines delays')
ax.set(xlabel='Delay length', ylabel='Frequency')
ax.legend()
plt.axvline(DCA_s.mean(), 0, .815, color='red', alpha=.8)
plt.axvline(IAD_s.mean(), 0, .43, color='red', alpha=.8)
plt.axvline(BWI_s.mean(), 0, .46, color='red', alpha=.8)

plt.savefig('american_specific_airport_delays_sample_dist.jpg')