# Hypothesis testing

## Load libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
from scipy.stats import ttest_1samp
import yaml
pd.set_option('display.max_columns', None)

In [2]:
try:
    with open("..\config.yaml", "r") as file:
        config = yaml.safe_load(file)
except Exception as e:
    print("Error reading the config file", e)

## Load data

In [3]:
delays_cancellations = pd.read_csv(config["data"]["clean1"])
delays_cancellations.head()

Unnamed: 0,fl_date,op_carrier,airline,op_carrier_fl_num,origin,origin_airport,origin_city,origin_state,origin_country,origin_latitude,origin_longitude,dest,dest_airport,dest_city,dest_state,dest_country,dest_latitude,dest_longitude,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,crs_elapsed_time,actual_elapsed_time,air_time,distance
0,2018-01-01,UA,United Air Lines Inc.,2429,EWR,Newark Liberty International Airport,Newark,NJ,USA,40.6925,-74.16866,DEN,Denver International Airport,Denver,CO,USA,39.85841,-104.667,15:17:00,15:12:00,-5.0,15.0,15:27:00,17:12:00,10.0,17:45:00,17:22:00,-23.0,0.0,0,0.0,268.0,250.0,225.0,1605.0
1,2018-01-01,UA,United Air Lines Inc.,2427,LAS,McCarran International Airport,Las Vegas,NV,USA,36.08036,-115.15233,SFO,San Francisco International Airport,San Francisco,CA,USA,37.619,-122.37484,11:15:00,11:07:00,-8.0,11.0,11:18:00,12:23:00,7.0,12:54:00,12:30:00,-24.0,0.0,0,0.0,99.0,83.0,65.0,414.0
2,2018-01-01,UA,United Air Lines Inc.,2426,SNA,John Wayne Airport (Orange County Airport),Santa Ana,CA,USA,33.67566,-117.86822,DEN,Denver International Airport,Denver,CO,USA,39.85841,-104.667,13:35:00,13:30:00,-5.0,15.0,13:45:00,16:31:00,5.0,16:49:00,16:36:00,-13.0,0.0,0,0.0,134.0,126.0,106.0,846.0
3,2018-01-01,UA,United Air Lines Inc.,2425,RSW,Southwest Florida International Airport,Ft. Myers,FL,USA,26.53617,-81.75517,ORD,Chicago O'Hare International Airport,Chicago,IL,USA,41.9796,-87.90446,15:46:00,15:52:00,6.0,19.0,16:11:00,17:48:00,6.0,17:56:00,17:54:00,-2.0,0.0,0,0.0,190.0,182.0,157.0,1120.0
4,2018-01-01,UA,United Air Lines Inc.,2424,ORD,Chicago O'Hare International Airport,Chicago,IL,USA,41.9796,-87.90446,ALB,Albany International Airport,Albany,NY,USA,42.74812,-73.80298,06:30:00,06:50:00,20.0,13.0,07:03:00,09:26:00,10.0,09:22:00,09:36:00,14.0,0.0,0,0.0,112.0,106.0,83.0,723.0


## Hypothesis test

Check if the population mean of the departure delay in minutes is 0 (the departure hasn't been advanced nor delayed). 

In [4]:
# Ho: mean of delays in minutes = 0
# H1: mean of delays in minutes != 0

In [26]:
array = delays_cancellations['dep_delay'].dropna().to_numpy()
stat, pval = ttest_1samp(array, popmean = 0, alternative = "two-sided") 
stat, pval

(592.4299737496306, 0.0)

The p_value is really small, that is why we obtain 0. As 0 is smaller than our significance level $\alpha=0.05$, the null hypothesis is rejected.