## Central Limit Theorem Resampler 

In [None]:
# set random seed for repeatable results for random sampling
#np.random.seed(1213145)
# this does the resampling for sample distributions
def CLM_resampler(distribution, sample_size, num_samples):
    np.random.seed(1213145)
    sample_means = np.zeros(num_samples)
    pop_std = distribution.std()
    for idx, num in enumerate(range(num_samples)):
        sample = np.random.choice(distribution, size=sample_size, replace=True)
        sample_means[idx] = sample.mean()
    return sample_means

## Central Limit Theorem Resampling Mean

In [None]:
def central_limit_theorem_mean(distribution, sample_size, num_samples):  ## distribution must be an array (df.CARRIER_DELAY)
    sample_means = np.zeros(num_samples)
    pop_std = distribution.std()
    for idx, num in enumerate(range(num_samples)):
        sample = np.random.choice(distribution, size=sample_size, replace=True)
        sample_means[idx] = sample.mean()
    return sample_means.mean()

## Central Limit Theorem Plotter

In [None]:
def central_limit_theorem_plotter(distribution, sample_size, num_samples):
    sample_means = np.zeros(num_samples)
    pop_std = distribution.std()
    for idx, num in enumerate(range(num_samples)):
        sample = np.random.choice(distribution, size=sample_size, replace=True)
        sample_means[idx] = sample.mean()
    sns.set_style("ticks", {"xtick.major.size":50, "ytick.major.size":50})
    ax = sns.distplot(sample_means, bins=50, kde=True)
    title = 'Sample Distribution n = {} and number of samples = {}, std error = {}'.format(
        sample_size, num_samples, round((pop_std/num_samples), ndigits = 3))
    ax.set_xlabel('Sample Mean Delay Times (minutes)', fontdict={'fontsize' : 15})
    print('mean = {}'.format(sample_means.mean()))
    plt.title(title, {'fontsize': 20,
        'fontweight' : 12,
        'verticalalignment': 'baseline'})
    return sample_means

## Proportion Calculator

In [None]:
# calculates proportion of delayed or cancelled flights given our dataframe - very narrow scope for this function, easy to break
def calc_prop(df, prop_type):
    if prop_type == 'cancelled':
        df_can = df.loc[(df.CANCELLATION_CODE=='A'),:].shape[0]
        df_nobs_can = df.shape[0]
        df_proportion_can = df_can/df_nobs_can
        return df_proportion_can, df_nobs_can, df_can
    elif prop_type == 'delayed':
        df_del = df.loc[((df.CARRIER_DELAY>0) & (df.CANCELLED==0)),:].shape[0]
        df_nobs_del = df.loc[(df.CANCELLED==0),:].shape[0]
        df_proportion_del = df_del/df_nobs_del
        return df_proportion_del, df_nobs_del, df_del
    else:
        return 'input in second argument must be either "cancelled" or "delayed"'

## Read in files and concatenate to single dataframe
come back to update this to appropriate file path

In [None]:
def file_read_in(folder_name):
    """Provide absolute file path for files labeled as {month}_2019.csv"""
    combined = pd.DataFrame()
    files = ['april', 'march', 'february', 'january', 'december', 'november']
    for month in files: 
        data = pd.read_csv(f"{folder_name}{month}_2019.csv")
        combined = pd.concat([combined, data])
    return combined

## Calculating Welch's variables

### Welch's t-Test

Recall that Welch's t-Test is given by  

# $ t = \frac{\bar{X_1}-\bar{X_2}}{\sqrt{\frac{s_1^2}{N_1} + \frac{s_2^2}{N_2}}} = \frac{\bar{X_1}-\bar{X_2}}{\sqrt{se_1^2+se_2^2}}$

where $\bar{X_i}$ , $s_i$, and $N_i$ are the sample mean, sample variance, and sample size, respectively, for sample i.

Write a function for calculatying Welch's t-statistic using two samples a, and b. To help, 2 potential samples are defined below.

> **Important Note**: While the formula does not indicate it, it is appropriate to take the absolute value of the t-value.

In [None]:
def welch_t(a, b):
    
    """ Calculate Welch's t statistic for two samples. """

    numerator = a.mean() - b.mean()
    
    # “ddof = Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof, 
    #  where N represents the number of elements. By default ddof is zero.
    
    denominator = np.sqrt(a.var(ddof=1)/a.size + b.var(ddof=1)/b.size)
    
    return np.abs(numerator/denominator)

In [None]:
def welch_df(a, b):
    
    """ Calculate the effective degrees of freedom for two samples. """
    
    s1 = a.var(ddof=1) 
    s2 = b.var(ddof=1)
    n1 = a.size
    n2 = b.size
    
    numerator = (s1/n1 + s2/n2)**2
    denominator = (s1/ n1)**2/(n1 - 1) + (s2/ n2)**2/(n2 - 1)
    
    return numerator/denominator


## Calculating Cohen's D

In [None]:
def cohen_d(group1, group2):

    """# Compute Cohen's d.

    # group1: Series or NumPy array
    # group2: Series or NumPy array

    # returns a floating point number
"""
    diff = group1.mean() - group2.mean()

    n1 = len(group1)
    n2 = len(group2)
    var1 = group1.var()
    var2 = group2.var()

    # Calculate the pooled threshold as shown earlier
    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)

    # Calculate Cohen's d statistic
    d = diff / np.sqrt(pooled_var)

    return d