In [1]:
import os

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import glob

from pathlib import Path

import scipy.stats as stats

from scipy import stats

In [2]:
def t_test_values(input_BA): # give the input_BA as a string e.g. 'AEC'
    
    # get list of files from given BA
    list_of_hot_files = glob.glob(os.path.join(r'C:\projects\CONUS_TGW_WRF_SSP585_HOT\CONUS_TGW_WRF_SSP585_HOT', f'{input_BA}_*.csv'))
    list_of_cold_files = glob.glob(os.path.join(r'C:\projects\CONUS_TGW_WRF_SSP585_COLD\CONUS_TGW_WRF_SSP585_COLD', f'{input_BA}_*.csv'))
    
    p_value_list = []
    # run for loop over list of the file names to return the t-test values
    for file_index in range(len(list_of_hot_files)):
        
        # read in csv files for hot and cold
        hot_df = pd.read_csv(list_of_hot_files[file_index], index_col = None, header = 0)
        hot_t2_array = hot_df['T2'].to_numpy()
        
        cold_df = pd.read_csv(list_of_cold_files[file_index], index_col = None, header = 0)
        cold_t2_array = cold_df['T2'].to_numpy()
        
        # get var for both hot and cold, to check if they have "equal" variance
        hot_var = np.var(hot_t2_array)
        cold_var = np.var(cold_t2_array)
        
        # check ratio of variances, then run the t-test if satisfied
        if hot_var > cold_var:
            if (hot_var / cold_var) > 4:
                print("The variances are unequal(ratio > 4)")
            else:
                t_stat, p_value = stats.ttest_ind(a = hot_t2_array, b = cold_t2_array, equal_var = True)
                
        else:
            if (cold_var / hot_var) > 4:
                print("The variances are unequal(ratio > 4)")
            else:
                t_stat, p_value = stats.ttest_ind(a = hot_t2_array, b = cold_t2_array, equal_var = True)
                
        # add p value to list for every year
        p_value_list.append(p_value)
    
    # return a list of the p values of each t-test
    return p_value_list

In [3]:
# run through t_test_values for every BA. return a list of the BAs where there exists a p-value greater than the input
def p_value_BA(input_p_value): # input the input_p_value as a float e.g. 0.05
    
    # get a list of the BAs
    list_of_files = glob.glob(os.path.join(r'C:\projects\CONUS_TGW_WRF_SSP585_HOT\CONUS_TGW_WRF_SSP585_HOT', "*.csv"))
    
    BA_list_split = []
    for file_index in range(len(list_of_files)):
        base_name = os.path.splitext(os.path.basename(list_of_files[file_index]))[0]
        split = base_name.split("_")
        BA_list_split.append(split)
        
    # get list of unique BA names
    BA_names = []
    for file_list in BA_list_split:
        if file_list[0] in BA_names:
            pass
        else:
            BA_names.append(file_list[0])
    
    insignificant_BA_list = []
    # run t_test_values for each BA
    for BA in BA_names:
        p_value_list = t_test_values(BA)
        
        # check p values
        for p_value in p_value_list:
            if p_value > input_p_value:
                insignificant_BA_list.append(BA)
                break
            
    # return a list of BAs where there exists a year which would accept the null hypothesis that the difference in group means is zero
    return insignificant_BA_list

In [4]:
p_value_BA(0.04)

['AEC',
 'CPLE',
 'DUK',
 'ERCO',
 'FPC',
 'GVL',
 'JEA',
 'LGEE',
 'NSB',
 'OVEC',
 'PSCO',
 'SEPA',
 'SOCO',
 'SPA',
 'TAL',
 'TVA']