# Summerize Data

## Import

In [None]:
import os, sys
sys.path.append(os.path.abspath("."))

In [None]:
import json
import numpy as np
import pandas as pd

In [None]:
import SettingForFeatures

In [None]:
import importlib
importlib.reload(SettingForFeatures)

## Functions

In [None]:
def calculate_bootstrap_se(data_series, n_bootstraps=1000, statistic_func=np.mean):
    """
    Calculates the Bootstrap Standard Error (SE) for a statistic (default is mean).
    
    Parameters:
    data_series (pd.Series): The data series to calculate the statistic from.
    n_bootstraps (int): The number of resamples (bootstraps).
    statistic_func (function): The statistic function to apply (e.g., np.mean, np.median).
    
    Returns:
    float: The Bootstrap Standard Error of the statistic.
    """
    n_samples = len(data_series)
    bootstrap_statistics = []
    
    for _ in range(n_bootstraps):
        # Resample with replacement, size equal to original sample
        resampled_data = np.random.choice(data_series, size=n_samples, replace=True)
        # Calculate the statistic (e.g., mean) on the resampled data
        stat = statistic_func(resampled_data)
        bootstrap_statistics.append(stat)
        
    # The Bootstrap SE is the standard deviation of the bootstrap distribution
    return np.std(bootstrap_statistics)

## Runs

In [None]:
if __name__ == '__main__':
    pass

In [None]:
import os
from dotenv import load_dotenv
from pathlib import Path

load_dotenv()
os.chdir(os.getenv("PROJECT_ROOT"))

In [None]:
os.makedirs(TABLES := './tables', exist_ok = True)

In [None]:
all_data = SettingForFeatures.data_load_combine_dataset()

In [None]:
np.sum(all_data['Year'].notna())

In [None]:
wave_1_raw = all_data[all_data['Year']==2016][['Prov', 'EcoBelt']].value_counts(dropna=False).sort_index()

In [None]:
wave_2_raw = all_data[all_data['Year']==2022][['Prov', 'EcoBelt']].value_counts(dropna=False).sort_index()

In [None]:
merged_data = wave_1_raw.to_frame().reset_index().merge(wave_2_raw.to_frame().reset_index(), on = ['Prov', 'EcoBelt'], how = 'outer').replace('Sudurpaschim', 'Sudurpashchim').fillna(0)

In [None]:
merged_data.columns = ['Province', 'EcoBelt', 'Respondents in Wave 1', 'Respondents in Wave 2']

In [None]:
merged_data

In [None]:
merged_data.to_excel(os.path.join(TABLES, 'TableS1_respondentCount.xlsx'))

### Data Summary

In [None]:
always_inputs = SettingForFeatures.return_input_variables()

In [None]:
aim_variable = SettingForFeatures.return_output_variables()[0]

In [None]:
data_summary = all_data[[aim_variable] + always_inputs].describe().T.reset_index()

In [None]:
data_summary

In [None]:
VARIABLE_MAP_RENAMED = SettingForFeatures.return_beautiful_dict()

In [None]:
data_summary['index'] = data_summary['index'].map(VARIABLE_MAP_RENAMED)

In [None]:
data_summary

In [None]:
data_summary.to_excel(os.path.join(TABLES, 'Table1_DataSummary.xlsx'))

### Hyperparameter

In [None]:
with open(save_path := os.path.join('results', 'HumanDiseaseIncreasePast25_Dummy_accuracy_comparison.json'), 'r', encoding='utf-8') as f:
    accuracy_comparison = json.load(f)

In [None]:
df = pd.DataFrame({
    'model': accuracy_comparison
}).reset_index().rename(columns={'index': 'model'})

In [None]:
df

In [None]:
df.to_excel(os.path.join(TABLES, 'TableS2_HyperTable.xlsx'))