In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway, kruskal, shapiro, levene

# Load data
benin = pd.read_csv("../data/benin-malanville.csv", parse_dates=['Timestamp'])
sierraleone = pd.read_csv("../data/sierraleone-bumbuna.csv", parse_dates=['Timestamp'])
togo = pd.read_csv("../data/togo-dapaong_qc.csv", parse_dates=['Timestamp'])

benin['Country'] = 'Benin'
sierraleone['Country'] = 'Sierra Leone'
togo['Country'] = 'Togo'

combined = pd.concat([benin, sierraleone, togo], ignore_index=True)

sns.set_style('whitegrid')

# Boxplots
for col, ylabel in [('GHI', 'Global Horizontal Irradiance (W/m²)'),
                    ('DNI', 'Direct Normal Irradiance (W/m²)'),
                    ('DHI', 'Diffuse Horizontal Irradiance (W/m²)')]:
    plt.figure(figsize=(10,6))
    sns.boxplot(x='Country', y=col, data=combined)
    plt.title(f'{col} Comparison Across Countries')
    plt.ylabel(ylabel)
    plt.show()

# Summary stats with flattened columns
summary = combined.groupby('Country')[['GHI','DNI','DHI']].agg(['mean','median','std'])
summary.columns = ['_'.join(col).strip() for col in summary.columns.values]
print(summary)

# Test assumptions for GHI
groups = [group['GHI'].values for name, group in combined.groupby('Country')]

for name, group in combined.groupby('Country'):
    stat, p = shapiro(group['GHI'])
    print(f'Shapiro-Wilk for {name} GHI: p={p:.4f}')

stat, p = levene(*groups)
print(f'Levene’s test p-value: {p:.4f}')

# ANOVA or Kruskal-Wallis based on assumptions
anova_result = f_oneway(*groups)
print("ANOVA p-value for GHI:", anova_result.pvalue)

# If assumptions fail, use Kruskal-Wallis instead
# kruskal_result = kruskal(*groups)
# print("Kruskal-Wallis p-value for GHI:", kruskal_result.pvalue)


FileNotFoundError: [Errno 2] No such file or directory: '../data/benin_clean.csv'