In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats  # For statistical tests

In [None]:
# Load cleaned datasets
countries = {
    'Benin': pd.read_csv("../data/benin_clean.csv"),
    'Sierra Leone': pd.read_csv("../data/sierraleone_clean.csv"),
    'Togo': pd.read_csv("../data/togo_clean.csv")
}

# Add country labels to each DataFrame
for country, df in countries.items():
    df['Country'] = country

# Combine into one DataFrame for easier plotting
combined_df = pd.concat(countries.values())

In [None]:
metrics = ['GHI', 'DNI', 'DHI']
plt.figure(figsize=(15, 5))

for i, metric in enumerate(metrics, 1):
    plt.subplot(1, 3, i)
    sns.boxplot(data=combined_df, x='Country', y=metric, palette='viridis')
    plt.title(f'{metric} Distribution by Country')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Prepare data for ANOVA
ghi_samples = [df['GHI'].dropna() for df in countries.values()]

# One-way ANOVA (assumes normality)
anova_result = stats.f_oneway(*ghi_samples)
print(f"ANOVA p-value for GHI: {anova_result.pvalue:.4f}")

# Kruskal-Wallis (non-parametric alternative)
kruskal_result = stats.kruskal(*ghi_samples)
print(f"Kruskal-Wallis p-value for GHI: {kruskal_result.pvalue:.4f}")

In [None]:
plt.figure(figsize=(8, 4))
combined_df.groupby('Country')['GHI'].mean().sort_values().plot(
    kind='barh', color=sns.color_palette('viridis', 3))
plt.title('Average GHI by Country (Ranked)')
plt.xlabel('GHI (W/m²)')
plt.show()