# Task-3 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway, kruskal

# Load cleaned datasets
df_benin = pd.read_csv('../../data/clean/benin_clean.csv')
df_togo = pd.read_csv('../../data/clean/togo_clean.csv')
df_sierra = pd.read_csv('../../data/clean/sierra_leone_clean.csv')

# Add a country column for easier plotting
df_benin['Country'] = 'Benin'
df_togo['Country'] = 'Togo'
df_sierra['Country'] = 'Sierra Leone'

# Combine all dataframes into one
df_all = pd.concat([df_benin, df_togo, df_sierra], ignore_index=True)


# Metric Comparison with Boxplots

In [None]:
# Set style
sns.set(style="whitegrid")

# Create boxplots for GHI, DNI, and DHI
for metric in ['GHI', 'DNI', 'DHI']:
    plt.figure(figsize=(8, 5))
    ax = sns.boxplot(x='Country', y=metric, hue='Country', data=df_all, palette='Set2')
    
    # Check if a legend exists, then remove it
    if ax.get_legend() is not None:
        ax.get_legend().remove()
        
    plt.title(f'Boxplot of {metric} by Country')
    plt.xlabel('Country')
    plt.ylabel(metric)
    plt.tight_layout()
    plt.show()


# Metric Comparison with Summary Table 

In [None]:
# Calculate summary statistics: mean, median, std for each metric grouped by Country
summary_stats = df_all.groupby('Country')[['GHI', 'DNI', 'DHI']].agg(['mean', 'median', 'std'])

# Flatten the MultiIndex columns for easier reading
summary_stats.columns = ['_'.join(col).strip() for col in summary_stats.columns.values]

# Display the summary table nicely
print("Summary Statistics (Mean, Median, Std) for GHI, DNI, DHI by Country:")
display(summary_stats)



# Statistical testing (One-way ANOVA and Kruskal-Wallis test)

In [None]:
# Extract GHI values per country
ghi_benin = df_benin['GHI']
ghi_togo = df_togo['GHI']
ghi_sierra = df_sierra['GHI']

# Run One-way ANOVA test
anova_result = f_oneway(ghi_benin, ghi_togo, ghi_sierra)

# Run Kruskal-Wallis test (non-parametric alternative)
kruskal_result = kruskal(ghi_benin, ghi_togo, ghi_sierra)

print("One-way ANOVA test result:")
print(f"F-statistic: {anova_result.statistic:.4f}, p-value: {anova_result.pvalue:.4e}")

print("\nKruskal-Wallis test result:")
print(f"Statistic: {kruskal_result.statistic:.4f}, p-value: {kruskal_result.pvalue:.4e}")


### Key Observations
- The One-way ANOVA and Kruskal-Wallis tests indicate no statistically significant difference in GHI values among Benin, Togo, and Sierra Leone (p ≈ 1.0).
- This suggests that solar irradiance levels are comparable across the three countries in the dataset.
- Further investigation into other metrics like DNI and DHI or additional contextual factors may be needed for nuanced insights.




# Visual Summary

In [None]:
# Bar chart ranking countries by average GHI
avg_ghi = df_all.groupby('Country')['GHI'].mean().sort_values(ascending=False)
plt.figure(figsize=(6, 4))
# sns.barplot(x=avg_ghi.index, y=avg_ghi.values, palette='Set2')
sns.barplot(x=avg_ghi.index, y=avg_ghi.values, hue=avg_ghi.index, palette='Set2', legend=False)
plt.title('Average GHI by Country')
plt.ylabel('Average GHI')
plt.xlabel('Country')
plt.tight_layout()
plt.show()
