In [None]:
# Conceptual: git checkout -b compare-countries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
# --- Load each country's cleaned CSV ---
try:
    df_benin = pd.read_csv('../../data/benin_clean.csv', parse_dates=['Timestamp'])
    df_benin['Country'] = 'Benin'
    df_sierra_leone = pd.read_csv('../../data/sierra_leone_clean.csv', parse_dates=['Timestamp'])
    df_sierra_leone['Country'] = 'Sierra Leone'
    df_togo = pd.read_csv('../../data/togo_clean.csv', parse_dates=['Timestamp'])
    df_togo['Country'] = 'Togo'

In [None]:
   # Concatenate all dataframes
    df_combined = pd.concat([df_benin, df_sierra_leone, df_togo], ignore_index=True)
    print("Cleaned data for all countries loaded successfully.")
    print("Combined DataFrame head:")
    print(df_combined.head())
    print("\nCombined DataFrame info:")
    df_combined.info()

except FileNotFoundError as e:
    print(f"Error loading cleaned data: {e}. Please ensure cleaned CSV files exist in the 'data/' directory (e.g., benin_clean.csv).")
    exit()
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    exit()

In [None]:
# --- Metric Comparison ---
metrics = ['GHI', 'DNI', 'DHI']

print("\n--- Boxplots of GHI, DNI, DHI Across Countries ---")
for metric in metrics:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Country', y=metric, data=df_combined, palette='viridis')
    plt.title(f'Distribution of {metric} Across Countries')
    plt.ylabel(f'{metric} ($W/m^2$)')
    plt.xlabel('Country')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

In [None]:
# Summary Table
print("\n--- Summary Table (Mean, Median, Std Dev) of GHI, DNI, DHI Across Countries ---")
summary_table = df_combined.groupby('Country')[metrics].agg(['mean', 'median', 'std'])
print(summary_table)

In [None]:
# --- Statistical Testing (One-way ANOVA) ---
print(f"\n--- One-way ANOVA on GHI values ---")
if 'GHI' in df_combined.columns:
    # Ensure all countries have GHI data to perform ANOVA
    ghi_data_per_country = [df_combined[df_combined['Country'] == country]['GHI'].dropna()
                            for country in df_combined['Country'].unique()]

    # Check if there's enough data for ANOVA (at least 2 groups with data)
    if len(ghi_data_per_country) >= 2 and all(len(g) > 1 for g in ghi_data_per_country):
        f_statistic, p_value = stats.f_oneway(*ghi_data_per_country)
        print(f"F-statistic: {f_statistic:.2f}")
        print(f"P-value: {p_value:.3f}")

        if p_value < 0.05:
            print("Brief Note: The p-value is less than 0.05, indicating a statistically significant difference in mean GHI across the countries.")
        else:
            print("Brief Note: The p-value is greater than 0.05, indicating no statistically significant difference in mean GHI across the countries.")
    else:
        print("Not enough data points per country to perform ANOVA on GHI.")
else:
    print("'GHI' column not found for statistical testing.")

In [None]:
# --- Key Observations (Markdown Cell Content) ---
# This section describes what you would put in a markdown cell in the Jupyter Notebook.
print("\n--- Key Observations (Content for Markdown Cell) ---")
print("""
### Key Observations

In [None]:
Based on the cross-country comparison of solar radiation data:

* **Solar Potential Ranking:** Based on median GHI, [Country X] appears to have the highest solar potential, followed by [Country Y] and [Country Z]. The boxplots visually confirm this ranking, showing the general shift in GHI distributions.
* **Variability in Irradiance:** [Country A] exhibits the greatest variability in GHI, DNI, and DHI (as indicated by the spread of its boxplots and higher standard deviations), which could be due to more dynamic weather patterns or local environmental factors.
* **Statistical Significance:** The one-way ANOVA test on GHI indicates that there [is/is no] a statistically significant difference in the mean GHI values between the countries (p-value = [p_value_from_above]). This suggests that the observed differences are [unlikely/likely] due to random chance.
""")


# --- (Bonus) Visual Summary: Bar chart ranking countries by average GHI ---
print("\n--- Bonus: Visual Summary - Average GHI Ranking ---")
if 'GHI' in df_combined.columns:
    avg_ghi_by_country = df_combined.groupby('Country')['GHI'].mean().sort_values(ascending=False)

    plt.figure(figsize=(8, 5))
    avg_ghi_by_country.plot(kind='bar', color='skyblue')
    plt.title('Average GHI by Country')
    plt.xlabel('Country')
    plt.ylabel('Average GHI ($W/m^2$)')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
else:
    print("'GHI' column not available for bonus visual summary.")