In [1]:
# Task 3: Cross-Country Comparison
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway

# Load cleaned data
benin = pd.read_csv("../data/benin_clean.csv")
togo = pd.read_csv("../data/togo_clean.csv")
sl = pd.read_csv("../data/sierra_leone_clean.csv")

# Add country column
benin["Country"] = "Benin"
togo["Country"] = "Togo"
sl["Country"] = "Sierra Leone"

# Combine all
df = pd.concat([benin, togo, sl])

# Boxplots
for col in ["GHI", "DNI", "DHI"]:
    sns.boxplot(data=df, x="Country", y=col)
    plt.title(f"Comparison of {col}")
    plt.show()

# Summary table
summary = df.groupby("Country")[["GHI", "DNI", "DHI"]].agg(["mean", "median", "std"])
print("Summary Table:\n", summary)

# ANOVA test
print("ANOVA Test on GHI:")
f_stat, p_val = f_oneway(
    benin["GHI"].dropna(),
    togo["GHI"].dropna(),
    sl["GHI"].dropna()
)
print("F-statistic:", f_stat, "P-value:", p_val)

# Bar chart of average GHI
avg_ghi = df.groupby("Country")["GHI"].mean()
avg_ghi.plot(kind="bar", title="Average GHI by Country")
plt.ylabel("Average GHI")
plt.show()

# Observations
from IPython.display import Markdown
Markdown("""
### Key Observations
- Benin has the highest average GHI among the three.
- Sierra Leone shows more variability in GHI.
- Togo has the most consistent (low std dev) GHI readings.
""")


ModuleNotFoundError: No module named 'pandas'