In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway, kruskal

In [14]:
benin_df = pd.read_csv('data/benin_clean.csv')
sierra_leone_df = pd.read_csv('data/sierraleone_clean.csv')
togo_df = pd.read_csv('data/togo_clean.csv')

In [16]:
print("Benin columns:", benin_df.columns.tolist())
print("Sierra Leone columns:", sierra_leone_df.columns.tolist())
print("Togo columns:", togo_df.columns.tolist())

Benin columns: ['Timestamp', 'GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'WD', 'WDstdev', 'BP', 'Cleaning', 'Precipitation', 'TModA', 'TModB', 'Comments']
Sierra Leone columns: ['Timestamp', 'GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'WD', 'WDstdev', 'BP', 'Cleaning', 'Precipitation', 'TModA', 'TModB', 'Comments']
Togo columns: ['Timestamp', 'GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev', 'WD', 'WDstdev', 'BP', 'Cleaning', 'Precipitation', 'TModA', 'TModB', 'Comments']


In [18]:
benin_df['Country'] = 'Benin'
sierra_leone_df['Country'] = 'Sierra Leone'
togo_df['Country'] = 'Togo'

In [20]:
combined_df = pd.concat([benin_df, sierra_leone_df, togo_df], ignore_index=True)

In [22]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Country', y='GHI', data=combined_df, hue='Country', palette='Set3')
plt.title('GHI Comparison Across Countries')
plt.savefig('notebooks/ghi_boxplot.png')
plt.close()

In [24]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Country', y='DNI', data=combined_df, hue='Country', palette='Set3')
plt.title('DNI Comparison Across Countries')
plt.savefig('notebooks/dni_boxplot.png')
plt.close()

In [26]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Country', y='DHI', data=combined_df, hue='Country', palette='Set3')
plt.title('DHI Comparison Across Countries')
plt.savefig('notebooks/dhi_boxplot.png')
plt.close()

In [28]:
summary_stats = combined_df.groupby('Country')[['GHI', 'DNI', 'DHI']].agg(['mean', 'median', 'std']).round(2)
print("\nSummary Table (Mean, Median, Std Dev):")
print(summary_stats)


Summary Table (Mean, Median, Std Dev):
                 GHI                    DNI                    DHI         \
                mean median     std    mean median     std    mean median   
Country                                                                     
Benin         240.56    1.8  331.13  167.19   -0.1  261.71  115.36    1.6   
Sierra Leone  201.96    0.3  298.50  116.38   -0.1  218.65  113.72   -0.1   
Togo          230.56    2.1  322.53  151.26    0.0  250.96  116.44    2.5   

                      
                 std  
Country               
Benin         158.69  
Sierra Leone  158.95  
Togo          156.52  


In [30]:
ghi_benin = benin_df['GHI'].dropna()
ghi_sierra_leone = sierra_leone_df['GHI'].dropna()
ghi_togo = togo_df['GHI'].dropna()
anova_result = f_oneway(ghi_benin, ghi_sierra_leone, ghi_togo)
print("\nANOVA Test for GHI:")
print(f"F-statistic: {anova_result.statistic:.2f}, p-value: {anova_result.pvalue:.4f}")


ANOVA Test for GHI:
F-statistic: 2090.09, p-value: 0.0000


In [32]:
kruskal_result = kruskal(ghi_benin, ghi_sierra_leone, ghi_togo)
print("Kruskal-Wallis Test for GHI:")
print(f"H-statistic: {kruskal_result.statistic:.2f}, p-value: {kruskal_result.pvalue:.4f}")

Kruskal-Wallis Test for GHI:
H-statistic: 4524.88, p-value: 0.0000


In [34]:
avg_ghi = combined_df.groupby('Country')['GHI'].mean().sort_values()
plt.figure(figsize=(8, 6))
avg_ghi.plot(kind='bar', color=['#ff9999', '#66b3ff', '#99ff99'])
plt.title('Average GHI by Country')
plt.xlabel('Country')
plt.ylabel('Average GHI')
plt.savefig('notebooks/avg_ghi_bar_chart.png')
plt.close()