In [None]:
import pandas as pd

df = pd.read_csv('data/benin.csv')  # Change file name to your actual country CSV


In [None]:
df.describe()


In [None]:
df.isna().sum()


In [None]:
total_rows = len(df)
null_report = df.isna().sum() / total_rows * 100
null_report[null_report > 5]


In [None]:
from scipy.stats import zscore

columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = df[columns].apply(zscore)
df_clean = df[(z_scores.abs() < 3).all(axis=1)]


In [None]:
df_clean.fillna(df_clean.median(numeric_only=True), inplace=True)


In [None]:
df_clean.to_csv('data/benin_clean.csv', index=False)


In [None]:
import matplotlib.pyplot as plt

df['Timestamp'] = pd.to_datetime(df['Timestamp'])

plt.figure(figsize=(10,5))
plt.plot(df['Timestamp'], df['GHI'])
plt.title("GHI Over Time")
plt.xlabel("Time")
plt.ylabel("GHI")
plt.show()


In [None]:
df['clean_flag'] = z_scores.abs().max(axis=1) < 3

df.groupby('clean_flag')[['ModA', 'ModB']].mean()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
sns.heatmap(df_clean[columns].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:
df_clean['WS'].hist(bins=20)
plt.title("Wind Speed Distribution")
plt.xlabel("WS")
plt.show()


In [None]:
plt.scatter(df_clean['RH'], df_clean['Tamb'])
plt.xlabel("Relative Humidity")
plt.ylabel("Temperature")
plt.title("RH vs Tamb")
plt.show()


In [None]:
plt.scatter(df_clean['GHI'], df_clean['Tamb'], s=df_clean['RH'], alpha=0.5)
plt.xlabel("GHI")
plt.ylabel("Tamb")
plt.title("GHI vs Tamb (bubble size = RH)")
plt.show()
