In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import zscore
from scipy import stats
import os

In [2]:
# loading benin-malanville dataset
df = pd.read_csv("../data/benin-malanville.csv")

In [None]:
df.head(10)

## Summary Statistics & Missing Values

In [None]:
# Display basic info
print("Dataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())
print("\nPercentage of Missing Values:")
print((df.isna().sum() / len(df) * 100).round(2))
print("\nMissing Values:")
print(df.isna().sum())


In [None]:

missing_counts = df.isnull().sum()
print("Missing values:\n", missing_counts)

columns_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = df[columns_to_check].apply(zscore)

outliers = (np.abs(z_scores) > 3)
print("Outliers detected per column:\n", outliers.sum())

df_cleaned = df.copy()
df_cleaned[columns_to_check] = df_cleaned[columns_to_check].mask(outliers)

df_cleaned[columns_to_check] = df_cleaned[columns_to_check].fillna(df_cleaned[columns_to_check].median())

key_columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
df_cleaned = df_cleaned.dropna(subset=key_columns)

output_path = 'data/benin_clean.csv'
os.makedirs('data', exist_ok=True)
df_cleaned.to_csv(output_path, index=False)

print(f"Cleaned data exported to: {output_path}")


## Time Series Analysis

In [None]:
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

# setting the index to Timestamp
df.set_index("Timestamp", inplace=True)

plot_cols = ['GHI', 'DNI', 'DHI', 'Tamb']
# Plotting
df[plot_cols].plot(figsize=(15, 6))
plt.title("Solar Irradiance and Ambient Temperature Over Time")
plt.ylabel("Values")

## Cleaning Impact

In [None]:
# Group by cleaning status (0 or 1)
cleaning_impact = df.groupby('Cleaning')[['ModA', 'ModB']].mean()

# Bar plot
cleaning_impact.plot(kind='bar', figsize=(8, 5))
plt.title('Average ModA and ModB Before vs After Cleaning')
plt.ylabel('W/m²')

## Correlation & Relationship Analysis

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df[['GHI', 'DNI', 'DHI', 'TModA', 'TModB']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')

## Wind & Distribution Analysis

In [None]:
sns.scatterplot(data=df, x='WS', y='GHI')
plt.title('Wind Speed vs GHI')

sns.scatterplot(data=df, x='RH', y='Tamb')
plt.title('Humidity vs Ambient Temp')

In [None]:
df['GHI'].plot.hist(bins=50, alpha=0.7, title='GHI Histogram')
plt.xlabel('W/m²')

## Temperature Analysis

In [None]:
plt.figure(figsize=(12, 6))

# Humidity vs GHI
sns.scatterplot(data=df, x='RH', y='GHI', color='blue', alpha=0.6, label='Humidity vs GHI')
plt.title('Humidity vs GHI')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Global Horizontal Irradiance (W/m²)')
plt.legend()

# Humidity vs Ambient Temp
sns.scatterplot(data=df, x='RH', y='Tamb', color='orange', alpha=0.6, label='Humidity vs Ambient Temp')
plt.title('Humidity vs GHI and Ambient Temp')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Values')
plt.legend()

## Bubble Chart

In [None]:
# Bubble chart: GHI vs Tamb, bubble = RH
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Tamb', y='GHI', size='RH', hue='RH', alpha=0.6, sizes=(20, 200))
plt.title('GHI vs Temperature (bubble size = RH)')