In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

ModuleNotFoundError: No module named 'scipy'

In [None]:
# Load Benin dataset
df = pd.read_csv('data\benin-malanville.csv')

# Display first few rows
df.head()

In [None]:
# Summary statistics
print("Summary Statistics:")
print(df.describe())

# Missing values report
print("\nMissing Values:")
missing = df.isna().sum()
print(missing)

# Columns with >5% missing
threshold = len(df) * 0.05
print("\nColumns with >5% Missing:")
print(missing[missing > threshold])

In [None]:
# Compute Z-scores for key columns
key_columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
df_zscores = df[key_columns].apply(zscore, nan_policy='omit')

# Flag outliers (|Z| > 3)
outliers = (df_zscores.abs() > 3).any(axis=1)
print(f"Number of rows with outliers: {outliers.sum()}")

# Impute missing values with median for key columns
for col in key_columns:
    df[col].fillna(df[col].median(), inplace=True)

# Drop rows with missing Timestamp or critical columns
df.dropna(subset=['Timestamp'], inplace=True)

# Export cleaned DataFrame
df.to_csv('data/benin_clean.csv', index=False)
print("Cleaned data saved to data/benin_clean.csv")

In [None]:
# Convert Timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Plot GHI, DNI, DHI, Tamb over time
plt.figure(figsize=(12, 8))
plt.plot(df['Timestamp'], df['GHI'], label='GHI')
plt.plot(df['Timestamp'], df['DNI'], label='DNI')
plt.plot(df['Timestamp'], df['DHI'], label='DHI')
plt.plot(df['Timestamp'], df['Tamb'], label='Tamb')
plt.xlabel('Timestamp')
plt.ylabel('Value')
plt.title('Time Series of GHI, DNI, DHI, and Tamb')
plt.legend()
plt.tight_layout()
plt.savefig('plots/ghi_dni_dhi_tamb_time_series.png')
plt.show()

In [None]:
# Group by Cleaning flag and compute average ModA, ModB
cleaning_impact = df.groupby('Cleaning')[['ModA', 'ModB']].mean()
print("Average ModA and ModB by Cleaning Flag:")
print(cleaning_impact)

# Plot
cleaning_impact.plot(kind='bar', title='Average ModA and ModB Pre/Post Cleaning')
plt.ylabel('Value')
plt.savefig('plots/cleaning_impact.png')
plt.show()

In [None]:
# Correlation heatmap
corr_columns = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']
corr_matrix = df[corr_columns].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig('plots/correlation_heatmap.png')
plt.show()

In [None]:
# Scatter plot: WS vs GHI
plt.figure(figsize=(8, 6))
plt.scatter(df['WS'], df['GHI'], alpha=0.5)
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('GHI (W/m^2)')
plt.title('Wind Speed vs GHI')
plt.savefig('plots/ws_vs_ghi.png')
plt.show()

In [None]:
# Histogram for GHI
plt.figure(figsize=(8, 6))
plt.hist(df['GHI'], bins=30)
plt.xlabel('GHI (W/m^2)')
plt.ylabel('Frequency')
plt.title('GHI Distribution')
plt.savefig('plots/ghi_histogram.png')
plt.show()

# Wind rose (simplified bar plot for WD)
wd_counts = df['WD'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(wd_counts.index, wd_counts.values)
plt.xlabel('Wind Direction (°N)')
plt.ylabel('Count')
plt.title('Wind Direction Distribution')
plt.savefig('plots/wind_direction.png')
plt.show()

In [None]:
# Scatter plot: RH vs Tamb
plt.figure(figsize=(8, 6))
plt.scatter(df['RH'], df['Tamb'], alpha=0.5)
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Ambient Temperature (°C)')
plt.title('RH vs Tamb')
plt.savefig('plots/rh_vs_tamb.png')
plt.show()

In [None]:
# Bubble chart: GHI vs Tamb, bubble size = RH
plt.figure(figsize=(10, 8))
plt.scatter(df['Tamb'], df['GHI'], s=df['RH']*10, alpha=0.5)
plt.xlabel('Ambient Temperature (°C)')
plt.ylabel('GHI (W/m^2)')
plt.title('GHI vs Tamb (Bubble Size: RH)')
plt.savefig('plots/bubble_ghi_tamb_rughty')
plt.show()