In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import zscore

# Set style
plt.style.use('seaborn')
sns.set_palette('husl')

# 2.1 Load & Inspect
df = pd.read_csv('../data/benin-malanville.csv')
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

print("Dataset Info:")
df.info()

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values:")
print(df.isna().sum())

# 2.2 Missing & Outliers
# Handle missing values
numeric_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# Detect outliers
for col in numeric_cols:
    df[f'{col}_z'] = zscore(df[col])
    outliers = df[abs(df[f'{col}_z']) > 3]
    print(f"\nOutliers in {col}:", len(outliers))

# 2.3 Time Series & Patterns
plt.figure(figsize=(15, 5))
plt.plot(df['Timestamp'], df['GHI'], label='GHI')
plt.plot(df['Timestamp'], df['DNI'], label='DNI')
plt.plot(df['Timestamp'], df['DHI'], label='DHI')
plt.title('Solar Radiation Over Time')
plt.xlabel('Timestamp')
plt.ylabel('Radiation (W/m²)')
plt.legend()
plt.show()

# Daily patterns
daily_avg = df.groupby(df['Timestamp'].dt.hour)[['GHI', 'DNI', 'DHI']].mean()
daily_avg.plot(figsize=(10, 5))
plt.title('Average Daily Pattern')
plt.xlabel('Hour of Day')
plt.ylabel('Radiation (W/m²)')
plt.show()

# 2.4 Cleaning Impact
if 'Cleaning' in df.columns:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    df[df['Cleaning'] == 0][['ModA', 'ModB']].boxplot(ax=ax1)
    ax1.set_title('Before Cleaning')
    df[df['Cleaning'] == 1][['ModA', 'ModB']].boxplot(ax=ax2)
    ax2.set_title('After Cleaning')
    plt.show()

# 2.5 Correlations
corr_matrix = df[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'WS']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Scatter plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
sns.scatterplot(data=df, x='WS', y='GHI', ax=ax1)
ax1.set_title('Wind Speed vs GHI')
sns.scatterplot(data=df, x='Tamb', y='GHI', ax=ax2)
ax2.set_title('Temperature vs GHI')
plt.show()

# 2.6 Wind & Temperature Analysis
# Wind rose plot
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='polar')
ax.scatter(np.radians(df['WD']), df['WS'], alpha=0.5)
plt.title('Wind Rose')
plt.show()

# Bubble chart
plt.figure(figsize=(10, 6))
plt.scatter(df['Tamb'], df['GHI'], s=df['RH']*2, alpha=0.5)
plt.xlabel('Temperature (°C)')
plt.ylabel('GHI (W/m²)')
plt.title('GHI vs Temperature (bubble size = RH)')
plt.show()

# 2.7 Save Cleaned Data
# Remove z-score columns
cols_to_drop = [col for col in df.columns if col.endswith('_z')]
df_clean = df.drop(columns=cols_to_drop)
df_clean.to_csv("../data/benin_clean.csv", index=False)
print("Cleaned data saved successfully")

ModuleNotFoundError: No module named 'matplotlib'