In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load raw data (adjust path as needed)
df = pd.read_csv("../data/benin-malanville.csv")  # Replace with your file

In [3]:
# Summary statistics
print(df.describe())

# Missing values report
missing_report = df.isna().sum() / len(df) * 100
print("Missing values (%):\n", missing_report[missing_report > 5])  # Columns with >5% nulls

                 GHI            DNI            DHI           ModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      240.559452     167.187516     115.358961     236.589496   
std       331.131327     261.710501     158.691074     326.894859   
min       -12.900000      -7.800000     -12.600000       0.000000   
25%        -2.000000      -0.500000      -2.100000       0.000000   
50%         1.800000      -0.100000       1.600000       4.500000   
75%       483.400000     314.200000     216.300000     463.700000   
max      1413.000000     952.300000     759.200000    1342.300000   

                ModB           Tamb             RH             WS  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      228.883576      28.179683      54.487969       2.121113   
std       316.536515       5.924297      28.073069       1.603466   
min         0.000000      11.000000       2.100000       0.000000   
25%         0.000000      24.2000

In [None]:
# Calculate Z-scores for key columns
cols_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = (df[cols_to_check] - df[cols_to_check].mean()) / df[cols_to_check].std()
outliers = (np.abs(z_scores) > 3).any(axis=1)

# Impute missing values (median for numeric columns)
df_clean = df.copy()
for col in cols_to_check:
    df_clean[col].fillna(df_clean[col].median(), inplace=True)

# Export cleaned data (ensure `data/` is in .gitignore!)
df_clean.to_csv("../data/benin_clean.csv", index=False)  # Replace "benin"

In [None]:
# Convert timestamp if needed
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'])

# Plot solar irradiance and temperature
plt.figure(figsize=(12, 6))
df_clean.set_index('Timestamp')['GHI'].plot(title='GHI Over Time')
plt.show()

In [None]:
# Heatmap
corr_matrix = df_clean[['GHI', 'DNI', 'DHI', 'TModA', 'TModB']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Scatter plots
sns.scatterplot(data=df_clean, x='WS', y='GHI')
plt.title('Wind Speed vs. GHI')
plt.show()

In [5]:
# Wind rose (example using plotly)
import plotly.express as px
fig = px.bar_polar(df_clean, r='WS', theta='WD', title='Wind Direction vs. Speed')
fig.show()

# Temperature vs. Humidity
sns.scatterplot(data=df_clean, x='Tamb', y='RH')
plt.title('Ambient Temp vs. Relative Humidity')
plt.show()

ModuleNotFoundError: No module named 'plotly'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[col].fillna(df_clean[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[col].fillna(df_clean[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi