In [2]:
import pandas as pd
import numpy as np

In [6]:
df=pd.read_csv("data/data/benin-malanville.csv")
#df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

Summary Statistics & Missing-Value Report

In [7]:
# Summary statistics for all numeric columns
summary_statistics = df.describe()

# Display summary statistics
print("Summary Statistics:")
print(summary_statistics)

# Missing-value report
missing_values = df.isna().sum()

# Calculate the percentage of missing values
missing_percentage = (missing_values / len(df)) * 100

# List columns with more than 5% nulls
columns_with_nulls = missing_percentage[missing_percentage > 5]

# Display missing-value report
print("\nMissing-Value Report:")
print(columns_with_nulls)

Summary Statistics:
                 GHI            DNI            DHI           ModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      240.559452     167.187516     115.358961     236.589496   
std       331.131327     261.710501     158.691074     326.894859   
min       -12.900000      -7.800000     -12.600000       0.000000   
25%        -2.000000      -0.500000      -2.100000       0.000000   
50%         1.800000      -0.100000       1.600000       4.500000   
75%       483.400000     314.200000     216.300000     463.700000   
max      1413.000000     952.300000     759.200000    1342.300000   

                ModB           Tamb             RH             WS  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      228.883576      28.179683      54.487969       2.121113   
std       316.536515       5.924297      28.073069       1.603466   
min         0.000000      11.000000       2.100000       0.000000   
25%         0

Outlier Detection & Basic Cleaning

In [None]:
from scipy.stats import zscore
import seaborn as sns
import matplotlib.pyplot as plt
# Step 2: Drop or impute missing values in key columns (assuming GHI, DNI, DHI, ModA, ModB, WS, WSgust are key)
key_columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
df[key_columns] = df[key_columns].fillna(df[key_columns].median())

# Step 3: Compute Z-scores
z_scores = zscore(df[key_columns])
z_scores_df = pd.DataFrame(z_scores, columns=key_columns)

# Step 4: Flag rows with |Z| > 3
outlier_flags = (np.abs(z_scores_df) > 3).any(axis=1)
df['outlier'] = outlier_flags

# Step 5: Optionally drop outliers
# df = df[~df['outlier']]  # Uncomment this line to drop outliers

# Step 6: Export cleaned DataFrame to CSV
output_path = 'data/<country>_clean.csv'  # Replace <country> with your country name
df.to_csv(output_path, index=False)

print(f"Cleaned DataFrame exported to: {output_path}")

Time Series Analysis

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.set_index('Timestamp', inplace=True)

In [None]:
# Resample data to daily means
daily_data = df.resample('D').mean()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style
sns.set(style='whitegrid')

# Create subplots
plt.figure(figsize=(15, 10))

# Plot GHI
plt.subplot(4, 1, 1)
plt.plot(df.index, df['GHI'], label='GHI', color='orange')
plt.title('Global Horizontal Irradiance (GHI)')
plt.ylabel('Irradiance (W/m²)')
plt.legend()

# Plot DNI
plt.subplot(4, 1, 2)
plt.plot(df.index, df['DNI'], label='DNI', color='blue')
plt.title('Direct Normal Irradiance (DNI)')
plt.ylabel('Irradiance (W/m²)')
plt.legend()

# Plot DHI
plt.subplot(4, 1, 3)
plt.plot(df.index, df['DHI'], label='DHI', color='green')
plt.title('Diffuse Horizontal Irradiance (DHI)')
plt.ylabel('Irradiance (W/m²)')
plt.legend()

# Plot Tamb
plt.subplot(4, 1, 4)
plt.plot(df.index, df['Tamb'], label='Tamb', color='red')
plt.title('Ambient Temperature (Tamb)')
plt.ylabel('Temperature (°C)')
plt.xlabel('Timestamp')
plt.legend()

# Adjust layout and show
plt.tight_layout()
plt.show()

Cleaning Impact