In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import os

In [None]:

# Create plots directory if it doesn't exist
if not os.path.exists('plots'):
    os.makedirs('plots')

# Load Sierra Leone dataset
df = pd.read_csv('data/sierraleone-bumbuna.csv')s
print("First few rows of the Sierra Leone dataset:")
print(df.head())

In [None]:

# Summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Missing values report
print("\nMissing Values:")
missing = df.isna().sum()
print(missing)
threshold = len(df) * 0.05
print("\nColumns with >5% Missing:")
print(missing[missing > threshold])

In [None]:

# Outlier detection and cleaning
key_columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
df_zscores = df[key_columns].apply(zscore, nan_policy='omit')
outliers = (df_zscores.abs() > 3).any(axis=1)
print(f"\nNumber of rows with outliers: {outliers.sum()}")
for col in key_columns:
    df[col].fillna(df[col].median(), inplace=True)
df.dropna(subset=['Timestamp'], inplace=True)
df.to_csv('data/sierraleone_clean.csv', index=False)
print("Cleaned data saved to data/sierraleone_clean.csv")

In [None]:

# Time series analysis
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
plt.figure(figsize=(12, 8))
plt.plot(df['Timestamp'], df['GHI'], label='GHI')
plt.plot(df['Timestamp'], df['DNI'], label='DNI')
plt.plot(df['Timestamp'], df['DHI'], label='DHI')
plt.plot(df['Timestamp'], df['Tamb'], label='Tamb')
plt.xlabel('Timestamp')
plt.ylabel('Value')
plt.title('Sierra Leone: Time Series of GHI, DNI, DHI, and Tamb')
plt.legend()
plt.tight_layout()
plt.savefig('plots/sierraleone_ghi_dni_dhi_tamb_time_series.png')
plt.close()

In [None]:

# Cleaning impact
cleaning_impact = df.groupby('Cleaning')[['ModA', 'ModB']].mean()
print("\nAverage ModA and ModB by Cleaning Flag:")
print(cleaning_impact)
cleaning_impact.plot(kind='bar', title='Sierra Leone: Average ModA and ModB Pre/Post Cleaning')
plt.ylabel('Value')
plt.savefig('plots/sierraleone_cleaning_impact.png')
plt.close()

In [None]:

# Correlation heatmap
corr_columns = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']
corr_matrix = df[corr_columns].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Sierra Leone: Correlation Heatmap')
plt.savefig('plots/sierraleone_correlation_heatmap.png')
plt.close()

In [None]:

# Scatter plot: WS vs GHI
plt.figure(figsize=(8, 6))
plt.scatter(df['WS'], df['GHI'], alpha=0.5)
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('GHI (W/m^2)')
plt.title('Sierra Leone: Wind Speed vs GHI')
plt.savefig('plots/sierraleone_ws_vs_ghi.png')
plt.close()

In [None]:

# Wind analysis: GHI histogram
plt.figure(figsize=(8, 6))
plt.hist(df['GHI'], bins=30)
plt.xlabel('GHI (W/m^2)')
plt.ylabel('Frequency')
plt.title('Sierra Leone: GHI Distribution')
plt.savefig('plots/sierraleone_ghi_histogram.png')
plt.close()

# Wind direction distribution
wd_counts = df['WD'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
plt.bar(wd_counts.index, wd_counts.values)
plt.xlabel('Wind Direction (°N)')
plt.ylabel('Count')
plt.title('Sierra Leone: Wind Direction Distribution')
plt.savefig('plots/sierraleone_wind_direction.png')
plt.close()

In [None]:

# Temperature analysis: RH vs Tamb
plt.figure(figsize=(8, 6))
plt.scatter(df['RH'], df['Tamb'], alpha=0.5)
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Ambient Temperature (°C)')
plt.title('Sierra Leone: RH vs Tamb')
plt.savefig('plots/sierraleone_rh_vs_tamb.png')
plt.close()

In [None]:

# Bubble chart: GHI vs Tamb, bubble size = RH
plt.figure(figsize=(10, 8))
plt.scatter(df['Tamb'], df['GHI'], s=df['RH']*10, alpha=0.5)
plt.xlabel('Ambient Temperature (°C)')
plt.ylabel('GHI (W/m^2)')
plt.title('Sierra Leone: GHI vs Tamb (Bubble Size: RH)')
plt.savefig('plots/sierraleone_bubble_ghi_tamb_rh.png')
plt.close()

print("Sierra Leone EDA complete. Plots saved to plots/ directory.")