In [26]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
plt.ion()  

<contextlib.ExitStack at 0x1dada3e25d0>

In [5]:
# Set folder paths
base_dir = "Solar-Radiation-Measurement-Data"
data_dir = os.path.join(base_dir, "data")
raw_data_dir = os.path.join(base_dir, "data", "raw")
processed_data_dir = os.path.join(data_dir, "processed")
analysis_dir = os.path.join(base_dir, "analysis")
plots_dir = os.path.join(analysis_dir, "plots")
summary_dir = os.path.join(analysis_dir, "summary")
results_dir = os.path.join(base_dir, "results")

In [6]:
# Create directories if they don't exist
for folder in [data_dir, raw_data_dir, processed_data_dir, analysis_dir, plots_dir, summary_dir, results_dir]:
    os.makedirs(folder, exist_ok=True)

In [8]:
# Load datasets
files = ["benin-malanville.csv", "sierraleone-bumbuna.csv", "togo-dapaong_qc.csv"]
datasets = {}
for file in files:
    file_path = os.path.join(raw_data_dir, file)
    datasets[file.split(".")[0]] = pd.read_csv(file_path)

In [9]:
# Function to save plots

def save_plot(fig, filename):
    fig.savefig(os.path.join(plots_dir, filename), bbox_inches='tight')
    plt.close(fig)

In [10]:
# Function for summary statistics
def generate_summary_statistics(df, filename):
    summary = df.describe().T
    summary["missing"] = df.isnull().sum()
    summary.to_csv(os.path.join(summary_dir, filename))
    return summary

In [11]:
# Perform EDA on each dataset
for name, df in datasets.items():
    print(f"Processing dataset: {name}")

    # Summary statistics
    summary = generate_summary_statistics(df, f"summary_{name}.csv")
    print(summary)

Processing dataset: benin-malanville
                  count        mean         std    min    25%    50%    75%  \
GHI            525600.0  240.559452  331.131327  -12.9   -2.0    1.8  483.4   
DNI            525600.0  167.187516  261.710501   -7.8   -0.5   -0.1  314.2   
DHI            525600.0  115.358961  158.691074  -12.6   -2.1    1.6  216.3   
ModA           525600.0  236.589496  326.894859    0.0    0.0    4.5  463.7   
ModB           525600.0  228.883576  316.536515    0.0    0.0    4.3  447.9   
Tamb           525600.0   28.179683    5.924297   11.0   24.2   28.0   32.3   
RH             525600.0   54.487969   28.073069    2.1   28.8   55.1   80.1   
WS             525600.0    2.121113    1.603466    0.0    1.0    1.9    3.1   
WSgust         525600.0    2.809195    2.029120    0.0    1.3    2.6    4.1   
WSstdev        525600.0    0.473390    0.273395    0.0    0.4    0.5    0.6   
WD             525600.0  153.435172  102.332842    0.0   59.0  181.0  235.1   
WDstdev        

In [12]:
# Data quality check
print("Checking for missing values...")
missing_values = df.isnull().sum()
print(missing_values)
print("Checking for outliers...")
numeric_cols = df.select_dtypes(include=[np.number]).columns
z_scores = np.abs(zscore(df[numeric_cols].dropna()))
outliers = (z_scores > 3).sum(axis=0)
print(outliers)

Checking for missing values...
Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64
Checking for outliers...
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
# Time Series Analysis
time_cols = ["GHI", "DNI", "DHI", "Tamb"]
time_cols = [col for col in time_cols if col in df.columns]
if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"])
    df.set_index("Date", inplace=True)
    fig, ax = plt.subplots(figsize=(10, 6))
    df[time_cols].plot(ax=ax, title=f"Time Series Analysis: {name}")
    save_plot(fig, f"time_series_{name}.png")
    

In [14]:
# Correlation Analysis
if not df[numeric_cols].empty:
    corr = df[numeric_cols].corr()
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(corr, annot=True, cmap="coolwarm", ax=ax)
    ax.set_title(f"Correlation Matrix: {name}")
    save_plot(fig, f"correlation_matrix_{name}.png")

In [15]:
# Wind Analysis
if "WS" in df.columns and "WD" in df.columns:
    fig, ax = plt.subplots(figsize=(8, 8))
    sns.histplot(df["WS"], kde=True, ax=ax)
    ax.set_title(f"Wind Speed Distribution: {name}")
    save_plot(fig, f"wind_speed_distribution_{name}.png")

In [16]:
# Histograms
fig, axes = plt.subplots(len(numeric_cols) // 3 + 1, 3, figsize=(15, 10))
axes = axes.flatten()
for i, col in enumerate(numeric_cols):
    sns.histplot(df[col], kde=True, ax=axes[i])
    axes[i].set_title(f"Histogram: {col}")
save_plot(fig, f"histograms_{name}.png")

In [18]:
# Bubble Chart
bubble_vars = ["GHI", "Tamb", "WS", "RH"]
bubble_vars = [col for col in bubble_vars if col in df.columns]
if len(bubble_vars) >= 3:
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.scatterplot(
        x=bubble_vars[0],
        y=bubble_vars[1],
        size=bubble_vars[2],
        hue=bubble_vars[2],
        sizes=(40, 400),
        data=df,
        ax=ax
    )
    ax.set_title(f"Bubble Chart: {name}")
    save_plot(fig, f"bubble_chart_{name}.png")

In [None]:
# Data Cleaning
if "Comments" in df.columns:
    df.drop(columns=["Comments"], inplace=True)

# Forward fill missing values
df.ffill(inplace=True)

# Backward fill missing values
df.bfill(inplace=True)

# Save the cleaned dataset
df.to_csv(os.path.join(processed_data_dir, f"cleaned_{name}.csv"), index=False)
print(f"Finished processing dataset: {name}\n")


Finished processing dataset: togo-dapaong_qc

