In [67]:
# ------------------------------
# Benin EDA Notebook (eda-benin branch)
# ------------------------------

# 1️⃣ Setup: fix Python path for Jupyter and imports
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# Absolute path to repo root
repo_root = Path(r"D:\Python\Week_01\Assignment\solar-challenge-week0")

# Prepend repo root to sys.path to allow 'src' imports
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

# Now imports from src package work
from src.data_loader import load_country_data
from src.preprocess import preprocess_dataset
from src.analyze import run_full_analysis

# ------------------------------
# 2️⃣ Dataset configuration
# ------------------------------
FILENAME = "benin-malanville.csv"  # Change to togo-dapaong_qc.csv or sierraleone-bumbuna.csv for other datasets
COUNTRY_NAME = "benin"
TIMESTAMP_COL = "timestamp"  # adjust if your CSV column differs

# ------------------------------
# 3️⃣ Load raw dataset
# ------------------------------
df_raw = load_country_data(FILENAME)
print(f"Raw dataset shape: {df_raw.shape}")
display(df_raw.head())

# ------------------------------
# 4️⃣ Preprocess dataset
# ------------------------------
df_clean = preprocess_dataset(df_raw, country=COUNTRY_NAME)
print(f"Cleaned dataset shape: {df_clean.shape}")
display(df_clean.head())

# ------------------------------
# 5️⃣ Run full EDA/Analysis
# ------------------------------
# Includes:
# - Summary statistics & missing values
# - Outlier detection
# - Time series plots
# - Cleaning impact plots
# - Correlations & scatter plots
# - Wind & distribution analysis
# - Histograms
# - Temperature analysis
# - Bubble charts
run_full_analysis(df_clean, country=COUNTRY_NAME, timestamp_col=TIMESTAMP_COL)

# ------------------------------
# 6️⃣ Optional: Bubble Chart
# ------------------------------
# GHI vs Tamb with RH as bubble size
if all(col in df_clean.columns for col in ["ghi", "tamb", "rh"]):
    plt.figure(figsize=(10, 6))
    plt.scatter(df_clean["ghi"], df_clean["tamb"], s=df_clean["rh"], alpha=0.5,
                c="orange", edgecolors="k")
    plt.xlabel("GHI")
    plt.ylabel("Tamb")
    plt.title(f"GHI vs Tamb with RH bubble size ({COUNTRY_NAME})")
    plt.show()


ModuleNotFoundError: No module named 'src.data_loader'