In [1]:
# EDA : Task 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

# ======================
# Load datasets
# ======================
benin = pd.read_csv("../data/benin-malanville.csv")
togo = pd.read_csv("../data/togo-dapaong_qc.csv")
sl = pd.read_csv("../data/sierraleone-bumbuna.csv")

# Rename for easier handling
datasets = {
    "Benin": benin,
    "Togo": togo,
    "Sierra Leone": sl
}

# ======================
# Perform EDA per country
# ======================
for country, df in datasets.items():
    print(f"--- {country} ---")
    
    # 1. Shape and info
    print(df.shape)
    print(df.info())
    
    # 2. Summary statistics
    print(df.describe())
    
    # 3. Missing values
    missing = df.isna().sum()
    print("Missing values:\n", missing[missing > 0])
    
    # 4. Outlier detection using Z-score
    cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
    z_scores = df[cols].apply(zscore)
    outliers = (z_scores.abs() > 3)
    print(f"{country}: Outliers detected (any column):", outliers.any(axis=1).sum())

    # 5. Clean: Drop NA in key columns
    df_clean = df.dropna(subset=cols)
    df_clean.to_csv(f"../data/{country.lower().replace(' ', '_')}_clean.csv", index=False)
    
    # 6. Plot time series
    df_clean["Timestamp"] = pd.to_datetime(df_clean["Timestamp"])
    df_clean.set_index("Timestamp")[["GHI", "DNI", "Tamb"]].plot(figsize=(10, 5), title=f"{country} - Irradiance and Temperature")
    plt.show()
    
    # 7. Cleaning impact on ModA/ModB
    df_clean.groupby("Cleaning")[["ModA", "ModB"]].mean().plot(kind="bar", title=f"{country} - Cleaning Impact")
    plt.show()
    
    # 8. Correlation heatmap
    sns.heatmap(df_clean[cols + ["TModA", "TModB"]].corr(), annot=True)
    plt.title(f"{country} - Correlation")
    plt.show()


ModuleNotFoundError: No module named 'pandas'