In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import shutil
import os
import warnings
warnings.filterwarnings('ignore')

# --- Data Configuration ---
# The dataset in this workspace is located under the 'data/' folder
DATA_PATH = os.path.join('data', 'nsrdb_mock_dataset.csv')
TARGET_COL_RAW = 'GHI (W/mÂ²)'  # The column used as the sole time series
NOTEBOOK_FILENAME = "Solar_Prediction_Univariate_LGBM.ipynb"
GENERATION_PROXY = 'GENERATION_MW'  # Renamed target for project consistency

# --- Load and Prepare Data ---
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Error: Ensure '{DATA_PATH}' is present in the working directory.")

df_solar = pd.read_csv(DATA_PATH)

# Standardize column names and set time as index
df_solar.rename(columns={TARGET_COL_RAW: GENERATION_PROXY}, inplace=True)
df_solar['DATE_TIME'] = pd.to_datetime(df_solar['Timestamp'])
df_solar = df_solar.set_index('DATE_TIME')

# Keep ONLY the target column and sort by time
df_solar = df_solar[[GENERATION_PROXY]].sort_index().copy()

print(f"Data Loaded. Total observations: {len(df_solar)}")
print("\nUnivariate Time-Series Data Head:")
print(df_solar.head())

KeyboardInterrupt: 

In [None]:
# ==============================================================================
# CELL 2: Data Cleaning and Filtering (robust)
# ==============================================================================

# Safety checks: ensure df_solar exists
if 'df_solar' not in globals():
    raise NameError('df_solar is not defined. Run the data-loading cell first (Cell 1).')

# Ensure the generation column exists. If not, try to find a 'ghi' column (case-insensitive)
if GENERATION_PROXY not in df_solar.columns:
    # If the raw target name exists, rename it; otherwise look for GHI-like columns
    if TARGET_COL_RAW in df_solar.columns:
        df_solar.rename(columns={TARGET_COL_RAW: GENERATION_PROXY}, inplace=True)
    else:
        candidates = [c for c in df_solar.columns if 'ghi' in c.lower()]
        if candidates:
            chosen = candidates[0]
            print(f"NOTICE: Using '{chosen}' as the generation proxy column.")
            df_solar.rename(columns={chosen: GENERATION_PROXY}, inplace=True)
        else:
            raise KeyError(f"No generation column found. Available columns: {df_solar.columns.tolist()}")

# 1. Handle Missing Values (forward then backward fill)
df_solar[GENERATION_PROXY] = df_solar[GENERATION_PROXY].replace([np.inf, -np.inf], np.nan)
df_solar[GENERATION_PROXY] = df_solar[GENERATION_PROXY].fillna(method='ffill').fillna(method='bfill')

# 2. Filter for Daytime Data (Non-zero generation)
GEN_THRESHOLD = 0.001
# ensure numeric dtype before filtering
df_solar[GENERATION_PROXY] = pd.to_numeric(df_solar[GENERATION_PROXY], errors='coerce')
df_solar_day = df_solar[df_solar[GENERATION_PROXY] > GEN_THRESHOLD].copy()

# If filtering yields no rows, warn and fall back to original df_solar
if df_solar_day.empty:
    print('WARNING: Daytime filter returned 0 rows. Using full dataset for modeling.')
    df_solar_day = df_solar.copy()

# 3. Outlier Handling (Cap the top 0.1% to manage extreme values) - only if non-empty
if not df_solar_day.empty:
    q_high = df_solar_day[GENERATION_PROXY].quantile(0.999)
    if pd.notna(q_high) and q_high > 0:
        df_solar_day.loc[df_solar_day[GENERATION_PROXY] > q_high, GENERATION_PROXY] = q_high

print(f"\nDaytime-only data filtered. Rows for modeling: {len(df_solar_day)}")
print(df_solar_day[GENERATION_PROXY].describe())