# Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error


# Configure plotting

In [4]:
sns.set_palette("colorblind")
plt.style.use('fivethirtyeight')


# Load solar production data

In [8]:
df = pd.read_csv(
    'C:/_Projects/home-energy-ai/data/raw/villamichelin_solar_power.csv', 
    parse_dates=['last_changed']
)

df = df.set_index('last_changed')


# Resample to 15Min data


In [9]:
# Find duplicates in the index
duplicates = df.index.duplicated(keep=False)
print(df[duplicates])


df_resampled = df.resample("15T").interpolate(method='linear')  # Resample to 15 minutes
df_resampled["state"] = df_resampled["state"].fillna(0)  # Fill missing production data

df = df_resampled

print(df.head())


  df_resampled = df.resample("15T").interpolate(method='linear')  # Resample to 15 minutes


ValueError: cannot reindex on an axis with duplicate labels

# Plot solar production


In [None]:
df.plot(style='.',
        figsize=(15, 3),
        title='Solar Production (kWh)')
plt.show()


# Train-Test Split

In [None]:
train = df.loc[df.index < '2025-01-01']
test = df.loc[df.index >= '2025-01-01']

fig, ax = plt.subplots(figsize=(15, 4))
train['solar_output'].plot(ax=ax, label='Training Set', title='Solar Production: Train/Test Split')
test['solar_output'].plot(ax=ax, label='Test Set', color='orange')
ax.axvline('2025-01-01', color='black', ls='--')
ax.legend(['Training Set', 'Test Set'])
plt.show()


# Feature Engineering

In [None]:

def create_features(df):
    """
    Create features for solar production prediction based on weather and time data.
    """
    df = df.copy()
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['prev_hour'] = df['solar_output'].shift(1)
    df['prev_day'] = df['solar_output'].shift(24)
    df['rolling_mean_6h'] = df['solar_output'].rolling(6).mean()
    df['rolling_max_6h'] = df['solar_output'].rolling(6).max()
    return df

train = create_features(train)
test = create_features(test)


# Define Features and Target

In [None]:

FEATURES = ['hour', 'dayofweek', 'month', 'dayofyear', 'prev_hour', 'prev_day', 'rolling_mean_6h']
TARGET = 'solar_output'

X_train = train[FEATURES].fillna(0)  # Fill NaN values
y_train = train[TARGET]

X_test = test[FEATURES].fillna(0)
y_test = test[TARGET]



# Train XGBoost Model

In [None]:

params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
grid_search = GridSearchCV(xgb_model, params, cv=TimeSeriesSplit(n_splits=5), scoring='neg_mean_absolute_error', verbose=1)
grid_search.fit(X_train, y_train)

best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)



# Evaluation

In [None]:

mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"MAPE: {mape:.4f}")
print(f"RMSE: {rmse:.4f}")



# Plot Actual vs Predicted Solar Output

In [None]:

test['Predictions'] = y_pred
test[['solar_output', 'Predictions']].plot(figsize=(15, 5), title='Actual vs. Predicted Solar Production')
plt.show()


# Feature Importance

In [None]:


fi = pd.DataFrame(data=best_xgb.feature_importances_,
                    index=FEATURES,
                    columns=['importance'])
fi.sort_values('importance').plot(kind='barh', title='Feature Importance', figsize=(10, 6))
plt.show()
