# SolarSync Predictor

*   Using historical solar production and weather sensor data (e.g., irradiance, temperature) to optimize grid stability. It
leverages time-series Supervised machine learning models and a Dash dashboard for real-time visualization using XGBoost and LSTM



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor, plot_importance

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [2]:
# 🗂️ Data Cleaning & Merging

def load_data(gen_path, weather_path):
    gen_df = pd.read_csv(gen_path, parse_dates=['DATE_TIME'])
    weather_df = pd.read_csv(weather_path, parse_dates=['DATE_TIME'])
    return gen_df, weather_df

def clean_data(gen_df, weather_df):
    gen_df = gen_df[gen_df['PLANT_ID'] == 4135001]
    weather_df = weather_df[weather_df['PLANT_ID'] == 4135001]

    gen_df.fillna({'DC_POWER': 0, 'AC_POWER': 0, 'DAILY_YIELD': 0, 'TOTAL_YIELD': 0}, inplace=True)
    weather_df.fillna(method='ffill', inplace=True)

    gen_df['DATE_TIME'] = gen_df['DATE_TIME'].dt.floor('H')
    gen_df = gen_df.groupby(['DATE_TIME', 'PLANT_ID']).agg({
        'DC_POWER': 'mean',
        'AC_POWER': 'mean',
        'DAILY_YIELD': 'mean',
        'TOTAL_YIELD': 'mean'
    }).reset_index()

    weather_df['DATE_TIME'] = weather_df['DATE_TIME'].dt.floor('H')
    weather_df = weather_df.groupby(['DATE_TIME', 'PLANT_ID']).agg({
        'AMBIENT_TEMPERATURE': 'mean',
        'MODULE_TEMPERATURE': 'mean',
        'IRRADIATION': 'mean'
    }).reset_index()

    merged_df = pd.merge(gen_df, weather_df, on=['DATE_TIME', 'PLANT_ID'], how='inner')
    merged_df.drop_duplicates(subset=['DATE_TIME', 'PLANT_ID'], inplace=True)

    return merged_df

# Example execution
gen_df, weather_df = load_data('Generation_Data.csv', 'Weather_Sensor_Data.csv')
cleaned_df = clean_data(gen_df, weather_df)
cleaned_df.to_csv('cleaned_solar_data.csv', index=False)
print("Cleaned dataset saved.")


Cleaned dataset saved.


In [3]:
# 🛠️ Feature Engineering & Preprocessing

def engineer_features(df):
    df['hour'] = df.index.hour
    df['day'] = df.index.day
    df['month'] = df.index.month
    df['day_of_week'] = df.index.dayofweek

    df['ac_power_lag1'] = df['AC_POWER'].shift(1)
    df['irradiation_lag1'] = df['IRRADIATION'].shift(1)

    df['ac_power_rolling_mean'] = df['AC_POWER'].rolling(window=3).mean()
    df['irradiation_rolling_mean'] = df['IRRADIATION'].rolling(window=3).mean()

    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)

    return df

def normalize_features(df, feature_cols):
    scaler = MinMaxScaler()
    df[feature_cols] = scaler.fit_transform(df[feature_cols])
    return df, scaler

def prepare_lstm_data(X, y, time_steps=10):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

def preprocess_data(file_path):
    df = pd.read_csv(file_path, parse_dates=['DATE_TIME'])
    df.set_index('DATE_TIME', inplace=True)
    df = engineer_features(df)

    feature_cols = [
        'IRRADIATION', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE',
        'hour', 'day', 'month', 'day_of_week',
        'ac_power_lag1', 'irradiation_lag1',
        'ac_power_rolling_mean', 'irradiation_rolling_mean'
    ]

    target_col = 'AC_POWER'
    df, scaler = normalize_features(df, feature_cols)

    X = df[feature_cols].values
    y = df[target_col].values

    X_lstm, y_lstm = prepare_lstm_data(X, y)

    pd.DataFrame(X, columns=feature_cols, index=df.index[:len(X)]).to_csv('X_tabular.csv')
    pd.DataFrame(y, columns=[target_col], index=df.index[:len(y)]).to_csv('y_tabular.csv')
    np.save('X_lstm.npy', X_lstm)
    np.save('y_lstm.npy', y_lstm)
    joblib.dump(scaler, 'scaler.pkl')

    return X, y, X_lstm, y_lstm, scaler, feature_cols

# Example execution
X, y, X_lstm, y_lstm, scaler, feature_cols = preprocess_data('cleaned_solar_data.csv')
print("Preprocessing completed.")


Preprocessing completed.


In [4]:
# 🤖 Model Training & Saving

def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")
    return mae, rmse, r2

def train_models(X_tabular, y_tabular, X_lstm, y_lstm):
    tscv = TimeSeriesSplit(n_splits=5)
    results = {'LinearRegression': [], 'XGBoost': [], 'LSTM': []}

    # Linear Regression
    print("\nTraining Linear Regression...")
    lr_model = LinearRegression()
    for fold, (train_idx, test_idx) in enumerate(tscv.split(X_tabular)):
        X_train, X_test = X_tabular[train_idx], X_tabular[test_idx]
        y_train, y_test = y_tabular[train_idx], y_tabular[test_idx]
        lr_model.fit(X_train, y_train)
        evaluate_model(y_test, lr_model.predict(X_test), f"Linear Regression (Fold {fold+1})")
    joblib.dump(lr_model, 'lr_model.pkl')
    print("Linear Regression model saved as 'lr_model.pkl'")


    # XGBoost
    print("\nTraining XGBoost...")
    xgb_model = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
    for fold, (train_idx, test_idx) in enumerate(tscv.split(X_tabular)):
        X_train, X_test = X_tabular[train_idx], X_tabular[test_idx]
        y_train, y_test = y_tabular[train_idx], y_tabular[test_idx]
        xgb_model.fit(X_train, y_train)
        evaluate_model(y_test, xgb_model.predict(X_test), f"XGBoost (Fold {fold+1})")
    joblib.dump(xgb_model, 'xgb_model.pkl')
    print("XGBoost model saved as 'xgb_model.pkl'")

    # LSTM
    print("\nTraining LSTM...")
    lstm_model = Sequential([
        LSTM(64, activation='relu', input_shape=(X_lstm.shape[1], X_lstm.shape[2]), return_sequences=True),
        Dropout(0.2),
        LSTM(32, activation='relu'),
        Dense(1)
    ])
    lstm_model.compile(optimizer='adam', loss='mse')
    for fold, (train_idx, test_idx) in enumerate(tscv.split(X_lstm)):
        X_train, X_test = X_lstm[train_idx], X_lstm[test_idx]
        y_train, y_test = y_lstm[train_idx], y_lstm[test_idx]
        print(f"LSTM (Fold {fold+1}) - Training...")
        history = lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
        print(f"LSTM (Fold {fold+1}) - Evaluating...")
        evaluate_model(y_test, lstm_model.predict(X_test, verbose=0), f"LSTM (Fold {fold+1})")
    lstm_model.save('lstm_model.h5')
    print("LSTM model saved as 'lstm_model.h5'")


# Example execution - Load preprocessed data and train models
# Assuming X, y, X_lstm, y_lstm are available from the preprocessing step
try:
    X_tabular = pd.read_csv('X_tabular.csv', index_col='DATE_TIME').values
    y_tabular = pd.read_csv('y_tabular.csv', index_col='DATE_TIME')['AC_POWER'].values
    X_lstm = np.load('X_lstm.npy')
    y_lstm = np.load('y_lstm.npy')

    print("Loaded preprocessed data.")
    train_models(X_tabular, y_tabular, X_lstm, y_lstm)
    print("\nModel training completed.")

except FileNotFoundError:
    print("Error: Preprocessed data files not found. Please run the preprocessing step first.")
except Exception as e:
    print(f"An error occurred during model training: {e}")

Loaded preprocessed data.

Training Linear Regression...
Linear Regression (Fold 1) - MAE: 9.0815, RMSE: 15.2792, R²: 0.9987
Linear Regression (Fold 2) - MAE: 9.6494, RMSE: 17.5943, R²: 0.9979
Linear Regression (Fold 3) - MAE: 9.1952, RMSE: 14.6706, R²: 0.9985
Linear Regression (Fold 4) - MAE: 12.1859, RMSE: 26.3501, R²: 0.9950
Linear Regression (Fold 5) - MAE: 9.5144, RMSE: 13.7458, R²: 0.9985
Linear Regression model saved as 'lr_model.pkl'

Training XGBoost...
XGBoost (Fold 1) - MAE: 12.9366, RMSE: 23.1294, R²: 0.9971
XGBoost (Fold 2) - MAE: 13.4014, RMSE: 25.2174, R²: 0.9957
XGBoost (Fold 3) - MAE: 9.9421, RMSE: 17.8425, R²: 0.9977
XGBoost (Fold 4) - MAE: 12.2497, RMSE: 30.0470, R²: 0.9935
XGBoost (Fold 5) - MAE: 8.2481, RMSE: 16.1258, R²: 0.9980
XGBoost model saved as 'xgb_model.pkl'

Training LSTM...
LSTM (Fold 1) - Training...
LSTM (Fold 1) - Evaluating...
LSTM (Fold 1) - MAE: 396.3432, RMSE: 502.6345, R²: -0.3725
LSTM (Fold 2) - Training...
LSTM (Fold 2) - Evaluating...
LSTM (Fo



LSTM (Fold 5) - Evaluating...
LSTM (Fold 5) - MAE: 80.3153, RMSE: 133.5490, R²: 0.8618
LSTM model saved as 'lstm_model.h5'

Model training completed.


In [8]:
# 📊 Model Evaluation & Visualization

def evaluate_and_visualize(X_tabular, y_tabular, X_lstm, y_lstm, lr_model, xgb_model, lstm_model):
    X_train, X_test, y_train, y_test = train_test_split(X_tabular, y_tabular, test_size=0.2, shuffle=False)
    X_lstm_train, X_lstm_test, y_lstm_train, y_lstm_test = train_test_split(X_lstm, y_lstm, test_size=0.2, shuffle=False)

    lr_pred = lr_model.predict(X_test)
    xgb_pred = xgb_model.predict(X_test)
    lstm_pred = lstm_model.predict(X_lstm_test).ravel()

    evaluate_model(y_test, lr_pred, "Linear Regression")
    evaluate_model(y_test, xgb_pred, "XGBoost")
    evaluate_model(y_lstm_test, lstm_pred, "LSTM")

    plt.figure(figsize=(14, 6))
    plt.plot(y_test, label='Actual')
    plt.plot(lr_pred, label='Linear Regression')
    plt.plot(xgb_pred, label='XGBoost')
    plt.plot(range(len(lstm_pred)), lstm_pred, label='LSTM')
    plt.legend()
    plt.title('Actual vs Predicted')
    plt.savefig('actual_vs_predicted.png')
    plt.close()

    plt.figure(figsize=(10, 6))
    plot_importance(xgb_model, max_num_features=10)
    plt.title('XGBoost Feature Importance')
    plt.savefig('xgb_feature_importance.png')
    plt.close()


