# Solar Power Generation Model Development and Training

This notebook implements multiple regression models for solar power generation prediction:
- XGBoost Regressor
- Random Forest Regressor
- Neural Network (MLP Regressor)

Target variable: generation(kWh)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Set random seed
np.random.seed(42)

# Directories
DATA_DIR = '/home/ubuntu/processed_data/'
MODEL_DIR = '/home/ubuntu/models/'
os.makedirs(MODEL_DIR, exist_ok=True)

# Plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Load Processed Data

In [None]:
# Load processed data
print("Loading processed data...")
data = pd.read_csv(os.path.join(DATA_DIR, 'processed_solar_data.csv'))
data['Time'] = pd.to_datetime(data['Time'])

# Load feature info
with open(os.path.join(DATA_DIR, 'feature_info.json'), 'r') as f:
    feature_info = json.load(f)

feature_cols = feature_info['feature_columns']
target_col = feature_info['target_column']

print(f"Data shape: {data.shape}")
print(f"Features: {len(feature_cols)}")
print(f"Target: {target_col}")
print(f"Date range: {data['Time'].min()} to {data['Time'].max()}")
print(f"Stations: {data['station'].nunique()}")

# Basic statistics
print(f"\nTarget variable statistics:")
print(data[target_col].describe())

## 2. Prepare Data for Training

In [None]:
# Prepare features and target
X = data[feature_cols].copy()
y = data[target_col].copy()

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Check for any remaining missing values
print(f"\nMissing values in features: {X.isnull().sum().sum()}")
print(f"Missing values in target: {y.isnull().sum()}")

# Remove any rows with missing values
mask = ~(X.isnull().any(axis=1) | y.isnull())
X = X[mask].reset_index(drop=True)
y = y[mask].reset_index(drop=True)
data_clean = data[mask].reset_index(drop=True)

print(f"\nAfter removing missing values:")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

## 3. Train-Test Split (Time-based)

In [None]:
# Time-based split to avoid data leakage
# Use 80% for training, 20% for testing
split_date = data_clean['Time'].quantile(0.8)
print(f"Split date: {split_date}")

train_mask = data_clean['Time'] <= split_date
test_mask = data_clean['Time'] > split_date

X_train = X[train_mask].reset_index(drop=True)
X_test = X[test_mask].reset_index(drop=True)
y_train = y[train_mask].reset_index(drop=True)
y_test = y[test_mask].reset_index(drop=True)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Train ratio: {len(X_train) / len(X):.2%}")
print(f"Test ratio: {len(X_test) / len(X):.2%}")

# Further split training data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

print(f"\nAfter validation split:")
print(f"Training set: {X_train_split.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

## 4. Model Development

### 4.1 XGBoost Regressor

In [None]:
print("Training XGBoost Regressor...")

# XGBoost parameters
xgb_params = {
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'n_jobs': -1
}

# Train XGBoost
xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_model.fit(
    X_train_split, y_train_split,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=10,
    verbose=False
)

# Predictions
y_train_pred_xgb = xgb_model.predict(X_train_split)
y_val_pred_xgb = xgb_model.predict(X_val)
y_test_pred_xgb = xgb_model.predict(X_test)

# Metrics
xgb_metrics = {
    'train_rmse': np.sqrt(mean_squared_error(y_train_split, y_train_pred_xgb)),
    'train_mae': mean_absolute_error(y_train_split, y_train_pred_xgb),
    'train_r2': r2_score(y_train_split, y_train_pred_xgb),
    'val_rmse': np.sqrt(mean_squared_error(y_val, y_val_pred_xgb)),
    'val_mae': mean_absolute_error(y_val, y_val_pred_xgb),
    'val_r2': r2_score(y_val, y_val_pred_xgb),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred_xgb)),
    'test_mae': mean_absolute_error(y_test, y_test_pred_xgb),
    'test_r2': r2_score(y_test, y_test_pred_xgb)
}

print("XGBoost Results:")
for metric, value in xgb_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Save model
joblib.dump(xgb_model, os.path.join(MODEL_DIR, 'xgboost_model.pkl'))
print("\nXGBoost model saved.")

### 4.2 Random Forest Regressor

In [None]:
print("Training Random Forest Regressor...")

# Random Forest parameters
rf_params = {
    'n_estimators': 100,
    'max_depth': 10,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'random_state': 42,
    'n_jobs': -1
}

# Train Random Forest
rf_model = RandomForestRegressor(**rf_params)
rf_model.fit(X_train_split, y_train_split)

# Predictions
y_train_pred_rf = rf_model.predict(X_train_split)
y_val_pred_rf = rf_model.predict(X_val)
y_test_pred_rf = rf_model.predict(X_test)

# Metrics
rf_metrics = {
    'train_rmse': np.sqrt(mean_squared_error(y_train_split, y_train_pred_rf)),
    'train_mae': mean_absolute_error(y_train_split, y_train_pred_rf),
    'train_r2': r2_score(y_train_split, y_train_pred_rf),
    'val_rmse': np.sqrt(mean_squared_error(y_val, y_val_pred_rf)),
    'val_mae': mean_absolute_error(y_val, y_val_pred_rf),
    'val_r2': r2_score(y_val, y_val_pred_rf),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred_rf)),
    'test_mae': mean_absolute_error(y_test, y_test_pred_rf),
    'test_r2': r2_score(y_test, y_test_pred_rf)
}

print("Random Forest Results:")
for metric, value in rf_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Save model
joblib.dump(rf_model, os.path.join(MODEL_DIR, 'random_forest_model.pkl'))
print("\nRandom Forest model saved.")

### 4.3 Neural Network (MLP Regressor)

In [None]:
print("Training Neural Network (MLP Regressor)...")

# Scale features for neural network
nn_scaler = StandardScaler()
X_train_scaled = nn_scaler.fit_transform(X_train_split)
X_val_scaled = nn_scaler.transform(X_val)
X_test_scaled = nn_scaler.transform(X_test)

# Neural Network parameters
nn_params = {
    'hidden_layer_sizes': (100, 50, 25),
    'activation': 'relu',
    'solver': 'adam',
    'alpha': 0.001,
    'learning_rate': 'adaptive',
    'max_iter': 500,
    'random_state': 42,
    'early_stopping': True,
    'validation_fraction': 0.1
}

# Train Neural Network
nn_model = MLPRegressor(**nn_params)
nn_model.fit(X_train_scaled, y_train_split)

# Predictions
y_train_pred_nn = nn_model.predict(X_train_scaled)
y_val_pred_nn = nn_model.predict(X_val_scaled)
y_test_pred_nn = nn_model.predict(X_test_scaled)

# Metrics
nn_metrics = {
    'train_rmse': np.sqrt(mean_squared_error(y_train_split, y_train_pred_nn)),
    'train_mae': mean_absolute_error(y_train_split, y_train_pred_nn),
    'train_r2': r2_score(y_train_split, y_train_pred_nn),
    'val_rmse': np.sqrt(mean_squared_error(y_val, y_val_pred_nn)),
    'val_mae': mean_absolute_error(y_val, y_val_pred_nn),
    'val_r2': r2_score(y_val, y_val_pred_nn),
    'test_rmse': np.sqrt(mean_squared_error(y_test, y_test_pred_nn)),
    'test_mae': mean_absolute_error(y_test, y_test_pred_nn),
    'test_r2': r2_score(y_test, y_test_pred_nn)
}

print("Neural Network Results:")
for metric, value in nn_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Save model and scaler
joblib.dump(nn_model, os.path.join(MODEL_DIR, 'neural_network_model.pkl'))
joblib.dump(nn_scaler, os.path.join(MODEL_DIR, 'nn_scaler.pkl'))
print("\nNeural Network model and scaler saved.")

## 5. Model Comparison

In [None]:
# Create comparison dataframe
comparison_data = {
    'Model': ['XGBoost', 'Random Forest', 'Neural Network'],
    'Train_RMSE': [xgb_metrics['train_rmse'], rf_metrics['train_rmse'], nn_metrics['train_rmse']],
    'Val_RMSE': [xgb_metrics['val_rmse'], rf_metrics['val_rmse'], nn_metrics['val_rmse']],
    'Test_RMSE': [xgb_metrics['test_rmse'], rf_metrics['test_rmse'], nn_metrics['test_rmse']],
    'Train_MAE': [xgb_metrics['train_mae'], rf_metrics['train_mae'], nn_metrics['train_mae']],
    'Val_MAE': [xgb_metrics['val_mae'], rf_metrics['val_mae'], nn_metrics['val_mae']],
    'Test_MAE': [xgb_metrics['test_mae'], rf_metrics['test_mae'], nn_metrics['test_mae']],
    'Train_R2': [xgb_metrics['train_r2'], rf_metrics['train_r2'], nn_metrics['train_r2']],
    'Val_R2': [xgb_metrics['val_r2'], rf_metrics['val_r2'], nn_metrics['val_r2']],
    'Test_R2': [xgb_metrics['test_r2'], rf_metrics['test_r2'], nn_metrics['test_r2']]
}

comparison_df = pd.DataFrame(comparison_data)
print("=== MODEL COMPARISON ===")
print(comparison_df.round(4))

# Save comparison
comparison_df.to_csv(os.path.join(MODEL_DIR, 'model_comparison.csv'), index=False)

# Find best model based on validation RMSE
best_model_idx = comparison_df['Val_RMSE'].idxmin()
best_model_name = comparison_df.loc[best_model_idx, 'Model']
print(f"\nBest model based on validation RMSE: {best_model_name}")
print(f"Validation RMSE: {comparison_df.loc[best_model_idx, 'Val_RMSE']:.4f}")
print(f"Test RMSE: {comparison_df.loc[best_model_idx, 'Test_RMSE']:.4f}")

## 6. Feature Importance Analysis

In [None]:
# XGBoost feature importance
xgb_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

# Random Forest feature importance
rf_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot feature importance
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# XGBoost importance
top_features_xgb = xgb_importance.head(15)
axes[0].barh(range(len(top_features_xgb)), top_features_xgb['importance'])
axes[0].set_yticks(range(len(top_features_xgb)))
axes[0].set_yticklabels(top_features_xgb['feature'])
axes[0].set_title('XGBoost Feature Importance (Top 15)')
axes[0].set_xlabel('Importance')

# Random Forest importance
top_features_rf = rf_importance.head(15)
axes[1].barh(range(len(top_features_rf)), top_features_rf['importance'])
axes[1].set_yticks(range(len(top_features_rf)))
axes[1].set_yticklabels(top_features_rf['feature'])
axes[1].set_title('Random Forest Feature Importance (Top 15)')
axes[1].set_xlabel('Importance')

plt.tight_layout()
plt.savefig(os.path.join(MODEL_DIR, 'feature_importance.png'), dpi=300, bbox_inches='tight')
plt.show()

# Save feature importance
xgb_importance.to_csv(os.path.join(MODEL_DIR, 'xgb_feature_importance.csv'), index=False)
rf_importance.to_csv(os.path.join(MODEL_DIR, 'rf_feature_importance.csv'), index=False)

print("Top 10 features (XGBoost):")
print(xgb_importance.head(10))

print("\nTop 10 features (Random Forest):")
print(rf_importance.head(10))

## 7. Cross-Validation

In [None]:
# Time series cross-validation
print("Performing time series cross-validation...")

tscv = TimeSeriesSplit(n_splits=5)

# XGBoost CV
xgb_cv_scores = cross_val_score(
    xgb.XGBRegressor(**xgb_params), 
    X_train, y_train, 
    cv=tscv, 
    scoring='neg_mean_squared_error',
    n_jobs=-1
)
xgb_cv_rmse = np.sqrt(-xgb_cv_scores)

# Random Forest CV
rf_cv_scores = cross_val_score(
    RandomForestRegressor(**rf_params), 
    X_train, y_train, 
    cv=tscv, 
    scoring='neg_mean_squared_error',
    n_jobs=-1
)
rf_cv_rmse = np.sqrt(-rf_cv_scores)

# Neural Network CV (scaled data)
X_train_scaled_full = nn_scaler.fit_transform(X_train)
nn_cv_scores = cross_val_score(
    MLPRegressor(**nn_params), 
    X_train_scaled_full, y_train, 
    cv=tscv, 
    scoring='neg_mean_squared_error'
)
nn_cv_rmse = np.sqrt(-nn_cv_scores)

# CV Results
cv_results = pd.DataFrame({
    'Model': ['XGBoost', 'Random Forest', 'Neural Network'],
    'CV_RMSE_Mean': [xgb_cv_rmse.mean(), rf_cv_rmse.mean(), nn_cv_rmse.mean()],
    'CV_RMSE_Std': [xgb_cv_rmse.std(), rf_cv_rmse.std(), nn_cv_rmse.std()]
})

print("\n=== CROSS-VALIDATION RESULTS ===")
print(cv_results.round(4))

# Save CV results
cv_results.to_csv(os.path.join(MODEL_DIR, 'cv_results.csv'), index=False)

## 8. Save Model Metadata

In [None]:
# Create model metadata
model_metadata = {
    'timestamp': datetime.now().isoformat(),
    'data_info': {
        'total_samples': len(data_clean),
        'train_samples': len(X_train),
        'test_samples': len(X_test),
        'features': len(feature_cols),
        'target': target_col,
        'date_range': [str(data_clean['Time'].min()), str(data_clean['Time'].max())],
        'stations': list(data_clean['station'].unique())
    },
    'models': {
        'xgboost': {
            'parameters': xgb_params,
            'metrics': xgb_metrics,
            'cv_rmse_mean': float(xgb_cv_rmse.mean()),
            'cv_rmse_std': float(xgb_cv_rmse.std()),
            'model_file': 'xgboost_model.pkl'
        },
        'random_forest': {
            'parameters': rf_params,
            'metrics': rf_metrics,
            'cv_rmse_mean': float(rf_cv_rmse.mean()),
            'cv_rmse_std': float(rf_cv_rmse.std()),
            'model_file': 'random_forest_model.pkl'
        },
        'neural_network': {
            'parameters': nn_params,
            'metrics': nn_metrics,
            'cv_rmse_mean': float(nn_cv_rmse.mean()),
            'cv_rmse_std': float(nn_cv_rmse.std()),
            'model_file': 'neural_network_model.pkl',
            'scaler_file': 'nn_scaler.pkl'
        }
    },
    'best_model': {
        'name': best_model_name,
        'val_rmse': float(comparison_df.loc[best_model_idx, 'Val_RMSE']),
        'test_rmse': float(comparison_df.loc[best_model_idx, 'Test_RMSE'])
    },
    'feature_columns': feature_cols
}

# Save metadata
with open(os.path.join(MODEL_DIR, 'model_metadata.json'), 'w') as f:
    json.dump(model_metadata, f, indent=2)

print("\n=== MODEL TRAINING COMPLETE ===")
print(f"Models saved to: {MODEL_DIR}")
print(f"Best model: {best_model_name}")
print(f"Best validation RMSE: {comparison_df.loc[best_model_idx, 'Val_RMSE']:.4f}")
print(f"Best test RMSE: {comparison_df.loc[best_model_idx, 'Test_RMSE']:.4f}")
print(f"\nFiles saved:")
print(f"- xgboost_model.pkl")
print(f"- random_forest_model.pkl")
print(f"- neural_network_model.pkl")
print(f"- nn_scaler.pkl")
print(f"- model_metadata.json")
print(f"- model_comparison.csv")
print(f"- cv_results.csv")
print(f"- feature_importance.png")
print(f"- xgb_feature_importance.csv")
print(f"- rf_feature_importance.csv")