# Solar Power Prediction - Model Training

This notebook trains multiple machine learning models for solar power prediction.

## Objectives
- Train multiple ML algorithms
- Compare model performance
- Select the best performing model
- Save trained models for deployment

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

print("Libraries imported successfully!")

## 1. Load Preprocessed Data

In [None]:
def load_processed_data():
    """Load preprocessed data and feature information"""
    try:
        # Load processed data
        data = pd.read_csv('../data/processed_solar_data.csv')
        
        # Load feature information
        with open('../data/feature_info.json', 'r') as f:
            feature_info = json.load(f)
        
        print(f"Loaded processed data: {data.shape}")
        print(f"Features: {len(feature_info['features'])}")
        
        return data, feature_info
        
    except FileNotFoundError:
        print("Processed data not found. Please run the preprocessing notebook first.")
        
        # Create sample data for demonstration
        print("Creating sample data for demonstration...")
        np.random.seed(42)
        n_samples = 2000
        
        # Create realistic features
        data = pd.DataFrame({
            'Power_W': np.random.normal(2000, 800, n_samples),
            'Irradiance': np.random.normal(400, 200, n_samples),
            'Temperature': np.random.normal(25, 5, n_samples),
            'Hour': np.random.randint(0, 24, n_samples),
            'Power_lag_1': np.random.normal(2000, 800, n_samples),
            'Power_Density': np.random.normal(5, 2, n_samples),
            'SolarElevation': np.random.uniform(0, 1, n_samples)
        })
        
        # Make power generation realistic
        data['Power_W'] = np.maximum(0, 
            data['Irradiance'] * 3 + 
            data['Power_lag_1'] * 0.5 + 
            np.random.normal(0, 200, n_samples)
        )
        
        feature_info = {
            'target': 'Power_W',
            'features': [col for col in data.columns if col != 'Power_W'],
            'n_samples': len(data),
            'n_features': len(data.columns) - 1
        }
        
        return data, feature_info

# Load data
data, feature_info = load_processed_data()
print(f"Dataset shape: {data.shape}")
print(f"Target: {feature_info['target']}")
print(f"Features: {feature_info['features']}")

## 2. Prepare Data for Modeling

In [None]:
# Separate features and target
X = data[feature_info['features']]
y = data[feature_info['target']]

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

## 3. Model Training Functions

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Train and evaluate a model"""
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores.mean())
    cv_std = np.sqrt(-cv_scores).std()
    
    # Store results
    results = {
        'model_name': model_name,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'cv_rmse': cv_rmse,
        'cv_std': cv_std
    }
    
    # Print results
    print(f"\n{model_name} Results:")
    print(f"  Train RMSE: {train_rmse:.2f}")
    print(f"  Test RMSE: {test_rmse:.2f}")
    print(f"  Train R²: {train_r2:.3f}")
    print(f"  Test R²: {test_r2:.3f}")
    print(f"  CV RMSE: {cv_rmse:.2f} ± {cv_std:.2f}")
    
    return results, y_test_pred, model

## 4. Train Baseline Models

In [None]:
print("TRAINING BASELINE MODELS")
print("=" * 50)

# Define baseline models
baseline_models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=1.0)
}

baseline_results = []
baseline_predictions = {}
trained_models = {}

for name, model in baseline_models.items():
    results, predictions, trained_model = evaluate_model(
        model, X_train_scaled, X_test_scaled, y_train, y_test, name
    )
    baseline_results.append(results)
    baseline_predictions[name] = predictions
    trained_models[name] = trained_model

print("\nBaseline models training completed!")

## 5. Train Advanced Models

In [None]:
print("\nTRAINING ADVANCED MODELS")
print("=" * 50)

# Define advanced models
advanced_models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Support Vector Regression': SVR(kernel='rbf', C=1.0),
    'Neural Network': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
}

advanced_results = []
advanced_predictions = {}

for name, model in advanced_models.items():
    print(f"\nTraining {name}...")
    results, predictions, trained_model = evaluate_model(
        model, X_train_scaled, X_test_scaled, y_train, y_test, name
    )
    advanced_results.append(results)
    advanced_predictions[name] = predictions
    trained_models[name] = trained_model

print("\nAdvanced models training completed!")

## 6. Model Comparison

In [None]:
# Combine all results
all_results = baseline_results + advanced_results
all_predictions = {**baseline_predictions, **advanced_predictions}

# Create results DataFrame
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values('test_rmse')

print("MODEL COMPARISON RESULTS")
print("=" * 60)
print(f"{'Model':<25} {'Test RMSE':<12} {'Test R²':<10} {'CV RMSE':<12}")
print("=" * 60)

for _, row in results_df.iterrows():
    print(f"{row['model_name']:<25} {row['test_rmse']:<12.2f} {row['test_r2']:<10.3f} {row['cv_rmse']:<12.2f}")

# Best model
best_model_name = results_df.iloc[0]['model_name']
best_rmse = results_df.iloc[0]['test_rmse']
best_r2 = results_df.iloc[0]['test_r2']

print(f"\nBest Model: {best_model_name}")
print(f"Best RMSE: {best_rmse:.2f}")
print(f"Best R²: {best_r2:.3f}")

## 7. Visualize Results

In [None]:
# Model comparison plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# RMSE comparison
axes[0, 0].bar(results_df['model_name'], results_df['test_rmse'])
axes[0, 0].set_title('Test RMSE Comparison')
axes[0, 0].set_ylabel('RMSE')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3)

# R² comparison
axes[0, 1].bar(results_df['model_name'], results_df['test_r2'])
axes[0, 1].set_title('Test R² Comparison')
axes[0, 1].set_ylabel('R²')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3)

# MAE comparison
axes[1, 0].bar(results_df['model_name'], results_df['test_mae'])
axes[1, 0].set_title('Test MAE Comparison')
axes[1, 0].set_ylabel('MAE')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3)

# Cross-validation RMSE
axes[1, 1].bar(results_df['model_name'], results_df['cv_rmse'])
axes[1, 1].set_title('Cross-Validation RMSE')
axes[1, 1].set_ylabel('CV RMSE')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Prediction vs Actual plots for top 4 models
top_models = results_df.head(4)
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

for i, (_, row) in enumerate(top_models.iterrows()):
    model_name = row['model_name']
    y_pred = all_predictions[model_name]
    
    axes[i].scatter(y_test, y_pred, alpha=0.6)
    axes[i].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    axes[i].set_xlabel('Actual Power (W)')
    axes[i].set_ylabel('Predicted Power (W)')
    axes[i].set_title(f'{model_name}\nR² = {row["test_r2"]:.3f}, RMSE = {row["test_rmse"]:.1f}')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Feature Importance Analysis

In [None]:
# Feature importance for tree-based models
tree_models = ['Random Forest', 'Gradient Boosting']

for model_name in tree_models:
    if model_name in trained_models:
        model = trained_models[model_name]
        
        if hasattr(model, 'feature_importances_'):
            # Get feature importance
            importance = model.feature_importances_
            feature_importance = pd.DataFrame({
                'feature': X_train.columns,
                'importance': importance
            }).sort_values('importance', ascending=False)
            
            print(f"\nTop 10 Features - {model_name}:")
            print(feature_importance.head(10))
            
            # Plot feature importance
            plt.figure(figsize=(10, 8))
            top_features = feature_importance.head(15)
            plt.barh(range(len(top_features)), top_features['importance'])
            plt.yticks(range(len(top_features)), top_features['feature'])
            plt.xlabel('Feature Importance')
            plt.title(f'Top 15 Feature Importance - {model_name}')
            plt.gca().invert_yaxis()
            plt.tight_layout()
            plt.show()

## 9. Hyperparameter Tuning (Optional)

In [None]:
# Hyperparameter tuning for the best model
if best_model_name == 'Random Forest':
    print("Tuning Random Forest hyperparameters...")
    
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10]
    }
    
    rf_grid = GridSearchCV(
        RandomForestRegressor(random_state=42),
        param_grid,
        cv=3,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    
    rf_grid.fit(X_train_scaled, y_train)
    
    print(f"Best parameters: {rf_grid.best_params_}")
    print(f"Best CV score: {np.sqrt(-rf_grid.best_score_):.2f}")
    
    # Evaluate tuned model
    tuned_results, tuned_predictions, tuned_model = evaluate_model(
        rf_grid.best_estimator_, X_train_scaled, X_test_scaled, y_train, y_test, 'Random Forest (Tuned)'
    )
    
    trained_models['Random Forest (Tuned)'] = tuned_model

elif best_model_name == 'Gradient Boosting':
    print("Tuning Gradient Boosting hyperparameters...")
    
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
    
    gb_grid = GridSearchCV(
        GradientBoostingRegressor(random_state=42),
        param_grid,
        cv=3,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    
    gb_grid.fit(X_train_scaled, y_train)
    
    print(f"Best parameters: {gb_grid.best_params_}")
    print(f"Best CV score: {np.sqrt(-gb_grid.best_score_):.2f}")
    
    # Evaluate tuned model
    tuned_results, tuned_predictions, tuned_model = evaluate_model(
        gb_grid.best_estimator_, X_train_scaled, X_test_scaled, y_train, y_test, 'Gradient Boosting (Tuned)'
    )
    
    trained_models['Gradient Boosting (Tuned)'] = tuned_model

else:
    print(f"Hyperparameter tuning not implemented for {best_model_name}")

## 10. Save Models and Results

In [None]:
# Save the best models
import os
os.makedirs('../models', exist_ok=True)

# Save all trained models
for name, model in trained_models.items():
    filename = f"../models/model_{name.replace(' ', '_').replace('(', '').replace(')', '').lower()}.pkl"
    joblib.dump(model, filename)
    print(f"Saved: {filename}")

# Save the scaler
joblib.dump(scaler, '../models/scaler.pkl')
print("Saved: ../models/scaler.pkl")

# Save results
results_df.to_csv('../data/model_results.csv', index=False)
print("Saved: ../data/model_results.csv")

print("\nAll models and results saved successfully!")

## 11. Training Summary

In [None]:
print("MODEL TRAINING SUMMARY")
print("=" * 50)
print(f"Total models trained: {len(trained_models)}")
print(f"Best model: {best_model_name}")
print(f"Best RMSE: {best_rmse:.2f}")
print(f"Best R²: {best_r2:.3f}")

print(f"\nTop 3 models:")
for i, (_, row) in enumerate(results_df.head(3).iterrows()):
    print(f"{i+1}. {row['model_name']}: RMSE = {row['test_rmse']:.2f}, R² = {row['test_r2']:.3f}")

print(f"\nFiles saved:")
print(f"- Model files: ../models/model_*.pkl")
print(f"- Scaler: ../models/scaler.pkl")
print(f"- Results: ../data/model_results.csv")

print(f"\nNext steps:")
print(f"- Model evaluation and diagnostics")
print(f"- Residual analysis")
print(f"- Prediction intervals")
print(f"- Model deployment preparation")

## Next Steps

The models have been successfully trained and saved. The next steps are:

1. **Model Evaluation**: Detailed analysis of model performance
2. **Residual Analysis**: Check model assumptions and identify patterns
3. **Prediction Intervals**: Quantify uncertainty in predictions
4. **Model Deployment**: Prepare models for production use

Continue to the next notebook: `04_model_evaluation.ipynb`