In [42]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load and prepare data
df = pd.read_csv("solar data.csv")  
df.columns = ['date', 'from_time', 'to_time', 'power_mw', 'drop_col']
df = df.drop(columns=['drop_col'])

# Create datetime column and extract time-based features
df['datetime'] = pd.to_datetime(
    df['date'] + ' ' + df['from_time'],
    format='%d.%m.%Y %H:%M',
    errors='coerce'
)

# Remove any rows that couldn't be parsed
df = df.dropna(subset=['datetime'])

df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['day_of_month'] = df['datetime'].dt.day

# Create lag features
df['power_prev_1h'] = df['power_mw'].shift(4)   # 4 * 15min = 1 hour
df['power_prev_2h'] = df['power_mw'].shift(8)   # 2 hours
df['power_prev_4h'] = df['power_mw'].shift(16)  # 4 hours

# Calculate rolling statistics
df['power_mean_24h'] = df['power_mw'].rolling(window=96).mean()  # 96 * 15min = 24 hours
df['power_std_24h'] = df['power_mw'].rolling(window=96).std()

# Drop rows with NaN from feature engineering
df = df.dropna()

print(f"Dataset shape after feature engineering: {df.shape}")
print(f"Feature columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

Dataset shape after feature engineering: (280350, 14)
Feature columns: ['date', 'from_time', 'to_time', 'power_mw', 'datetime', 'hour', 'day_of_week', 'month', 'day_of_month', 'power_prev_1h', 'power_prev_2h', 'power_prev_4h', 'power_mean_24h', 'power_std_24h']

First few rows:
          date from_time to_time  power_mw            datetime  hour  \
95  01.01.2010     23:45   00:00       0.0 2010-01-01 23:45:00    23   
96  02.01.2010     00:00   00:15       0.0 2010-01-02 00:00:00     0   
97  02.01.2010     00:15   00:30       0.0 2010-01-02 00:15:00     0   
98  02.01.2010     00:30   00:45       0.0 2010-01-02 00:30:00     0   
99  02.01.2010     00:45   01:00       0.0 2010-01-02 00:45:00     0   

    day_of_week  month  day_of_month  power_prev_1h  power_prev_2h  \
95            4      1             1            0.0            0.0   
96            5      1             2            0.0            0.0   
97            5      1             2            0.0            0.0   
98      

In [43]:
# Prepare features and target
feature_cols = ['hour', 'day_of_week', 'month', 'day_of_month', 
                'power_prev_1h', 'power_prev_2h', 'power_prev_4h',
                'power_mean_24h', 'power_std_24h']

X = df[feature_cols]
y = df['power_mw']

# Split data into train, validation, and test sets (60%, 20%, 20%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(f"\nData Split Summary:")
print(f"  Training set: {X_train.shape[0]} samples")
print(f"  Validation set: {X_val.shape[0]} samples")
print(f"  Test set: {X_test.shape[0]} samples")
print(f"\nTarget Variable Statistics:")
print(f"  Mean: {y.mean():.2f} MW")
print(f"  Std: {y.std():.2f} MW")
print(f"  Min: {y.min():.2f} MW")
print(f"  Max: {y.max():.2f} MW")


Data Split Summary:
  Training set: 168210 samples
  Validation set: 56070 samples
  Test set: 56070 samples

Target Variable Statistics:
  Mean: 96.37 MW
  Std: 213.88 MW
  Min: 0.00 MW
  Max: 999.00 MW


In [44]:
# ============================================================================
# TRAIN RANDOM FOREST MODEL
# ============================================================================


rf_model = RandomForestRegressor(
    n_estimators=100, 
    max_depth=20, 
    random_state=42, 
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

print("✓ Model trained successfully")

# Feature importance
print(f"\nTop 5 Feature Importance:")
feature_importance = sorted(zip(feature_cols, rf_model.feature_importances_), 
                            key=lambda x: x[1], reverse=True)
for i, (feature, importance) in enumerate(feature_importance[:5], 1):
    print(f"  {i}. {feature}: {importance:.4f}")

# Make predictions
y_train_pred = rf_model.predict(X_train)
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(X_test)

✓ Model trained successfully

Top 5 Feature Importance:
  1. power_prev_1h: 0.5248
  2. hour: 0.1285
  3. power_prev_2h: 0.1155
  4. power_mean_24h: 0.0938
  5. power_std_24h: 0.0556


In [45]:
# ============================================================================
# METRICS EVALUATION FUNCTION
# ============================================================================
def evaluate_metrics(y_true, y_pred):
    """Calculate comprehensive regression metrics"""
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1))) * 100
    
    return {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'MAPE': mape
    }

In [46]:
# ============================================================================
# EVALUATE RANDOM FOREST ON ALL SETS
# ============================================================================


train_metrics = evaluate_metrics(y_train, y_train_pred)
val_metrics = evaluate_metrics(y_val, y_val_pred)
test_metrics = evaluate_metrics(y_test, y_test_pred)

print("\n" + "-"*70)
print("TRAINING SET METRICS")
print("-"*70)
for metric, value in train_metrics.items():
    print(f"  {metric:6s}: {value:10.4f}")

print("\n" + "-"*70)
print("VALIDATION SET METRICS")
print("-"*70)
for metric, value in val_metrics.items():
    print(f"  {metric:6s}: {value:10.4f}")

print("\n" + "-"*70)
print("TEST SET METRICS (Final Model Performance)")
print("-"*70)
for metric, value in test_metrics.items():
    print(f"  {metric:6s}: {value:10.4f}")


----------------------------------------------------------------------
TRAINING SET METRICS
----------------------------------------------------------------------
  MSE   :  1556.0132
  RMSE  :    39.4463
  MAE   :    12.5497
  R2    :     0.9659
  MAPE  :   164.9361

----------------------------------------------------------------------
VALIDATION SET METRICS
----------------------------------------------------------------------
  MSE   :  8006.6354
  RMSE  :    89.4798
  MAE   :    28.3573
  R2    :     0.8256
  MAPE  :   378.7402

----------------------------------------------------------------------
TEST SET METRICS (Final Model Performance)
----------------------------------------------------------------------
  MSE   :  8373.7759
  RMSE  :    91.5083
  MAE   :    28.6335
  R2    :     0.8171
  MAPE  :   393.6378


In [None]:
# ============================================================================
# MODEL SUMMARY
# ============================================================================

print(f"\nRandom Forest Regressor Configuration:")
print(f"  Number of Trees: {rf_model.n_estimators}")
print(f"  Max Depth: {rf_model.max_depth}")
print(f"  Number of Features: {len(feature_cols)}")
print(f"\nTest Set Performance:")
print(f"  ✓ RMSE (Root Mean Squared Error): {test_metrics['RMSE']:.4f} MW")
print(f"  ✓ MAE (Mean Absolute Error): {test_metrics['MAE']:.4f} MW")
print(f"  ✓ R² Score: {test_metrics['R2']:.4f}")
print(f"  ✓ MAPE (Mean Absolute Percentage Error): {test_metrics['MAPE']:.4f}%")

# Save the model
with open('best_solar_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
print(f"\n✓ Best model saved as 'best_solar_model.pkl'")


Random Forest Regressor Configuration:
  Number of Trees: 100
  Max Depth: 20
  Number of Features: 9

Test Set Performance:
  ✓ RMSE (Root Mean Squared Error): 91.5083 MW
  ✓ MAE (Mean Absolute Error): 28.6335 MW
  ✓ R² Score: 0.8171
  ✓ MAPE (Mean Absolute Percentage Error): 393.6378%



✓ Best model saved as 'best_solar_model.pkl'


: 