In [24]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [25]:
df = pd.read_csv("solar data.csv")  

df.columns = ['date', 'from_time', 'to_time', 'power_mw', 'drop_col']
df = df.drop(columns=['drop_col'])

df.head(70)



Unnamed: 0,date,from_time,to_time,power_mw
0,01.01.2010,00:00,00:15,0.0
1,01.01.2010,00:15,00:30,0.0
2,01.01.2010,00:30,00:45,0.0
3,01.01.2010,00:45,01:00,0.0
4,01.01.2010,01:00,01:15,0.0
...,...,...,...,...
65,01.01.2010,16:15,16:30,0.0
66,01.01.2010,16:30,16:45,0.0
67,01.01.2010,16:45,17:00,0.0
68,01.01.2010,17:00,17:15,0.0


In [26]:
# Create datetime column and extract time-based features for prediction
df['datetime'] = pd.to_datetime(
    df['date'] + ' ' + df['from_time'],
    format='%d.%m.%Y %H:%M',
    errors='coerce'
)

# Remove any rows that couldn't be parsed
df = df.dropna(subset=['datetime'])

df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['day_of_month'] = df['datetime'].dt.day

# Create lag features (previous hour power)
df['power_prev_1h'] = df['power_mw'].shift(4)  # 4 * 15min = 1 hour
df['power_prev_2h'] = df['power_mw'].shift(8)  # 2 hours
df['power_prev_4h'] = df['power_mw'].shift(16)  # 4 hours

# Calculate rolling statistics
df['power_mean_24h'] = df['power_mw'].rolling(window=96).mean()  # 96 * 15min = 24 hours
df['power_std_24h'] = df['power_mw'].rolling(window=96).std()

# Drop rows with NaN from feature engineering
df = df.dropna()

print(f"Dataset shape after feature engineering: {df.shape}")
print(f"\nFeature columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

Dataset shape after feature engineering: (280350, 14)

Feature columns: ['date', 'from_time', 'to_time', 'power_mw', 'datetime', 'hour', 'day_of_week', 'month', 'day_of_month', 'power_prev_1h', 'power_prev_2h', 'power_prev_4h', 'power_mean_24h', 'power_std_24h']

First few rows:


Unnamed: 0,date,from_time,to_time,power_mw,datetime,hour,day_of_week,month,day_of_month,power_prev_1h,power_prev_2h,power_prev_4h,power_mean_24h,power_std_24h
95,01.01.2010,23:45,00:00,0.0,2010-01-01 23:45:00,23,4,1,1,0.0,0.0,0.0,3.458333,6.500067
96,02.01.2010,00:00,00:15,0.0,2010-01-02 00:00:00,0,5,1,2,0.0,0.0,0.0,3.458333,6.500067
97,02.01.2010,00:15,00:30,0.0,2010-01-02 00:15:00,0,5,1,2,0.0,0.0,0.0,3.458333,6.500067
98,02.01.2010,00:30,00:45,0.0,2010-01-02 00:30:00,0,5,1,2,0.0,0.0,0.0,3.458333,6.500067
99,02.01.2010,00:45,01:00,0.0,2010-01-02 00:45:00,0,5,1,2,0.0,0.0,0.0,3.458333,6.500067


In [27]:
# Prepare features and target
# Feature columns for the model
feature_cols = ['hour', 'day_of_week', 'month', 'day_of_month', 
                'power_prev_1h', 'power_prev_2h', 'power_prev_4h',
                'power_mean_24h', 'power_std_24h']

X = df[feature_cols]
y = df['power_mw']

# Split data into train, validation, and test sets
# 60% train, 20% validation, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"\nTarget variable (power_mw) statistics:")
print(f"  Mean: {y.mean():.2f} MW")
print(f"  Std: {y.std():.2f} MW")
print(f"  Min: {y.min():.2f} MW")
print(f"  Max: {y.max():.2f} MW")

Training set size: 168210
Validation set size: 56070
Test set size: 56070

Target variable (power_mw) statistics:
  Mean: 96.37 MW
  Std: 213.88 MW
  Min: 0.00 MW
  Max: 999.00 MW


In [28]:
# Train Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions on all sets
y_train_pred_lr = lr_model.predict(X_train)
y_val_pred_lr = lr_model.predict(X_val)
y_test_pred_lr = lr_model.predict(X_test)

print("="*60)
print("LINEAR REGRESSION MODEL")
print("="*60)
print(f"\nModel coefficients:")
for feature, coef in zip(feature_cols, lr_model.coef_):
    print(f"  {feature}: {coef:.4f}")
print(f"  Intercept: {lr_model.intercept_:.4f}")

LINEAR REGRESSION MODEL

Model coefficients:
  hour: 0.5970
  day_of_week: -0.0018
  month: -0.0597
  day_of_month: -0.0232
  power_prev_1h: 0.4830
  power_prev_2h: -0.0205
  power_prev_4h: -0.0376
  power_mean_24h: 0.4337
  power_std_24h: 0.0094
  Intercept: 5.5390


In [29]:
# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, max_depth=20, 
                                  random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Make predictions on all sets
y_train_pred_rf = rf_model.predict(X_train)
y_val_pred_rf = rf_model.predict(X_val)
y_test_pred_rf = rf_model.predict(X_test)

# Feature importance
print("="*60)
print("RANDOM FOREST MODEL")
print("="*60)
print(f"\nFeature importance (top 5):")
feature_importance = sorted(zip(feature_cols, rf_model.feature_importances_), 
                            key=lambda x: x[1], reverse=True)
for feature, importance in feature_importance[:5]:
    print(f"  {feature}: {importance:.4f}")

RANDOM FOREST MODEL

Feature importance (top 5):
  power_prev_1h: 0.5248
  hour: 0.1285
  power_prev_2h: 0.1155
  power_mean_24h: 0.0938
  power_std_24h: 0.0556


In [30]:
# Define metric evaluation function
def evaluate_metrics(y_true, y_pred, set_name=""):
    """Calculate regression metrics"""
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1))) * 100  # +1 to avoid division by zero
    
    return {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'MAPE': mape
    }

# Evaluate Linear Regression
print("\n" + "="*60)
print("LINEAR REGRESSION - METRICS EVALUATION")
print("="*60)

lr_train_metrics = evaluate_metrics(y_train, y_train_pred_lr, "Training")
lr_val_metrics = evaluate_metrics(y_val, y_val_pred_lr, "Validation")
lr_test_metrics = evaluate_metrics(y_test, y_test_pred_lr, "Test")

print("\nTraining Set Metrics:")
for metric, value in lr_train_metrics.items():
    print(f"  {metric}: {value:.4f}")

print("\nValidation Set Metrics:")
for metric, value in lr_val_metrics.items():
    print(f"  {metric}: {value:.4f}")

print("\nTest Set Metrics:")
for metric, value in lr_test_metrics.items():
    print(f"  {metric}: {value:.4f}")


LINEAR REGRESSION - METRICS EVALUATION

Training Set Metrics:
  MSE: 33732.4536
  RMSE: 183.6640
  MAE: 106.2546
  R2: 0.2615
  MAPE: 3202.0057

Validation Set Metrics:
  MSE: 33971.8106
  RMSE: 184.3144
  MAE: 106.3503
  R2: 0.2600
  MAPE: 3195.5502

Test Set Metrics:
  MSE: 33605.9207
  RMSE: 183.3192
  MAE: 106.0468
  R2: 0.2661
  MAPE: 3197.8609


In [31]:
# Evaluate Random Forest
print("\n" + "="*60)
print("RANDOM FOREST - METRICS EVALUATION")
print("="*60)

rf_train_metrics = evaluate_metrics(y_train, y_train_pred_rf, "Training")
rf_val_metrics = evaluate_metrics(y_val, y_val_pred_rf, "Validation")
rf_test_metrics = evaluate_metrics(y_test, y_test_pred_rf, "Test")

print("\nTraining Set Metrics:")
for metric, value in rf_train_metrics.items():
    print(f"  {metric}: {value:.4f}")

print("\nValidation Set Metrics:")
for metric, value in rf_val_metrics.items():
    print(f"  {metric}: {value:.4f}")

print("\nTest Set Metrics:")
for metric, value in rf_test_metrics.items():
    print(f"  {metric}: {value:.4f}")


RANDOM FOREST - METRICS EVALUATION

Training Set Metrics:
  MSE: 1556.0132
  RMSE: 39.4463
  MAE: 12.5497
  R2: 0.9659
  MAPE: 164.9361

Validation Set Metrics:
  MSE: 8006.6354
  RMSE: 89.4798
  MAE: 28.3573
  R2: 0.8256
  MAPE: 378.7402

Test Set Metrics:
  MSE: 8373.7759
  RMSE: 91.5083
  MAE: 28.6335
  R2: 0.8171
  MAPE: 393.6378


In [32]:
# Model Comparison and Selection
print("\n" + "="*60)
print("MODEL COMPARISON - TEST SET PERFORMANCE")
print("="*60)

comparison_data = {
    'Model': ['Linear Regression', 'Random Forest'],
    'RMSE': [lr_test_metrics['RMSE'], rf_test_metrics['RMSE']],
    'MAE': [lr_test_metrics['MAE'], rf_test_metrics['MAE']],
    'R2': [lr_test_metrics['R2'], rf_test_metrics['R2']],
    'MAPE': [lr_test_metrics['MAPE'], rf_test_metrics['MAPE']]
}

comparison_df = pd.DataFrame(comparison_data)
print("\n", comparison_df.to_string(index=False))

# Select best model based on test RMSE
best_model = rf_model if rf_test_metrics['RMSE'] < lr_test_metrics['RMSE'] else lr_model
best_model_name = 'Random Forest' if rf_test_metrics['RMSE'] < lr_test_metrics['RMSE'] else 'Linear Regression'
print(f"\n✓ Best model: {best_model_name} (Lower RMSE on test set)")

# Save the best model
with open('best_solar_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print(f"✓ Model saved as 'best_solar_model.pkl'")


MODEL COMPARISON - TEST SET PERFORMANCE

             Model       RMSE        MAE       R2        MAPE
Linear Regression 183.319177 106.046809 0.266077 3197.860853
    Random Forest  91.508338  28.633535 0.817124  393.637829

✓ Best model: Random Forest (Lower RMSE on test set)
✓ Model saved as 'best_solar_model.pkl'
