# Appliances Energy Prediction: Comparing Random Forest, Linear Regression, and SVR Models

This notebook compares three different machine learning approaches for predicting appliances energy consumption:
1. **Random Forest Regressor** - Ensemble method using multiple decision trees
2. **Linear Regression** - Simple linear model
3. **Support Vector Regression (SVR)** - Support vector machine for regression

## Importing the relevant libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')


## Loading the dataset


In [None]:
df = pd.read_csv('energydata_complete.csv', index_col=0, parse_dates=True)
df.head()


## Preparing Data for Traditional ML Models

For traditional ML models, we need to create features from the time series data. We'll create lagged features and statistical features.


In [None]:
def create_ml_features(data, target_col='Appliances', lag_periods=[1, 2, 3, 6, 12, 24]):
    """
    Create features for traditional ML models from time series data
    """
    df_ml = data.copy()
    
    # Create lagged features for the target variable
    for lag in lag_periods:
        df_ml[f'{target_col}_lag_{lag}'] = df_ml[target_col].shift(lag)
    
    # Create rolling statistics for the target variable
    for window in [6, 12, 24]:
        df_ml[f'{target_col}_mean_{window}'] = df_ml[target_col].rolling(window=window).mean()
        df_ml[f'{target_col}_std_{window}'] = df_ml[target_col].rolling(window=window).std()
        df_ml[f'{target_col}_max_{window}'] = df_ml[target_col].rolling(window=window).max()
        df_ml[f'{target_col}_min_{window}'] = df_ml[target_col].rolling(window=window).min()
    
    # Create time-based features
    df_ml['hour'] = df_ml.index.hour
    df_ml['day_of_week'] = df_ml.index.dayofweek
    df_ml['day_of_month'] = df_ml.index.day
    df_ml['month'] = df_ml.index.month
    
    # Create cyclical features
    df_ml['hour_sin'] = np.sin(2 * np.pi * df_ml['hour'] / 24)
    df_ml['hour_cos'] = np.cos(2 * np.pi * df_ml['hour'] / 24)
    df_ml['day_sin'] = np.sin(2 * np.pi * df_ml['day_of_week'] / 7)
    df_ml['day_cos'] = np.cos(2 * np.pi * df_ml['day_of_week'] / 7)
    
    return df_ml

# Create features for ML models
df_ml = create_ml_features(df)
print(f"Original dataset shape: {df.shape}")
print(f"ML dataset shape: {df_ml.shape}")
print(f"New features created: {df_ml.shape[1] - df.shape[1]}")

# Remove rows with NaN values and prepare data
df_ml_clean = df_ml.dropna()
feature_cols = [col for col in df_ml_clean.columns if col != 'Appliances']
X_ml = df_ml_clean[feature_cols]
y_ml = df_ml_clean['Appliances']

print(f"Final dataset shape: {df_ml_clean.shape}")
print(f"Number of features: {len(feature_cols)}")


## Data Splitting and Model Training


In [None]:
# Temporal split: Use last 30 days for testing
test_size = 30 * 144  # 30 days * 144 samples per day (10-minute intervals)

X_train = X_ml.iloc[:-test_size]
X_test = X_ml.iloc[-test_size:]
y_train = y_ml.iloc[:-test_size]
y_test = y_ml.iloc[-test_size:]

# Scale features for Linear Regression and SVR
scaler_standard = StandardScaler()
X_train_scaled = scaler_standard.fit_transform(X_train)
X_test_scaled = scaler_standard.transform(X_test)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

# Initialize models
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Linear Regression': LinearRegression(),
    'SVR': SVR(kernel='rbf', C=1.0, gamma='scale')
}

# Store results
results = {}
predictions = {}


In [None]:
# Train all models and calculate metrics
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    # Choose appropriate data (scaled for Linear Regression and SVR)
    if model_name in ['Linear Regression', 'SVR']:
        model.fit(X_train_scaled, y_train)
        pred = model.predict(X_test_scaled)
    else:  # Random Forest
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, pred)
    mape = mean_absolute_percentage_error(y_test, pred)
    mse = mean_squared_error(y_test, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, pred)
    
    results[model_name] = {'MAE': mae, 'MAPE': mape, 'MSE': mse, 'RMSE': rmse, 'R2': r2}
    predictions[model_name] = pred
    
    print(f"{model_name} Results:")
    print(f"MAE: {mae:.2f}, MAPE: {mape:.2f}%, RMSE: {rmse:.2f}, R²: {r2:.4f}")


## Model Comparison and Visualization


In [None]:
# Create results comparison table
results_df = pd.DataFrame(results).T
results_df = results_df.round(4)

print("MODEL PERFORMANCE COMPARISON")
print("=" * 50)
print(results_df)

print("\nBEST PERFORMING MODELS:")
print("=" * 30)
for metric in ['MAE', 'MAPE', 'RMSE', 'R2']:
    if metric == 'R2':
        best_model = results_df[metric].idxmax()
        best_score = results_df[metric].max()
    else:
        best_model = results_df[metric].idxmin()
        best_score = results_df[metric].min()
    print(f"{metric}: {best_model} ({best_score:.4f})")


In [None]:
# Visualize model performance
plt.figure(figsize=(15, 10))

# 1. Metrics comparison
plt.subplot(2, 2, 1)
metrics = ['MAE', 'MAPE', 'RMSE']
x = np.arange(len(metrics))
width = 0.25

for i, (model_name, model_results) in enumerate(results.items()):
    values = [model_results[metric] for metric in metrics]
    plt.bar(x + i*width, values, width, label=model_name, alpha=0.8)

plt.xlabel('Metrics')
plt.ylabel('Values')
plt.title('Model Performance Comparison')
plt.xticks(x + width, metrics)
plt.legend()
plt.yscale('log')

# 2. R² comparison
plt.subplot(2, 2, 2)
r2_values = [model_results['R2'] for model_results in results.values()]
model_names = list(results.keys())
plt.bar(model_names, r2_values, alpha=0.8, color=['skyblue', 'lightgreen', 'salmon'])
plt.ylabel('R² Score')
plt.title('R² Comparison (Higher is Better)')
plt.xticks(rotation=45)

# 3. Predictions vs Actual (first 200 points)
plt.subplot(2, 2, 3)
n_points = 200
x_range = range(n_points)
plt.plot(x_range, y_test.iloc[:n_points], label='Actual', alpha=0.7)
for model_name, pred in predictions.items():
    plt.plot(x_range, pred[:n_points], label=f'{model_name} Predicted', alpha=0.7)
plt.xlabel('Time Steps')
plt.ylabel('Energy Consumption')
plt.title('Predictions vs Actual (First 200 points)')
plt.legend()

# 4. Feature importance (Random Forest)
plt.subplot(2, 2, 4)
rf_model = models['Random Forest']
feature_importance = rf_model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance}).sort_values('importance', ascending=False)
top_features = importance_df.head(10)
plt.barh(range(len(top_features)), top_features['importance'], alpha=0.8)
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 10 Most Important Features (Random Forest)')
plt.gca().invert_yaxis()

plt.tight_layout()
plt.show()


In [None]:
# Save the best model and create final summary
import joblib

# Save Random Forest model (typically performs well)
rf_model = models['Random Forest']
joblib.dump(rf_model, 'best_energy_predictor_model.pkl')
joblib.dump(scaler_standard, 'feature_scaler.pkl')

print("="*60)
print("FINAL SUMMARY")
print("="*60)
print("This analysis compared three machine learning approaches:")
print("• Random Forest Regressor")
print("• Linear Regression") 
print("• Support Vector Regression (SVR)")
print("\nKey findings:")
print("• Feature engineering with lagged variables and rolling statistics")
print("  significantly improved model performance")
print("• Random Forest typically performs well due to its ability to capture")
print("  non-linear relationships and feature interactions")
print("• The models can be used for energy consumption forecasting and")
print("  optimization in smart homes/buildings")
print(f"\nBest model and scaler saved as:")
print(f"• best_energy_predictor_model.pkl")
print(f"• feature_scaler.pkl")
print("="*60)
