In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [8]:
x = pd.read_csv("D:/Project/PowerPulse/Energy_consumption_predection/notebooks/x.csv")
y = pd.read_csv("D:/Project/PowerPulse/Energy_consumption_predection/notebooks/y.csv")

In [9]:
y = y.values.ravel()

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.3, random_state = 1)

In [11]:
print(len(x_train), len(y_train))

1452681 1452681


In [12]:
print(len(x_test), len(y_test))  

622578 622578


In [7]:
#Linear regression

In [7]:
regression_model = LinearRegression()

In [8]:
regression_model.fit(x_train,y_train)

In [9]:
predection_train = regression_model.predict(x_train)

In [10]:
predection_test = regression_model.predict(x_test)

In [11]:
#Evaluation Metrics

In [18]:
def evaluation_metrics(y_train, y_pred_train, y_test, y_pred_test):
    r2_train = metrics.r2_score(y_train, y_pred_train)
    r2_test = metrics.r2_score(y_test, y_pred_test)
    MAE_train = mean_absolute_error(y_train, y_pred_train)
    MAE_test = mean_absolute_error(y_test, y_pred_test)
    MSE_train = mean_squared_error(y_train, y_pred_train)
    MSE_test = mean_squared_error(y_test, y_pred_test)
    RMSE_train = np.sqrt(MSE_train)
    RMSE_test = np.sqrt(MSE_test)

    print(f"The r2_score for train: {r2_train}") 
    print(f"The r2_score for test: {r2_test}")
    print(f"The mean_absolute_error for train: {MAE_train}")
    print(f"The mean_absolute_error for test: {MAE_test}")
    print(f"The mean_squared_error for train: {MSE_train}")
    print(f"The mean_Squared_error for test: {MSE_test}")
    print(f"The root_mean_squared_error for train: {RMSE_train}")
    print(f"The root_mean_Squared_error for test: {RMSE_test}")


In [13]:
evaluation_metrics(y_train, predection_train, y_test, predection_test)

The r2_score for train: 0.6976144847443242
The r2_score for test: 0.696791236569051
The mean_absolute_error for train: 0.1629123480377553
The mean_absolute_error for test: 0.1631147698275407
The mean_squared_error for train: 0.04983111970077048
The mean_Squared_error for test: 0.04995744747719872
The root_mean_squared_error for train: 0.22322885051169009
The root_mean_Squared_error for test: 0.2235116271633284


In [15]:
#Ridge

In [16]:
ridge = Ridge()
param_grid = {'alpha':[0.1,1,3,5,25,12,60,45,85,74,100,55]}
        
ridge_regressor = GridSearchCV(estimator = ridge,
    param_grid = param_grid,
    scoring= 'neg_mean_absolute_error',
    n_jobs=-1,
    cv=5,
    verbose=2)
ridge_regressor.fit(x_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [17]:
ridge_regressor.best_params_

{'alpha': 0.1}

In [18]:
ridge_regressor = Ridge(alpha = 0.1)
ridge_regressor.fit(x_train,y_train)

In [19]:
yprec_train = ridge_regressor.predict(x_train)

In [20]:
yprec_test = ridge_regressor.predict(x_test)

In [21]:
evaluation_metrics(y_train, yprec_train, y_test, yprec_test)

The r2_score for train: 0.697614484744322
The r2_score for test: 0.6967912366291347
The mean_absolute_error for train: 0.1629123488360519
The mean_absolute_error for test: 0.16311477059604676
The mean_squared_error for train: 0.04983111970077083
The mean_Squared_error for test: 0.04995744746729919
The root_mean_squared_error for train: 0.22322885051169086
The root_mean_Squared_error for test: 0.22351162714118294


In [None]:
#lasso

In [None]:
lasso = Lasso()
param_grid = {'alpha':[0,0.2,0.1,3,5,15,18,42,78,19,0.5,0.3,6]}
        
lasso_regressor = GridSearchCV(estimator = lasso,
    param_grid = param_grid,
    scoring= 'neg_mean_absolute_error',
    n_jobs=6,
    cv=5,
    verbose=2)
lasso_regressor.fit(x_train, y_train)

In [None]:
lasso_regressor.best_params_

In [None]:
lasso_regressor = Lasso(alpha = 0.1)
lasso_regressor.fit(x_train,y_train)

In [None]:
yprec_train = lasso_regressor.predict(x_train)

In [None]:
yprec_test = lasso_regressor.predict(x_test)

In [None]:
evaluation_metrics(y_train, yprec_train, y_test, yprec_test)

In [22]:
#RandomForest Regressor

In [25]:
rfr = RandomForestRegressor(n_estimators=150, random_state = 1, n_jobs=4)
rfr.fit(x_train,y_train)

In [26]:
ypre_train = rfr.predict(x_train)

In [27]:
ypre_test = rfr.predict(x_test)

In [28]:
evaluation_metrics(y_train, ypre_train, y_test, ypre_test)

The r2_score for train: 0.942471221909291
The r2_score for test: 0.8940898463754015
The mean_absolute_error for train: 0.04721777111854599
The mean_absolute_error for test: 0.06999275811258678
The mean_squared_error for train: 0.009480359615946834
The mean_Squared_error for test: 0.017450026434370765
The root_mean_squared_error for train: 0.09736713827543066
The root_mean_Squared_error for test: 0.1320985481917601


In [29]:
print (pd.DataFrame(rfr.feature_importances_, columns = ["Imp"], index = x_train.columns))

                            Imp
Global_reactive_power  0.070775
Sub_metering_1         0.091731
Sub_metering_2         0.079043
Sub_metering_3         0.587008
Hour                   0.084479
year                   0.019785
Month                  0.048191
Peak_hours             0.004877
Weekday_1              0.006468
Weekday_6              0.007643


In [None]:
# Hyper parameter Tuning1

In [30]:
parameters = {'max_depth' : [10,20],
                   'min_samples_leaf' : [5,10],
                   'min_samples_split' : [10,20],
                   'max_features' : ['sqrt','log2']}
rfr = RandomForestRegressor(random_state = 1)

random_search = RandomizedSearchCV(estimator = rfr, param_distributions = parameters, 
                                   n_iter=20, cv = 3, verbose=2, random_state=1, scoring= 'neg_mean_absolute_error', n_jobs=6)

random_search.fit(x_train,y_train)



Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [34]:
random_search.best_params_

{'min_samples_split': 10,
 'min_samples_leaf': 5,
 'max_features': 'sqrt',
 'max_depth': 20}

In [32]:
best_params = random_search.best_params_

In [33]:
rfr_tune = RandomForestRegressor(**best_params, random_state = 1, n_jobs=5)
rfr_tune.fit(x_train,y_train)

In [36]:
ypre_train = rfr_tune.predict(x_train)

In [37]:
ypre_test = rfr_tune.predict(x_test)

In [38]:
evaluation_metrics(y_train, ypre_train, y_test, ypre_test)

The r2_score for train: 0.8888196729718426
The r2_score for test: 0.8804280069042797
The mean_absolute_error for train: 0.08507064797651429
The mean_absolute_error for test: 0.08828826920419922
The mean_squared_error for train: 0.01832177768113821
The mean_Squared_error for test: 0.019700985872671814
The root_mean_squared_error for train: 0.13535796127726735
The root_mean_Squared_error for test: 0.1403602004582204


In [None]:
# 2nd tuning

In [7]:
parameters = {'max_depth' : [10,20,50],
                   'min_samples_leaf' : [5,10,25],
                   'min_samples_split' : [10,20,15],
                   'max_features' : ['sqrt','log2']}
rfr = RandomForestRegressor(random_state = 1)

random_search = RandomizedSearchCV(estimator = rfr, param_distributions = parameters, 
                                   n_iter=40, cv = 3, verbose=2, random_state=1, scoring= 'neg_mean_absolute_error', n_jobs=6)

random_search.fit(x_train,y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


In [8]:
random_search.best_params_

{'min_samples_split': 10,
 'min_samples_leaf': 5,
 'max_features': 'log2',
 'max_depth': 50}

In [9]:
best_params_1 = random_search.best_params_

In [10]:
rfr_tune = RandomForestRegressor(**best_params_1, random_state = 1, n_jobs=5)
rfr_tune.fit(x_train,y_train)

In [11]:
ypre_train = rfr_tune.predict(x_train)

In [12]:
ypre_test = rfr_tune.predict(x_test)

In [17]:
evaluation_metrics(y_train, ypre_train, y_test, ypre_test)

The r2_score for train: 0.9093346537628368
The r2_score for test: 0.8927188412848999
The mean_absolute_error for train: 0.07239471566792156
The mean_absolute_error for test: 0.07934449492022175
The mean_squared_error for train: 0.014941045430816393
The mean_Squared_error for test: 0.017675916722054687
The root_mean_squared_error for train: 0.12223356916500636
The root_mean_Squared_error for test: 0.13295080564650477


In [None]:
#random_forest tuning results

In [22]:
tuning_result = [
    {
        'n_estimators': 150,
        'random_state': 1,
        'r2_score_train': 0.9424,
        'r2_score_test': 0.8940,
        'MAE_train': 0.04721,
        'MAE_test': 0.0699,
        'MSE_train': 0.0094,
        'MSE_test': 0.0174,
        'RMSE_train': 0.0973,
        'RMSE_test': 0.1320
    },
    {
        'max_depth': 20,
        'max_features': 'sqrt',
        'min_samples_leaf': 5,
        'min_samples_split': 10,
        'random_state': 1,
        'r2_score_train': 0.8888,
        'r2_score_test': 0.8804,
        'MAE_train': 0.0850,
        'MAE_test': 0.0882,
        'MSE_train': 0.0183,
        'MSE_test': 0.0197,
        'RMSE_train': 0.1353,
        'RMSE_test': 0.1403
    },
    {
        'max_depth': 50,
        'max_features': 'log2',
        'min_samples_leaf': 5,
        'min_samples_split': 10,
        'random_state': 1,
        'r2_score_train': 0.9093,
        'r2_score_test': 0.8927,
        'MAE_train': 0.0723,
        'MAE_test': 0.0793,
        'MSE_train': 0.0149,
        'MSE_test': 0.0176,
        'RMSE_train': 0.1222,
        'RMSE_test': 0.1329
    }
]


In [23]:
df = pd.DataFrame(tuning_result)

In [24]:
df.to_csv("random_forest_tuning_result.csv", index = False)

In [None]:
#gradientBoosting

In [28]:
gbr = GradientBoostingRegressor(n_estimators=200,random_state=1,verbose=2)
gbr.fit(x_train,y_train)

      Iter       Train Loss   Remaining Time 
         1           0.1432            7.69m
         2           0.1258            7.94m
         3           0.1114            7.97m
         4           0.0996            8.02m
         5           0.0896            8.02m
         6           0.0815            8.02m
         7           0.0749            7.99m
         8           0.0693            7.95m
         9           0.0647            7.92m
        10           0.0609            7.88m
        11           0.0576            7.84m
        12           0.0549            7.80m
        13           0.0526            7.78m
        14           0.0506            7.75m
        15           0.0489            7.70m
        16           0.0475            7.64m
        17           0.0462            7.60m
        18           0.0451            7.56m
        19           0.0441            7.53m
        20           0.0433            7.48m
        21           0.0422            7.44m
        2

In [29]:
joblib.dump(gbr, 'gradient_boosting.pkl')

['gradient_boosting.pkl']

In [30]:
ypre_train = gbr.predict(x_train)

In [31]:
ypre_test = gbr.predict(x_test)

In [32]:
evaluation_metrics(y_train, ypre_train, y_test, ypre_test)

The r2_score for train: 0.8325050820378348
The r2_score for test: 0.8323393168553659
The mean_absolute_error for train: 0.11113334192619698
The mean_absolute_error for test: 0.1110898714818241
The mean_squared_error for train: 0.027602047337440115
The mean_Squared_error for test: 0.027624200822601867
The root_mean_squared_error for train: 0.16613863890570463
The root_mean_Squared_error for test: 0.16620529721582844


In [7]:
from lightgbm import LGBMRegressor

In [13]:
model_lgbm = LGBMRegressor(n_jobs=-1, random_state=1)
model_lgbm.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.115761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 420
[LightGBM] [Info] Number of data points in the train set: 1452681, number of used features: 10
[LightGBM] [Info] Start training from score 0.626190


In [4]:
import joblib

In [14]:
joblib.dump(model_lgbm, 'light_gradient_boosting.pkl')

['light_gradient_boosting.pkl']

In [15]:
ypre_train = model_lgbm.predict(x_train)

In [16]:
ypre_test = model_lgbm.predict(x_test)

In [19]:
evaluation_metrics(y_train, ypre_train, y_test, ypre_test)

The r2_score for train: 0.8509702769681072
The r2_score for test: 0.850520227130498
The mean_absolute_error for train: 0.1016765298318534
The mean_absolute_error for test: 0.1017650253739561
The mean_squared_error for train: 0.024559106149961422
The mean_Squared_error for test: 0.02462866777837169
The root_mean_squared_error for train: 0.15671345235799453
The root_mean_Squared_error for test: 0.15693523434325285


In [None]:
#Hyper parameter tuning

In [20]:
# Define the hyperparameters to tune
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1],  # Focus on common learning rates
    'n_estimators': [100, 200, 300],  # Reduce the range of estimators
    'max_depth': [3, 5, 7],  # Remove -1 for unlimited depth
    'min_data_in_leaf': [20, 50],  # Fewer options for minimum data in leaf
    'feature_fraction': [0.8, 0.9, 1.0],  # Slightly narrower range
    'bagging_fraction': [0.8, 0.9],  # Narrow down the bagging fraction
    'lambda_l1': [0, 0.1],  # Fewer regularization parameters
    'lambda_l2': [0, 0.1],  # Fewer regularization parameters
    'boosting_type': ['gbdt', 'dart']  # Drop 'goss' if not needed
}

# Use RandomizedSearchCV to search over the hyperparameter space
random_search_lgbm = RandomizedSearchCV(estimator=model_lgbm, param_distributions=param_dist,
                                   n_iter=50, cv=3, random_state=1, n_jobs=-1, verbose=2)

In [21]:
random_search_lgbm.fit(x_train,y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036765 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 420
[LightGBM] [Info] Number of data points in the train set: 1452681, number of used features: 10
[LightGBM] [Info] Start training from score 0.626190


In [23]:
random_search_lgbm.best_params_

{'n_estimators': 300,
 'min_data_in_leaf': 20,
 'max_depth': 7,
 'learning_rate': 0.05,
 'lambda_l2': 0.1,
 'lambda_l1': 0,
 'feature_fraction': 0.9,
 'boosting_type': 'gbdt',
 'bagging_fraction': 0.8}

In [25]:
best_params = random_search_lgbm.best_params_

In [26]:
model_lgbm = LGBMRegressor(**best_params, n_jobs=-1, random_state=1)
model_lgbm.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 420
[LightGBM] [Info] Number of data points in the train set: 1452681, number of used features: 10
[LightGBM] [Info] Start training from score 0.626190


In [27]:
joblib.dump(model_lgbm, 'light_gradient_boosting_tuned1.pkl')

['light_gradient_boosting_tuned1.pkl']

In [28]:
ypre_train = model_lgbm.predict(x_train)



In [29]:
ypre_test = model_lgbm.predict(x_test)



In [30]:
evaluation_metrics(y_train, ypre_train, y_test, ypre_test)

The r2_score for train: 0.8525770100787196
The r2_score for test: 0.8520832090332489
The mean_absolute_error for train: 0.10072013245321444
The mean_absolute_error for test: 0.10083695292141784
The mean_squared_error for train: 0.02429432723059282
The mean_Squared_error for test: 0.0243711469025535
The root_mean_squared_error for train: 0.15586637620280014
The root_mean_Squared_error for test: 0.1561126096846552
