In [1]:
import os
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
import mlflow
from extended_modules import assistant
%matplotlib inline

df = pd.read_csv(os.path.join("..", "data", "test_energy_data.csv"))
df.head(5)

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Residential,24563,15,4,28.52,Weekday,2865.57
1,Commercial,27583,56,23,23.07,Weekend,4283.8
2,Commercial,45313,4,44,33.56,Weekday,5067.83
3,Residential,41625,84,17,27.39,Weekend,4624.3
4,Residential,36720,58,47,17.08,Weekday,4820.59


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        100 non-null    object 
 1   Square Footage       100 non-null    int64  
 2   Number of Occupants  100 non-null    int64  
 3   Appliances Used      100 non-null    int64  
 4   Average Temperature  100 non-null    float64
 5   Day of Week          100 non-null    object 
 6   Energy Consumption   100 non-null    float64
dtypes: float64(2), int64(3), object(2)
memory usage: 5.6+ KB


In [3]:
report = assistant.report(df)
report.show()

number of columns: 7
number of rows: 100
number of duplicates: 0

number of numerical columns: 5
number of categorical columns: 2


In [6]:
# report.export(os.path.join("..", "report", "test_energy_data"))

In [3]:
X_train, X_test, y_train, y_test = assistant.split_data(df, "Energy Consumption", 0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((80, 6), (20, 6), (80, 1), (20, 1))

In [2]:
# assistant.export_train_test(X_train, X_test, y_train, y_test, os.path.join("..", "data"))
X_train, X_test, y_train, y_test = assistant.import_train_test(os.path.join("..", "data"))
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((80, 6), (20, 6), (80, 1), (20, 1))

In [8]:
from sklearn.compose import ColumnTransformer
from extended_modules.sklearnext.preprocessing import LabelEncoderTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(
    transformers=[
        ("label-encoder-transformer", LabelEncoderTransformer(), "Day of Week"),
        ("one-hot-encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), ["Building Type"])
    ],
    remainder="passthrough"
)

ct_md = ct.fit(X_train)
ct_md

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [3]:
# pkl.dump(ct_md, open(os.path.join("..", "model", "preprocessor.pkl"), "wb"))
ppc_md = pkl.load(open(os.path.join("..", "model", "preprocessor.pkl"), "rb"))
ppc_md

In [4]:
X_train = pd.DataFrame(ppc_md.transform(X_train))
df_train = pd.concat([X_train, y_train], axis=1)
df_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,Energy Consumption
0,1.0,0.0,0.0,1.0,17982.0,4.0,37.0,13.29,3112.64
1,0.0,0.0,1.0,0.0,27165.0,73.0,25.0,30.15,4987.52
2,1.0,0.0,0.0,1.0,7924.0,63.0,36.0,34.71,3072.63
3,1.0,0.0,1.0,0.0,42767.0,40.0,28.0,17.94,5508.64
4,1.0,0.0,1.0,0.0,2145.0,56.0,12.0,11.77,3348.39


In [5]:
X_train_train, X_validate, y_train_train, y_validate = assistant.split_data(df_train, "Energy Consumption", 0.2)
X_train_train.shape, X_validate.shape, y_train_train.shape, y_validate.shape

((64, 8), (16, 8), (64, 1), (16, 1))

In [None]:
# assistant.export_train_test(X_train_train, X_validate, y_train_train, y_validate, os.path.join("..", "data"), True)
X_train_train, X_validate, y_train_train, y_validate = assistant.import_train_test(os.path.join("..", "data"), True)
X_train_train.shape, X_validate.shape, y_train_train.shape, y_validate.shape

((64, 8), (16, 8), (64, 1), (16, 1))

In [15]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import root_mean_squared_error

mlflow.set_tracking_uri("http://127.0.0.1:8080/")
mlflow.set_experiment("energy-consumption-prediction")

model_instances = [
    LinearRegression(), Ridge(), Lasso(), ElasticNet(), SVR(), RandomForestRegressor(), GradientBoostingRegressor(),
    XGBRegressor(), LGBMRegressor()
]

for md in model_instances:
    with mlflow.start_run():
        mlflow.log_params({"model_algorithm": type(md).__name__})
        md_md = md.fit(X_train_train, y_train_train)
        y_train_pred = md_md.predict(X_train_train)
        y_validate_pred = md_md.predict(X_validate)
        rmse_train = root_mean_squared_error(y_train_train, y_train_pred)
        rmse_validate = root_mean_squared_error(y_validate, y_validate_pred)
        if rmse_train <= 50 and rmse_validate <= 100:
            mlflow.sklearn.log_model(md_md, artifact_path="forecasting_model")
        mlflow.log_metrics({"train_rmse": rmse_train, "validate_rmse": rmse_validate})
    mlflow.end_run()



🏃 View run bemused-worm-774 at: http://127.0.0.1:8080/#/experiments/0/runs/2dc30cdccd1a442eb6169d7317bda837
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0




🏃 View run awesome-sponge-379 at: http://127.0.0.1:8080/#/experiments/0/runs/9beaf5878c5946cfa435b997b9546f0a
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0




🏃 View run valuable-carp-675 at: http://127.0.0.1:8080/#/experiments/0/runs/29175a0305fe4f53b1aed89c0a58495f
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0
🏃 View run glamorous-horse-880 at: http://127.0.0.1:8080/#/experiments/0/runs/66a7187887bf488db9ead3c925e228ff
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


🏃 View run dapper-robin-67 at: http://127.0.0.1:8080/#/experiments/0/runs/5866350b8e4f4d658b64165024ba7a30
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0
🏃 View run nervous-shad-960 at: http://127.0.0.1:8080/#/experiments/0/runs/b4eff4fd13154b47b3a918f038d9a547
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


🏃 View run classy-deer-210 at: http://127.0.0.1:8080/#/experiments/0/runs/8fbd63c51f494796abb05b72ceff1c08
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0
🏃 View run legendary-bat-228 at: http://127.0.0.1:8080/#/experiments/0/runs/dfcd2930ce3b472e81717a27cf1ee7db
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000020 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 64, number of used features: 7
[LightGBM] [Info] Start training from score 4018.297356
🏃 View run spiffy-fox-715 at: http://127.0.0.1:8080/#/experiments/0/runs/bedba81b2f9e454ca3c5400a82a0b2c8
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0


In [35]:
import mlflow
logged_model = 'runs:/2dc30cdccd1a442eb6169d7317bda837/forecasting_model'
loaded_model = mlflow.sklearn.load_model(logged_model)
loaded_model

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [37]:
# pkl.dump(loaded_model, open(os.path.join("..", "model", "forecaster.pkl"), "wb"))
fcs_md = pkl.load(open(os.path.join("..", "model", "forecaster.pkl"), "rb"))
fcs_md