In [1]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.dummy import DummyRegressor
import sklearn.metrics as metrics
from sklearn import linear_model
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
svr_parameters = [
    {
        "kernel": ["rbf", "poly"],
        "gamma": ["scale", "auto"],
        "C": [1, 100],
    }
]

MODELS = {
    "DummyRegressor": DummyRegressor(),
    "linear_regression": linear_model.LinearRegression(),
    "ridge regression": GridSearchCV(
        linear_model.Ridge(),
        [{"alpha": [1.0, 0.5, 0.25]}],
        scoring="neg_mean_absolute_percentage_error",
    ),
    "lasso": GridSearchCV(
        linear_model.Lasso(),
        [{"alpha": [1.0, 0.5, 0.25]}],
        scoring="neg_mean_absolute_percentage_error",
    ),
    "elasticNet": GridSearchCV(
        linear_model.ElasticNet(),
        [{"alpha": [1.0, 0.5, 0.25]}],
        scoring="neg_mean_absolute_percentage_error",
    ),
    "svr": GridSearchCV(
        SVR(), svr_parameters, scoring="neg_mean_absolute_percentage_error", n_jobs=-1
    ),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
}

trained_models = {}

In [3]:
def execute_model(kf, clf, model_name, X, y):
    mape = []
    mae = []
    clf = Pipeline([("scaler", StandardScaler()), ("model", clf)])
    print(f"Model: {model_name}.")
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clf.fit(X_train, y_train)
        if hasattr(clf.steps[-1][1], "best_params_"):
            print(f"    best: {clf.steps[-1][1].best_params_}")
        y_pred = clf.predict(X_test)
        mape.append(metrics.mean_absolute_percentage_error(y_test, y_pred))
        mae.append(metrics.mean_absolute_error(y_test, y_pred))
    print(f"    mean MAPE: {np.mean(mape)} (+-{np.std(mape)})")
    print(f"    mean MAE: {np.mean(mae)} (+-{np.std(mae)})")
    return clf.fit(X, y)

## Compare models and dataset V1 vs Dataset V2

In [4]:
df = pd.read_csv(f"../data/dataset.csv")
X, y = df[df.columns.difference(["value_energy"])], df["value_energy"]

In [5]:
kf = KFold(n_splits=4)
for model_name, clf in MODELS.items():
    if trained_models.get(model_name) is None:
        trained_models[model_name] = execute_model(kf, clf, model_name, X, y)

Model: DummyRegressor.
    mean MAPE: 0.8381422928015594 (+-0.16979693083928302)
    mean MAE: 0.4116483612560396 (+-0.1120917643437213)
Model: linear_regression.
    mean MAPE: 0.5240695987409194 (+-0.07738194217598399)
    mean MAE: 0.29804551594094486 (+-0.08331850087457779)
Model: ridge regression.
    best: {'alpha': 1.0}
    best: {'alpha': 1.0}
    best: {'alpha': 1.0}
    best: {'alpha': 1.0}
    mean MAPE: 0.5218349019149321 (+-0.07672052532773556)
    mean MAE: 0.2972931106657398 (+-0.0836929364305104)
Model: lasso.
    best: {'alpha': 0.25}
    best: {'alpha': 1.0}
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    mean MAPE: 0.6973099034486389 (+-0.10298777378012956)
    mean MAE: 0.3648133570874502 (+-0.1395114603439251)
Model: elasticNet.
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    mean MAPE: 0.5813119758491976 (+-0.09421475538564333)
    mean MAE: 0.3219532368269912 (+-0.12218579304009239)
Model: svr.
 

In [6]:
df = pd.read_csv(f"../data/dataset_v2.csv")
X, y = df[df.columns.difference(["value_energy"])], df["value_energy"]
kf = KFold(n_splits=4)
for model_name, clf in MODELS.items():
    trained_models[model_name] = execute_model(kf, clf, model_name, X, y)

Model: DummyRegressor.
    mean MAPE: 0.8381422928015594 (+-0.16979693083928302)
    mean MAE: 0.4116483612560396 (+-0.1120917643437213)
Model: linear_regression.
    mean MAPE: 0.5227238644071519 (+-0.08084609088947085)
    mean MAE: 0.2974519586302165 (+-0.08187198687394248)
Model: ridge regression.
    best: {'alpha': 1.0}
    best: {'alpha': 1.0}
    best: {'alpha': 1.0}
    best: {'alpha': 1.0}
    mean MAPE: 0.5204100229109603 (+-0.08005847299273498)
    mean MAE: 0.296691205150446 (+-0.08222846955255225)
Model: lasso.
    best: {'alpha': 0.25}
    best: {'alpha': 1.0}
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    mean MAPE: 0.6973099034486389 (+-0.10298777378012956)
    mean MAE: 0.3648133570874502 (+-0.1395114603439251)
Model: elasticNet.
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    mean MAPE: 0.5813120249058453 (+-0.09421485402481654)
    mean MAE: 0.3219532489903678 (+-0.12218578636775637)
Model: svr.
  

Hay poca o ninguna diferencia. No parece que el ritmo cardiaco aporte mucha información

In [7]:
df = pd.read_csv(f"../data/dataset_v3.csv")
X, y = df[df.columns.difference(["value_energy"])], df["value_energy"]
kf = KFold(n_splits=4)
for model_name, clf in MODELS.items():
    trained_models[model_name] = execute_model(kf, clf, model_name, X, y)

Model: DummyRegressor.
    mean MAPE: 0.8381422928015594 (+-0.16979693083928302)
    mean MAE: 0.4116483612560396 (+-0.1120917643437213)
Model: linear_regression.
    mean MAPE: 0.4668159221899545 (+-0.051215037388312804)
    mean MAE: 0.26757277292862813 (+-0.07332858085564337)
Model: ridge regression.
    best: {'alpha': 1.0}
    best: {'alpha': 1.0}
    best: {'alpha': 1.0}
    best: {'alpha': 1.0}
    mean MAPE: 0.46525823744118616 (+-0.05127231207834393)
    mean MAE: 0.2670795430410799 (+-0.07356631172467933)
Model: lasso.
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    mean MAPE: 0.6076948782029093 (+-0.056552187018287986)
    mean MAE: 0.34007439958870683 (+-0.15214299931973713)
Model: elasticNet.
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    best: {'alpha': 0.25}
    mean MAPE: 0.48674754400634246 (+-0.05409403397533029)
    mean MAE: 0.2889743971558123 (+-0.12054559924988888)
Model

Parece que la inclusión de los gradientes mejora el modelo por norma general.