In [None]:
# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import lightgbm
import pickle
import warnings
from sklearn.metrics import r2_score
from pyDOE import lhs
from tpot import TPOTRegressor
import matplotlib.pyplot as plt

import plotly
import plotly.graph_objs as go

from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.preprocessing import StandardScaler
import uuid
import os
from sklearn.metrics import r2_score

In [3]:
def normalize(input_array):
    mean = np.mean(input_array, axis=0)
    std = np.std(input_array, axis=0)

    # scikit-learn measure to handle zeros in scale: def _handle_zeros_in_scale(scale, copy=True)
    # https://github.com/scikit-learn/scikit-learn/blob/7389dbac82d362f296dc2746f10e43ffa1615660/sklearn/preprocessing/data.py#L70
    if np.isscalar(std):
        if std == .0:
            std = 1.
    elif isinstance(std, np.ndarray):
        std = std.copy()
        std[std == 0.0] = 1.0

    data_norm = (input_array - mean) / std
    return mean, std, data_norm

In [4]:
# READ DATA
data_path_ = r"..\..\01_Data\El_motor_Temp\pmsm_temperature_data.csv"
data = pd.read_csv(data_path_)
data_filt = data[data.profile_id==10]

# NORMALIZE DATA
mean_data, std_data, data_all_norm = normalize(data_filt)
X_train_norm, y_train_norm = data_all_norm.iloc[::1 ,:8], data_all_norm.iloc[::1 , 8].ravel()

In [13]:
regressor_dx = TPOTRegressor(generations=10, population_size=60, 
                             #config_dict=regressor_config_dict, 
                             cv=40, random_state=42,
                             verbosity=2, scoring='r2',
                            n_jobs = -1) # max_time_mins=5

regressor_dx.fit(X_train_norm, y_train_norm)

print(regressor_dx.score(X_train_norm, y_train_norm))

regressor_dx.export('tpot_reg_dx_result_long_run.py')
# see which pipelines were evaluated:
print(regressor_dx.evaluated_individuals_)




HBox(children=(IntProgress(value=0, description='Optimization Progress', max=660, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: -16.83113603108779
Generation 2 - Current best internal CV score: -16.133172499461182
Generation 3 - Current best internal CV score: -15.619924967059074
Generation 4 - Current best internal CV score: -15.353456891091554
Generation 5 - Current best internal CV score: -14.695731998466247
Generation 6 - Current best internal CV score: -12.053004300810972
Generation 7 - Current best internal CV score: -12.053004300810972
Generation 8 - Current best internal CV score: -12.053004300810972
Generation 9 - Current best internal CV score: -12.053004300810972
Generation 10 - Current best internal CV score: -12.053004300810972

Best pipeline: GradientBoostingRegressor(MinMaxScaler(RidgeCV(input_matrix)), alpha=0.75, learning_rate=0.1, loss=ls, max_depth=8, max_features=0.6500000000000001, min_samples_leaf=15, min_samples_split=16, n_estimators=100, subsample=0.5)
0.9997313168179677


{'RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.7500000000000001, RandomForestRegressor__min_samples_leaf=11, RandomForestRegressor__min_samples_split=9, RandomForestRegressor__n_estimators=100)': {'generation': 0, 'mutation_count': 0, 'crossover_count': 0, 'predecessor': ('ROOT',), 'operator_count': 1, 'internal_cv_score': -20.360598718692323}, 'RandomForestRegressor(ElasticNetCV(input_matrix, ElasticNetCV__l1_ratio=0.75, ElasticNetCV__tol=0.01), RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.4, RandomForestRegressor__min_samples_leaf=16, RandomForestRegressor__min_samples_split=14, RandomForestRegressor__n_estimators=100)': {'generation': 0, 'mutation_count': 0, 'crossover_count': 0, 'predecessor': ('ROOT',), 'operator_count': 2, 'internal_cv_score': -19.343180129009887}, 'ExtraTreesRegressor(input_matrix, ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.5, ExtraTr

In [14]:
regressor_dx.fitted_pipeline_

Pipeline(memory=None,
         steps=[('stackingestimator',
                 StackingEstimator(estimator=RidgeCV(alphas=array([ 0.1,  1. , 10. ]),
                                                     cv=None,
                                                     fit_intercept=True,
                                                     gcv_mode=None,
                                                     normalize=False,
                                                     scoring=None,
                                                     store_cv_values=False))),
                ('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(alpha=0.75, criterion='fried...
                                           loss='ls', max_depth=8,
                                           max_features=0.6500000000000001,
                                           max_leaf_nodes=None,
                         

# Validation

In [15]:
###help functions###
def PlotCVRes_plotly(X, y, model, cv, scoring, titel):
    dir = os.path.dirname(os.path.abspath("__file__"))
    id = uuid.uuid4().hex
    if titel == '': 
        titel = str(model).replace('\n', '').replace('  ', '')
        titel_new = ''
        pos=0
        com_positions = pd.DataFrame([i for i, ltr in enumerate(titel) if (ltr == ',')])
        split_positions = [com_pos[0] for ind, com_pos in com_positions.iterrows() if ((ind+1)%5==0)] + [len(titel)]
        for ind in split_positions:
            titel_new = titel_new + ' \n ' + titel[pos:ind]
            pos = ind
        titel = titel_new[3:]

    cv_scores = cross_validate(model, np.array(X), np.array(y), cv=cv, scoring=scoring)
    print('MAE_test_mean: ' + str(round(abs(cv_scores['test_MAE']).mean(), 2)) + ', MAE_test_std: ' + str(
        round(abs(cv_scores['test_MAE']).std(), 2)))
    print('RMSE_test_mean: ' + str(round(np.sqrt(abs(cv_scores['test_RMSE'])).mean(), 2)) + ', RMSE_test_std: ' + str(
        round(np.sqrt(abs(cv_scores['test_RMSE'])).std(), 2)))
    print('R2_test_mean: ' + str(round(cv_scores['test_R2'].mean(), 3)) + ', R2_test_std: ' + str(
        round(cv_scores['test_R2'].std(), 3)))
    
    predicted = cross_val_predict(model, X, y, cv=cv)
    #predicted/real plot
    trace1 = go.Scatter(
        x = [y.min(), y.max()],
        y = [y.min(), y.max()],
        name = 'Ideal prediction line',
        line = dict(
            color = ('rgb(0, 0, 0)'),
            width = 3,
            dash = 'dash')
    )
    trace2 = go.Scattergl(
            x = y,
            y = predicted,
            name = 'Model Predictions',
            mode = 'markers',
            marker= dict(size= 3,
                opacity= 0.7,
                color = ('rgb(0, 102, 204)')
               ),
            #text=item_pairs,
            #name=orders,
            #hoverinfo = 'all', #"text + name",
            )

    data = [trace2, trace1]
    
    
    layout = go.Layout(
        xaxis=dict(
            title= 'Real Electrical Power Output',
            #showgrid=True,
            #zeroline=True,
            #showline=True,
        ),
        yaxis=dict(
            title= 'Predicted Electrical Power Output',
            #showgrid=True,
            #zeroline=True,
            #showline=True,
        ),
        title="ML Model Cross-validation Results",
        hovermode = "closest",
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, 'pred_real_scatter')

def Model_validation(X, y, model, scoring, data_stand=False, cv=5, plotCV=True, titel=''):
    if data_stand:
        Xs = StandardScaler().fit_transform(X)
    else:
        Xs = X
    # Plotting Cross-Validated Predictions
    if plotCV:
        PlotCVRes_plotly(Xs, y, model, cv, scoring, titel)
    else:
        cv_scores = cross_validate(model, Xs, y, cv=cv, scoring=scoring)
        print('MAE_test_mean: ' + str(abs(cv_scores['test_MAE']).mean()) + ', MAE_test_std: ' + str(
            abs(cv_scores['test_MAE']).std()))
        print('RMSE_test_mean: ' + str(np.sqrt(abs(cv_scores['test_RMSE'])).mean()) + ', RMSE_test_std: ' + str(
            np.sqrt(abs(cv_scores['test_RMSE'])).std()))
        print('R2_test_mean: ' + str(cv_scores['test_R2'].mean()) + ', R2_test_std: ' + str(
            cv_scores['test_R2'].std()))

In [23]:
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

model = make_pipeline(
    StackingEstimator(estimator=RidgeCV()),
    MinMaxScaler(),
    GradientBoostingRegressor(alpha=0.75, learning_rate=0.1, loss="ls", max_depth=8, max_features=0.6500000000000001, min_samples_leaf=15, min_samples_split=16, n_estimators=100, subsample=0.5)
)

model = make_pipeline(
    make_union(
        StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.75, learning_rate=0.1, loss="quantile", max_depth=3, max_features=0.05, min_samples_leaf=5, min_samples_split=12, n_estimators=100, subsample=1.0)),
        StackingEstimator(estimator=make_pipeline(
            SelectFromModel(estimator=ExtraTreesRegressor(max_features=0.55, n_estimators=100), threshold=0.05),
            RidgeCV()
        ))
    ),
    RandomForestRegressor(bootstrap=False, max_features=0.6500000000000001, min_samples_leaf=10, min_samples_split=16, n_estimators=100)
)

In [24]:
scoring = {'R2': 'r2', 'RMSE': 'neg_mean_squared_error', 'MAE': 'neg_mean_absolute_error'}
Model_validation(X_train_norm, y_train_norm, model, scoring=scoring,
                        cv=3, data_stand=False, 
                        plotCV=True, titel='')

MAE_test_mean: 0.82, MAE_test_std: 0.42
RMSE_test_mean: 0.95, RMSE_test_std: 0.47
R2_test_mean: -3.236, R2_test_std: 2.91


NameError: name 'go' is not defined