In [1]:
# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import lightgbm
import pickle
import warnings
from sklearn.metrics import r2_score
from pyDOE import lhs
from tpot import TPOTRegressor



In [2]:
#Read models from string
file_object = open('pumadyn_32nm_ML.txt', 'rb')
str_mdl = file_object.read()
model = pickle.loads(str_mdl)



In [3]:
#validate on the random predictions from this range
actual_lows = {'theta1': -2.3555153, 'theta2': -2.3559486, 'theta3': -2.3556861000000002, 'theta4': -2.3554262, 
                'theta5': -2.355844, 'theta6': -2.3558267999999996, 'thetad1': -2.3556139, 'thetad2': -2.3550772999999996,
                'thetad3': -2.3556497000000003,'thetad4': -2.3553389,'thetad5': -2.3550476000000002,'thetad6': -2.3561332999999998,
                'tau1': -74.990634, 'tau2': -74.937538, 'tau3': -74.978278, 'tau4': -74.999533, 'tau5': -74.984873,
                'dm1': 0.25026488,'dm2': 0.25041709, 'dm3': 0.25022193, 'dm4': 0.25007437, 'dm5': 0.25014939, 'da1': 0.25043561,
                'da2': 0.25057928, 'da3': 0.25008744, 'da4': 0.2501664, 'da5': 0.25061589, 'db1': 0.25005398,
                'db2': 0.25001159, 'db3': 0.25024083, 'db4': 0.25010881, 'db5': 0.2514393}
actual_highs = {'theta1': 2.35, 'theta2': 2.355, 'theta3': 2.355, 'theta4': 2.355, 
                'theta5': 2.35,'theta6': 2.35,'thetad1': 2.35,'thetad2': 2.3540761000000003,'thetad3': 2.3547369,
                'thetad4': 2.3557772,'thetad5': 2.3557997999999998,'thetad6': 2.3558617,'tau1': 74.985591,'tau2': 74.967958,
                'tau3': 74.986797,'tau4': 74.99699100000001,'tau5': 74.995852,'dm1': 2.4999799,'dm2': 2.4994377999999995,
                'dm3': 2.4999333999999998,'dm4': 2.4999981,'dm5': 2.499663,'da1': 2.4991584,'da2': 2.4996680000000002,
                'da3': 2.4999561000000003,'da4': 2.4999662999999996,'da5': 2.4997887999999997,'db1': 2.4999776000000002,
                'db2': 2.4996115000000003,'db3': 2.4998112999999997,'db4': 2.4999968999999997,'db5': 2.4996541000000003}

variables_ = ['theta1', 'theta2', 'theta3', 'theta4', 'theta5', 'theta6', 'thetad1',
            'thetad2', 'thetad3', 'thetad4', 'thetad5', 'thetad6', 'tau1', 'tau2',
            'tau3', 'tau4', 'tau5', 'dm1', 'dm2', 'dm3', 'dm4', 'dm5', 'da1', 'da2',
            'da3', 'da4', 'da5', 'db1', 'db2', 'db3', 'db4', 'db5', 'thetadd6']
samples_ = 10000


def random_generator(samples_, actual_lows, actual_highs, variables_, model):
    df_doe = pd.DataFrame(columns=variables_[:-1])
    for var in variables_[:-1]:
        df_doe[var] = np.random.uniform(actual_lows[var], actual_highs[var],
                                        samples_).round(3)
    df_doe[variables_[-1]] = model.predict(df_doe).round(2)
    return df_doe

def latin_hypercube_generator(samples_, actual_lows, actual_highs, variables, model):
    # actual_lows = {'AT': [2], 'V': [30], 'AP': [993], 'RH': [30]}  # , 'PE':[425]}
    # actual_highs = {'AT': [35], 'V': [80], 'AP': [1033], 'RH': [100]}  # , 'PE':[495]}
    # variables = ['AT', 'V', 'AP', 'RH', 'PE']
    # samples_ = 100000
    # np.random.seed(5234)
    df_doe = pd.DataFrame(lhs(len(variables) - 1, samples=samples_, criterion='maximin'))
    df_doe.columns = variables[:-1]
    for col in df_doe.columns:
        df_doe[col] = [actual_lows[col]] * df_doe.shape[0] + df_doe[col] * (actual_highs[col] - actual_lows[col])
        df_doe[col] = df_doe[col].apply(lambda x: round(x, 3))
    # df_doe = df_doe.reset_index()
    df_doe.columns = variables[:-1]
    df_doe[variables[-1]] = model.predict(df_doe).round(3)
    return df_doe

def normalize(input_array):
    mean = np.mean(input_array, axis=0)
    std = np.std(input_array, axis=0)

    # scikit-learn measure to handle zeros in scale: def _handle_zeros_in_scale(scale, copy=True)
    # https://github.com/scikit-learn/scikit-learn/blob/7389dbac82d362f296dc2746f10e43ffa1615660/sklearn/preprocessing/data.py#L70
    if np.isscalar(std):
        if std == .0:
            std = 1.
    elif isinstance(std, np.ndarray):
        std = std.copy()
        std[std == 0.0] = 1.0

    data_norm = (input_array - mean) / std
    return mean, std, data_norm

df = random_generator(samples_, actual_lows, actual_highs, variables_, model)
print(df['thetad3'].min(), df['thetad3'].max())

-2.355 2.355


In [4]:
size_test_set = 1000
size_train_set = 300
reference_regr = model
np.random.seed(12345)
test_data = latin_hypercube_generator(size_test_set, actual_lows, actual_highs, variables_, reference_regr)
test_data = np.array(test_data)

np.random.seed(2)
train_data = np.array(random_generator(size_train_set, actual_lows, actual_highs, variables_, reference_regr))
mean_data, std_data, data_all_norm = normalize(np.concatenate((train_data, test_data), axis=0))

X_train = train_data[:, :-1]
X_test = test_data[:, :-1]
y_train = train_data[:, -1].reshape((-1, 1))
y_test = test_data[:, -1].reshape((-1, 1))

# NORMALIZED DATA
X_train_norm = data_all_norm[:train_data.shape[0], :-1]
y_train_norm = data_all_norm[:train_data.shape[0], -1].reshape((-1, 1))
X_test_norm = data_all_norm[train_data.shape[0]:, :-1]
y_test_norm = data_all_norm[train_data.shape[0]:, -1].reshape((-1, 1))


In [6]:
regressor_config_dict = {

    'sklearn.linear_model.ElasticNetCV': {
        'l1_ratio': np.arange(0.0, 1.01, 0.05),
        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    },

    'sklearn.neighbors.KNeighborsRegressor': {
        'n_neighbors': range(1, 101),
        'weights': ["uniform", "distance"],
        'p': [1, 2]
    },

    'sklearn.linear_model.LassoLarsCV': {
        'normalize': [True, False]
    },

    'sklearn.svm.LinearSVR': {
        'loss': ["epsilon_insensitive", "squared_epsilon_insensitive"],
        'dual': [True, False],
        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.],
        'epsilon': [1e-4, 1e-3, 1e-2, 1e-1, 1.]
    },


    'sklearn.linear_model.RidgeCV': {
    },


    # Preprocesssors
    'sklearn.preprocessing.Binarizer': {
        'threshold': np.arange(0.0, 1.01, 0.05)
    },

    'sklearn.decomposition.FastICA': {
        'tol': np.arange(0.0, 1.01, 0.05)
    },

    'sklearn.cluster.FeatureAgglomeration': {
        'linkage': ['ward', 'complete', 'average'],
        'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
    },

    'sklearn.preprocessing.MaxAbsScaler': {
    },

    'sklearn.preprocessing.MinMaxScaler': {
    },

    'sklearn.preprocessing.Normalizer': {
        'norm': ['l1', 'l2', 'max']
    },

    'sklearn.kernel_approximation.Nystroem': {
        'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid'],
        'gamma': np.arange(0.0, 1.01, 0.05),
        'n_components': range(1, 11)
    },

    'sklearn.decomposition.PCA': {
        'svd_solver': ['randomized'],
        'iterated_power': range(1, 11)
    },

    'sklearn.preprocessing.PolynomialFeatures': {
        'degree': [2],
        'include_bias': [False],
        'interaction_only': [False]
    },

    'sklearn.kernel_approximation.RBFSampler': {
        'gamma': np.arange(0.0, 1.01, 0.05)
    },

    'sklearn.preprocessing.RobustScaler': {
    },

    'sklearn.preprocessing.StandardScaler': {
    },

    'tpot.builtins.ZeroCount': {
    },

    'tpot.builtins.OneHotEncoder': {
        'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25],
        'sparse': [False],
        'threshold': [10]
    },


    # Selectors
    'sklearn.feature_selection.SelectFwe': {
        'alpha': np.arange(0, 0.05, 0.001),
        'score_func': {
            'sklearn.feature_selection.f_regression': None
        }
    },

    'sklearn.feature_selection.SelectPercentile': {
        'percentile': range(1, 100),
        'score_func': {
            'sklearn.feature_selection.f_regression': None
        }
    },

    'sklearn.feature_selection.VarianceThreshold': {
        'threshold': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
    },

            }
        }
    }

}

In [None]:
regressor_dx = TPOTRegressor(generations=30, population_size=30, 
                             config_dict=regressor_config_dict, 
                             cv=5, random_state=42,
                             verbosity=2, scoring='r2',
                            n_jobs = -1) # max_time_mins=5

regressor_dx.fit(X_train_norm, y_train_norm)

print(regressor_dx.score(X_test_norm, y_test_norm))

regressor_dx.export('tpot_reg_dx_result_long_run.py')
# see which pipelines were evaluated:
print(regressor_dx.evaluated_individuals_)


# Validation

In [None]:
size_test_set = 1000
reference_regr = model
np.random.seed(12345)
test_data = latin_hypercube_generator(size_test_set, actual_lows, actual_highs, variables_, reference_regr)
test_data = np.array(test_data)

In [40]:
size_train_set = 400

np.random.seed(1)
train_data = np.array(random_generator(size_train_set, actual_lows, actual_highs, variables_, reference_regr))
mean_data, std_data, data_all_norm = normalize(np.concatenate((train_data, test_data), axis=0))

X_train = train_data[:, :-1]
X_test = test_data[:, :-1]
y_train = train_data[:, -1].reshape((-1, 1))
y_test = test_data[:, -1].reshape((-1, 1))

# NORMALIZED DATA
X_train_norm = data_all_norm[:train_data.shape[0], :-1]
y_train_norm = data_all_norm[:train_data.shape[0], -1].reshape((-1, 1))
X_test_norm = data_all_norm[train_data.shape[0]:, :-1]
y_test_norm = data_all_norm[train_data.shape[0]:, -1].reshape((-1, 1))


import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer


from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, RBF

# NOTE: Make sure that the class is labeled 'target' in the data file
#tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
#features = tpot_data.drop('target', axis=1).values
#training_features, testing_features, training_target, testing_target = \
#            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.8623827025013939
model = make_pipeline(
    SelectFromModel(estimator=ExtraTreesRegressor(max_features=1.0, n_estimators=100), threshold=0.1),
    KNeighborsRegressor(n_neighbors=10, p=2, weights="uniform")
)
kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))
model = GaussianProcessRegressor(kernel=kernel)

model = GaussianProcessRegressor(alpha=1e-10, copy_X_train=True,
    kernel=1**2 + Matern(length_scale=2, nu=1.5) + WhiteKernel(noise_level=1),
    n_restarts_optimizer=0, normalize_y=False,
    optimizer='fmin_l_bfgs_b', random_state=None)

model.fit(X_train_norm, y_train_norm)
print(model.score(X_test_norm, y_test_norm))

0.17850507050197362
