# Installing TPOT

In [1]:
!pip install tpot



# Importing libraries and dependencies

In [2]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from tpot import TPOTRegressor

# Dataset Loading

In [3]:
data = pd.read_csv("../data/auto_insurance.csv")

In [4]:
data.drop("Unnamed: 0",axis=1,inplace=True)

In [5]:
data.dtypes

0      int64
1    float64
dtype: object

In [6]:
# split into input and output elements
data = data.values
data = data.astype('float32')

In [7]:
X = data[:, :-1]

In [8]:
y = data[:, :-1]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Evaluation Procedure

In [10]:
# define evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Initializing Regressor

In [11]:

model = TPOTRegressor(generations=5, population_size=50, scoring='neg_mean_absolute_error', cv=cv, verbosity=2, random_state=1, n_jobs=-1)
# perform the search

# Search for the best model

In [12]:
# Feed the structured data regressor with training data.
model.fit(
    X_train, 
    y_train
)

print(model.score(X_test, y_test))

  return f(*args, **kwargs)


Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -2.7544759101753395e-08

Generation 2 - Current best internal CV score: -6.922225755564189e-13

Generation 3 - Current best internal CV score: -6.922225755564189e-13

Generation 4 - Current best internal CV score: -6.922225755564189e-13

Generation 5 - Current best internal CV score: -4.1892415462522575e-15

Best pipeline: LassoLarsCV(VarianceThreshold(SGDRegressor(input_matrix, alpha=0.0, eta0=1.0, fit_intercept=False, l1_ratio=0.5, learning_rate=constant, loss=epsilon_insensitive, penalty=elasticnet, power_t=1.0), threshold=0.0005), normalize=True)
-6.695498856200944e-15


  return f(*args, **kwargs)


# Exporting Model

In [13]:
model.export('tpot_auto_insurance_pipeline.py')

In [14]:
!cat tpot_auto_insurance_pipeline.py

import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LassoLarsCV, SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=1)

# Average CV score on the training set was: -4.1892415462522575e-15
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=SGDRegressor(alpha=0.0, eta0=1.0, fit_intercept=False, l1_ratio=0.5, learning_rate="constant", loss="epsilon_insensitive", penalty=

In [16]:
model.evaluated_individuals_

#Get details on algorithm TPOT searched for and how the performed on the dataset

{'XGBRegressor(RBFSampler(input_matrix, RBFSampler__gamma=0.4), XGBRegressor__learning_rate=0.01, XGBRegressor__max_depth=6, XGBRegressor__min_child_weight=16, XGBRegressor__n_estimators=100, XGBRegressor__n_jobs=1, XGBRegressor__objective=reg:squarederror, XGBRegressor__subsample=0.05, XGBRegressor__verbosity=0)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 2,
  'internal_cv_score': -19.4199999332428},
 'XGBRegressor(input_matrix, XGBRegressor__learning_rate=0.01, XGBRegressor__max_depth=5, XGBRegressor__min_child_weight=17, XGBRegressor__n_estimators=100, XGBRegressor__n_jobs=1, XGBRegressor__objective=reg:squarederror, XGBRegressor__subsample=0.8500000000000001, XGBRegressor__verbosity=0)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 1,
  'internal_cv_score': -8.557179053624472},
 'RandomForestRegressor(input_matrix, RandomForestRegressor__bootstr

In [17]:
model.fitted_pipeline_

Pipeline(steps=[('stackingestimator',
                 StackingEstimator(estimator=SGDRegressor(alpha=0.0, eta0=1.0,
                                                          fit_intercept=False,
                                                          l1_ratio=0.5,
                                                          learning_rate='constant',
                                                          loss='epsilon_insensitive',
                                                          penalty='elasticnet',
                                                          power_t=1.0,
                                                          random_state=1))),
                ('variancethreshold', VarianceThreshold(threshold=0.0005)),
                ('lassolarscv', LassoLarsCV())])