### Libraries

In [3]:
from gplearn.genetic import SymbolicTransformer
from gplearn.fitness import make_fitness
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.datasets import load_boston
from sklearn.utils import check_random_state

### Load

In [366]:
boston = load_boston()

# Initializing the dataframe
data = pd.DataFrame(boston.data)

#Adding the feature names to the dataframe
data.columns = boston.feature_names
data.head()

In [None]:
#Adding target variable to dataframe
data['PRICE'] = boston.target 

In [None]:
# Spliting target variable and independent variables
X = data.drop(['PRICE'], axis = 1)
y = data['PRICE']

In [None]:
# Splitting to training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 4)

In [None]:
# Standarize
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### functions

In [7]:
# custom metric
def _mape(y, y_pred, w):
    diffs = np.abs(np.divide((np.maximum(0.001, y) - np.maximum(0.001, y_pred)),np.maximum(0.001, y)))
    
    return 100. * np.average(diffs, weights=w)

mape = make_fitness(_mape, greater_is_better=False)

### model

In [8]:
function_set = ['add', 'sub', 'mul', 'div',
                'sqrt', 'log', 'abs', 'neg', 'inv',
                'max', 'min']
model = SymbolicTransformer(generations=20, population_size=2000,
                         hall_of_fame=100, n_components=10,
                         function_set=function_set,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9, verbose=1,
                         random_state=0, n_jobs=3)

In [9]:
model.fit(X_train, y_train)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    22.07          0.28548        5         0.834319              N/A     39.68s
   1    17.70         0.558128       15         0.858646              N/A     28.58s
   2    25.97         0.661194       10         0.863079              N/A     20.19s
   3    12.13         0.694029       17         0.868805              N/A      7.68s
   4     7.27          0.72526       17         0.874665              N/A      0.00s


SymbolicTransformer(const_range=None, feature_names=None,
                    function_set=['sub', 'add', 'inv', 'mul', 'div', 'abs',
                                  'log', 'sqrt', 'max', 'min'],
                    generations=5, hall_of_fame=20, init_depth=(2, 8),
                    init_method='half and half', low_memory=False,
                    max_samples=1.0, metric='pearson', n_components=10,
                    n_jobs=1, p_crossover=0.9, p_hoist_mutation=0.01,
                    p_point_mutation=0.01, p_point_replace=0.05,
                    p_subtree_mutation=0.01, parsimony_coefficient='auto',
                    population_size=5000, random_state=123,
                    stopping_criteria=1.0, tournament_size=20, verbose=True,
                    warm_start=True)

In [10]:
print(model._best_programs[0])
# model._Program

mul(X5, log(div(inv(X12), max(mul(X9, add(X0, sqrt(X12))), sqrt(log(abs(X5)))))))


In [11]:
gp_features = model.transform(boston.data)
new_boston = np.hstack((X, gp_features))

In [12]:
pd.DataFrame(new_boston).to_csv("..//Data//newBostonStandard.csv")