- Symbolic Transformation with gplearn library.
    Parameters found at https://gplearn.readthedocs.io/en/stable/examples.html

### Libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_diabetes
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
from gplearn.genetic import SymbolicTransformer
from gplearn.fitness import make_fitness

### Load

In [3]:
diabetes = load_diabetes()

# Initializing the dataframe
data = pd.DataFrame(diabetes.data)

#Adding the feature names to the dataframe
data.columns = diabetes.feature_names
data.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [4]:
#Adding target variable to dataframe
data['diabetes'] = diabetes.target 

In [5]:
# Spliting target variable and independent variables
X = data.drop(['diabetes'], axis = 1)
y = data['diabetes']

In [6]:
# Splitting to training and testing data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 4)

In [7]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Metric Function

In [8]:
# custom metric
def _mape(y, y_pred, w):
    diffs = np.abs(np.divide((np.maximum(0.001, y) - np.maximum(0.001, y_pred)),np.maximum(0.001, y)))
    
    return 100. * np.average(diffs, weights=w)

mape = make_fitness(_mape, greater_is_better=False)

### Model

In [9]:
function_set = ['add', 'sub', 'mul', 'div',
                'sqrt', 'log', 'abs', 'neg', 'inv',
                'max', 'min']
model = SymbolicTransformer(generations=20, population_size=2000,
                         hall_of_fame=100, n_components=10,
                         function_set=function_set, metric = mape,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9, verbose=1,
                         random_state=0, n_jobs=3)

In [10]:
model.fit(X_train, y_train)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    11.37          99.5075       14          87.0549          80.3186      2.88m
   1    16.29          98.1514       20          65.1227          65.8657      1.27m
   2    24.17          98.7732        2          47.3592          44.5268      1.35m
   3    18.82           104.98       10          45.5899          49.1598      1.27m
   4    16.84          118.861       21          44.4126          49.4385      1.16m
   5    15.07          127.335       16          40.9997          55.8249      1.15m
   6    17.73          116.651       16          41.2445          53.6296      1.09m
   7    24.94          127.854       26          40.0961          46.3938      1.00m
   8    31.73          105.633       23          38.4384          60.0186  

SymbolicTransformer(function_set=['add', 'sub', 'mul', 'div', 'sqrt', 'log',
                                  'abs', 'neg', 'inv', 'max', 'min'],
                    max_samples=0.9,
                    metric=<gplearn.fitness._Fitness object at 0x000001B1248EFC70>,
                    n_jobs=3, parsimony_coefficient=0.0005,
                    population_size=2000, random_state=0, verbose=1)

In [11]:
# Concatenate old and transformed matrix
gp_features = model.transform(diabetes.data)
new_diabetes = np.hstack((X, gp_features))

In [12]:
# Saves new matrix
pd.DataFrame(new_diabetes).to_csv("newDiabetes.csv")