- Symbolic Transformation with gplearn library.
    Parameters found at https://gplearn.readthedocs.io/en/stable/examples.html

### Libraries

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing

import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
from gplearn.genetic import SymbolicTransformer
from gplearn.fitness import make_fitness

### Load

In [4]:
# load california housing dataset
housing = fetch_california_housing()

# Creating a dataframe with the data
data = pd.DataFrame(housing.data, columns=housing.feature_names)
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [5]:
# Adding target variable to dataframe
data['median_house_value'] = housing.target

In [6]:
# Splittingh dependent and independent variables
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']

In [7]:
# Splitting to training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 4)

In [8]:
# Standarize
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Metric Function

In [9]:
# custom metric
def _mape(y, y_pred, w):
    diffs = np.abs(np.divide((np.maximum(0.001, y) - np.maximum(0.001, y_pred)),np.maximum(0.001, y)))
    
    return 100. * np.average(diffs, weights=w)

mape = make_fitness(_mape, greater_is_better=False)

### Model

In [24]:
function_set = ['add', 'sub', 'mul', 'div',
                'sqrt', 'log', 'abs', 'neg', 'inv',
                'max', 'min']
model = SymbolicTransformer(generations=20, population_size=2000,
                         hall_of_fame=100, n_components=10,
                         function_set=function_set, metric = mape,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9, verbose=1,
                         random_state=0, n_jobs=3)

In [25]:
model.fit(X_train, y_train)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


   0    11.88          1900.39        5          44.0957          43.3009      2.64m
   1     6.52          75.9258       16          43.7218          44.0279      1.35m
   2     5.89          69.4985       11          36.0978          37.1984      1.39m
   3     7.31          71.4146       14          35.8433          35.9279      1.42m
   4    11.37          66.5714        9          32.1465          31.6921      1.66m
   5    14.59          51.8998        9          32.0587          32.4825      1.53m
   6    16.30          43.5922       19          31.4732          31.8147      1.09m
   7    18.08          45.3769       22          31.3241           30.695     58.40s
   8    18.37           45.723       31          30.5719          30.9186     53.80s
   9    20.90          47.1222       21           30.144          29.9362      1.05m
  10    25.80          42.4549       21          30.0939          30.3871     52.26s
  11    29.95          40.3256       24          29.7009         

In [26]:
# Concatenate old and transformed matrix
gp_features = model.transform(housing.data)
new_housing = np.hstack((X, gp_features))

In [27]:
# Saves new matrix
pd.DataFrame(new_housing).to_csv("GpTrans_CalHousing.csv", index=False)