In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_selection import SelectKBest, f_regression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from datetime import datetime

# pipeline
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# models
from sklearn.linear_model import Ridge
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor

# nn
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

#load and save
import pickle
import os

Since we are working on jupyter noteeboks, it's hard to import modules from other notebooks. For the simplicity, we'll just copy the functions developed in our 'etl' noteebok to this one.

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import TransformedTargetRegressor

class FeatureCreation(BaseEstimator, TransformerMixin):
    def __init__(self):
        # print('FeatureCreation initialized')
        return None
    
    # For the fit method, we will pass the parameter x. This is our independent variables. 
    # This fit method will be called when we fit the pipeline.
    def fit(self, x, y=None):
        # print('Fit FeatureCreation called')
        return self
    
    # Here, we will perform all of our transformations. For creating features automatically, we could create parameters in the class and pass the column names to them.
    # But in this case, since it's for this dataset specific, we will perform transformations in the column names directly into the fit method.
    # The transform method is called when we caled fit and also predict in the Pipeline. And that makes sense, since we need to create our features when we will train and when we will predict.
    def transform(self, x, y=None):
        # print('Transform FeatureCreation called')
        # creating a copy to avoid changes to the original dataset
        x_ = x.copy()
        # print(f'Before Transformation: {x_.shape}')
        # and now, we create everyone of our features.
        # Area power of two
        x_['area2'] = x_['area'] ** 2
        # The ratio between area and rooms
        x_['area/room'] = x_['area'] / x_['rooms']
        # The ratio between area and bathroom
        x_['area/bathroom'] = x_['area'] / x_['bathroom']
        # the sum of rooms and bathrooms
        x_['rooms+bathroom'] = x_['rooms'] + x_['bathroom']
        # the product between rooms and bathrooms
        x_['rooms*bathroom'] = x_['rooms'] * x_['bathroom']
        # the ratio between rooms and bathrooms
        x_['rooms/bathroom'] = x_['rooms'] / x_['bathroom']
        # the product between hoa and property tax
        x_['hoa*property tax'] = x_['hoa (R$)'] * x_['property tax (R$)']
        # print(f'After Transformation: {x_.shape}')
        return x_

In [3]:
# df = pd.read_csv('houses_to_rent_v2_fteng.csv')
df = pd.read_csv(os.path.join(os.path.abspath('../data'), 'houses_to_rent_v2_fteng.csv'))

In [4]:
df.head()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,animal,furniture,hoa (R$),rent amount (R$),property tax (R$),fire insurance (R$)
0,Belo Horizonte,42,1,1,1,not acept,furnished,470,2690,172,36
1,Belo Horizonte,64,2,2,1,acept,not furnished,352,1500,80,20
2,Belo Horizonte,80,3,2,1,acept,not furnished,0,11000,425,181
3,Belo Horizonte,200,4,2,1,not acept,not furnished,850,2550,9,34
4,Belo Horizonte,45,1,1,1,acept,not furnished,500,1631,192,12


In [5]:
df.shape

(8995, 11)

## Model Pipeline

In order to padronize the model definition, we will create a Pipeline. That will assure that all the data will pass to same process of transformation.

In [6]:
x = df.drop(columns=['rent amount (R$)'], axis=1)
y = df['rent amount (R$)']
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

Now, let's categorize our variables into numerical or categorical. This is required since the transformation of each type it's different.

Now, we will create the pipelines for every type of variable.

In [7]:
# Categorical
catTransformer = Pipeline(steps=[
    # For categorical variables, we will use onehotencoder.
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Numerical
numTransformer = Pipeline(steps=[
    # For numerical features we will use standardscaler because we have already treated the dataset for outliers.
    ('scaler', StandardScaler())
])

Finally, let's integrate those pipelines with a ColumnTransformer and create our preprocessor. Everytime that we wan't to predict, this preprocessor will be applied.

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numTransformer, selector(dtype_exclude=["category", "object"])),
        ('categoric', catTransformer, selector(dtype_include=["category", "object"]))
    ])

## Metric and Validation

Our target variable it's rent amount (R$), that means that we need a supervised machine learning model and our target it is a continous variable, a regression problem.

I've chose to use RMSE as our metric because it is more sensible to outliers than MAE, so it give us a more wide comprehension if that is affecting our model.

For validation, we will use K-Fold Cross Validation. That means that the data will be divided by K groups of samples, called folds. Then, in every iteration of K, the data will be trained in K-1 and tested in the rest. Below we have a example of how K-Fold works, according to the sci-kit learn documentation.

![kfold](https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_002.png)

Let's then define our cross validation function with our validation metric.

In [9]:
# We will use 5 folds
n_folds = 5


def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True).get_n_splits(df.values)
    # here we define that our scoring metric will be rmse for every iteration of the cross validation
    rmse = np.sqrt(-cross_val_score(model, x_train, y_train,
                   scoring="neg_mean_squared_error", cv=kf))
    return rmse 

## Model Validation

Now, we have to define a few base models to validate our metric.

We will also do a Random Search to tune our models. We chose to use random search instead of grid search because [researchs](https://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf) shows that random search do a better job.

For the baseline we will use these models:
- RandomForest
- XGBoost
- LGBM
- Ridge
- Neural Network

Let's create the topology of the neural network.

The number of hidden layers and neurons it's a import factor to make the model generalize well.

From Introduction to Neural Networks for Java (second edition) by Jeff Heaton, there are two decisions to be made regarding to the hidden layers:

1 - How many hidden layers to actually have in the neural network .

> Problems that require two or more hidden layers are rarely encountered.

2 - How many neurons will be in each of these layers.

> - The number of hidden neurons should be between the size of the input layer and the size of the output layer.
> - The number of hidden neurons should be 2/3 the size of the input layer, plus the size of the output layer.
> - The number of hidden neurons should be less than twice the size of the input layer.

In [10]:
def create_model(optimizer='adam', dropout=0.2, activation='relu', kernel_initializer='normal'):
    # We initialized every parameter that we want to optimize in the neural network.
    # Let's initialize our sequential model
    model = Sequential()
    # Our input layer will have 15 neurons, that's equal to the input_dim
    model.add(Dense(units = 15, activation = activation, input_dim = 15, kernel_initializer=kernel_initializer))
    # In every step we will create a dropout layer in order to optimize this parameter and prevent overfitting.
    model.add(Dropout(dropout))
    # Ni = number of input neurons
    # No = number of output neurons
    # Our nn will have only one hidden layer, and the number of neurons follow the rule: 2/3 * (Ni) + No  = 11.
    model.add(Dense(units = 11, activation = activation))
    model.add(Dropout(dropout))
    # Our output layer have only one neuron, since it's a regression problem.
    model.add(Dense(units = 1, activation = activation))

    # now we compile our model utilizing the mean squared error loss function
    model.compile(optimizer = optimizer, loss = 'mean_squared_error', metrics = ['accuracy'])

    return model

Now, let's create a function to apply the Random Search through the model.

In [17]:
from tempfile import mkdtemp
from shutil import rmtree

def get_best_param(model, custom_features=False, custom_target=False):
    """
    This function return a random search object.
    """
    # cachedir = mkdtemp()

    # The model will be identified by a string and for each one, we will set a parameter grid. This grid will be passed to the random search
    # Defined the model and the parameter grid, we instantiate the Pipeline.
    # Since we are using a Pipeline, we have to optimize the parameters of our model, and to do that we will have to name the step in the pipeline and access that in the pipeline.
    # In our case, our step will be named model. To access this parameters we will add a model__ in front of every parameter.
    if model == 'RandomForest':
        random_grid = {
            'model__n_estimators': [int(x) for x in np.linspace(start = 200, stop = 800, num = 4)],
            'model__max_features': ['auto', 'sqrt'],
            'model__max_depth': [i for i in np.arange(1, 10)],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4],
            'model__bootstrap': [True, False]
        }

        regressor_model = RandomForestRegressor()

    elif model == 'XGB':
        random_grid = {
            "model__n_estimators":[int(x) for x in np.linspace(start = 200, stop = 800, num = 4)],
            "model__learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ],
            "model__max_depth"        : [i for i in np.arange(1, 10)],
            "model__min_child_weight" : [1e-3, 1, 3, 5, 7 ],
            "model__gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
            "model__colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7, 1 ] 
        }

        regressor_model = xgb.XGBRegressor()

    elif model == 'LGBM':
        random_grid = {
            "model__n_estimators": [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
            "model__boosting_type": ['dart', 'goss'],
            "model__max_depth": [i for i in np.arange(1, 51)],
            "model__num_leaves": [int(x) for x in np.linspace(start = 10, stop = 2000, num = 10)],
            "model__learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ],
            "model__min_child_weight" : [1e-3, 1, 3, 5, 7 ],
            "model__colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7, 1 ],
        }

        regressor_model = lgb.LGBMRegressor()

    elif model == 'Ridge':
        random_grid = {
            "model__alpha": np.linspace(start=0.001, stop=1,  num=101),
            "model__fit_intercept": [True, False]
        }

        regressor_model = Ridge()

    elif model == 'NeuralNetwork':
        random_grid = {
            "model__batch_size": (32, 64, 128, 256),
            "model__epochs": (50, 100, 200, 300),
            "model__activation": ('relu', 'tanh', 'linear'),
            "model__dropout": (0.0, 0.1, 0.2, 0.3),
            "model__kernel_initializer": ('glorot_uniform','normal','uniform'),
            "model__optimizer": ('SGD', 'Adam', 'RMSprop', 'Adadelta', 'Adagrad', 'Adamax', 'Nadam', 'Ftrl')
        }

    
        # Since we are passing our model to a sci-kit learn Pipeline, we need to wrap our Keras model first.
        regressor_model = KerasRegressor(build_fn=create_model, verbose=0)

    # For the neural network, we have a particularity. We have to pass to the first layer the input dimension.
    # Since we are doing transformations in our dataset, such as One Hot encoding, we don't know for sure how many features will exist.
    # One way around that it is define a fix value of variables that will be passed, and this is possible by using SelecKBest from sci-kit learn.
    # This function scores the variables according to the function passed, in our case, f_regression, and return the k variables defined.
    # Defined the number of features, we just pass the input dim in our create_model function above and create another step in our Pipeline.
    

    # We will test for every possible combination regarding to the target transformation and the feature engineering and compute the results.
    if custom_features:
        if model == 'NeuralNetwork':
            select_best_features = SelectKBest(k=15, score_func=f_regression)
            model_pipeline = Pipeline(steps=[
                        ('featurecreation', FeatureCreation()),
                        ('preprocessor', preprocessor),
                        ('select_k_best', select_best_features),
                        ('model', regressor_model)
                    ])
        else:
            model_pipeline = Pipeline(steps=[
                        ('featurecreation', FeatureCreation()),
                        ('preprocessor', preprocessor),
                        ('model', regressor_model)
                    ])
    else:
        if model == 'NeuralNetwork':
            select_best_features = SelectKBest(k=15, score_func=f_regression)
            model_pipeline = Pipeline(steps=[
                        ('preprocessor', preprocessor),
                        ('select_k_best', select_best_features),
                        ('model', regressor_model)
                    ])
        else:
            model_pipeline = Pipeline(steps=[
                        ('preprocessor', preprocessor),
                        ('model', regressor_model)
                    ])

    kf = KFold(n_folds, shuffle=True).get_n_splits(df.values)

    if custom_target:
        custom_pipeline = TransformedTargetRegressor(
            regressor=model_pipeline,
            func=np.log,
            inverse_func=np.exp
            )

        for old_key in list(random_grid.keys()):
            random_grid['regressor__' + old_key] = random_grid.pop(old_key)
        rzsearch = RandomizedSearchCV(estimator=custom_pipeline, param_distributions=random_grid, cv=kf, n_jobs=-1)
    else:
        rzsearch = RandomizedSearchCV(estimator=model_pipeline, param_distributions=random_grid, cv=kf, n_jobs=-1)

    return rzsearch

In [19]:
# Now, let's iterate through every model, run the random search in each one and return a dataframe with the informations.
def result_cv_models(custom_features=False, custom_target=False):
    today = datetime.now().strftime('%Y%m%d_%H%M%S')
    models = ['RandomForest', 'XGB', 'LGBM', 'Ridge', 'NeuralNetwork']
    best_models = dict()
    results_dict = dict()
    
    if custom_target:
        for model in models:
            rzsearch = get_best_param(model, custom_features=custom_features, custom_target=custom_target).fit(x_train, y_train)
            best_models[model] = (rzsearch.best_estimator_.regressor_['model'], [rzsearch.best_params_])
    else:
        for model in models:
            rzsearch = get_best_param(model, custom_features=custom_features, custom_target=custom_target).fit(x_train, y_train)
            best_models[model] = (rzsearch.best_estimator_['model'], [rzsearch.best_params_])

    for name, model in best_models.items():

        if custom_features:
            if name == 'NeuralNetwork':
                select_best_features = SelectKBest(k=15, score_func=f_regression)
                model_pipeline = Pipeline(steps=[
                            ('featurecreation', FeatureCreation()),
                            ('preprocessor', preprocessor),
                            ('select_k_best', select_best_features),
                            ('model', model[0])
                        ])
            else:
                model_pipeline = Pipeline(steps=[
                            ('featurecreation', FeatureCreation()),
                            ('preprocessor', preprocessor),
                            ('model', model[0])
                        ])
        else:
            if name == 'NeuralNetwork':
                select_best_features = SelectKBest(k=15, score_func=f_regression)
                model_pipeline = Pipeline(steps=[
                            ('preprocessor', preprocessor),
                            ('select_k_best', select_best_features),
                            ('model', model[0])
                        ])
            else:
                model_pipeline = Pipeline(steps=[
                            ('preprocessor', preprocessor),
                            ('model', model[0])
                        ])

        # If we are performing a target transformation, we have to pass the pipeline to our TransformedTargetRegressor object.
        if custom_target:
            custom_pipeline = TransformedTargetRegressor(
                regressor=model_pipeline,
                func=np.log,
                inverse_func=np.exp
                )

            scores = rmsle_cv(custom_pipeline)

            # save the model
            custom_pipeline.fit(x_train, y_train)

            predict_test = custom_pipeline.predict(x_test)

            rmse_testset = mean_squared_error(y_test, predict_test, squared=False)
            mae_testset = mean_absolute_error(y_test, predict_test)

            # for the neural network we need additional steps.
            if name == 'NeuralNetwork':
                # The keras model it is not serialized by pickle. To get around that, we save the model using the keras save method.
                # Similar to the pipeline object, to access the model, we have to dig into the steps of the transformed object, and then dig into the pipeline object.
                # custom_pipeline.regressor_.named_steps['model'].model.save(os.path.join(os.path.abspath(''), "models", f'v1_model_{name}_{round(scores.mean(), 3)}_{today}.h5'))
                custom_pipeline.regressor_.named_steps['model'].model.save(os.path.join(os.path.abspath('../models'), f'v1_model_{name}_{round(scores.mean(), 3)}_{today}.h5'))
                # Then, we set the model inside the pipeline equals to None to be able to serialize.
                custom_pipeline.regressor_.named_steps['model'].model = None

                # Into the transformed object we have to set the regressor equals None also.
                # Later, we will load the pipeline and the model and add the model to the pipeline again.
                custom_pipeline.regressor.set_params(model = None)

            # Now, serialize and save the model.
            # with open(os.path.join(os.path.abspath(''), "models", f"v1_pipe_{name}_{round(scores.mean(), 3)}_{today}.pickle"), 'wb') as f:
            with open(os.path.join(os.path.abspath('../models'), f"v1_pipe_{name}_{round(scores.mean(), 3)}_{today}.pickle"), 'wb') as f:
                pickle.dump(custom_pipeline, f, -1)

        else:
            scores = rmsle_cv(model_pipeline)

            #save the model
            model_pipeline.fit(x_train, y_train)

            predict_test = model_pipeline.predict(x_test)

            rmse_testset = mean_squared_error(y_test, predict_test, squared=False)
            mae_testset = mean_absolute_error(y_test, predict_test)

            if name == 'NeuralNetwork':
                # model_pipeline.named_steps['model'].model.save(os.path.join(os.path.abspath(''), "models", f'v1_model_{name}_{round(scores.mean(), 3)}_{today}.h5'))
                model_pipeline.named_steps['model'].model.save(os.path.join(os.path.abspath('../models'), f'v1_model_{name}_{round(scores.mean(), 3)}_{today}.h5'))
                model_pipeline.named_steps['model'].model = None

            # with open(os.path.join(os.path.abspath(''), "models", f"v1_pipe_{name}_{round(scores.mean(), 3)}_{today}.pickle"), 'wb') as f:
            with open(os.path.join(os.path.abspath('../models'), f"v1_pipe_{name}_{round(scores.mean(), 3)}_{today}.pickle"), 'wb') as f:
                pickle.dump(model_pipeline, f, -1)

        # Here we will save our results. One important column it is the 'pipe_file_name', this will be used to load our model later.
        results_dict[name] = {'name': name, 'model': model[0], 'params': model[1], 'rmse_cv': round(np.mean(scores), 3), 'std_cv': round(np.std(scores), 3), 'rmse_testset': rmse_testset, 'mae_testset': mae_testset, 'custom_features': custom_features, 'custom_target': custom_target, 'all_scores_cv': scores, 'pipe_file_name': f"v1_pipe_{name}_{round(scores.mean(), 3)}_{today}.pickle"}

    results_df = pd.DataFrame(results_dict).T
    return results_df

In [20]:
all_results_df = list()

import time
start = time.time()

combinations = ({'custom_feature': True, 'custom_target': True}, {'custom_feature': True, 'custom_target': False}, {'custom_feature': False, 'custom_target': True}, {'custom_feature': False, 'custom_target': False})

# We have 4 possible combinations, let's get the results of each one of them.
for combination in combinations:
    print(combination)
    results_df = result_cv_models(custom_features=combination['custom_feature'], custom_target=combination['custom_target'])
    all_results_df.append(results_df)

end = time.time()
time_run = end-start

{'custom_feature': True, 'custom_target': True}
{'custom_feature': True, 'custom_target': False}
{'custom_feature': False, 'custom_target': True}
{'custom_feature': False, 'custom_target': False}


In [21]:
time_run

2496.265648126602

In [22]:
final_df = pd.concat(i for i in all_results_df)

In [23]:
final_df.to_csv(os.path.join(os.path.abspath('../data'), 'model_evaluation.csv'), index=False)