In [1]:
#Import packages

from time import time
import re
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

#Import custom class and functions

import sys
sys.path.append("../app/customized_class")
from input_data import InputData
from dummy_estimator import DummyEstimator

In [4]:
def load_data(database_filepath):
    
    '''
    
    This function load a database of cleaned properties and remove non informative variables like this:
    
    -'l2', 'l3', 'l4', 'l5', 'l6', 'Region'
    -'missing_l2', missing_l3', 'missing_l4', 'missing_l5', 'missing_l6'
    
    - 'l2' is removed because is redundant with 'l2shp'
    - 'Region' is removed because a department(l2shp) belongs to a single region, therefore the department defines the region,
       and this can lead to collinearity problems.
    - 'missing_l2' and 'missing_price' are removed because are constant.(no missing values in this columns)
    - lat and lon are in this dataframe but no in the model. They are used for visualizations.
    
    In addition to this we also remove values for properties other than houses or apartments, because the model
    only include this categories.
    
    Params:
        database_filepath (string): Path to sqlLite database
    Returns:
        df(pandas DataFrame): Matrix with features for train model and visualizations (lat and lon columns) and
                              target column ('price')
        
    '''
    engine = create_engine('sqlite:///'+database_filepath)
    df = pd.read_sql_table("Cleaned_prices",con=engine)
    
    columns_to_drop = ['l2', 'l3', 'l4', 'l5', 'l6','Region','missing_l2','missing_l3', 'missing_l4',
                       'missing_l5', 'missing_l6', 'missing_price']
    df = df.drop(columns_to_drop, axis=1)
    df = df[df['property_type'].isin(['Casa','Apartamento'])]
    
    return df

def adjust_data_for_model(df):
    
    '''
    This function the data in convenient format for the stage modelling. Some operations made are:
    
        1. Remove incomplete rows, that is, rows which have more than 2 missing fields in this list: 
           [rooms, log_surface_total, log_surface_covered, bathrooms]
        2. Exclude departments with less of 100 rows in the dataframe
        3. Include dummy variables for categorical variables: property_type, and l2shp (Department)
           using One-Hot Encoding because they are nominal variables. Here the original categorical variables
           are droped, except for l2shp because is ussefull for input median in missing values in a posterior step.
        4. Replace price for log10(price).
        5. Split the dataframe en covariates and target variable (X,y)
        
        
    Parameters:
    -----------
        df(pandas DataFrame): DataFrame with relevant columns and rows for modelling stage
    
    Returns:
    -----------
        
        df(pandas DataFrame): DataFrame with features adjusted for modelling stage
        
    '''
    
    # Step 1: Remove incomplete rows:
    
    columns = ['missing_rooms', 'missing_surface_total', 'missing_surface_covered','missing_bathrooms']
    counts = df[columns].apply(sum,axis=1)
    df = df[counts<=2]

    # Step 2: Exclude departments with less of 100 points.
    
    rows_by_departments = df['l2shp'].value_counts()
    departments_to_exclude = list(rows_by_departments[rows_by_departments<100].index)
    df = df[~df['l2shp'].isin(departments_to_exclude)]
    
    # Step 3: Include dummy variables:
    
    var_cat = df.select_dtypes(include=['object']).copy().columns
    for col in var_cat:
        try:
            
            if ((col!='l2shp') & (col!='property_type')):
                df = pd.concat([df.drop(col,axis=1),pd.get_dummies(df[col], prefix = col, prefix_sep = "_", drop_first = True, 
                                                                   dummy_na = False, dtype=int)],axis=1)
            else:
                df = pd.concat([df,pd.get_dummies(df[col], prefix = col, prefix_sep = "_", drop_first = True, 
                                                                   dummy_na = False,dtype=int)],axis=1)
                
        except Exception as e:
            print(col, "processing error")
            print(e)
        
    # Step 4. Replace price for log10(price):
    
    df['price'] = np.log10(df['price'])
    
    # Step 5. Split the dataframe en covariates and target variable (X,y)
    
    X = df.loc[:,df.columns!="price"]
    y = df['price']
    
    return X,y

In [5]:
df = load_data("../data/PropertiesPrices.db")
display(df)
df, y = adjust_data_for_model(df)
display(df)

Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_covered,property_type,surface_total,price,missing_lat,missing_lon,missing_rooms,missing_bedrooms,missing_bathrooms,missing_surface_total,missing_surface_covered,l2shp
5,6.338954,-75.541284,,,,,Apartamento,,162000000.0,0,0,1,1,1,1,1,ANTIOQUIA
14,6.172401,-75.609512,,,1.0,,Apartamento,,150000000.0,0,0,1,1,0,1,1,ANTIOQUIA
15,6.313522,-75.559738,,,2.0,,Apartamento,,320000000.0,0,0,1,1,0,1,1,ANTIOQUIA
16,6.156883,-75.628126,,,2.0,,Apartamento,,375000000.0,0,0,1,1,0,1,1,ANTIOQUIA
17,6.192737,-75.593727,,,2.0,,Apartamento,,280000000.0,0,0,1,1,0,1,1,ANTIOQUIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595378,1.149300,-76.646600,4.0,,2.0,,Casa,243.0,250000000.0,0,0,0,1,0,0,1,PUTUMAYO
595379,1.178700,-76.878500,,,,,Casa,,81000000.0,0,0,1,1,1,1,1,PUTUMAYO
595389,,,2.0,,1.0,78.0,Apartamento,,113000000.0,1,1,0,1,0,1,0,PUTUMAYO
595390,,,2.0,,,86.0,Apartamento,,123000000.0,1,1,0,1,1,1,0,PUTUMAYO


Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_covered,property_type,surface_total,missing_lat,missing_lon,...,l2shp_HUILA,l2shp_MAGDALENA,l2shp_META,l2shp_NARIÑO,l2shp_NORTE DE SANTANDER,l2shp_QUINDIO,l2shp_RISARALDA,l2shp_SANTANDER,l2shp_TOLIMA,l2shp_VALLE DEL CAUCA
166,6.205000,-75.549004,,3.0,4.0,,Apartamento,259.0,0,0,...,0,0,0,0,0,0,0,0,0,0
167,6.216000,-75.608002,,2.0,2.0,,Apartamento,76.0,0,0,...,0,0,0,0,0,0,0,0,0,0
168,6.228000,-75.565002,,2.0,2.0,,Apartamento,68.0,0,0,...,0,0,0,0,0,0,0,0,0,0
169,5.617000,-75.623001,,4.0,2.0,,Casa,196.0,0,0,...,0,0,0,0,0,0,0,0,0,0
170,6.342000,-75.557999,,5.0,2.0,,Casa,80.0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594703,10.474775,-73.248602,3.0,,2.0,56.0,Casa,,0,0,...,0,0,0,0,0,0,0,0,0,0
594704,10.476000,-73.250000,,2.0,2.0,,Apartamento,63.0,0,0,...,0,0,0,0,0,0,0,0,0,0
594705,10.475000,-73.250999,,2.0,2.0,,Apartamento,60.0,0,0,...,0,0,0,0,0,0,0,0,0,0
594706,10.485000,-73.277000,,4.0,4.0,,Casa,120.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
print(df['l2shp'].value_counts())
print(df.columns)

CUNDINAMARCA          35718
ANTIOQUIA             25778
VALLE DEL CAUCA       16270
ATLANTICO              8702
CALDAS                 4475
BOLIVAR                4231
RISARALDA              3689
SANTANDER              3454
QUINDIO                3123
NORTE DE SANTANDER     2894
MAGDALENA              1315
CORDOBA                1198
TOLIMA                 1118
META                    800
CAUCA                   644
HUILA                   573
NARIÑO                  400
BOYACA                  236
CESAR                   117
Name: l2shp, dtype: int64
Index(['lat', 'lon', 'rooms', 'bedrooms', 'bathrooms', 'surface_covered',
       'property_type', 'surface_total', 'missing_lat', 'missing_lon',
       'missing_rooms', 'missing_bedrooms', 'missing_bathrooms',
       'missing_surface_total', 'missing_surface_covered', 'l2shp',
       'property_type_Casa', 'l2shp_ATLANTICO', 'l2shp_BOLIVAR',
       'l2shp_BOYACA', 'l2shp_CALDAS', 'l2shp_CAUCA', 'l2shp_CESAR',
       'l2shp_CORDOBA', 'l2shp

In [7]:
transformer = InputData()
df_mod = transformer.fit_transform(df)

{'rooms': 7.0, 'surface_total': 1501.0, 'surface_covered': 525.0, 'bathrooms': 6.0}


In [8]:
display(df_mod)
print(df_mod.dtypes)
display(transformer.medians_by_department)
display(transformer.quantile98_by_feature)

for col in df_mod.columns:
    print(df_mod[col].unique())

Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_covered,surface_total,missing_lat,missing_lon,missing_rooms,...,l2shp_HUILA,l2shp_MAGDALENA,l2shp_META,l2shp_NARIÑO,l2shp_NORTE DE SANTANDER,l2shp_QUINDIO,l2shp_RISARALDA,l2shp_SANTANDER,l2shp_TOLIMA,l2shp_VALLE DEL CAUCA
166,6.205000,-75.549004,3.0,3.0,4.0,1.949390,2.413300,0,0,1,...,0,0,0,0,0,0,0,0,0,0
167,6.216000,-75.608002,3.0,2.0,2.0,1.949390,1.880814,0,0,1,...,0,0,0,0,0,0,0,0,0,0
168,6.228000,-75.565002,3.0,2.0,2.0,1.949390,1.832509,0,0,1,...,0,0,0,0,0,0,0,0,0,0
169,5.617000,-75.623001,3.0,4.0,2.0,1.949390,2.292256,0,0,1,...,0,0,0,0,0,0,0,0,0,0
170,6.342000,-75.557999,3.0,5.0,2.0,1.949390,1.903090,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594703,10.474775,-73.248602,3.0,3.0,2.0,1.748188,2.212188,0,0,0,...,0,0,0,0,0,0,0,0,0,0
594704,10.476000,-73.250000,3.0,2.0,2.0,2.204120,1.799341,0,0,1,...,0,0,0,0,0,0,0,0,0,0
594705,10.475000,-73.250999,3.0,2.0,2.0,2.204120,1.778151,0,0,1,...,0,0,0,0,0,0,0,0,0,0
594706,10.485000,-73.277000,3.0,4.0,4.0,2.204120,2.079181,0,0,1,...,0,0,0,0,0,0,0,0,0,0


lat                         float64
lon                         float64
rooms                       float64
bedrooms                    float64
bathrooms                   float64
surface_covered             float64
surface_total               float64
missing_lat                  object
missing_lon                  object
missing_rooms                object
missing_bedrooms             object
missing_bathrooms            object
missing_surface_total        object
missing_surface_covered      object
property_type_Casa           object
l2shp_ATLANTICO              object
l2shp_BOLIVAR                object
l2shp_BOYACA                 object
l2shp_CALDAS                 object
l2shp_CAUCA                  object
l2shp_CESAR                  object
l2shp_CORDOBA                object
l2shp_CUNDINAMARCA           object
l2shp_HUILA                  object
l2shp_MAGDALENA              object
l2shp_META                   object
l2shp_NARIÑO                 object
l2shp_NORTE DE SANTANDER    

Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_covered,surface_total
ANTIOQUIA,6.205563,-75.582001,3.0,3.0,2.0,89.0,90.0
VALLE DEL CAUCA,3.41,-76.527086,3.0,3.0,3.0,110.0,101.0
CUNDINAMARCA,4.702386,-74.055361,3.0,3.0,3.0,115.0,120.0
HUILA,2.933352,-75.282756,3.0,3.0,2.0,129.0,120.0
ATLANTICO,11.001,-74.815002,3.0,3.0,2.0,112.0,105.0
BOLIVAR,10.404722,-75.518689,3.0,3.0,2.0,110.0,104.0
QUINDIO,4.542,-75.6725,3.0,3.0,2.0,85.0,80.0
CALDAS,5.058764,-75.496089,3.0,3.0,2.0,80.0,90.0
NORTE DE SANTANDER,7.894,-72.495,3.0,3.0,3.0,112.0,105.0
RISARALDA,4.808644,-75.698,3.0,3.0,2.0,94.0,90.0


{'rooms': 7.0,
 'surface_total': 1501.0,
 'surface_covered': 525.0,
 'bathrooms': 6.0}

[ 6.20499992  6.21600008  6.22800016 ... 10.4747749  10.47500038
 10.48499966]
[-75.5490036  -75.60800171 -75.56500244 ... -73.2486015  -73.25099945
 -73.27700043]
[3. 2. 5. 4. 7. 1. 6.]
[3.000e+00 2.000e+00 4.000e+00 5.000e+00 1.000e+00 6.000e+00 1.000e+01
 2.018e+03 7.000e+00 8.000e+00 1.100e+01 1.200e+01 3.200e+01 9.000e+00
 1.300e+01 1.800e+01 1.400e+01 4.300e+01 1.500e+01 2.000e+01 1.900e+01
 1.700e+01 3.000e+01 1.600e+01 3.400e+01 2.200e+01 3.800e+01 2.300e+01
 3.500e+01 2.100e+01 4.100e+01 3.600e+01 6.000e+01]
[4. 2. 1. 3. 5. 6.]
[1.94939001 1.91381385 2.19589965 2.33041377 1.8573325  2.02938378
 2.24551267 2.47712125 2.54654266 2.38021124 1.90308999 1.79239169
 1.80617997 1.68124124 1.86923172 1.79934055 2.50514998 1.8260748
 2.05307844 1.84509804 1.97312785 2.17609126 1.81291336 2.1271048
 1.89762709 2.06069784 1.87506126 1.83250891 1.93449845 1.88649073
 2.13672057 1.74036269 2.1172713  2.13353891 1.86332286 1.77815125
 1.95424251 2.2787536  2.06445799 1.59106461 2.37106786 1

### Pipeline

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.base import BaseEstimator

In [10]:
def build_model():
    
    '''
    This function construct a pipeline with custom transformer and estimators. The pipeline is passed to grid search function
    for tuning parameter for estimators. The pipeline include FeatureUnion based in custom transformer.
    
    Params:
        None
    Returns:
        cv(GridSearch object): An object of class GridSearch fitting over train data. The object have an attribute "best_estimator_"
                               that contain the best model finded.
    
    '''
    
    pipeline = Pipeline([
             ('input', InputData()),
             ('scaler', StandardScaler()),
             ('clf', DummyEstimator())])
#   
#   
#    pipeline = Pipeline([
#            ('transformer', Pipeline([
#                ('input', InputData()),
#                ('scaler', StandardScaler())
#            ])),
#            ('clf', DummyEstimator())
#    ])
    
    
   # pipeline = Pipeline([
   #     ('features', FeatureUnion([
   #         ('input', InputData()),
   #         ('scaler', StandardScaler())
   #     ])),
   #     ('clf', DummyEstimator())
   # ])

    print(pipeline.get_params())
    
    # Estimator 1: LinearRegression (clasic model):
    
    fit_intercept = [False, True] 
    
    # Estimator 2: Stochastic Gradient Descent:

    # The gradient of the loss is estimated each sample at a time and the model is updated along the way with
    # a decreasing strength schedule (aka learning rate). 
    
    # Choosen loss functions for SGD
    
    loss_function_SGD =["squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"]
    
    # Epsilon parameter according loss function selected:
    
    epsilon_huber = [0.4,0.7,1]
    epsilon_epsilon_insensitive = [0.01,0.1,0.2]
    epsilon_squared_epsilon_insensitive = [0.01,0.1,0.2]
    learning_rate = ["invscaling", "adaptive"]
    
    # Estimator 3: Support Vector Regression with Linear Kernel
    
    # Analogously to SVM for classification problem, the model produced by Support Vector Regression depends only
    # on a subset of the training data, because the cost function ignores samples whose prediction is close to their target.
        
    loss_functions_SVR = ["epsilon_insensitive", "squared_epsilon_insensitive"]
    
    # Candidate learning algorithms and their hyperparameters
    
    # Note that the SGDRegressor is splitted in several versions because loss functions is related to specific epsilon
    # values

    search_space = [{'clf': [LinearRegression()],
                    'clf__fit_intercept': fit_intercept},
                    {'clf': [SGDRegressor()],
                     'clf__loss': ['squared_loss']},
                    {'clf': [SGDRegressor()],
                     'clf__loss': ['huber'],
                     'clf__epsilon': epsilon_huber,
                     'clf__learning_rate': learning_rate},
                    {'clf': [SGDRegressor()],
                     'clf__loss': ['epsilon_insensitive'],
                     'clf__epsilon': epsilon_epsilon_insensitive,
                     'clf__learning_rate': learning_rate},
                    {'clf': [SGDRegressor()],
                     'clf__loss': ['squared_epsilon_insensitive'],
                     'clf__epsilon': epsilon_squared_epsilon_insensitive,
                     'clf__learning_rate': learning_rate},
                    {'clf': [LinearSVR()],
                     'clf__loss': loss_functions_SVR}
                   ]

    #Create grid search

    cv = GridSearchCV(pipeline, search_space, n_jobs=-1, scoring = "r2")
    
    return cv

In [11]:
#df = load_data("../data/PropertiesPrices.db")
#df, y = adjust_data_for_model(df)

X, y = df, y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 10)

print('Building model...')
model = build_model()

print('Training model...')
start_time = time()
model.fit(X_train, y_train)
end_time = time()
print("The time for training was: {}".format(end_time-start_time))

Building model...
{'memory': None, 'steps': [('input', InputData(include=['rooms', 'surface_total', 'surface_covered', 'bathrooms'],
          include_log=['surface_total', 'surface_covered'],
          segmentation_col='l2shp')), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', DummyEstimator())], 'verbose': False, 'input': InputData(include=['rooms', 'surface_total', 'surface_covered', 'bathrooms'],
          include_log=['surface_total', 'surface_covered'],
          segmentation_col='l2shp'), 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'clf': DummyEstimator(), 'input__include': ['rooms', 'surface_total', 'surface_covered', 'bathrooms'], 'input__include_log': ['surface_total', 'surface_covered'], 'input__segmentation_col': 'l2shp', 'scaler__copy': True, 'scaler__with_mean': True, 'scaler__with_std': True}
Training model...
{'rooms': 7.0, 'surface_total': 1520.0, 'surface_covered': 520.0, 'bathrooms': 6.0}
The time for training was

In [12]:
best_model = model.best_estimator_
print(best_model)
model_filepath = "regressor.pkl"
pickle.dump(model, open(model_filepath, 'wb'))

Pipeline(memory=None,
         steps=[('input',
                 InputData(include=['rooms', 'surface_total', 'surface_covered',
                                    'bathrooms'],
                           include_log=['surface_total', 'surface_covered'],
                           segmentation_col='l2shp')),
                ('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 SGDRegressor(alpha=0.0001, average=False, early_stopping=False,
                              epsilon=1, eta0=0.01, fit_intercept=True,
                              l1_ratio=0.15, learning_rate='adaptive',
                              loss='huber', max_iter=1000, n_iter_no_change=5,
                              penalty='l2', power_t=0.25, random_state=None,
                              shuffle=True, tol=0.001, validation_fraction=0.1,
                              verbose=0, warm_start=False))],
         verbose=False)


In [13]:
results_df = pd.DataFrame(model.cv_results_)
display(results_df)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__fit_intercept,param_clf__loss,param_clf__epsilon,param_clf__learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,8.146642,0.158621,2.492959,0.130722,"LinearRegression(copy_X=True, fit_intercept=Tr...",False,,,,"{'clf': LinearRegression(copy_X=True, fit_inte...",-581.8217,-572.3473,-572.7149,-576.0649,-561.3109,-572.852,6.696988,16
1,7.957658,0.133773,2.238873,0.042619,"LinearRegression(copy_X=True, fit_intercept=Tr...",True,,,,"{'clf': LinearRegression(copy_X=True, fit_inte...",-0.001171258,-0.0008077191,-0.0008994866,-0.001326965,-0.0007091593,-0.0009829176,0.0002308416,2
2,8.034463,0.135808,2.273747,0.044422,"SGDRegressor(alpha=0.0001, average=False, earl...",,squared_loss,,,"{'clf': SGDRegressor(alpha=0.0001, average=Fal...",-68213990000000.0,-1.421421e+20,-512305900000000.0,-5.886576e+18,-5.598659e+19,-4.080317e+19,5.48617e+19,21
3,7.819234,0.059853,2.22452,0.054377,"SGDRegressor(alpha=0.0001, average=False, earl...",,huber,0.4,invscaling,"{'clf': SGDRegressor(alpha=0.0001, average=Fal...",-0.02763692,-0.009444435,-0.04462897,-0.009715182,-0.01172156,-0.02062941,0.01377431,9
4,9.884716,0.981038,2.753248,0.12043,"SGDRegressor(alpha=0.0001, average=False, earl...",,huber,0.4,adaptive,"{'clf': SGDRegressor(alpha=0.0001, average=Fal...",-0.00466975,-0.005553448,-0.004955721,-0.00585957,-0.006523887,-0.005512475,0.0006580683,6
5,9.498563,0.699917,2.888273,0.092408,"SGDRegressor(alpha=0.0001, average=False, earl...",,huber,0.7,invscaling,"{'clf': SGDRegressor(alpha=0.0001, average=Fal...",-0.1035519,-0.008831735,-0.06150573,-0.005921638,-0.00699874,-0.03736195,0.03921333,11
6,10.638551,0.421311,3.374867,0.171221,"SGDRegressor(alpha=0.0001, average=False, earl...",,huber,0.7,adaptive,"{'clf': SGDRegressor(alpha=0.0001, average=Fal...",-0.001217978,-0.001265715,-0.001118565,-0.001699419,-0.00149848,-0.001360031,0.000210586,3
7,9.333066,0.437381,2.610675,0.420673,"SGDRegressor(alpha=0.0001, average=False, earl...",,huber,1.0,invscaling,"{'clf': SGDRegressor(alpha=0.0001, average=Fal...",-0.1374914,-0.009300072,-0.1426771,-0.005617765,-0.008290823,-0.06067541,0.0648689,13
8,9.2932,0.202961,2.135466,0.103721,"SGDRegressor(alpha=0.0001, average=False, earl...",,huber,1.0,adaptive,"{'clf': SGDRegressor(alpha=0.0001, average=Fal...",-0.001087757,-0.000793875,-0.0008634909,-0.001316774,-0.0007634466,-0.0009650689,0.000209297,1
9,8.036199,0.168938,2.196068,0.132171,"SGDRegressor(alpha=0.0001, average=False, earl...",,epsilon_insensitive,0.01,invscaling,"{'clf': SGDRegressor(alpha=0.0001, average=Fal...",-0.1301844,-0.02613576,-0.1133275,-0.02765947,-0.03673185,-0.0668078,0.04532559,15


### Using the best model to predict prices in test data set

In [14]:
y_pred = best_model.predict(X_test)

In [15]:
df_pred = pd.DataFrame({'y_pred':list(y_pred)}, index=X_test.index)
display(df_pred)
print(sum(df_pred['y_pred'].isna()))

Unnamed: 0,y_pred
313930,8.549942
185004,8.552374
329527,8.554822
547964,8.555583
454003,8.574761
...,...
313630,8.585186
238179,8.588566
413085,8.594052
502853,8.589993


0


In [27]:
df_results = pd.concat([X_test,df_pred,y_test],axis=1)
df_results['errors'] = df_results['y_pred']-df_results['price']
df_results['l2shp'] = df['l2shp']
df_results['property_type'] = df['property_type']
df_results['squared_errors'] = df_results['errors']**2
df_to_save = df_results[['lat','lon','l2shp','errors','squared_errors','missing_lon','missing_lat','property_type']]
df_to_save.to_csv("../app/source/test_errors.csv")

In [28]:
display(df_to_save)

Unnamed: 0,lat,lon,l2shp,errors,squared_errors,missing_lon,missing_lat,property_type
313930,4.680539,-74.047640,CUNDINAMARCA,0.282771,0.079959,0,0,Apartamento
185004,3.370000,-76.518000,VALLE DEL CAUCA,0.089976,0.008096,0,0,Apartamento
329527,4.695075,-74.091758,CUNDINAMARCA,0.174611,0.030489,0,0,Apartamento
547964,,,SANTANDER,-0.046477,0.002160,1,1,Casa
454003,4.534566,-75.670600,QUINDIO,0.553572,0.306441,0,0,Apartamento
...,...,...,...,...,...,...,...,...
313630,,,CUNDINAMARCA,0.028883,0.000834,1,1,Apartamento
238179,4.690000,-74.060000,CUNDINAMARCA,-0.027912,0.000779,0,0,Apartamento
413085,10.992546,-74.821085,ATLANTICO,-0.335367,0.112471,0,0,Casa
502853,5.041843,-75.510268,CALDAS,0.548600,0.300962,0,0,Apartamento


In [19]:
from sklearn.metrics import r2_score
r2_model = r2_score(y_test, y_pred)
print(r2_model)

-0.0005610461795817479


In [20]:
display(best_model)

Pipeline(memory=None,
         steps=[('input',
                 InputData(include=['rooms', 'surface_total', 'surface_covered',
                                    'bathrooms'],
                           include_log=['surface_total', 'surface_covered'],
                           segmentation_col='l2shp')),
                ('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 SGDRegressor(alpha=0.0001, average=False, early_stopping=False,
                              epsilon=1, eta0=0.01, fit_intercept=True,
                              l1_ratio=0.15, learning_rate='adaptive',
                              loss='huber', max_iter=1000, n_iter_no_change=5,
                              penalty='l2', power_t=0.25, random_state=None,
                              shuffle=True, tol=0.001, validation_fraction=0.1,
                              verbose=0, warm_start=False))],
         verbose=False)

In [None]:
pipeline2 = Pipeline([
             ('input', InputData()),
             ('scaler', StandardScaler()),
             ('clf', (LinearRegression))])