In [13]:
%load_ext watermark
%watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Last updated: 2021-06-09T11:27:45.202377-05:00

Python implementation: CPython
Python version       : 3.7.6
IPython version      : 7.12.0

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
CPU cores   : 8
Architecture: 64bit



In [14]:
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

matplotlib.rcParams['figure.figsize'] = [10, 10]

In [15]:
np.random.seed(42)

In [5]:
ames = pd.read_csv("data/ames.csv").drop("id_parcela", axis=1)

In [6]:
ames.shape

(2930, 80)

In [9]:
ames.sample(2).T

Unnamed: 0,1663,93
tipo_construccion,20,120
tipo_zona,RL,FV
perimetro_conectado_calle,,25
area_parcela,10530,4835
tipo_acceso,Pave,Pave
...,...,...
mes_venta,3,3
ano_venta,2007,2010
tipo_venta,WD,WD
condicion_venta,Normal,Normal


### Elegimos las variables independientes y dependientes

In [10]:
variables_independientes = ames.drop("precio_venta", axis=1).columns
variable_objetivo = ["precio_venta"]

In [11]:
datos_numericos = ames[variables_independientes].select_dtypes([int, float])
col_no_numericas = ames[variables_independientes].select_dtypes([object]).columns

col_numericas = datos_numericos.columns

# para crear este diccionario he ido a la descripcion del dataset
dict_var_ordinales = {
    "calidad_cocinas": ["Po", "Fa", "TA", "Gd", "Ex"],
    "funcionalidad":["Sal", "Sev", "Maj2", "Maj1", "Min2", "Min1","Typ"],
    "calidad_chimeneas":["NA","Po","Fa","TA","Gd","Ex"],
    "acabado_garaje":["NA","Unf","RFn","Fin"],
    "calidad_garaje":["NA","Po","Fa","TA","Gd","Ex"],
    "condicion_garaje":["NA","Po","Fa","TA","Gd","Ex"],
    "acceso_garaje_pavimentado":["N", "P", "Y"],
    "calidad_piscina":["NA","Fa","TA","Gd","Ex"],
    "calidad_valla":["NA","MnWw","GdWo","MnPrv","GdPrv"],
    "forma_parcela":["IR3", "IR2", "IR1","Reg"],
    "tipo_instalaciones":["ELO","NoSeWa","NoSewr","AllPub"],
    "pendiente_parcela":["Sev", "Mod", "Gtl"],
    "calidad_material_exterior":["Po","Fa","TA","Gd","Ex"],
    "condicion_material_exterior":["Po","Fa","TA","Gd","Ex"],
    "altura_sotano":["NA","Po","Fa","TA","Gd","Ex"],
    "condicion_sotano":["NA","Po","Fa","TA","Gd","Ex"],
    "sotano_exterior":["NA","No","Mn","Av","Gd"],
    "calidad_sotano_habitable1":["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "calidad_sotano_habitable2":["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
     "calidad_calefaccion":["Po","Fa","TA","Gd","Ex"],
}
col_ordinales = list(dict_var_ordinales.keys())
datos_ordinales = ames[col_ordinales]
col_categoricas = list(set(col_no_numericas) - set(col_ordinales))
datos_categoricos = ames[col_categoricas]

#### Nos aseguramos de que todas las columnas esten guardadas como categoricas o númericas 

In [12]:
[col for col in ames.columns if col not in datos_numericos.columns and\
 col not in datos_categoricos.columns and \
 col not in datos_ordinales.columns]

['tipo_construccion',
 'area_parcela',
 'calidad_general',
 'condicion_general',
 'fecha_construccion',
 'ano_remodelado',
 'area_piso1',
 'area_piso2',
 'area_suelos_baja_calidad',
 'area_habitable_sobre_suelo',
 'n_aseos_sobre_suelo',
 'n_medios_aseos_sobre_suelo',
 'n_dormitorios_sobre_suelo',
 'n_cocinas',
 'n_habitaciones_sobre_suelo',
 'n_chimeneas',
 'area_terraza_madera',
 'area_porche_abierto',
 'area_porche_cerrado',
 'area_porche_3estaciones',
 'area_porche_tapado',
 'area_piscina',
 'valor_atributo_miscelaneo',
 'mes_venta',
 'ano_venta',
 'precio_venta']

#### A las variables númericas les eliminamos los valores inexistentes 

In [17]:
from sklearn.preprocessing import normalize

In [18]:
from sklearn.impute import SimpleImputer

In [21]:
# Recordatorio: los transformadores de sklearn devuelven numpy arrays, no dataframes
datos_numericos_imputados_normalizados = pd.DataFrame(
    normalize(SimpleImputer(strategy="median").fit_transform(datos_numericos)),
    columns=datos_numericos.columns
)

### Variables categoricas

Con las variables categóricas tenemos dos opciones principalmente:

Usar sklearn.preprocessing.LabelBinarizer para hacer un 1 hot encoding y codificarlas como vectores
Usar la función get_dummies de pandas.
En general la opción recomendada es la de usar LabelBinarizer ya que esto nos crea un transformador de scikit learn que podemos usar en pipelines y para transformar nuevas observaciones. Sin embargo dado que esta sección es sobre selección de variables y para no hacerla demasiado larga, directamente voy a usar pd.get_dummies.

In [22]:
datos_categoricos_dummy = pd.get_dummies(datos_categoricos, drop_first=True)

In [23]:
datos_categoricos_dummy.shape

(2930, 153)

### Variables ordinales 

In [24]:
dict_var_ordinales = {
    "calidad_cocinas": ["Po", "Fa", "TA", "Gd", "Ex"],
    "funcionalidad":["Sal", "Sev", "Maj2", "Maj1", "Min2", "Min1","Typ"],
    "calidad_chimeneas":["NA","Po","Fa","TA","Gd","Ex"],
    "acabado_garaje":["NA","Unf","RFn","Fin"],
    "calidad_garaje":["NA","Po","Fa","TA","Gd","Ex"],
    "condicion_garaje":["NA","Po","Fa","TA","Gd","Ex"],
    "acceso_garaje_pavimentado":["N", "P", "Y"],
    "calidad_piscina":["NA","Fa","TA","Gd","Ex"],
    "calidad_valla":["NA","MnWw","GdWo","MnPrv","GdPrv"],
    "forma_parcela":["IR3", "IR2", "IR1","Reg"],
    "tipo_instalaciones":["ELO","NoSeWa","NoSewr","AllPub"],
    "pendiente_parcela":["Sev", "Mod", "Gtl"],
    "calidad_material_exterior":["Po","Fa","TA","Gd","Ex"],
    "condicion_material_exterior":["Po","Fa","TA","Gd","Ex"],
    "altura_sotano":["NA","Po","Fa","TA","Gd","Ex"],
    "condicion_sotano":["NA","Po","Fa","TA","Gd","Ex"],
    "sotano_exterior":["NA","No","Mn","Av","Gd"],
    "calidad_sotano_habitable1":["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "calidad_sotano_habitable2":["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "calidad_calefaccion":["Po","Fa","TA","Gd","Ex"],
}

In [25]:
for columna_ordinal, valores in dict_var_ordinales.items():
    datos_ordinales[columna_ordinal] = (
    datos_ordinales[columna_ordinal]
    .astype("category")
    .cat.set_categories(valores)
    .cat.codes
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [26]:
datos_ordinales.head()

Unnamed: 0,calidad_cocinas,funcionalidad,calidad_chimeneas,acabado_garaje,calidad_garaje,condicion_garaje,acceso_garaje_pavimentado,calidad_piscina,calidad_valla,forma_parcela,tipo_instalaciones,pendiente_parcela,calidad_material_exterior,condicion_material_exterior,altura_sotano,condicion_sotano,sotano_exterior,calidad_sotano_habitable1,calidad_sotano_habitable2,calidad_calefaccion
0,2,6,4,3,3,3,1,-1,-1,2,3,2,2,2,3,4,4,4,1,1
1,2,6,-1,1,3,3,2,-1,3,3,3,2,2,2,3,3,1,3,2,2
2,3,6,-1,1,3,3,2,-1,-1,2,3,2,2,2,3,3,1,5,1,2
3,4,6,3,3,3,3,2,-1,-1,3,3,2,3,2,3,3,1,5,1,4
4,2,6,3,3,3,3,2,-1,3,2,3,2,2,2,4,3,1,6,1,3


In [27]:
### Unimos los tres tipos de datos

ames_procesado = pd.concat([
    datos_numericos_imputados_normalizados,
    datos_categoricos_dummy,
    datos_ordinales
], axis=1)

In [28]:
ames_procesado.shape

(2930, 184)

### Ahora el dataset esta preparado para usarse 

#### Crear un selector RFE y entrenar con un Random Forest con las 20 mejores variables ¿que son las variables que han elegido?

In [29]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


ames_X = ames_procesado
ames_y = ames[variable_objetivo]

Vamos a usar la función cross_validate que es una versión más flexible que cross_val_score. Evaluaremos usando la raíz del error cuadrático medio (RMSE)

In [30]:
def rmse(y_real, y_pred):
    return np.sqrt(mean_squared_error(y_real, y_pred))

def rmse_cv(estimator, X, y):
    y_pred = estimator.predict(X)
    return rmse(y, y_pred)

res = cross_validate(LinearRegression(), ames_procesado, ames[variable_objetivo],
                     scoring=rmse_cv, n_jobs=-1, cv=10)

In [31]:
res

{'fit_time': array([0.15957141, 0.16954565, 0.07878757, 0.10272408, 0.10870862,
        0.08875775, 0.13165116, 0.08377504, 0.02892113, 0.02792406]),
 'score_time': array([0.00498676, 0.00498676, 0.0059855 , 0.00498676, 0.00498676,
        0.00498676, 0.00298786, 0.00498676, 0.00199485, 0.0009973 ]),
 'test_score': array([29869.59361161, 34889.42346779, 25569.84433073, 30687.24890532,
        41765.03174894, 43664.68634708, 41549.41534577, 39705.84994347,
        31162.38691242, 37269.65210705])}

In [32]:
def evaluar_modelo(estimador, X, y):
    resultados_estimador = cross_validate(estimador, X, y,
                     scoring=rmse_cv, n_jobs=-1, cv=10, return_train_score=True)
    return resultados_estimador

In [33]:
resultados = {}

def ver_resultados():
    resultados_df  = pd.DataFrame(resultados).T
    resultados_cols = resultados_df.columns
    for col in resultados_df:
        resultados_df[col] = resultados_df[col].apply(np.mean)
        resultados_df[col+"_idx"] = resultados_df[col] / resultados_df[col].min()
    return resultados_df

In [34]:
resultados["reg_lineal_sin_seleccion"] = evaluar_modelo(LinearRegression(), ames_X, ames_y)
resultados["svr_sin_seleccion"] = evaluar_modelo(SVR(), ames_X, ames_y)
resultados["rf_sin_seleccion"] = evaluar_modelo(RandomForestRegressor(), ames_X, ames_y)

In [35]:
ver_resultados()

Unnamed: 0,fit_time,score_time,test_score,train_score,fit_time_idx,score_time_idx,test_score_idx,train_score_idx
reg_lineal_sin_seleccion,0.074502,0.00389,35613.313272,27932.67734,1.0,1.0,1.115189,2.383071
svr_sin_seleccion,3.261576,0.329124,81031.639474,82149.04483,43.778573,84.603216,2.537411,7.008531
rf_sin_seleccion,7.725937,0.023538,31934.774256,11721.293054,103.70155,6.050617,1.0,1.0


#### Crear un selector Kbest con el criterio f_regression que seleccione 5 variables, transformar los datos, y comprobar las dimensiones del nuevo dataset

In [36]:
from sklearn.feature_selection import SelectKBest, f_regression

In [40]:
selector_kbest5 = SelectKBest(f_regression, k=5)
ames_X_kbest5 = selector_kbest5.fit_transform(ames_X, ames_y)

  y = column_or_1d(y, warn=True)


In [41]:
ames_X_kbest5.shape

(2930, 5)

In [43]:
puntuaciones_seleccion_kbest5 = zip(ames_X.columns, selector_kbest5.scores_, 
                                   selector_kbest5.get_support())

evaluacion_kbest5 = sorted(filter(lambda c: c[2], puntuaciones_seleccion_kbest5),
                          key=lambda c: c[1], reverse= True)

In [44]:
list(evaluacion_kbest5)


[('calidad_material_exterior', 2781.4056293393337, True),
 ('ano_construccion_garaje', 2542.1834185458356, True),
 ('calidad_cocinas', 2423.00029861259, True),
 ('altura_sotano', 1372.464274306123, True),
 ('area_sotano_total', 1170.3575348657419, True)]

In [45]:
resultados["reg_lineal_kbest_5"] = evaluar_modelo(LinearRegression(), ames_X_kbest5, ames_y)
resultados["rf_kbest_5"] = evaluar_modelo(RandomForestRegressor(), ames_X_kbest5, ames_y)
resultados["svr_kbest_5"] = evaluar_modelo(SVR(), ames_X_kbest5, ames_y)

In [46]:
ver_resultados()

Unnamed: 0,fit_time,score_time,test_score,train_score,fit_time_idx,score_time_idx,test_score_idx,train_score_idx
reg_lineal_sin_seleccion,0.074502,0.00389,35613.313272,27932.67734,6.853425,2.166605,1.115189,2.383071
svr_sin_seleccion,3.261576,0.329124,81031.639474,82149.04483,300.033168,183.301726,2.537411,7.008531
rf_sin_seleccion,7.725937,0.023538,31934.774256,11721.293054,710.710798,13.109295,1.0,1.0
reg_lineal_kbest_5,0.010871,0.001796,44581.994088,44735.293652,1.0,1.0,1.396033,3.816583
rf_kbest_5,0.953828,0.019253,42611.737854,15735.550041,87.742841,10.722786,1.334337,1.342476
svr_kbest_5,0.493588,0.028919,80919.325886,82045.292068,45.405254,16.106161,2.533894,6.999679


### Crear un Selector RFE y entrenar con un Random Forest con las 20 mejores variables. ¿Que variables son las que se han elegido?

In [48]:
from sklearn.feature_selection import RFE
estimador_selector = RandomForestRegressor()
selector_rfe20_rf = RFE(estimador_selector, n_features_to_select=20)
ames_X_rfe20_rf = selector_rfe20_rf.fit_transform(ames_X, ames_y)

  y = column_or_1d(y, warn=True)


In [49]:
ames_X_rfe20_rf.shape

(2930, 20)

In [50]:
evaluacion_rfe20_rf = sorted(
    filter(lambda c: c[2], 
        zip(
            ames_X.columns,
            selector_rfe20_rf.ranking_,
            selector_rfe20_rf.get_support()
        )
    ), key=lambda c: c[1],reverse=True
)

In [51]:
evaluacion_rfe20_rf


[('perimetro_conectado_calle', 1, True),
 ('area_revestimiento', 1, True),
 ('area_sotano_habitable1', 1, True),
 ('area_sotano_inhabitable', 1, True),
 ('area_sotano_total', 1, True),
 ('n_aseos_sotanos', 1, True),
 ('ano_construccion_garaje', 1, True),
 ('n_coches_garaje', 1, True),
 ('area_garage', 1, True),
 ('barrio_Crawfor', 1, True),
 ('barrio_NoRidge', 1, True),
 ('tipo_garaje_Detchd', 1, True),
 ('tipo_casa_1Story', 1, True),
 ('tipo_casa_2Story', 1, True),
 ('calidad_cocinas', 1, True),
 ('calidad_chimeneas', 1, True),
 ('acabado_garaje', 1, True),
 ('calidad_material_exterior', 1, True),
 ('altura_sotano', 1, True),
 ('sotano_exterior', 1, True)]

In [53]:
resultados["reg_lineal_rfe20_rf"] = evaluar_modelo(LinearRegression(), ames_X_rfe20_rf, ames_y)
resultados["rf_rfe20_rf"] = evaluar_modelo(RandomForestRegressor(), ames_X_rfe20_rf, ames_y)
resultados["svr_rfe20_rf"] = evaluar_modelo(SVR(), ames_X_rfe20_rf, ames_y)

In [54]:
estimador_selector = LinearRegression()
selector_rfe20_lineal = RFE(estimador_selector, n_features_to_select=20)
ames_X_rfe20_lineal = selector_rfe20_lineal.fit_transform(ames_X, ames_y)

  y = column_or_1d(y, warn=True)


In [55]:
evaluacion_rfe20_lineal = sorted(
    filter(lambda c: c[2], 
        zip(
            ames_X.columns,
            selector_rfe20_lineal.ranking_,
            selector_rfe20_lineal.get_support()
        )
    ), key=lambda c: c[1],reverse=True
)
evaluacion_rfe20_lineal

[('perimetro_conectado_calle', 1, True),
 ('area_revestimiento', 1, True),
 ('area_sotano_habitable1', 1, True),
 ('area_sotano_habitable2', 1, True),
 ('area_sotano_inhabitable', 1, True),
 ('area_sotano_total', 1, True),
 ('n_aseos_sotanos', 1, True),
 ('n_medios_aseos_sotanos', 1, True),
 ('ano_construccion_garaje', 1, True),
 ('n_coches_garaje', 1, True),
 ('tipo_revestimiento_CBlock', 1, True),
 ('material_tejado_CompShg', 1, True),
 ('material_tejado_Membran', 1, True),
 ('material_tejado_Metal', 1, True),
 ('material_tejado_Roll', 1, True),
 ('material_tejado_Tar&Grv', 1, True),
 ('material_tejado_WdShake', 1, True),
 ('material_tejado_WdShngl', 1, True),
 ('barrio_GrnHill', 1, True),
 ('cercania_carretera2_PosA', 1, True)]

In [56]:
resultados["reg_lineal_rfe20_lineal"] = evaluar_modelo(LinearRegression(), ames_X_rfe20_lineal, ames_y)
resultados["rf_rfe20_lineal"] = evaluar_modelo(RandomForestRegressor(), ames_X_rfe20_lineal, ames_y)
resultados["svr_rfe20_lineal"] = evaluar_modelo(SVR(), ames_X_rfe20_lineal, ames_y)

In [57]:
ver_resultados()

Unnamed: 0,fit_time,score_time,test_score,train_score,fit_time_idx,score_time_idx,test_score_idx,train_score_idx
reg_lineal_sin_seleccion,0.074502,0.00389,35613.31,27932.67734,19.155904,3.537419,1.115189,2.383071
svr_sin_seleccion,3.261576,0.329124,81031.64,82149.04483,838.618142,299.277045,2.537411,7.008531
rf_sin_seleccion,7.725937,0.023538,31934.77,11721.293054,1986.496935,21.403568,1.0,1.0
reg_lineal_kbest_5,0.010871,0.001796,44581.99,44735.293652,2.795085,1.632702,1.396033,3.816583
rf_kbest_5,0.953828,0.019253,42611.74,15735.550041,245.248679,17.507111,1.334337,1.342476
svr_kbest_5,0.493588,0.028919,80919.33,82045.292068,126.911535,26.296557,2.533894,6.999679
reg_lineal_rfe20_rf,0.013066,0.002094,36084.32,35811.511319,3.359495,1.904349,1.129938,3.055253
rf_rfe20_rf,2.430299,0.017753,32276.96,11973.784534,624.879817,16.143303,1.010715,1.021541
svr_rfe20_rf,0.540956,0.034509,80909.77,82044.768637,139.090801,31.379092,2.533595,6.999635
reg_lineal_rfe20_lineal,0.003889,0.0011,718675400000000.0,49151.146028,1.0,1.0,22504480000.0,4.193321
