## Importando Librerías

In [20]:
from wikiframe import Say, Extractor #Extrae los csv files
import numpy as np  #Libreria para trabajar con arrays
import pandas as pd #Libreria para trabajar con dataframes

import warnings
warnings.simplefilter("ignore")

#Regresores
from sklearn.linear_model import ElasticNet 
from sklearn.linear_model import HuberRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor


from sklearn.model_selection import RandomizedSearchCV,cross_val_score,KFold, GridSearchCV

from sklearn.pipeline import make_pipeline 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler,StandardScaler, OneHotEncoder, LabelEncoder,PowerTransformer, Normalizer
from sklearn.compose import ColumnTransformer
from mlxtend.regressor import StackingCVRegressor

In [21]:
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_log_error,  make_scorer
scoring=make_scorer(mean_squared_log_error, greater_is_better=False, squared=False)

## Variables Principales

Para el desarrollo del modelo , se tomó todas la columnas de la tabla de datos.

In [22]:
#Instanciar el objeto Extractor
extractor = Extractor('data')

#Extraer en df de ./data
df_dict = extractor.extract_from_csv()

#Crear dataframe con los datos
train = df_dict['house_train_raw'].drop(['Id'],axis=1)
test = df_dict['houses_test_raw'].drop(['Id'],axis=1)

In [23]:
#Separamos en train y test
X_train = train.drop(['SalePrice'], axis=1)
y_train = train['SalePrice']

In [24]:
#Sepramos las varaiables categoricas y las numericas
categorical_cols = [x for x in X_train if X_train[x].dtype == "object"]
numerical_cols = [x for x in X_train if X_train[x].dtype == "int64" or train[x].dtype == "float64"]

In [25]:
#Elegimos los mejores Regresores
huber_regressor = HuberRegressor()
elastic_net = ElasticNet()
randon_forest = RandomForestRegressor()
ada_boost = AdaBoostRegressor()
gradient_boost = GradientBoostingRegressor()
xg_boost = XGBRegressor()

In [26]:
#Pipelines

#Pipeline para la variable categorica
categorical = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), #Llena los NaN con el valor mas frecuentes
    ("oneHot", OneHotEncoder(handle_unknown="ignore")) #Codifica las variables categoricas
])

#Pipeline para la variable numerica
numerical = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")), #Llena los NaN con la mediana de la columna
    ("scaler", PowerTransformer()) #Transforma los valores de la columna, se eligió PowerTransformer porque trata de hacer gausiana la distribucion de los valores
])

#Pipeline para las variables categoricas y numericas
preproces = ColumnTransformer(
    transformers=[
        ('cat', categorical, categorical_cols),
        ('num', numerical, numerical_cols)
])

In [28]:
#Pipeline para el modelo
model = StackingCVRegressor(
    regressors=[huber_regressor, elastic_net, randon_forest,ada_boost,xg_boost,gradient_boost],
    meta_regressor=XGBRegressor()
)

#Pipeline para el modelo general
pipe = Pipeline(steps=[
    ('preproces', preproces),
    ('model', model)
])

In [36]:
#Establecemos los parametros del modelo para que sean optimizados
grid =RandomizedSearchCV(
    pipe, 
    param_distributions={
        'model__elastcnet__alpha':[0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007],
        'model__elasticnet__l1_ratio':[0.8, 0.85, 0.9, 0.95, 0.99, 1],
        'model__huberregressor__epsilon':np.linspace(1, 1.9, num=9),
        'model__huberregressor__max_iter':[100,200,300,400,500,600,700,800,900,1000],
        'model__randomforestregressor__n_estimators': [300,400,500,600],
        'model__randomforestregressor__max_features': ['sqrt', 'log2', None],
        'model__randomforestregressor__max_depth': [ 60, 70, 80, 90, 100,],
        'model__randomforestregressor__min_samples_split':  [2, 5, 10],
        'model__randomforestregressor__min_samples_leaf': [1, 2, 4],
    },
    cv=KFold(n_splits=5,shuffle=True), 
    n_jobs=-1, 
    verbose=1,
    scoring=scoring,
    n_iter=10
    )

In [35]:
#Poewr Transformer
grid.fit(X_train, y_train)
print('RMLS: ',np.abs(grid.best_score_))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


ValueError: Invalid parameter elasticnt for estimator StackingCVRegressor(meta_regressor=XGBRegressor(base_score=None, booster=None,
                                                callbacks=None,
                                                colsample_bylevel=None,
                                                colsample_bynode=None,
                                                colsample_bytree=None,
                                                early_stopping_rounds=None,
                                                enable_categorical=False,
                                                eval_metric=None, gamma=None,
                                                gpu_id=None, grow_policy=None,
                                                importance_type=None,
                                                interaction_constraints=None,
                                                learning_rate=None,
                                                max_bin=None,...
                                             importance_type=None,
                                             interaction_constraints=None,
                                             learning_rate=None, max_bin=None,
                                             max_cat_to_onehot=None,
                                             max_delta_step=None,
                                             max_depth=None, max_leaves=None,
                                             min_child_weight=None, missing=nan,
                                             monotone_constraints=None,
                                             n_estimators=100, n_jobs=None,
                                             num_parallel_tree=None,
                                             predictor=None, random_state=None,
                                             reg_alpha=None, reg_lambda=None, ...),
                                GradientBoostingRegressor()]). Check the list of available parameters with `estimator.get_params().keys()`.

## Resultados

Se obtuvo una métrica del 0.0165 gracias a elegir correctamente los hyperparámetros del Pipeline. 

Se procederá a predecir el dataset de Test

In [19]:
y_pred = grid.predict(test)
test['SalePrice'] = y_pred
test.to_csv('./out/predictions.csv',index=False)