## Importando Librerías

In [9]:
from wikiframe import Say, Extractor #Extrae los csv files
import numpy as np  #Libreria para trabajar con arrays
import pandas as pd #Libreria para trabajar con dataframes

import warnings
warnings.simplefilter("ignore")

#Regresores
from sklearn.linear_model import ElasticNet 
from sklearn.linear_model import HuberRegressor 
from sklearn.ensemble import RandomForestRegressor


from sklearn.model_selection import RandomizedSearchCV,cross_val_score,KFold

from sklearn.pipeline import make_pipeline 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler,StandardScaler, OneHotEncoder, LabelEncoder,PowerTransformer, Normalizer
from sklearn.compose import ColumnTransformer
from mlxtend.regressor import StackingCVRegressor

## Variables Principales

Para el desarrollo del modelo , se tomó todas la columnas de la tabla de datos.

In [10]:
#Instanciar el objeto Extractor
extractor = Extractor('data')

#Extraer en df de ./data
df_dict = extractor.extract_from_csv()

#Crear dataframe con los datos
train = df_dict['house_train_raw'].drop(['Id'],axis=1)
test = df_dict['houses_test_raw'].drop(['Id'],axis=1)

In [11]:
#Separamos en train y test
X_train = train.drop(['SalePrice'], axis=1)
y_train = train['SalePrice']

In [12]:
#Sepramos las varaiables categoricas y las numericas
categorical_cols = [x for x in X_train if X_train[x].dtype == "object"]
numerical_cols = [x for x in X_train if X_train[x].dtype == "int64" or train[x].dtype == "float64"]

In [13]:
#Elegimos los mejores Regresores
huber_regressor = HuberRegressor()
elastic_net = ElasticNet()
randon_forest = RandomForestRegressor()

In [14]:
#Pipelines

#Pipeline para la variable categorica
categorical = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), #Llena los NaN con el valor mas frecuentes
    ("oneHot", OneHotEncoder(handle_unknown="ignore")) #Codifica las variables categoricas
])

#Pipeline para la variable numerica
numerical = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")), #Llena los NaN con la mediana de la columna
    ("scaler", PowerTransformer()) #Transforma los valores de la columna, se eligió PowerTransformer porque trata de hacer gausiana la distribucion de los valores
])

#Pipeline para las variables categoricas y numericas
preproces = ColumnTransformer(
    transformers=[
        ('cat', categorical, categorical_cols),
        ('num', numerical, numerical_cols)
])

In [15]:
#Pipeline para el modelo
model = StackingCVRegressor(
    regressors=[huber_regressor, elastic_net, randon_forest],
    meta_regressor=RandomForestRegressor()
)

#Pipeline para el modelo general
pipe = Pipeline(steps=[
    ('preproces', preproces),
    ('model', model)
])

In [16]:
#Establecemos los parametros del modelo para que sean optimizados
grid =RandomizedSearchCV(
    pipe, 
    param_distributions={
        'model__elasticnet__alpha':[0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007],
        'model__elasticnet__l1_ratio':[0.8, 0.85, 0.9, 0.95, 0.99, 1],
        'model__huberregressor__epsilon':np.linspace(1, 1.9, num=9),
        'model__huberregressor__max_iter':[100,200,300,400,500,600,700,800,900,1000],
        'model__randomforestregressor__n_estimators': [300,400,500,600],
        'model__randomforestregressor__max_features': ['sqrt', 'log2', None],
        'model__randomforestregressor__max_depth': [ 60, 70, 80, 90, 100,],
        'model__randomforestregressor__min_samples_split':  [2, 5, 10],
        'model__randomforestregressor__min_samples_leaf': [1, 2, 4],
    },
    cv=KFold(n_splits=5,shuffle=True), 
    n_jobs=-1, 
    verbose=1,
    scoring='neg_mean_squared_log_error'
    )

In [17]:
#Poewr Transformer
grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
                   estimator=Pipeline(steps=[('preproces',
                                              ColumnTransformer(transformers=[('cat',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='most_frequent')),
                                                                                               ('oneHot',
                                                                                                OneHotEncoder(handle_unknown='ignore'))]),
                                                                               ['MSZoning',
                                                                                'Street',
                                                                                'Alley',
                    

In [18]:
print('RMLS: ',np.abs(grid.best_score_))

RMLS:  0.01832928691620198


## Resultados

Se obtuvo una métrica del 0.0175 gracias a elegir correctamente los hyperparámetros del Pipeline. 

Se procederá a predecir el dataset de Test

In [19]:
y_pred = grid.predict(test)
test['SalePrice'] = y_pred
test.to_csv('./out/predictions.csv',index=False)