In [1]:
import pandas as pd
ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values="?")
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

In [2]:
numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]

## Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

logreg = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler() ,LinearRegression())

cv_logreg = cross_validate(logreg, data_numerical, target, cv=10)
cv_logreg["test_score"].mean()

0.7178828308359911

## Desiccion Tree Regressor

In [9]:
from sklearn.tree import DecisionTreeRegressor

tree = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler() ,DecisionTreeRegressor())

cv_tree = cross_validate(tree, data_numerical, target, cv=10)
cv_tree["test_score"].mean()

0.6126201846714042

## Grid Search

In [18]:
from sklearn.model_selection import GridSearchCV
import numpy as np

param_tree = [{"decisiontreeregressor__max_depth":np.linspace(1,15,15)}]
param_tree

search = GridSearchCV(tree, param_tree, cv=10)
cv_results_tree_optimal_depth = cross_validate(
    search, data_numerical, target, cv=10, return_estimator=True, n_jobs=2,
)
cv_results_tree_optimal_depth["test_score"].mean()

0.6928343107842425

## Using data complete

In [23]:
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_processor = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
)
numerical_processor = SimpleImputer()


preprocessor = make_column_transformer(
    (categorical_processor, selector(dtype_include=object)),
    (numerical_processor, selector(dtype_exclude=object))
)
tree = make_pipeline(preprocessor, DecisionTreeRegressor(random_state=0))

cv_results = cross_validate(
    tree, data, target, cv=10, return_estimator=True, n_jobs=2
)
cv_results["test_score"].mean()

0.7422751120581628

Este modelo arroja los mejores resultados.