In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [6]:
# Cargar los datasets
train_data = pd.read_csv('train.csv')
text_data = pd.read_csv('test.csv')

# Mostrar las primeras filas del dataset de entrenamiento
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [9]:
train_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [11]:
# Seleccionar las columnas de interés
features = [
    "BedroomAbvGr",
    "FullBath",
    "LotArea",
    "YearBuilt",
    "Neighborhood",
    "YearRemodAdd",
    "PavedDrive"
]

# Eliminar filas con valores faltantes
train_data = train_data.dropna(subset=features + ["SalePrice"])

In [12]:
# Variables predictoras (X) y objetivo (y)
X = train_data[features]
y = train_data['SalePrice']

# Codificar la columna categórica 'Neighborhood'
X = pd.get_dummies(X)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el modelo de Random Forest
rf_model = RandomForestRegressor(random_state=42)

# Entrenar el modelo
rf_model.fit(X_train, y_train)

# Realizar predicciones
y_pred = rf_model.predict(X_test)

# Evaluar el modelo utilizando el error absoluto medio (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Definir los parámetros a ajustar
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
}

# Usar GridSearchCV para buscar los mejores parámetros
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)

# Ajustar el modelo a los datos
grid_search.fit(X_train, y_train)

# Obtener los mejores parámetros
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Entrenar el modelo con los mejores parámetros
best_rf_model = grid_search.best_estimator_

# Evaluar el rendimiento
y_pred_best = best_rf_model.predict(X_test)
best_mae = mean_absolute_error(y_test, y_pred_best)
print(f"Best Mean Absolute Error: {best_mae}")

# Importancia de las características
importances = best_rf_model.feature_importances_
feature_importance = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print("Feature Importances:")
print(feature_importance)

Mean Absolute Error: 27406.291725619696
Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 300}
Best Mean Absolute Error: 27473.57787439544
Feature Importances:
YearBuilt               0.393680
LotArea                 0.265073
FullBath                0.074580
YearRemodAdd            0.062613
Neighborhood_NridgHt    0.047739
BedroomAbvGr            0.040487
Neighborhood_NoRidge    0.026612
Neighborhood_StoneBr    0.023844
Neighborhood_Crawfor    0.017414
Neighborhood_Edwards    0.007421
Neighborhood_NWAmes     0.005318
Neighborhood_Gilbert    0.004822
Neighborhood_Somerst    0.004470
Neighborhood_Veenker    0.002998
Neighborhood_Mitchel    0.002852
Neighborhood_OldTown    0.002730
PavedDrive_N            0.002718
Neighborhood_Sawyer     0.002044
Neighborhood_NAmes      0.001926
Neighborhood_ClearCr    0.001812
PavedDrive_Y            0.001737
Neighborhood_CollgCr    0.001629
Neighborhood_Timber     0.001448
Neighborhood_Blmngtn    0.001108
Neighborhood_BrkSide   