In [96]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [229]:
df_train = pd.read_csv("./data/train.csv")
X_test = pd.read_csv("./data/test.csv")
X_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [230]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [231]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [172]:
# Calcular el porcentaje de valores nulos por columna en X
nan_percentage = df_train.isnull().mean() * 100

# Crear un dataframe para visualizarlo mejor
nan_summary = nan_percentage.reset_index()
nan_summary.columns = ['Column', 'Missing_Percentage']
nan_summary = nan_summary.sort_values(by='Missing_Percentage', ascending=False)

# Mostrar el resultado
c_to_delete = nan_summary.head(5).loc[:,"Column"].to_list()
c_to_delete

['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType']

In [232]:
df_train = df_train.drop(columns=c_to_delete)
X_test = X_test.drop(columns=c_to_delete)

In [233]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [234]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1457 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-null   object 
 15  HouseStyle     1459 non-null   object 
 16  OverallQual    1459 non-null   int64  
 17  OverallCond    1459 non-null   int64  
 18  YearBuil

In [235]:
# Imputar valores faltantes en FireplaceQu con "None"
df_train['FireplaceQu'] = df_train['FireplaceQu'].fillna('None')
X_test['FireplaceQu'] = X_test['FireplaceQu'].fillna('None')

# Mapear categorías a valores numéricos
fireplace_mapping = {
    'Ex': 5,  # Excelente
    'Gd': 4,  # Bueno
    'TA': 3,  # Típico/Aceptable
    'Fa': 2,  # Pobre
    'Po': 1,  # Muy pobre
    'None': 0  # Ausencia de chimenea
}

# Crear la nueva columna con valores numéricos
df_train['FireplaceQu_num'] = df_train['FireplaceQu'].map(fireplace_mapping)
X_test['FireplaceQu_num'] = X_test['FireplaceQu'].map(fireplace_mapping)


In [236]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 77 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               1460 non-null   int64  
 1   MSSubClass       1460 non-null   int64  
 2   MSZoning         1460 non-null   object 
 3   LotFrontage      1201 non-null   float64
 4   LotArea          1460 non-null   int64  
 5   Street           1460 non-null   object 
 6   LotShape         1460 non-null   object 
 7   LandContour      1460 non-null   object 
 8   Utilities        1460 non-null   object 
 9   LotConfig        1460 non-null   object 
 10  LandSlope        1460 non-null   object 
 11  Neighborhood     1460 non-null   object 
 12  Condition1       1460 non-null   object 
 13  Condition2       1460 non-null   object 
 14  BldgType         1460 non-null   object 
 15  HouseStyle       1460 non-null   object 
 16  OverallQual      1460 non-null   int64  
 17  OverallCond   

In [237]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 76 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               1459 non-null   int64  
 1   MSSubClass       1459 non-null   int64  
 2   MSZoning         1455 non-null   object 
 3   LotFrontage      1232 non-null   float64
 4   LotArea          1459 non-null   int64  
 5   Street           1459 non-null   object 
 6   LotShape         1459 non-null   object 
 7   LandContour      1459 non-null   object 
 8   Utilities        1457 non-null   object 
 9   LotConfig        1459 non-null   object 
 10  LandSlope        1459 non-null   object 
 11  Neighborhood     1459 non-null   object 
 12  Condition1       1459 non-null   object 
 13  Condition2       1459 non-null   object 
 14  BldgType         1459 non-null   object 
 15  HouseStyle       1459 non-null   object 
 16  OverallQual      1459 non-null   int64  
 17  OverallCond   

In [176]:
# Identificar variables categóricas
categorical_columns = df_train.select_dtypes(include=['object']).columns

# Evaluar correlación con SalePrice sin modificar el DataFrame original
correlations = {}

for column in categorical_columns:
    # Crear una columna temporal transformada a valores numéricos
    temp_col = df_train[column].astype('category').cat.codes
    # Calcular la correlación con SalePrice
    correlations[column] = temp_col.corr(df_train['SalePrice'])

# Ordenar columnas por magnitud de correlación
correlations_sorted = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)

# Mostrar las columnas categóricas ordenadas por su potencial (correlación)
print("Potencial de las variables categóricas:")
for col, corr in correlations_sorted:
    print(f"{col}: Correlation with SalePrice = {corr}")


Potencial de las variables categóricas:
ExterQual: Correlation with SalePrice = -0.6368836943991136
KitchenQual: Correlation with SalePrice = -0.5891887782994203
BsmtQual: Correlation with SalePrice = -0.4388809626489313
HeatingQC: Correlation with SalePrice = -0.4001775431629006
Foundation: Correlation with SalePrice = 0.3824789911901015
GarageFinish: Correlation with SalePrice = -0.29248334862954245
GarageCond: Correlation with SalePrice = 0.2757814575618802
GarageQual: Correlation with SalePrice = 0.2613470178693258
LotShape: Correlation with SalePrice = -0.2555798704871221
CentralAir: Correlation with SalePrice = 0.25132816384015444
Electrical: Correlation with SalePrice = 0.23391943085163172
PavedDrive: Correlation with SalePrice = 0.23135695225722633
GarageType: Correlation with SalePrice = -0.22381853020674997
RoofStyle: Correlation with SalePrice = 0.2224052924071381
SaleCondition: Correlation with SalePrice = 0.2130920296778053
Neighborhood: Correlation with SalePrice = 0.2108

In [177]:
columns_to_keep = [
    'ExterQual',
    'KitchenQual',
    'BsmtQual',
    'HeatingQC',
    'Foundation',
    'GarageFinish',
    'GarageCond',
    'GarageQual',
    'LotShape',
    'CentralAir',
    'Electrical',
    'PavedDrive',
    'GarageType',
    'RoofStyle',
    'SaleCondition',
    'Neighborhood'
]


In [238]:
columns_to_drop = [
    'BsmtExposure',
    'HouseStyle',
    'MSZoning',
    'BsmtCond',
    'RoofMatl',
    'BsmtFinType2',
    'ExterCond',
    'Functional',
    'Exterior2nd',
    'Exterior1st',
    'Heating',
    'FireplaceQu',
    'Condition1',
    'BldgType',
    'LotConfig',
    'SaleType',
    'LandSlope',
    'Street',
    'LandContour',
    'Utilities',
    'BsmtFinType1',
    'Condition2'
]

df_train = df_train.drop(columns=columns_to_drop)
X_test = X_test.drop(columns=columns_to_drop)
df_train

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,LotShape,Neighborhood,OverallQual,OverallCond,YearBuilt,YearRemodAdd,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleCondition,SalePrice,FireplaceQu_num
0,1,60,65.0,8450,Reg,CollgCr,7,5,2003,2003,...,0,0,0,0,0,2,2008,Normal,208500,0
1,2,20,80.0,9600,Reg,Veenker,6,8,1976,1976,...,0,0,0,0,0,5,2007,Normal,181500,3
2,3,60,68.0,11250,IR1,CollgCr,7,5,2001,2002,...,0,0,0,0,0,9,2008,Normal,223500,3
3,4,70,60.0,9550,IR1,Crawfor,7,5,1915,1970,...,272,0,0,0,0,2,2006,Abnorml,140000,4
4,5,60,84.0,14260,IR1,NoRidge,8,5,2000,2000,...,0,0,0,0,0,12,2008,Normal,250000,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,Reg,Gilbert,6,5,1999,2000,...,0,0,0,0,0,8,2007,Normal,175000,3
1456,1457,20,85.0,13175,Reg,NWAmes,6,6,1978,1988,...,0,0,0,0,0,2,2010,Normal,210000,3
1457,1458,70,66.0,9042,Reg,Crawfor,7,9,1941,2006,...,0,0,0,0,2500,5,2010,Normal,266500,4
1458,1459,20,68.0,9717,Reg,NAmes,5,6,1950,1996,...,112,0,0,0,0,4,2010,Normal,142125,0


In [239]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 54 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               1459 non-null   int64  
 1   MSSubClass       1459 non-null   int64  
 2   LotFrontage      1232 non-null   float64
 3   LotArea          1459 non-null   int64  
 4   LotShape         1459 non-null   object 
 5   Neighborhood     1459 non-null   object 
 6   OverallQual      1459 non-null   int64  
 7   OverallCond      1459 non-null   int64  
 8   YearBuilt        1459 non-null   int64  
 9   YearRemodAdd     1459 non-null   int64  
 10  RoofStyle        1459 non-null   object 
 11  MasVnrArea       1444 non-null   float64
 12  ExterQual        1459 non-null   object 
 13  Foundation       1459 non-null   object 
 14  BsmtQual         1415 non-null   object 
 15  BsmtFinSF1       1458 non-null   float64
 16  BsmtFinSF2       1458 non-null   float64
 17  BsmtUnfSF     

In [240]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 55 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               1460 non-null   int64  
 1   MSSubClass       1460 non-null   int64  
 2   LotFrontage      1201 non-null   float64
 3   LotArea          1460 non-null   int64  
 4   LotShape         1460 non-null   object 
 5   Neighborhood     1460 non-null   object 
 6   OverallQual      1460 non-null   int64  
 7   OverallCond      1460 non-null   int64  
 8   YearBuilt        1460 non-null   int64  
 9   YearRemodAdd     1460 non-null   int64  
 10  RoofStyle        1460 non-null   object 
 11  MasVnrArea       1452 non-null   float64
 12  ExterQual        1460 non-null   object 
 13  Foundation       1460 non-null   object 
 14  BsmtQual         1423 non-null   object 
 15  BsmtFinSF1       1460 non-null   int64  
 16  BsmtFinSF2       1460 non-null   int64  
 17  BsmtUnfSF     

In [143]:
df_train.select_dtypes(include=['object']).columns


Index(['LotShape', 'Neighborhood', 'RoofStyle', 'ExterQual', 'Foundation',
       'BsmtQual', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleCondition'],
      dtype='object')

In [243]:
# Codificación ordinal
ordinal_mapping = {
    'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
    'HeatingQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'GarageFinish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0},
    'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
    'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
    'LotShape': {'Reg': 1, 'IR1': 2, 'IR2': 3, 'IR3': 4}
}

for col, mapping in ordinal_mapping.items():
    if col in df_train.columns:
        df_train[col] = df_train[col].fillna('None').map(mapping)

# Codificación binaria
binary_columns = ['CentralAir', 'PavedDrive']
binary_mapping = {'Y': 1, 'N': 0}

for col in binary_columns:
    if col in df_train.columns:
        df_train[col] = df_train[col].map(binary_mapping)

# One-Hot Encoding para columnas nominales
nominal_columns = ['Foundation', 'GarageType', 'RoofStyle', 'SaleCondition', 'Neighborhood', 'Electrical']
df_train = pd.get_dummies(df_train, columns=nominal_columns, drop_first=True)

# Verificar el resultado final
print("Columnas transformadas y dummies creados:")
print(df_train.head())


Columnas transformadas y dummies creados:
   Id  MSSubClass  LotFrontage  LotArea  LotShape  OverallQual  OverallCond  \
0   1          60         65.0     8450         1            7            5   
1   2          20         80.0     9600         1            6            8   
2   3          60         68.0    11250         2            7            5   
3   4          70         60.0     9550         2            7            5   
4   5          60         84.0    14260         2            8            5   

   YearBuilt  YearRemodAdd  MasVnrArea  ...  Neighborhood_Sawyer  \
0       2003          2003       196.0  ...                False   
1       1976          1976         0.0  ...                False   
2       2001          2002       162.0  ...                False   
3       1915          1970         0.0  ...                False   
4       2000          2000       350.0  ...                False   

   Neighborhood_SawyerW  Neighborhood_Somerst  Neighborhood_StoneBr  \
0  

In [241]:
# Codificación ordinal
ordinal_mapping = {
    'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
    'HeatingQC': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1},
    'GarageFinish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0},
    'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
    'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
    'LotShape': {'Reg': 1, 'IR1': 2, 'IR2': 3, 'IR3': 4}
}

for col, mapping in ordinal_mapping.items():
    if col in X_test.columns:
        X_test[col] = X_test[col].fillna('None').map(mapping)

# Codificación binaria
binary_columns = ['CentralAir', 'PavedDrive']
binary_mapping = {'Y': 1, 'N': 0}

for col in binary_columns:
    if col in X_test.columns:
        X_test[col] = X_test[col].map(binary_mapping)

# One-Hot Encoding para columnas nominales
nominal_columns = ['Foundation', 'GarageType', 'RoofStyle', 'SaleCondition', 'Neighborhood', 'Electrical']
X_test = pd.get_dummies(X_test, columns=nominal_columns, drop_first=True)

# Verificar el resultado final
print("Columnas transformadas y dummies creados:")
print(X_test.head())


Columnas transformadas y dummies creados:
     Id  MSSubClass  LotFrontage  LotArea  LotShape  OverallQual  OverallCond  \
0  1461          20         80.0    11622         1            5            6   
1  1462          20         81.0    14267         2            6            6   
2  1463          60         74.0    13830         2            5            5   
3  1464          60         78.0     9978         2            6            6   
4  1465         120         43.0     5005         2            8            5   

   YearBuilt  YearRemodAdd  MasVnrArea  ...  Neighborhood_SWISU  \
0       1961          1961         0.0  ...               False   
1       1958          1958       108.0  ...               False   
2       1997          1998         0.0  ...               False   
3       1998          1998        20.0  ...               False   
4       1992          1992         0.0  ...               False   

   Neighborhood_Sawyer  Neighborhood_SawyerW  Neighborhood_Somerst  

In [244]:
df_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 97 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Id                     1460 non-null   int64  
 1   MSSubClass             1460 non-null   int64  
 2   LotFrontage            1201 non-null   float64
 3   LotArea                1460 non-null   int64  
 4   LotShape               1460 non-null   int64  
 5   OverallQual            1460 non-null   int64  
 6   OverallCond            1460 non-null   int64  
 7   YearBuilt              1460 non-null   int64  
 8   YearRemodAdd           1460 non-null   int64  
 9   MasVnrArea             1452 non-null   float64
 10  ExterQual              1460 non-null   int64  
 11  BsmtQual               1460 non-null   int64  
 12  BsmtFinSF1             1460 non-null   int64  
 13  BsmtFinSF2             1460 non-null   int64  
 14  BsmtUnfSF              1460 non-null   int64  
 15  Tota

In [245]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 95 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Id                     1459 non-null   int64  
 1   MSSubClass             1459 non-null   int64  
 2   LotFrontage            1232 non-null   float64
 3   LotArea                1459 non-null   int64  
 4   LotShape               1459 non-null   int64  
 5   OverallQual            1459 non-null   int64  
 6   OverallCond            1459 non-null   int64  
 7   YearBuilt              1459 non-null   int64  
 8   YearRemodAdd           1459 non-null   int64  
 9   MasVnrArea             1444 non-null   float64
 10  ExterQual              1459 non-null   int64  
 11  BsmtQual               1459 non-null   int64  
 12  BsmtFinSF1             1458 non-null   float64
 13  BsmtFinSF2             1458 non-null   float64
 14  BsmtUnfSF              1458 non-null   float64
 15  Tota

In [246]:
# Imputar valores faltantes
df_train['LotFrontage'] = df_train['LotFrontage'].fillna(df_train['LotFrontage'].median())  # Mediana por robustez
df_train['MasVnrArea'] = df_train['MasVnrArea'].fillna(0)  # 0 si no hay revestimiento
df_train['GarageYrBlt'] = df_train['GarageYrBlt'].fillna(df_train['GarageYrBlt'].median())  # Año mediano
df_train['PavedDrive'] = df_train['PavedDrive'].fillna(1)  # Asumir pavimento presente

# Imputar valores faltantes
X_test['LotFrontage'] = X_test['LotFrontage'].fillna(X_test['LotFrontage'].median())  # Mediana por robustez
X_test['MasVnrArea'] = X_test['MasVnrArea'].fillna(0)  # 0 si no hay revestimiento
X_test['GarageYrBlt'] = X_test['GarageYrBlt'].fillna(X_test['GarageYrBlt'].median())  # Año mediano
X_test['PavedDrive'] = X_test['PavedDrive'].fillna(1)  # Asumir pavimento presente


In [249]:
# Imputar con 0 en columnas donde el valor nulo indica ausencia
columns_to_impute_zero = [
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
    'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea'
]
for col in columns_to_impute_zero:
    X_test[col] = X_test[col].fillna(0)

# Imputar con la moda para columnas categóricas ordinales
X_test['KitchenQual'] = X_test['KitchenQual'].fillna(X_test['KitchenQual'].mode()[0])

# Imputar con la mediana para columnas numéricas restantes
columns_with_nulls = X_test.columns[X_test.isnull().any()]
for col in columns_with_nulls:
    X_test[col] = X_test[col].fillna(X_test[col].median())

# Verificar si quedan valores nulos
print("Valores nulos restantes por columna:")
print(X_test.isnull().sum())


Valores nulos restantes por columna:
Id                      0
MSSubClass              0
LotFrontage             0
LotArea                 0
LotShape                0
                       ..
Neighborhood_Timber     0
Neighborhood_Veenker    0
Electrical_FuseF        0
Electrical_FuseP        0
Electrical_SBrkr        0
Length: 95, dtype: int64


In [250]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 97 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Id                     1460 non-null   int64  
 1   MSSubClass             1460 non-null   int64  
 2   LotFrontage            1460 non-null   float64
 3   LotArea                1460 non-null   int64  
 4   LotShape               1460 non-null   int64  
 5   OverallQual            1460 non-null   int64  
 6   OverallCond            1460 non-null   int64  
 7   YearBuilt              1460 non-null   int64  
 8   YearRemodAdd           1460 non-null   int64  
 9   MasVnrArea             1460 non-null   float64
 10  ExterQual              1460 non-null   int64  
 11  BsmtQual               1460 non-null   int64  
 12  BsmtFinSF1             1460 non-null   int64  
 13  BsmtFinSF2             1460 non-null   int64  
 14  BsmtUnfSF              1460 non-null   int64  
 15  Tota

In [254]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 96 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Id                     1459 non-null   int64  
 1   MSSubClass             1459 non-null   int64  
 2   LotFrontage            1459 non-null   float64
 3   LotArea                1459 non-null   int64  
 4   LotShape               1459 non-null   int64  
 5   OverallQual            1459 non-null   int64  
 6   OverallCond            1459 non-null   int64  
 7   YearBuilt              1459 non-null   int64  
 8   YearRemodAdd           1459 non-null   int64  
 9   MasVnrArea             1459 non-null   float64
 10  ExterQual              1459 non-null   int64  
 11  BsmtQual               1459 non-null   int64  
 12  BsmtFinSF1             1459 non-null   float64
 13  BsmtFinSF2             1459 non-null   float64
 14  BsmtUnfSF              1459 non-null   float64
 15  Tota

In [252]:
df_train["Electrical_Mix"]

0       False
1       False
2       False
3       False
4       False
        ...  
1455    False
1456    False
1457    False
1458    False
1459    False
Name: Electrical_Mix, Length: 1460, dtype: bool

In [253]:
# Ultimo ajuste
X_test["Electrical_Mix"] = False

In [152]:
df_train.head(10)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,LotShape,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr
0,1,60,65.0,8450,1,7,5,2003,2003,196.0,...,False,False,False,False,False,False,False,False,False,True
1,2,20,80.0,9600,1,6,8,1976,1976,0.0,...,False,False,False,False,False,True,False,False,False,True
2,3,60,68.0,11250,2,7,5,2001,2002,162.0,...,False,False,False,False,False,False,False,False,False,True
3,4,70,60.0,9550,2,7,5,1915,1970,0.0,...,False,False,False,False,False,False,False,False,False,True
4,5,60,84.0,14260,2,8,5,2000,2000,350.0,...,False,False,False,False,False,False,False,False,False,True
5,6,50,85.0,14115,2,5,5,1993,1995,0.0,...,False,False,False,False,False,False,False,False,False,True
6,7,20,75.0,10084,1,8,5,2004,2005,186.0,...,False,False,True,False,False,False,False,False,False,True
7,8,60,69.0,10382,2,7,6,1973,1973,240.0,...,False,False,False,False,False,False,False,False,False,True
8,9,50,51.0,6120,1,7,5,1931,1950,0.0,...,False,False,False,False,False,False,True,False,False,False
9,10,190,50.0,7420,1,5,6,1939,1950,0.0,...,False,False,False,False,False,False,False,False,False,True


In [150]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Separar características y variable objetivo
X = df_train.drop(columns=['SalePrice'])
y = df_train['SalePrice']

# Dividir los datos en entrenamiento y validación
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar un modelo base
model_base = RandomForestRegressor(random_state=42)
model_base.fit(X_train, y_train)

# Predecir y calcular RMSE
y_pred_base = model_base.predict(X_valid)
rmse_base = np.sqrt(mean_squared_error(y_valid, y_pred_base))
print(f"RMSE del modelo base: {rmse_base}")


RMSE del modelo base: 28097.434815318833


In [151]:
from sklearn.model_selection import GridSearchCV

# Definir los hiperparámetros a probar
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Configurar GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=3,  # Validación cruzada de 3 particiones
    scoring='neg_root_mean_squared_error',  # Usar RMSE como métrica
    verbose=2,
    n_jobs=-1  # Usar todos los núcleos disponibles
)

# Entrenar el Grid Search
grid_search.fit(X_train, y_train)

# Mostrar los mejores hiperparámetros
print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)

# Calcular RMSE en el conjunto de validación
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_valid)
rmse_best = np.sqrt(mean_squared_error(y_valid, y_pred_best))
print(f"RMSE después de Grid Search: {rmse_best}")


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Mejores hiperparámetros encontrados:
{'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
RMSE después de Grid Search: 28469.21206834142


In [None]:
best_model

In [256]:
# Re-orden
# Reordenar las columnas de X_test para que coincidan con X_train
X_test = X_test[X_train.columns]

# Verificar que el orden ahora sea el mismo
print("Orden de columnas de X_test coincide con X_train:", list(X_test.columns) == list(X_train.columns))


Orden de columnas de X_test coincide con X_train: True


In [258]:
y_pred_test = best_model.predict(X_test)
y_pred_test

array([131298.95454545, 153298.25      , 178319.43666667, ...,
       160624.        , 114733.5       , 219419.15151515])

In [259]:
print(y_pred_test)

[131298.95454545 153298.25       178319.43666667 ... 160624.
 114733.5        219419.15151515]


In [260]:
# Generar las predicciones para X_test
y_pred_test = best_model.predict(X_test)

# Crear un DataFrame con el formato requerido
submission = pd.DataFrame({
    'Id': X_test['Id'],       # Asegúrate de que 'Id' esté presente en X_test
    'SalePrice': y_pred_test  # Predicciones generadas
})

# Guardar el DataFrame como un archivo .csv
submission.to_csv('submission.csv', index=False)

print("Archivo 'submission.csv' generado exitosamente.")


Archivo 'submission.csv' generado exitosamente.
