In [17]:
# pip install lightgbm

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Cargar datos
train = pd.read_csv("housing_data/train.csv")
test = pd.read_csv("housing_data/test.csv")
submission = pd.read_csv("housing_data/sample_submission.csv")

# Separar variables independientes y dependientes

In [8]:

y = train['SalePrice']
X = train.drop(['SalePrice', 'Id'], axis=1)
test_ids = test['Id']
test = test.drop(['Id'], axis=1)
# X

In [9]:

# Identificar tipos de columnas
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocesamiento
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)


In [19]:

# Crear modelos
models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}


In [21]:

# Función para evaluación
def evaluate_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error')
    return np.mean(-scores)


In [22]:

# Crear pipeline y entrenar modelos
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model)])
    results[name] = evaluate_model(pipeline, X, y)
    print(f"{name}: RMSE = {results[name]:.4f}")


RandomForest: RMSE = 29854.4002
XGBoost: RMSE = 29359.9204
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3168
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 181
[LightGBM] [Info] Start training from score 180717.091610
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001571 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3199
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 177
[LightGBM] [Info] Start training from score 180407.575342
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3189
[LightGBM] [Info] Number of data points in the trai

In [13]:

# Optimización con GridSearchCV (ejemplo para LightGBM)
lgbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('model', LGBMRegressor(random_state=42))])

param_grid = {
    'model__n_estimators': [100, 500, 1000],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(lgbm_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004614 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3372
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 187
[LightGBM] [Info] Start training from score 180921.195890


In [14]:

print(f"Best parameters for LightGBM: {grid_search.best_params_}")
print(f"Best RMSE: {-grid_search.best_score_:.4f}")


Best parameters for LightGBM: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 1000}
Best RMSE: 26948.8072


In [15]:

# Entrenar modelo final con mejores parámetros
final_model = grid_search.best_estimator_
final_model.fit(X, y)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000954 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3372
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 187
[LightGBM] [Info] Start training from score 180921.195890


In [16]:

# Predecir en conjunto de test
test_predictions = final_model.predict(test)

# Crear archivo de envío
submission['SalePrice'] = test_predictions
submission.to_csv('submission.csv', index=False)


In [24]:
fm  = final_model["model"]

In [26]:
fm.feature_name_

['Column_0',
 'Column_1',
 'Column_2',
 'Column_3',
 'Column_4',
 'Column_5',
 'Column_6',
 'Column_7',
 'Column_8',
 'Column_9',
 'Column_10',
 'Column_11',
 'Column_12',
 'Column_13',
 'Column_14',
 'Column_15',
 'Column_16',
 'Column_17',
 'Column_18',
 'Column_19',
 'Column_20',
 'Column_21',
 'Column_22',
 'Column_23',
 'Column_24',
 'Column_25',
 'Column_26',
 'Column_27',
 'Column_28',
 'Column_29',
 'Column_30',
 'Column_31',
 'Column_32',
 'Column_33',
 'Column_34',
 'Column_35',
 'Column_36',
 'Column_37',
 'Column_38',
 'Column_39',
 'Column_40',
 'Column_41',
 'Column_42',
 'Column_43',
 'Column_44',
 'Column_45',
 'Column_46',
 'Column_47',
 'Column_48',
 'Column_49',
 'Column_50',
 'Column_51',
 'Column_52',
 'Column_53',
 'Column_54',
 'Column_55',
 'Column_56',
 'Column_57',
 'Column_58',
 'Column_59',
 'Column_60',
 'Column_61',
 'Column_62',
 'Column_63',
 'Column_64',
 'Column_65',
 'Column_66',
 'Column_67',
 'Column_68',
 'Column_69',
 'Column_70',
 'Column_71',
 '

In [25]:
fm.feature_importances_

array([ 50, 204, 343, 278, 124, 159, 101, 207, 230,  20, 140, 257, 183,
       115,  12, 354,  26,   7,  19,  15,  63,  12,  55,  56, 193,  51,
       215, 121, 212,  35,  19,  21,   0,   0,  77,  76,   0,  15,   0,
         5,   4,   0,   0,   1,   0,  14,   6,   0,   1,  32,  19,   9,
        12,   0,   0,   3,  23,  29,   0,   9,   1,   8,   0,   0,   0,
         0,  31,   2,   0,  28,  19,   0,   5,   0,  21,   7,   0,   0,
        30,  23,   2,   0,   1,   0,  25,  67,   1,   0,  16,   0,  33,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   3,   0,   1,   0,   0,   0,   0,   1,   0,   0,   1,   0,
        11,   0,   4,   0,   0,   0,   0,   0,   3,   0,   0,   0,   0,
         0,   0,   0,   0,   0,  34,   0,   2,   4,   0,   4,   5,   0,
         5,   9,   8,   0,   0,   0,   0,   1,   0,   2,   0,   0,   0,
         0,   4,   0,   7,   2,   0,   0,   0,  10,   8,   2,   0,   9,
         7,   0,   0,  11,   0,   2,  13,   2,   3,   0,   0,   