In [17]:
# pip install lightgbm

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Cargar datos
train = pd.read_csv("housing_data/train.csv")
test = pd.read_csv("housing_data/test.csv")
submission = pd.read_csv("housing_data/sample_submission.csv")

# Separar variables independientes y dependientes

In [8]:

y = train['SalePrice']
X = train.drop(['SalePrice', 'Id'], axis=1)
test_ids = test['Id']
test = test.drop(['Id'], axis=1)
# X

In [9]:

# Identificar tipos de columnas
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocesamiento
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)


In [10]:

# Crear modelos
models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42)
}


In [18]:

# Función para evaluación
def evaluate_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error')
    return np.mean(-scores)


In [12]:

# Crear pipeline y entrenar modelos
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model)])
    results[name] = evaluate_model(pipeline, X, y)
    print(f"{name}: RMSE = {results[name]:.4f}")


RandomForest: RMSE = 29854.4002
GradientBoosting: RMSE = 26500.3688
XGBoost: RMSE = 29359.9204
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3168
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 181
[LightGBM] [Info] Start training from score 180717.091610
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3199
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 177
[LightGBM] [Info] Start training from score 180407.575342
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001465 seconds.
You can set `force_col_wise=true` to remove

In [13]:

# Optimización con GridSearchCV (ejemplo para LightGBM)
lgbm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('model', LGBMRegressor(random_state=42))])

param_grid = {
    'model__n_estimators': [100, 500, 1000],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(lgbm_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004614 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3372
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 187
[LightGBM] [Info] Start training from score 180921.195890


In [14]:

print(f"Best parameters for LightGBM: {grid_search.best_params_}")
print(f"Best RMSE: {-grid_search.best_score_:.4f}")


Best parameters for LightGBM: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 1000}
Best RMSE: 26948.8072


In [15]:

# Entrenar modelo final con mejores parámetros
final_model = grid_search.best_estimator_
final_model.fit(X, y)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000954 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3372
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 187
[LightGBM] [Info] Start training from score 180921.195890


In [16]:

# Predecir en conjunto de test
test_predictions = final_model.predict(test)

# Crear archivo de envío
submission['SalePrice'] = test_predictions
submission.to_csv('submission.csv', index=False)
