In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder

In [4]:
train_df = pd.read_csv('S:\\Codigos\\CATIJR\\train.csv')
test_df = pd.read_csv('S:\\Codigos\\CATIJR\\test.csv')

In [5]:
train_df['SalePrice'] = np.log(train_df['SalePrice'])
train_df = train_df.drop(columns=['Id'])

In [6]:
def treat_outliers(df: pd.DataFrame) -> pd.DataFrame:
    """
    PolÃ­tica:
      - log1p se skew > 1 e valores >= 0
      - Z-score (apenas em valores != 0) com |z| >= 3.5 define outlier
      - Linhas com >= 3 outliers -> REMOVER
      - Linhas com 1 ou 2 outliers -> SUBSTITUIR por mediana (calculada nos nÃ£o-outliers)
    """
    outlier_cols = [
        "GrLivArea","GarageArea","TotalBsmtSF","1stFlrSF","TotRmsAbvGrd",
        "YearBuilt","MasVnrArea","BsmtFinSF1","LotFrontage","WoodDeckSF",
        "2ndFlrSF","OpenPorchSF","LotArea","BsmtUnfSF","BedroomAbvGr",
        "ScreenPorch","OverallCond","3SsnPorch"
    ]

    df = df.copy()

    # --- 1) DetecÃ§Ã£o coluna a coluna -> matriz booleana de outliers ---
    mask_df = pd.DataFrame(False, index=df.index, columns=[c for c in outlier_cols if c in df.columns])
    info_cols = {}
    aux_cols = []  # rastrear auxiliares para remover ao final

    for col in mask_df.columns:
        s = df[col]

        # log1p condicional (idÃªntico ao seu detector)
        use_log = (stats.skew(s.dropna()) > 1) and ((s >= 0).all())
        rep = np.log1p(s) if use_log else s

        # vÃ¡lidos para z-score: nÃ£o nulos e col != 0
        mask_valid = rep.notna() & s.ne(0)

        z = pd.Series(np.nan, index=df.index, dtype="float64")
        if mask_valid.any() and rep[mask_valid].std(ddof=0) > 0:
            z_vals = stats.zscore(rep[mask_valid], nan_policy="omit")
            z.loc[mask_valid] = z_vals
            mask_out = z.abs() >= 3.5
        else:
            mask_out = pd.Series(False, index=df.index)

        # colunas auxiliares (para auditoria) â€” serÃ£o removidas ao final
        z_col = f"{col}_z"
        flag_col = f"{col}_outlier"
        df[z_col] = z
        df[flag_col] = mask_out.astype("int8")
        aux_cols += [z_col, flag_col]

        mask_df[col] = mask_out
        info_cols[col] = {
            "use_log": bool(use_log),
            "valid_n": int(mask_valid.sum()),
            "std_valid": float(rep[mask_valid].std(ddof=0)) if mask_valid.any() else np.nan,
            "n_outliers": int(mask_out.sum())
        }

    # --- 2) Contagem por linha e aplicaÃ§Ã£o da polÃ­tica ---
    row_counts = mask_df.sum(axis=1)  # nÂº de ocorrÃªncias por linha (across cols)

    # Remover (>=3)
    to_remove_idx = row_counts[row_counts >= 3].index
    df.drop(index=to_remove_idx, inplace=True)

    # Substituir (1â€“2)
    to_replace_idx = row_counts[(row_counts > 0) & (row_counts < 3)].index

    # SubstituiÃ§Ã£o por mediana calculada nos NÃƒO-outliers (jÃ¡ sem as linhas removidas)
    for col in mask_df.columns:
        if col not in df.columns:
            continue
        col_flag = f"{col}_outlier"
        if col_flag not in df.columns:
            continue

        mask_replace_here = df.index.isin(to_replace_idx) & (df[col_flag] == 1)
        if not mask_replace_here.any():
            continue

        mask_non_out = df[col_flag] != 1
        median_val = df.loc[mask_non_out, col].median()
        if pd.isna(median_val):
            median_val = df[col].median()

        df.loc[mask_replace_here, col] = median_val

    # --- 3) RelatÃ³rio de diagnÃ³stico rÃ¡pido (usa row_counts/info_cols; independe das auxiliares no df) ---
    dist_counts = row_counts.value_counts().sort_index()
    print("DistribuiÃ§Ã£o de ocorrÃªncias por linha (antes da remoÃ§Ã£o):")
    print(dist_counts.to_string())

    print("\nOutliers detectados por coluna:")
    for c, meta in info_cols.items():
        print(f"- {c}: use_log={meta['use_log']}, valid_n={meta['valid_n']}, "
              f"std_valid={meta['std_valid']:.6g} | n_outliers={meta['n_outliers']}")

    print(f"\nLinhas removidas (>=3 ocorrÃªncias): {len(to_remove_idx)}")
    if len(to_remove_idx) > 0:
        print(f"Exemplos removidos: {list(to_remove_idx[:10])}")

    print(f"Linhas tratadas (1â€“2 ocorrÃªncias): {len(to_replace_idx)}")
    if len(to_replace_idx) > 0:
        print(f"Exemplos tratadas: {list(to_replace_idx[:])}")

    # --- 4) Limpeza final: remover auxiliares para nÃ£o interferir na imputaÃ§Ã£o de nulos ---
    if aux_cols:
        df.drop(columns=[c for c in aux_cols if c in df.columns], inplace=True, errors="ignore")

    return df

In [7]:
train_df = treat_outliers(train_df)

DistribuiÃ§Ã£o de ocorrÃªncias por linha (antes da remoÃ§Ã£o):
0    1390
1      64
2       5
6       1

Outliers detectados por coluna:
- GrLivArea: use_log=True, valid_n=1460, std_valid=0.333189 | n_outliers=4
- GarageArea: use_log=False, valid_n=1379, std_valid=185.613 | n_outliers=6
- TotalBsmtSF: use_log=True, valid_n=1423, std_valid=0.360793 | n_outliers=7
- 1stFlrSF: use_log=True, valid_n=1460, std_valid=0.317322 | n_outliers=2
- TotRmsAbvGrd: use_log=False, valid_n=1460, std_valid=1.62484 | n_outliers=1
- YearBuilt: use_log=False, valid_n=1460, std_valid=30.1926 | n_outliers=0
- MasVnrArea: use_log=False, valid_n=591, std_valid=204.971 | n_outliers=8
- BsmtFinSF1: use_log=True, valid_n=993, std_valid=0.884773 | n_outliers=15
- LotFrontage: use_log=False, valid_n=1201, std_valid=24.2746 | n_outliers=7
- WoodDeckSF: use_log=True, valid_n=699, std_valid=0.584282 | n_outliers=2
- 2ndFlrSF: use_log=False, valid_n=631, std_valid=273.13 | n_outliers=4
- OpenPorchSF: use_log=True, valid

In [8]:
train_df["HouseAge"] = train_df["YrSold"] - train_df["YearBuilt"]

train_df["IsRemodeled"] = (train_df["YearRemodAdd"] != train_df["YearBuilt"]).astype("int64")

train_df["RemodeledAge"] = train_df["YrSold"] - train_df["YearRemodAdd"]

train_df["TotalSF"] = (train_df["TotalBsmtSF"] + train_df["1stFlrSF"] +
                       train_df["2ndFlrSF"] + train_df['BsmtFinSF1'] +
                       train_df['BsmtFinSF2']
                      )

train_df["TotalBath"] = (train_df["FullBath"] + 0.5*train_df["HalfBath"] +
                   train_df["BsmtFullBath"] + 0.5*train_df["BsmtHalfBath"]
                   )

train_df["has_2nd_flr"]   = (train_df["2ndFlrSF"] > 0).astype("int64")

train_df["total_porch_sf"] = train_df[["OpenPorchSF", "EnclosedPorch", "3SsnPorch",
                                       "ScreenPorch", "WoodDeckSF"]].sum(axis=1)

train_df["rooms_per_area"] = train_df["TotRmsAbvGrd"] / train_df["GrLivArea"].clip(lower=1)

train_df["has_garage"]   = ((train_df["GarageCars"].fillna(0) > 0) | (train_df["GarageArea"].fillna(0) > 0)).astype("int64")

train_df["has_bsmt"]     = (train_df["TotalBsmtSF"] > 0).astype("int64")

train_df["has_ext_area"]    = (train_df["total_porch_sf"] > 0).astype("int64")

train_df["has_pool"]     = (train_df["PoolArea"] > 0).astype("int64")

train_df["has_mas_vnr"]  = (train_df["MasVnrArea"] > 0).astype("int64")

In [9]:
train_df["MSSubClass"] = train_df["MSSubClass"].astype(str)

In [10]:
train_df = train_df.drop(columns=['YrSold', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
                      'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath', 'YearRemodAdd',
                      'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'TotRmsAbvGrd',
                      'GarageArea', 'BsmtFinSF1', 'BsmtFinSF2', 'WoodDeckSF', "Alley",
                      "PoolQC", "Fence", "MiscFeature", 'Utilities',
                      'RoofMatl', 'Condition2', 'BsmtFinType2', 'GarageYrBlt', 'WoodDeckSF'
                      ])

In [11]:
num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = train_df.select_dtypes(include="object").columns

In [12]:
print(num_cols)
print(cat_cols)

Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea',
       'BsmtUnfSF', 'LowQualFinSF', 'GrLivArea', 'BedroomAbvGr',
       'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'PoolArea', 'MiscVal',
       'MoSold', 'SalePrice', 'HouseAge', 'IsRemodeled', 'RemodeledAge',
       'TotalSF', 'TotalBath', 'has_2nd_flr', 'total_porch_sf',
       'rooms_per_area', 'has_garage', 'has_bsmt', 'has_ext_area', 'has_pool',
       'has_mas_vnr'],
      dtype='object')
Index(['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandContour',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType',
       'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleTyp

In [13]:
#numericas
num_median = ["LotArea","LowQualFinSF","GrLivArea","BedroomAbvGr",
              "KitchenAbvGr", "Fireplaces","MiscVal","MoSold","HouseAge",
              "LotFrontage", "TotalSF", "TotalBath","RemodeledAge",
              'OverallQual' , 'OverallCond'
              ]

num_zero = ["BsmtUnfSF","GarageCars","MasVnrArea","PoolArea",
            "has_2nd_flr","IsRemodeled","total_porch_sf","rooms_per_area",
            "has_garage","has_bsmt","has_ext_area","has_pool","has_mas_vnr"]

#categoricas one-hot
onehot_none = ["GarageType", "MasVnrType"]

onehot_moda = ["MSSubClass","MSZoning","Street","LandContour","LotConfig",
               "Neighborhood","Condition1","BldgType","HouseStyle",
               "RoofStyle","Exterior1st","Exterior2nd","Foundation",
               "Heating","CentralAir","Electrical","SaleType","SaleCondition"]

#categoricas ordinais

ord_none = ["BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
            "GarageFinish", "GarageQual", "GarageCond", "FireplaceQu"]

ord_moda = ["LotShape","LandSlope","HeatingQC","KitchenQual",
            "Functional","PavedDrive", 'ExterQual', 'ExterCond']

In [14]:
ord_none_maps = [
    ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],       # BsmtQual
    ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],       # BsmtCond
    ['None', 'No', 'Mn', 'Av', 'Gd'],             # BsmtExposure
    ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],  # BsmtFinType1
    ['None', 'Unf', 'RFn', 'Fin'],                # GarageFinish
    ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],       # GarageQual
    ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],       # GarageCond
    ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']        # FireplaceQu
]

ord_moda_maps = [
    ['IR3', 'IR2', 'IR1', 'Reg'],                 # LotShape
    ['Sev', 'Mod', 'Gtl'],                        # LandSlope
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],              # HeatingQC
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],              # KitchenQual
    ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],  # Functional
    ['N', 'P', 'Y'],                           # PavedDrive
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],              # ExterQual
    ['Po', 'Fa', 'TA', 'Gd', 'Ex']               # ExterCond
]

In [15]:
num_median_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

num_zero_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

onehot_moda_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

onehot_none_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

ord_none_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('ordinal', OrdinalEncoder(categories=ord_none_maps))
])

ord_moda_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=ord_moda_maps))
])

In [16]:
col_trans = ColumnTransformer(transformers=[
    ('num_median', num_median_pipeline, num_median),
    ('num_zero', num_zero_pipeline, num_zero),
    ('onehot_moda', onehot_moda_pipeline, onehot_moda),
    ('onehot_none', onehot_none_pipeline, onehot_none),
    ('ord_none', ord_none_pipeline, ord_none),
    ('ord_moda', ord_moda_pipeline, ord_moda)
    ],
    remainder='passthrough',
    n_jobs=-1
)

In [17]:
pipeline = Pipeline(steps=[
    ('preprocessing', col_trans)
    ])


In [18]:
X = train_df.drop(columns=['SalePrice'])
y = train_df['SalePrice']

In [19]:
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [21]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "XGBoost": XGBRegressor(eval_metric="rmse"),
    "LightGBM": LGBMRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0)
}

print("\n=== RESULTADOS DOS MODELOS ===")

for name, model in models.items():
    full_model = Pipeline(steps=[
        ("preprocessing", col_trans),
        ("model", model)
    ])

    full_model.fit(X_train, y_train)

    preds = full_model.predict(X_test)

    rmse = root_mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    print(f"\n===== {name} =====")
    print("RMSE:", rmse)
    print("RÂ²:", r2)


=== RESULTADOS DOS MODELOS ===

===== Linear Regression =====
RMSE: 0.1300659274270467
RÂ²: 0.9037975803367921

===== Random Forest =====
RMSE: 0.14677765816083985
RÂ²: 0.87748794634543

===== XGBoost =====
RMSE: 0.15037625507162145
RÂ²: 0.8714069664422044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2288
[LightGBM] [Info] Number of data points in the train set: 1167, number of used features: 135
[LightGBM] [Info] Start training from score 12.023999

===== LightGBM =====
RMSE: 0.13421347646783693
RÂ²: 0.8975643416157865





===== CatBoost =====
RMSE: 0.12080144890880716
RÂ²: 0.9170143121594385


In [22]:
#Melhor resultado foi o CatBoost
import optuna
from sklearn.model_selection import cross_val_score

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
def rmse_cv(model, X, y):
    scores = cross_val_score(
        model, X, y,
        scoring="neg_root_mean_squared_error",
        cv=5,
        n_jobs=-1
    )
    return scores.mean()


# FunÃ§Ã£o objetivo usada pela busca Bayesiana
def objective(trial):
    params = {
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "n_estimators": trial.suggest_int("n_estimators", 300, 2000),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 5.0),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5.0),
        "leaf_estimation_iterations": trial.suggest_int("leaf_estimation_iterations", 1, 10)
    }

    model = Pipeline(steps=[
        ("preprocessing", col_trans),
        ("model", CatBoostRegressor(
            verbose=0,
            random_seed=42,
            **params
        ))
    ])

    score = rmse_cv(model, X, y)
    return score


# Criar estudo (minimizar RMSE)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("\nðŸŽ¯ Melhores hiperparÃ¢metros encontrados:")
print(study.best_params)

[I 2025-11-24 12:28:04,043] A new study created in memory with name: no-name-a4294a75-906a-4c83-833d-673596f53e26
[I 2025-11-24 12:28:12,171] Trial 0 finished with value: -0.12980159160509638 and parameters: {'depth': 8, 'learning_rate': 0.25062611210522184, 'l2_leaf_reg': 6.782077812184848, 'n_estimators': 349, 'bagging_temperature': 0.04083200136939524, 'random_strength': 4.721831698827279, 'leaf_estimation_iterations': 6}. Best is trial 0 with value: -0.12980159160509638.
[I 2025-11-24 12:28:36,864] Trial 1 finished with value: -0.1249779144391758 and parameters: {'depth': 7, 'learning_rate': 0.1842131365020978, 'l2_leaf_reg': 5.480340393242398, 'n_estimators': 1898, 'bagging_temperature': 0.12184038547429221, 'random_strength': 2.317888578665655, 'leaf_estimation_iterations': 3}. Best is trial 1 with value: -0.1249779144391758.
[I 2025-11-24 12:28:46,094] Trial 2 finished with value: -0.11985409212804501 and parameters: {'depth': 8, 'learning_rate': 0.029779410166264154, 'l2_leaf_r


ðŸŽ¯ Melhores hiperparÃ¢metros encontrados:
{'depth': 5, 'learning_rate': 0.023777482911700433, 'l2_leaf_reg': 4.006022085082244, 'n_estimators': 1125, 'bagging_temperature': 4.200325460163603, 'random_strength': 2.061661636703463, 'leaf_estimation_iterations': 2}


In [24]:
best_params = study.best_params
best_params

final_model = Pipeline(steps=[
    ("preprocessing", col_trans),
    ("model", CatBoostRegressor(
        verbose=0,
        random_seed=42,
        **best_params
    ))
])

final_model.fit(X, y)
print("Modelo final treinado com sucesso!")


Modelo final treinado com sucesso!


In [25]:
# Prever usando o modelo final treinado
final_preds = final_model.predict(X_test)

# Calcular mÃ©tricas
rmse = root_mean_squared_error(y_test, final_preds)
r2 = r2_score(y_test, final_preds)

print("\n===== DESEMPENHO DO MODELO FINAL =====")
print("RMSE:", rmse)
print("RÂ²:", r2)



===== DESEMPENHO DO MODELO FINAL =====
RMSE: 0.059127231127986436
RÂ²: 0.9801191936431017


In [26]:
#preparando o teste

# Copiar o test para nÃ£o alterar o original
test_processed = test_df.copy()

# Criar features novas (mesmo cÃ³digo do train)
test_processed["HouseAge"] = test_processed["YrSold"] - test_processed["YearBuilt"]
test_processed["IsRemodeled"] = (test_processed["YearRemodAdd"] != test_processed["YearBuilt"]).astype("int64")
test_processed["RemodeledAge"] = test_processed["YrSold"] - test_processed["YearRemodAdd"]
test_processed["TotalSF"] = (test_processed["TotalBsmtSF"] + test_processed["1stFlrSF"] +
                             test_processed["2ndFlrSF"] + test_processed['BsmtFinSF1'] +
                             test_processed['BsmtFinSF2'])
test_processed["TotalBath"] = (test_processed["FullBath"] + 0.5*test_processed["HalfBath"] +
                               test_processed["BsmtFullBath"] + 0.5*test_processed["BsmtHalfBath"])
test_processed["has_2nd_flr"] = (test_processed["2ndFlrSF"] > 0).astype("int64")
test_processed["total_porch_sf"] = test_processed[["OpenPorchSF", "EnclosedPorch", "3SsnPorch",
                                                   "ScreenPorch", "WoodDeckSF"]].sum(axis=1)
test_processed["rooms_per_area"] = test_processed["TotRmsAbvGrd"] / test_processed["GrLivArea"].clip(lower=1)
test_processed["has_garage"] = ((test_processed["GarageCars"].fillna(0) > 0) |
                                (test_processed["GarageArea"].fillna(0) > 0)).astype("int64")
test_processed["has_bsmt"] = (test_processed["TotalBsmtSF"] > 0).astype("int64")
test_processed["has_ext_area"] = (test_processed["total_porch_sf"] > 0).astype("int64")
test_processed["has_pool"] = (test_processed["PoolArea"] > 0).astype("int64")
test_processed["has_mas_vnr"] = (test_processed["MasVnrArea"] > 0).astype("int64")

# Ajustes de tipos
test_processed["MSSubClass"] = test_processed["MSSubClass"].astype(str)

# Remover as mesmas colunas
test_processed = test_processed.drop(columns=[
    'YrSold', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
    'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath', 'YearRemodAdd',
    'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'TotRmsAbvGrd',
    'GarageArea', 'BsmtFinSF1', 'BsmtFinSF2', 'WoodDeckSF', "Alley",
    "PoolQC", "Fence", "MiscFeature", 'Utilities',
    'RoofMatl', 'Condition2', 'BsmtFinType2', 'GarageYrBlt', 'WoodDeckSF'
], errors="ignore")


In [27]:
test_pred_log = final_model.predict(test_processed)


In [28]:
test_pred = np.expm1(test_pred_log)


In [29]:
submission = pd.DataFrame({
    "Id": test_df["Id"],
    "SalePrice": test_pred
})

submission.to_csv("submission.csv", index=False)


In [30]:
import joblib

joblib.dump(final_model, "modelo_catboost_pipeline.pkl")
print("Modelo salvo!")


Modelo salvo!
