<a href="https://colab.research.google.com/github/Kalana-Lakshan/Kaggle_Competitions_House-Prices---Advanced-Regression-Techniques/blob/main/House_Prices_Advanced_Regression_Techniques_with_catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
!pip install catboost scikit-learn
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

y = np.log1p(train_data["SalePrice"])
X = train_data.drop("SalePrice", axis=1)

# Convert categorical to string
for col in X.columns:
    if X[col].dtype == "object":
        X[col] = X[col].astype(str)
        test_data[col] = test_data[col].astype(str)

cat_features = [i for i, col in enumerate(X.columns) if X[col].dtype == "object"]

# Proper validation split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
model = CatBoostRegressor(
    iterations=6000,
    learning_rate=0.025,
    depth=8,
    l2_leaf_reg=3,
    bagging_temperature=0.6,
    random_strength=1.2,
    border_count=128,
    loss_function="RMSE",
    eval_metric="RMSE",
    od_type="Iter",
    od_wait=300,
    verbose=200
)


In [3]:
model.fit(
    train_pool,
    eval_set=valid_pool,
    use_best_model=True
)

# Validation score
val_preds = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, val_preds))
print("Validation RMSE:", rmse)


0:	learn: 0.3844074	test: 0.4266672	best: 0.4266672 (0)	total: 113ms	remaining: 11m 19s
200:	learn: 0.1056054	test: 0.1506098	best: 0.1506098 (200)	total: 26.5s	remaining: 12m 43s
400:	learn: 0.0788104	test: 0.1372340	best: 0.1372340 (400)	total: 44.5s	remaining: 10m 21s
600:	learn: 0.0643950	test: 0.1344249	best: 0.1344121 (594)	total: 1m 5s	remaining: 9m 51s
800:	learn: 0.0525935	test: 0.1332433	best: 0.1332383 (784)	total: 1m 26s	remaining: 9m 19s
1000:	learn: 0.0453265	test: 0.1333278	best: 0.1331748 (816)	total: 1m 45s	remaining: 8m 45s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.1331748121
bestIteration = 816

Shrink model to first 817 iterations.
Validation RMSE: 0.13317480398312326


In [4]:
full_pool = Pool(X, y, cat_features=cat_features)

final_model = CatBoostRegressor(
    iterations=model.best_iteration_,
    learning_rate=0.025,
    depth=8,
    l2_leaf_reg=3,
    bagging_temperature=0.6,
    random_strength=1.2,
    border_count=128,
    loss_function="RMSE",
    verbose=200
)

final_model.fit(full_pool)


0:	learn: 0.3926478	total: 78.2ms	remaining: 1m 3s
200:	learn: 0.1101477	total: 16.1s	remaining: 49.2s
400:	learn: 0.0856438	total: 36.9s	remaining: 38.1s
600:	learn: 0.0725424	total: 55.5s	remaining: 19.8s
800:	learn: 0.0625780	total: 1m 15s	remaining: 1.42s
815:	learn: 0.0617556	total: 1m 16s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7f1cbf6f8c20>

In [5]:
test_preds = np.expm1(final_model.predict(test_data))

submission = pd.DataFrame({
    "Id": test_data["Id"],
    "SalePrice": test_preds
})

submission.to_csv("submission_catboost_optimized.csv", index=False)


Improve more

In [6]:
train_pool = Pool(X, y, cat_features=cat_features)

model = CatBoostRegressor(
    iterations=5000,
    learning_rate=0.02,
    depth=6,
    l2_leaf_reg=5,
    bagging_temperature=0.7,
    random_strength=1.5,
    border_count=254,
    loss_function="RMSE",
    eval_metric="RMSE",
    subsample=0.8,
    verbose=250
)

model.fit(train_pool)


0:	learn: 0.3940997	total: 89.4ms	remaining: 7m 26s
250:	learn: 0.1256629	total: 10.6s	remaining: 3m 19s
500:	learn: 0.1045951	total: 19.5s	remaining: 2m 54s
750:	learn: 0.0953067	total: 28.8s	remaining: 2m 43s
1000:	learn: 0.0869309	total: 37.9s	remaining: 2m 31s
1250:	learn: 0.0801028	total: 46.8s	remaining: 2m 20s
1500:	learn: 0.0744863	total: 56.3s	remaining: 2m 11s
1750:	learn: 0.0695528	total: 1m 5s	remaining: 2m 2s
2000:	learn: 0.0652896	total: 1m 14s	remaining: 1m 52s
2250:	learn: 0.0612655	total: 1m 23s	remaining: 1m 42s
2500:	learn: 0.0574769	total: 1m 33s	remaining: 1m 33s
2750:	learn: 0.0539548	total: 1m 42s	remaining: 1m 24s
3000:	learn: 0.0507943	total: 1m 51s	remaining: 1m 14s
3250:	learn: 0.0479967	total: 2m	remaining: 1m 5s
3500:	learn: 0.0455579	total: 2m 10s	remaining: 55.9s
3750:	learn: 0.0431274	total: 2m 19s	remaining: 46.6s
4000:	learn: 0.0410222	total: 2m 28s	remaining: 37.2s
4250:	learn: 0.0392806	total: 2m 38s	remaining: 27.8s
4500:	learn: 0.0375405	total: 2m 

<catboost.core.CatBoostRegressor at 0x7f1c97341010>

In [7]:
test_preds = np.expm1(model.predict(test_data))

submission = pd.DataFrame({
    "Id": test_data["Id"],
    "SalePrice": test_preds
})

submission.to_csv("submission_catboost_final.csv", index=False)


With optimization

In [8]:
# =========================
# 1. Imports
# =========================
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from scipy.stats import skew

# =========================
# 2. Load Data
# =========================
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# =========================
# 3. Target & Features
# =========================
y = np.log1p(train_data["SalePrice"])
full_data = pd.concat([train_data.drop("SalePrice", axis=1), test_data], axis=0)

# =========================
# 4. Feature Engineering
# =========================
full_data["TotalSF"] = (
    full_data["TotalBsmtSF"] +
    full_data["1stFlrSF"] +
    full_data["2ndFlrSF"]
)

full_data["HouseAge"] = full_data["YrSold"] - full_data["YearBuilt"]
full_data["RemodAge"] = full_data["YrSold"] - full_data["YearRemodAdd"]

full_data["TotalBathrooms"] = (
    full_data["FullBath"] +
    0.5 * full_data["HalfBath"] +
    full_data["BsmtFullBath"] +
    0.5 * full_data["BsmtHalfBath"]
)

# =========================
# 5. Handle Categories
# =========================
for col in full_data.columns:
    if full_data[col].dtype == "object":
        full_data[col] = full_data[col].astype(str)

# =========================
# 6. Skew Correction
# =========================
numeric_feats = full_data.select_dtypes(exclude="object").columns
skewed_feats = full_data[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.75].index

full_data[skewed_feats] = np.log1p(full_data[skewed_feats])

# =========================
# 7. Split Back
# =========================
X = full_data.iloc[:len(train_data), :]
X_test = full_data.iloc[len(train_data):, :]

cat_features = [i for i, col in enumerate(X.columns) if X[col].dtype == "object"]

train_pool = Pool(X, y, cat_features=cat_features)

# =========================
# 8. Final Optimized CatBoost
# =========================
model = CatBoostRegressor(
    iterations=7000,
    learning_rate=0.018,
    depth=7,
    l2_leaf_reg=4,
    bagging_temperature=0.65,
    random_strength=1.3,
    border_count=254,
    loss_function="RMSE",
    subsample=0.85,
    verbose=300
)

model.fit(train_pool)

# =========================
# 9. Predict & Submission
# =========================
test_preds = np.expm1(model.predict(X_test))

submission = pd.DataFrame({
    "Id": test_data["Id"],
    "SalePrice": test_preds
})

submission.to_csv("submission_catboost_final2.csv", index=False)

print("Submission file created: submission_catboost_final2.csv")


0:	learn: 0.3946929	total: 76.8ms	remaining: 8m 57s
300:	learn: 0.1109430	total: 21s	remaining: 7m 47s
600:	learn: 0.0920313	total: 36.9s	remaining: 6m 33s
900:	learn: 0.0821651	total: 55.6s	remaining: 6m 16s
1200:	learn: 0.0744222	total: 1m 13s	remaining: 5m 55s
1500:	learn: 0.0670755	total: 1m 31s	remaining: 5m 36s
1800:	learn: 0.0610676	total: 1m 50s	remaining: 5m 19s
2100:	learn: 0.0543543	total: 2m 8s	remaining: 4m 58s
2400:	learn: 0.0487411	total: 2m 27s	remaining: 4m 41s
2700:	learn: 0.0441778	total: 2m 44s	remaining: 4m 22s
3000:	learn: 0.0402402	total: 3m 3s	remaining: 4m 5s
3300:	learn: 0.0369338	total: 3m 21s	remaining: 3m 45s
3600:	learn: 0.0340839	total: 3m 40s	remaining: 3m 28s
3900:	learn: 0.0315124	total: 3m 58s	remaining: 3m 9s
4200:	learn: 0.0291577	total: 4m 15s	remaining: 2m 50s
4500:	learn: 0.0271407	total: 4m 34s	remaining: 2m 32s
4800:	learn: 0.0252720	total: 4m 51s	remaining: 2m 13s
5100:	learn: 0.0233386	total: 5m 10s	remaining: 1m 55s
5400:	learn: 0.0218055	to