In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os

from scipy.stats import uniform, randint
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool

from DataTransformer import DataTransformer
from TrainUtils import *

In [3]:
data_folder = "data"
train_path = os.path.join(data_folder, "train.csv")

train_df: pd.DataFrame = pd.read_csv(train_path).drop("Id", axis=1)
y = np.log1p(train_df['SalePrice'])
X: pd.DataFrame = train_df.drop(columns='SalePrice', axis=1)

transformer = DataTransformer()
X = transformer.fit_transform(X)
X = X.astype(str)

print(f"Is there nan: {np.any(X.isnull())}")

Is there nan: False


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)
cat_features = np.where(X_train.loc[:, X_train.columns.values].dtypes == "object")[0]

In [5]:
train_pool = Pool(X_train, y_train, cat_features=cat_features)
dev_pool = Pool(X_test, y_test, cat_features=cat_features)

param_grid = {
    "n_estimators": randint(1, 400),
    "learning_rate": uniform(1e-3,  1e-1),
    "depth": randint(1, 15)}


cat_model = CatBoostRegressor(task_type="CPU",  
                              devices='0:1', 
                              logging_level='Silent', 
                              random_seed=0,
                              cat_features=cat_features)

randomized_search_result = cat_model.randomized_search(param_grid,
                                                        X=X,
                                                        y=y,
                                                        plot=False,
                                                        verbose=True)

0:	loss: 0.4657897	best: 0.4657897 (0)	total: 1.04s	remaining: 9.38s
1:	loss: 0.6007477	best: 0.4657897 (0)	total: 2.72s	remaining: 10.9s
2:	loss: 0.2705522	best: 0.2705522 (2)	total: 10.4s	remaining: 24.3s
3:	loss: 2.6601132	best: 0.2705522 (2)	total: 13.2s	remaining: 19.8s
4:	loss: 0.1366078	best: 0.1366078 (4)	total: 17.2s	remaining: 17.2s
5:	loss: 0.1551062	best: 0.1366078 (4)	total: 30.6s	remaining: 20.4s
6:	loss: 0.1317126	best: 0.1317126 (6)	total: 40.4s	remaining: 17.3s
7:	loss: 0.2664496	best: 0.1317126 (6)	total: 41.4s	remaining: 10.4s
8:	loss: 0.2665391	best: 0.1317126 (6)	total: 2m 9s	remaining: 14.4s
9:	loss: 4.8633964	best: 0.1317126 (6)	total: 2m 16s	remaining: 0us
Estimating final quality...


In [6]:
evaluate(cat_model, X_train, y_train)
evaluate(cat_model, X_test, y_test)

RMSLE: 0.11276031721525373
RMSLE: 0.126782685725767
