In [1]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('/home/subaru/ml_project/data/house_price/train.csv')
test = pd.read_csv('/home/subaru/ml_project/data/house_price/test.csv')

In [3]:
y = train['SalePrice']
train.drop(columns=['SalePrice', 'Id'], inplace=True)
test_ids = test['Id']
test.drop(columns=['Id'], inplace=True)

In [4]:
cols_to_drop = ['PoolQC', 'MiscFeature', 'Alley', 'FireplaceQu']
train.drop(columns=cols_to_drop, inplace=True)
test.drop(columns=cols_to_drop, inplace=True)

In [5]:
cat_cols = train.select_dtypes(include=['object']).columns
num_cols = train.select_dtypes(include=['int64', 'float64']).columns

In [6]:
train[num_cols] = train[num_cols].fillna(train[num_cols].median())
test[num_cols] = test[num_cols].fillna(train[num_cols].median())

In [7]:
train[cat_cols] = train[cat_cols].fillna('Missing')
test[cat_cols] = test[cat_cols].fillna('Missing')

In [8]:
X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.2, random_state=42)

In [31]:
model = CatBoostRegressor(
    iterations=500,
    depth=6,
    learning_rate=0.1,
    l2_leaf_reg=3,
    eval_metric='RMSE',
    loss_function='RMSE',
    cat_features=list(cat_cols),
    random_seed=42,
    logging_level='Silent'
)

In [13]:
param_grid = {
    'depth': [4, 6],
    'learning_rate': [0.05, 0.1],
    'l2_leaf_reg': [3, 5]
}

In [14]:
model.grid_search(
    param_grid=param_grid,
    X=X_train,
    y=y_train,
    cv=3,
    partition_random_seed=42,
    shuffle=True,
    refit=True,
    plot=False
)

0:	loss: 23234.8170494	best: 23234.8170494 (0)	total: 1.3s	remaining: 9.1s
1:	loss: 22648.7870516	best: 22648.7870516 (1)	total: 2.58s	remaining: 7.75s
2:	loss: 23549.7573095	best: 22648.7870516 (1)	total: 3.85s	remaining: 6.41s
3:	loss: 22877.8933727	best: 22648.7870516 (1)	total: 5.11s	remaining: 5.11s
4:	loss: 23082.6221286	best: 22648.7870516 (1)	total: 7.69s	remaining: 4.61s
5:	loss: 21919.6428079	best: 21919.6428079 (5)	total: 10.3s	remaining: 3.43s
6:	loss: 23005.6133425	best: 21919.6428079 (5)	total: 12.8s	remaining: 1.83s
7:	loss: 22853.2564004	best: 21919.6428079 (5)	total: 15.4s	remaining: 0us
Estimating final quality...


{'params': {'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 3},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
     

In [28]:
model.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=list(cat_cols), early_stopping_rounds=50)

<catboost.core.CatBoostRegressor at 0x742c1bdaede0>

In [29]:
preds = model.predict(test)

In [30]:
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': preds
})

submission.to_csv('submission5.csv', index=False)
