In [1]:
import pandas as pd
from catboost import CatBoostRegressor
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("features/dataset_train_1.csv")

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Id                 1460 non-null   int64  
 1   MSSubClass         1460 non-null   int64  
 2   LotFrontage        1201 non-null   float64
 3   LotArea            1460 non-null   int64  
 4   OverallQual        1460 non-null   int64  
 5   OverallCond        1460 non-null   int64  
 6   YearBuilt          1460 non-null   int64  
 7   YearRemodAdd       1460 non-null   int64  
 8   MasVnrArea         1452 non-null   float64
 9   BsmtFinSF1         1460 non-null   int64  
 10  BsmtFinSF2         1460 non-null   int64  
 11  BsmtUnfSF          1460 non-null   int64  
 12  TotalBsmtSF        1460 non-null   int64  
 13  1stFlrSF           1460 non-null   int64  
 14  2ndFlrSF           1460 non-null   int64  
 15  LowQualFinSF       1460 non-null   int64  
 16  GrLivArea          1460 

In [29]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(["SalePrice", "Id"], axis=1), df['SalePrice'])

In [6]:
def estimate_model(my_model):
    pred = my_model.predict(x_test)
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    r2 = r2_score(y_test, pred)
    score = my_model.score(x_test, y_test)
    local_score = my_model.score(x_train, y_train)
    print("Testing performance")
    print("RMSE: {:.2f}".format(rmse))
    print("R2: {:.2f}".format(r2))
    print("Score: {:.4f}".format(score))
    print("Local Score: {:.4f}".format(local_score))

    print("Best params: ", my_model.get_params())

In [31]:
model1 = CatBoostRegressor(iterations=1000)

In [32]:
model1.fit(x_train, y_train)

Learning rate set to 0.041534
0:	learn: 73094.4825618	total: 1.21ms	remaining: 1.21s
1:	learn: 71258.3990921	total: 2.21ms	remaining: 1.1s
2:	learn: 69438.7261979	total: 3.97ms	remaining: 1.32s
3:	learn: 67573.2547632	total: 5.2ms	remaining: 1.29s
4:	learn: 65839.0235535	total: 6.48ms	remaining: 1.29s
5:	learn: 64264.5834327	total: 7.85ms	remaining: 1.3s
6:	learn: 62644.5541264	total: 9.38ms	remaining: 1.33s
7:	learn: 61174.8689256	total: 10.5ms	remaining: 1.31s
8:	learn: 59658.3779424	total: 12.5ms	remaining: 1.38s
9:	learn: 58213.0129413	total: 13.8ms	remaining: 1.36s
10:	learn: 56702.2123821	total: 14.9ms	remaining: 1.34s
11:	learn: 55418.2264657	total: 16.3ms	remaining: 1.34s
12:	learn: 54117.0252471	total: 17.6ms	remaining: 1.34s
13:	learn: 52991.3678137	total: 18.8ms	remaining: 1.32s
14:	learn: 51822.6046535	total: 20ms	remaining: 1.31s
15:	learn: 50666.2028478	total: 21.4ms	remaining: 1.31s
16:	learn: 49481.2838568	total: 22.9ms	remaining: 1.32s
17:	learn: 48453.4680540	total: 2

<catboost.core.CatBoostRegressor at 0x1794303d0>

In [33]:
estimate_model(model1)

Testing performance
RMSE: 43199.77
R2: 0.78
Score: 0.7750
Local Score: 0.9957
Best params:  {'iterations': 1000, 'loss_function': 'RMSE'}


In [11]:
model2 = CatBoostRegressor()

In [18]:
param_grid = {
    'learning_rate': [0.1, 0.15],
    'depth': [5,4],
    'iterations': [600],
    'l2_leaf_reg': [6, 5,  4],
}

In [19]:
grid_search = GridSearchCV(estimator=model2, param_grid=param_grid, cv=5)
grid_search.fit(x_train, y_train)
model2 = grid_search.best_estimator_

0:	learn: 73767.7885616	total: 816us	remaining: 489ms
1:	learn: 69365.2331347	total: 1.62ms	remaining: 483ms
2:	learn: 65501.6583834	total: 2.43ms	remaining: 483ms
3:	learn: 62405.4076931	total: 3.27ms	remaining: 487ms
4:	learn: 59311.9943569	total: 4.21ms	remaining: 501ms
5:	learn: 56339.5018809	total: 5.89ms	remaining: 584ms
6:	learn: 53436.5691056	total: 6.77ms	remaining: 573ms
7:	learn: 51141.0441787	total: 7.79ms	remaining: 577ms
8:	learn: 48822.0233702	total: 9.48ms	remaining: 623ms
9:	learn: 46612.3128474	total: 10.9ms	remaining: 646ms
10:	learn: 44824.2002710	total: 11.7ms	remaining: 627ms
11:	learn: 43191.0961343	total: 12.6ms	remaining: 615ms
12:	learn: 41585.2859408	total: 13.4ms	remaining: 605ms
13:	learn: 40073.6851396	total: 14.2ms	remaining: 594ms
14:	learn: 38865.9011251	total: 15.1ms	remaining: 589ms
15:	learn: 37720.8681740	total: 16ms	remaining: 583ms
16:	learn: 36793.7806717	total: 17.1ms	remaining: 586ms
17:	learn: 35779.8134411	total: 17.9ms	remaining: 577ms
18:	l

In [20]:
estimate_model(model2)

Testing performance
RMSE: 33748.41
R2: 0.83
Score: 0.8304
Local Score: 0.9941
Best params:  {'iterations': 600, 'learning_rate': 0.1, 'depth': 5, 'l2_leaf_reg': 5, 'loss_function': 'RMSE'}


In [21]:
import pickle
with open("models/Catboost_1_simple.pickle", "wb") as file:
    pickle.dump(model1, file)

In [34]:
test = pd.read_csv("features/dataset_test_1.csv")

In [35]:
pred = model1.predict(test.drop("Id", axis=1))

In [36]:
test['SalePrice'] = pred

In [37]:
test[['Id', "SalePrice"]].to_csv("results/result_catboost_2.csv", index=False)