In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [5]:
import xgboost
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import catboost

In [4]:
ds = pd.read_csv("train.csv")

In [7]:
def prepare_dataset(dataset):
    ds_new = dataset.drop(columns=["Id"])

    category_features = ds_new.select_dtypes(include = ["object"]).columns
    number_features = ds_new.select_dtypes(exclude = ["object"]).columns

    ds_new[category_features] = ds_new[category_features].fillna('None')

    for feature in number_features:
        ds_new[feature] = ds_new[feature].fillna(ds_new[feature].mean())

    return ds_new

In [6]:
ds = prepare_dataset(ds)

In [7]:
from sklearn.preprocessing import LabelEncoder

ds_nocat = ds.copy()
category_features = ds.select_dtypes(include = ["object"]).columns

for feature in category_features:
    encoder = LabelEncoder()
    encoded_feature = encoder.fit_transform(ds[feature])
    ds_nocat[feature] = encoded_feature

In [8]:
y = np.log1p(ds.SalePrice.to_numpy())
x = ds_nocat.drop(columns=["SalePrice"]).to_numpy()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=98987)

In [9]:
x.shape,y.shape

((1460, 79), (1460,))

In [10]:
from sklearn.ensemble import RandomForestRegressor

parameters = {
    'criterion':('squared_error',), 
    'max_depth': (1000,),
    'max_features':(1/3, ),
    'n_estimators': (100, 1000),
    'min_samples_leaf': (1, 2, 8)
}

rforest = RandomForestRegressor()
rforest_gs = GridSearchCV(rforest, parameters, verbose=2)
rforest_gs.fit(x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   2.4s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   2.2s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   2.2s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   2.4s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=100; total time=   2.2s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf=1, n_estimators=1000; total time=  29.0s
[CV] END criterion=squared_error, max_depth=1000, max_features=0.3333333333333333, min_samples_leaf

In [11]:
best_scores = {}

In [15]:
from sklearn.metrics import mean_squared_error

def print_error(preds, gt):
    err =  np.sqrt(mean_squared_error(preds, gt))
    print('Root mean square error:', err)
    return err

In [13]:
print(rforest_gs.best_params_)
best_scores["RandomForest"] = print_error(rforest_gs.predict(x_test), y_test)

{'criterion': 'squared_error', 'max_depth': 1000, 'max_features': 0.3333333333333333, 'min_samples_leaf': 1, 'n_estimators': 1000}
Root mean square error: 0.15443623632758616


In [14]:
parameters = {
    "learning_rate": (0.001, 0.01, ),
    "max_depth": [ 2, 4],
    "min_child_weight": [ 1, 10],
    "gamma":[ 0.0,],
    "n_estimators": [1000, 5000]
}
xgb = xgboost.XGBRegressor()
xgb_gs = GridSearchCV(xgb, parameters, verbose=2, cv=3)
xgb_gs.fit(x_train, y_train);

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   0.8s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   0.8s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=1000; total time=   0.8s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=5000; total time=   4.3s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=5000; total time=   4.3s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=1, n_estimators=5000; total time=   4.3s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=10, n_estimators=1000; total time=   0.8s
[CV] END gamma=0.0, learning_rate=0.001, max_depth=2, min_child_weight=10, n_estimators=1000; total time=   0.8s
[CV] END gamma=0.0, learning_rate=0.001, 

In [15]:
print(xgb_gs.best_params_)
best_scores["XGBoost"] = print_error(xgb_gs.predict(x_test), y_test)

{'gamma': 0.0, 'learning_rate': 0.01, 'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 5000}
Root mean square error: 0.1315894507589594


In [16]:
parameters = {
    'num_leaves': (40, 20, 10,),
    'learning_rate': (0.1, 0.01, 0.05),
    'max_depth': (-1,),
    'n_estimators': (10**3, 10**4),}

lgbmr = GridSearchCV(LGBMRegressor(), parameters, verbose=2)
lgbmr.fit(x_train, y_train);

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001565 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.026352
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.5s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3027
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 73
[LightGBM] [Info] Start training from score 12.021537
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.3s
[

[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.3s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000372 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3039
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 72
[LightGBM] [Info] Start training from score 12.026171
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.3s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1052, number of used features: 72
[LightGBM] [Info] Start training from score 12.01379

[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.1s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000526 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.026352
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=20; total time=   1.3s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3027
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 73
[LightGBM] [Info] Start training from score 12.02153

[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=10; total time=   0.6s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000396 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1052, number of used features: 72
[LightGBM] [Info] Start training from score 12.013796
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=1000, num_leaves=10; total time=   0.5s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000366 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.02635









[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  23.5s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000894 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3027
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 73
[LightGBM] [Info] Start training from score 12.021537






[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  24.8s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3018
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.020510




















[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  23.7s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3039
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 72
[LightGBM] [Info] Start training from score 12.026171












[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  24.1s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1052, number of used features: 72
[LightGBM] [Info] Start training from score 12.013796








[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  23.2s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000831 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.026352
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10000, num_leaves=20; total time=  13.6s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000872 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3027
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 73
[LightGBM] [Info] Start training from score 12.021537
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10000, num_leaves=20; total time=  14.2s
[LightGBM] [Info] Auto-choos

[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10000, num_leaves=20; total time=  14.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1052, number of used features: 72
[LightGBM] [Info] Start training from score 12.013796
[CV] END learning_rate=0.1, max_depth=-1, n_estimators=10000, num_leaves=20; total time=  13.1s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.026

[CV] END learning_rate=0.01, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.2s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000566 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3027
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 73
[LightGBM] [Info] Start training from score 12.021537


[CV] END learning_rate=0.01, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.2s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000537 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3018
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.020510


[CV] END learning_rate=0.01, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.2s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3039
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 72
[LightGBM] [Info] Start training from score 12.026171




[CV] END learning_rate=0.01, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.5s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1052, number of used features: 72
[LightGBM] [Info] Start training from score 12.013796


[CV] END learning_rate=0.01, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.026352
[CV] END learning_rate=0.01, max_depth=-1, n_estimators=1000, num_leaves=20; total time=   1.4s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000711 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3027
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 73
[LightGBM] [Info] Start training from score 12.021









[CV] END learning_rate=0.01, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  23.7s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3027
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 73
[LightGBM] [Info] Start training from score 12.021537








[CV] END learning_rate=0.01, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  25.2s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3018
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.020510












[CV] END learning_rate=0.01, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  23.9s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000837 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3039
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 72
[LightGBM] [Info] Start training from score 12.026171








[CV] END learning_rate=0.01, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  25.7s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1052, number of used features: 72
[LightGBM] [Info] Start training from score 12.013796














[CV] END learning_rate=0.01, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  24.5s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.026352
[CV] END learning_rate=0.01, max_depth=-1, n_estimators=10000, num_leaves=20; total time=  13.1s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000385 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3027
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 73
[LightGBM] [Info] Start training from score 12.0

[CV] END learning_rate=0.01, max_depth=-1, n_estimators=10000, num_leaves=10; total time=   7.2s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3039
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 72
[LightGBM] [Info] Start training from score 12.026171
[CV] END learning_rate=0.01, max_depth=-1, n_estimators=10000, num_leaves=10; total time=   7.3s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000395 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1052, number of used features: 72
[LightGBM] [Info] Start training from score 12.0

[CV] END learning_rate=0.05, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.1s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000367 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3018
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.020510


[CV] END learning_rate=0.05, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.4s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000601 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3039
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 72
[LightGBM] [Info] Start training from score 12.026171


[CV] END learning_rate=0.05, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.3s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000403 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1052, number of used features: 72
[LightGBM] [Info] Start training from score 12.013796


[CV] END learning_rate=0.05, max_depth=-1, n_estimators=1000, num_leaves=40; total time=   2.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.026352
[CV] END learning_rate=0.05, max_depth=-1, n_estimators=1000, num_leaves=20; total time=   1.3s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3027
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 73
[LightGBM] [Info] Start training from score 12.021









[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  24.5s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3027
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 73
[LightGBM] [Info] Start training from score 12.021537






[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  24.8s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3018
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.020510
















[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  23.9s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000843 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3039
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 72
[LightGBM] [Info] Start training from score 12.026171










[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  25.3s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000555 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1052, number of used features: 72
[LightGBM] [Info] Start training from score 12.013796








[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=40; total time=  23.8s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000368 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3017
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.026352
[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=20; total time=  13.2s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000558 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3027
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 73
[LightGBM] [Info] Start training from score 12.0

[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=20; total time=  12.9s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3018
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 71
[LightGBM] [Info] Start training from score 12.020510
[CV] END learning_rate=0.05, max_depth=-1, n_estimators=10000, num_leaves=20; total time=  14.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3039
[LightGBM] [Info] Number of data points in the train set: 1051, number of used features: 72
[LightGBM] [Info] Start training from score 12.0

In [17]:
print(lgbmr.best_params_)
best_scores["LightGBM"] = print_error(lgbmr.predict(x_test), y_test)

{'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 10}
Root mean square error: 0.14171984107096894


In [17]:
ds = pd.read_csv("train.csv")

ds.drop(columns=["Id"], inplace=True)
x = ds.drop(columns=['SalePrice'])
y = np.log1p(ds['SalePrice'])

cat_features = ds.select_dtypes(include = ["object"]).columns

x[cat_features] = x[cat_features].fillna('None')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=98987)

In [18]:
train_pool = catboost.Pool(x_train, y_train, cat_features=cat_features.tolist())
test_pool = catboost.Pool(x_test, y_test, cat_features=cat_features.tolist())

In [19]:
p_grid = {
        'learning_rate': [0.07, 0.9],
        'depth': [1, 2, 3],
        'l2_leaf_reg': [0.7, 1, 1.3],
}

catboost_cls = CatBoostRegressor(
        loss_function='RMSE',
        verbose=0
)
grid_search_results = catboost_cls.grid_search(p_grid, train_pool, shuffle=False, verbose=1, search_by_train_test_split=False)

Training on fold [0/3]

bestTest = 0.1424761025
bestIteration = 918

Training on fold [1/3]

bestTest = 0.1225894164
bestIteration = 999

Training on fold [2/3]

bestTest = 0.1422752513
bestIteration = 392

0:	loss: 0.1363115	best: 0.1363115 (0)	total: 13.5s	remaining: 3m 48s
Training on fold [0/3]

bestTest = 0.1452043185
bestIteration = 890

Training on fold [1/3]

bestTest = 0.1288349456
bestIteration = 420

Training on fold [2/3]

bestTest = 0.1584301759
bestIteration = 804

1:	loss: 0.1455148	best: 0.1363115 (0)	total: 27.7s	remaining: 3m 41s
Training on fold [0/3]

bestTest = 0.1422212522
bestIteration = 955

Training on fold [1/3]

bestTest = 0.1221175133
bestIteration = 999

Training on fold [2/3]

bestTest = 0.1427051403
bestIteration = 468

2:	loss: 0.1360531	best: 0.1360531 (2)	total: 44s	remaining: 3m 39s
Training on fold [0/3]

bestTest = 0.1485506428
bestIteration = 979

Training on fold [1/3]

bestTest = 0.1267461794
bestIteration = 469

Training on fold [2/3]

bestTest 

In [20]:
print_error(catboost_cls.predict(test_pool), y_test)

Root mean square error: 0.1276739987586883


0.1276739987586883

In [1]:
#special for Kaggle

In [21]:
test_dataset = pd.read_csv('test.csv')
test_dataset = prepare_dataset(test_dataset)

cat_features = test_dataset.select_dtypes(['object']).columns.tolist()

y_pred = catboost_cls.predict(test_dataset)

y_pred = np.expm1(y_pred)

In [22]:
y_pred

array([120995.43352071, 156772.53467243, 180902.51969985, ...,
       168663.0707533 , 122296.12663951, 230285.94474493])

In [23]:
my_submission = pd.read_csv('sample_submission.csv')

In [24]:
my_submission

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.683570
3,1464,179317.477511
4,1465,150730.079977
...,...,...
1454,2915,167081.220949
1455,2916,164788.778231
1456,2917,219222.423400
1457,2918,184924.279659


In [25]:
my_submission['SalePrice'] = y_pred

In [26]:
my_submission

Unnamed: 0,Id,SalePrice
0,1461,120995.433521
1,1462,156772.534672
2,1463,180902.519700
3,1464,195579.875315
4,1465,198189.797487
...,...,...
1454,2915,87224.945809
1455,2916,85925.232111
1456,2917,168663.070753
1457,2918,122296.126640


In [27]:
my_submission.to_csv('my_submission.csv', index=False)