# CatBoost (Category Boosting)

In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale 
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)


In [3]:
from catboost import CatBoostRegressor

In [4]:
catb = CatBoostRegressor()
catb_model = catb.fit(X_train, y_train)

Learning rate set to 0.031674
0:	learn: 437.6430699	total: 136ms	remaining: 2m 16s
1:	learn: 431.3923642	total: 138ms	remaining: 1m 8s
2:	learn: 424.8820360	total: 139ms	remaining: 46.1s
3:	learn: 418.2514904	total: 140ms	remaining: 34.8s
4:	learn: 412.6394021	total: 141ms	remaining: 28s
5:	learn: 406.6247020	total: 142ms	remaining: 23.5s
6:	learn: 400.5321206	total: 143ms	remaining: 20.3s
7:	learn: 394.6683437	total: 144ms	remaining: 17.8s
8:	learn: 388.2496484	total: 144ms	remaining: 15.9s
9:	learn: 382.9448842	total: 145ms	remaining: 14.4s
10:	learn: 377.2600080	total: 146ms	remaining: 13.1s
11:	learn: 372.4829606	total: 147ms	remaining: 12.1s
12:	learn: 366.6823437	total: 148ms	remaining: 11.2s
13:	learn: 362.6076230	total: 149ms	remaining: 10.5s
14:	learn: 358.0107745	total: 150ms	remaining: 9.83s
15:	learn: 353.2802665	total: 151ms	remaining: 9.26s
16:	learn: 348.5646265	total: 151ms	remaining: 8.76s
17:	learn: 343.6407912	total: 152ms	remaining: 8.31s
18:	learn: 339.2363847	tota

## Tahmin

In [5]:
y_pred = catb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

351.194631344607

## Model Tuning

In [7]:
catb_grid = {
    'iterations': [200, 500, 1000, 2000],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'depth': [3, 4, 5, 6, 7, 8] }

In [8]:
catb = CatBoostRegressor()
catb_cv_model = GridSearchCV(catb, catb_grid, cv=5, n_jobs=-1, verbose=2)

In [13]:
# Alt satırdaki modeli fit etme işlemi yaklaşık
# 45 dakika sürüyor.
# Ayrıca catboost_info adında çalışma dizininde bir klasör
# oluşturuyor.

In [None]:
catb_cv_model.fit(X_train, y_train)

In [None]:
catb_cv_model.best_params_

In [None]:
catb_tuned = CatBoostRegressor(iterations = 200, 
                               learning_rate = 0.01, 
                               depth = 8)

catb_tuned = catb_tuned.fit(X_train, y_train)

In [12]:
y_pred = catb_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

369.6970696250705