In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from catboost import CatBoostClassifier
import warnings

In [2]:
hr = pd.read_csv(r"C:\Training\Academy\Statistics (Python)\Cases\human-resources-analytics\HR_comma_sep.csv")
X = hr.drop('left', axis=1)
y = hr['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, test_size=0.3, stratify=y)

CatBoost without One Hot Encoding

In [14]:
cgbm = CatBoostClassifier(random_state=24, cat_features=['Department','salary'], n_estimators=50)
cgbm.fit(X_train, y_train)

Learning rate set to 0.438486
0:	learn: 0.4127012	total: 42ms	remaining: 2.06s
1:	learn: 0.2883445	total: 75.7ms	remaining: 1.81s
2:	learn: 0.2171350	total: 110ms	remaining: 1.72s
3:	learn: 0.1787252	total: 148ms	remaining: 1.7s
4:	learn: 0.1554308	total: 186ms	remaining: 1.67s
5:	learn: 0.1402544	total: 230ms	remaining: 1.69s
6:	learn: 0.1274649	total: 268ms	remaining: 1.65s
7:	learn: 0.1223568	total: 306ms	remaining: 1.61s
8:	learn: 0.1151863	total: 345ms	remaining: 1.57s
9:	learn: 0.1090791	total: 382ms	remaining: 1.53s
10:	learn: 0.1055714	total: 420ms	remaining: 1.49s
11:	learn: 0.1027904	total: 460ms	remaining: 1.46s
12:	learn: 0.1001674	total: 499ms	remaining: 1.42s
13:	learn: 0.0980211	total: 538ms	remaining: 1.38s
14:	learn: 0.0960815	total: 576ms	remaining: 1.34s
15:	learn: 0.0946345	total: 614ms	remaining: 1.3s
16:	learn: 0.0927250	total: 652ms	remaining: 1.26s
17:	learn: 0.0906349	total: 695ms	remaining: 1.23s
18:	learn: 0.0885028	total: 733ms	remaining: 1.2s
19:	learn: 0.0

<catboost.core.CatBoostClassifier at 0x1d4e2c1ef30>

In [16]:
y_pred = cgbm.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9773282951767059


Grid Search CV

In [23]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
params = {'n_estimators':[10,50], 'max_depth':[2,3,4], 'learning_rate':np.linspace(0.001,1,5) }
gcv = GridSearchCV(cgbm, param_grid=params, cv=kfold, scoring='roc_auc',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
0:	learn: 0.6925472	total: 12.6ms	remaining: 113ms
1:	learn: 0.6919348	total: 30.1ms	remaining: 120ms
2:	learn: 0.6913254	total: 48.1ms	remaining: 112ms
3:	learn: 0.6907172	total: 58.2ms	remaining: 87.3ms
4:	learn: 0.6901085	total: 67.4ms	remaining: 67.4ms
5:	learn: 0.6895010	total: 82.6ms	remaining: 55.1ms
6:	learn: 0.6888947	total: 100ms	remaining: 43ms
7:	learn: 0.6882913	total: 114ms	remaining: 28.5ms
8:	learn: 0.6876875	total: 130ms	remaining: 14.5ms
9:	learn: 0.6870981	total: 144ms	remaining: 0us
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.911 total time=   0.1s
0:	learn: 0.6925453	total: 9.88ms	remaining: 88.9ms
1:	learn: 0.6919295	total: 26ms	remaining: 104ms
2:	learn: 0.6913171	total: 40.5ms	remaining: 94.4ms
3:	learn: 0.6907059	total: 49.6ms	remaining: 74.4ms
4:	learn: 0.6900937	total: 58.6ms	remaining: 58.6ms
5:	learn: 0.6894828	total: 73ms	remaining: 48.6ms
6:	learn: 0.6888731	total: 

In [25]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 1.0, 'max_depth': 4, 'n_estimators': 50}
0.9893002633546979


## Medical Cost Expenses

In [42]:
from catboost import CatBoostRegressor

In [30]:
import os
os.chdir(r"C:\Training\Academy\Statistics (Python)\Cases\Medical Cost Personal")
medical = pd.read_csv("insurance.csv")
X = medical.drop('charges', axis=1)
y = medical['charges']

In [38]:
list(X.columns[X.dtypes==object])

['sex', 'smoker', 'region']

In [48]:
cgbm = CatBoostRegressor(random_state=24, cat_features=list(X.columns[X.dtypes==object]))
kfold = KFold(n_splits=5, shuffle=True, random_state=24)
params = {'n_estimators':[10,50], 'max_depth':[2,3,4], 'learning_rate':np.linspace(0.001,1,5) }
gcv = GridSearchCV(cgbm, param_grid=params, cv=kfold, scoring='r2',verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
0:	learn: 12036.2666510	total: 8.96ms	remaining: 80.6ms
1:	learn: 12027.5885074	total: 17.3ms	remaining: 69.3ms
2:	learn: 12018.7579696	total: 36ms	remaining: 84.1ms
3:	learn: 12010.0080421	total: 47.5ms	remaining: 71.3ms
4:	learn: 12001.6612206	total: 58.8ms	remaining: 58.8ms
5:	learn: 11992.8131380	total: 64.9ms	remaining: 43.3ms
6:	learn: 11984.5229977	total: 76.1ms	remaining: 32.6ms
7:	learn: 11975.9771850	total: 87.3ms	remaining: 21.8ms
8:	learn: 11967.4506001	total: 98.3ms	remaining: 10.9ms
9:	learn: 11958.6367854	total: 110ms	remaining: 0us
[CV 1/5] END learning_rate=0.001, max_depth=2, n_estimators=10;, score=0.014 total time=   0.1s
0:	learn: 12050.3139519	total: 6.03ms	remaining: 54.3ms
1:	learn: 12041.5605243	total: 12.3ms	remaining: 49.4ms
2:	learn: 12032.6556923	total: 23.9ms	remaining: 55.8ms
3:	learn: 12023.9006792	total: 35.8ms	remaining: 53.7ms
4:	learn: 12015.5102852	total: 47.8ms	remaining: 47.8ms
5:	learn

In [50]:
print(gcv.best_params_)
print(gcv.best_score_)

{'learning_rate': 0.5005, 'max_depth': 2, 'n_estimators': 50}
0.8584148378430502
