In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor

In [47]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis = 1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.25, random_state=42)


In [49]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.5-cp312-cp312-macosx_11_0_universal2.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.5-cp312-cp312-macosx_11_0_universal2.whl (26.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.1/26.1 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: graphviz, catboost
Successfully installed catboost-1.2.5 graphviz-0.20.3


In [51]:
from catboost import CatBoostRegressor

In [53]:
catb_model= CatBoostRegressor().fit(X_train, y_train)

Learning rate set to 0.031674
0:	learn: 437.6430699	total: 59.6ms	remaining: 59.6s
1:	learn: 431.3923642	total: 60.9ms	remaining: 30.4s
2:	learn: 424.8820360	total: 63.4ms	remaining: 21.1s
3:	learn: 418.2514904	total: 64.2ms	remaining: 16s
4:	learn: 412.6394021	total: 65.2ms	remaining: 13s
5:	learn: 406.6247020	total: 66.9ms	remaining: 11.1s
6:	learn: 400.5321206	total: 68ms	remaining: 9.64s
7:	learn: 394.6683437	total: 69ms	remaining: 8.55s
8:	learn: 388.2496484	total: 69.9ms	remaining: 7.7s
9:	learn: 382.9448842	total: 70.9ms	remaining: 7.02s
10:	learn: 377.2600080	total: 71.9ms	remaining: 6.46s
11:	learn: 372.4829606	total: 72.8ms	remaining: 6s
12:	learn: 366.6823437	total: 73.9ms	remaining: 5.61s
13:	learn: 362.6076230	total: 74.8ms	remaining: 5.27s
14:	learn: 358.0107745	total: 75.6ms	remaining: 4.97s
15:	learn: 353.2802665	total: 76.4ms	remaining: 4.7s
16:	learn: 348.5646265	total: 77.3ms	remaining: 4.47s
17:	learn: 343.6407912	total: 78ms	remaining: 4.26s
18:	learn: 339.2363847	

In [55]:
y_pred=catb_model.predict(X_test)

In [57]:
np.sqrt(mean_squared_error(y_test, y_pred))

351.194631344607

In [None]:
# model tuning

In [59]:
catb_params= {"iterations":[200,500,100],
              "learning_rate":[0.01,0.1],
              "depth":[3,6,8]}

In [62]:
catb_model=CatBoostRegressor()

In [64]:
catb_cv_model= GridSearchCV(catb_model,
                          catb_params,
                          cv=5,
                          n_jobs=-1,
                          verbose=2).fit(X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
0:	learn: 432.7654990	total: 56.6ms	remaining: 11.3s
1:	learn: 430.7063071	total: 56.9ms	remaining: 5.64s
2:	learn: 428.6446707	total: 57.3ms	remaining: 3.76s
3:	learn: 426.5692639	total: 57.6ms	remaining: 2.82s
4:	learn: 424.5071715	total: 58ms	remaining: 2.26s
5:	learn: 422.4941196	total: 58.2ms	remaining: 1.88s
6:	learn: 420.6993247	total: 58.5ms	remaining: 1.61s
7:	learn: 418.7319320	total: 58.7ms	remaining: 1.41s
8:	learn: 416.8557986	total: 59ms	remaining: 1.25s
9:	learn: 415.2303544	total: 59.6ms	remaining: 1.13s
10:	learn: 413.2318844	total: 60.1ms	remaining: 1.03s
11:	learn: 411.5760332	total: 60.5ms	remaining: 948ms
12:	learn: 409.8316008	total: 61.1ms	remaining: 878ms
13:	learn: 408.1620000	total: 61.4ms	remaining: 816ms
14:	learn: 406.3481403	total: 61.8ms	remaining: 762ms
15:	learn: 404.5361495	total: 62.3ms	remaining: 716ms
16:	learn: 402.7515483	total: 62.7ms	remaining: 674ms
17:	learn: 401.0065014	total: 62.9m

In [66]:
catb_cv_model.best_params_

{'depth': 3, 'iterations': 200, 'learning_rate': 0.1}

In [68]:
catb_tuned=CatBoostRegressor(depth =3, iterations=500, learning_rate=0.1).fit(X_train, y_train)

0:	learn: 425.7900818	total: 819us	remaining: 409ms
1:	learn: 404.8723520	total: 1.55ms	remaining: 386ms
2:	learn: 387.4057666	total: 1.99ms	remaining: 330ms
3:	learn: 372.2801584	total: 2.31ms	remaining: 286ms
4:	learn: 358.9204229	total: 2.63ms	remaining: 261ms
5:	learn: 347.0083933	total: 3.06ms	remaining: 252ms
6:	learn: 336.0130818	total: 3.35ms	remaining: 236ms
7:	learn: 324.3923300	total: 3.73ms	remaining: 229ms
8:	learn: 314.8690957	total: 4.06ms	remaining: 221ms
9:	learn: 308.5075563	total: 4.36ms	remaining: 214ms
10:	learn: 298.8587285	total: 4.6ms	remaining: 205ms
11:	learn: 294.7655438	total: 4.84ms	remaining: 197ms
12:	learn: 288.0697862	total: 5.11ms	remaining: 191ms
13:	learn: 282.6697154	total: 5.4ms	remaining: 188ms
14:	learn: 277.6121667	total: 5.7ms	remaining: 184ms
15:	learn: 273.4383979	total: 5.98ms	remaining: 181ms
16:	learn: 269.1556201	total: 6.29ms	remaining: 179ms
17:	learn: 264.8098704	total: 6.59ms	remaining: 176ms
18:	learn: 261.6700768	total: 7.1ms	remain

In [70]:
y_pred=catb_tuned.predict(X_test)


In [74]:
np.sqrt(mean_squared_error(y_test, y_pred))

335.69740780164267