## Diamond dataset
https://www.kaggle.com/datasets/natedir/diamonds

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor

### Готовим данные

In [33]:
df_diamonds = pd.read_csv('diamonds.csv')
df_diamonds.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [34]:
df_diamonds.drop(['Unnamed: 0'], axis=1,inplace=True)

In [35]:
df_diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [36]:
y = df_diamonds['price']
X = df_diamonds.drop(['price'], axis=1)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, len(y_train))
print(X_test.shape, len(y_test))

(43152, 9) 43152
(10788, 9) 10788


### Учим модель

In [38]:
cat_features=['cut', 'color', 'clarity']

In [39]:
model = CatBoostRegressor(iterations=100,
                          learning_rate=1,
                          depth=5)
# Fit model
model.fit(X_train, y_train, cat_features=cat_features)

0:	learn: 1536.8508789	total: 4.84ms	remaining: 479ms
1:	learn: 1242.9798386	total: 10.3ms	remaining: 503ms
2:	learn: 1044.8484081	total: 15.7ms	remaining: 508ms
3:	learn: 951.3271921	total: 19.9ms	remaining: 477ms
4:	learn: 894.0832520	total: 23.5ms	remaining: 447ms
5:	learn: 859.0847833	total: 27.3ms	remaining: 427ms
6:	learn: 827.3026822	total: 31.5ms	remaining: 418ms
7:	learn: 799.4628893	total: 35.2ms	remaining: 405ms
8:	learn: 756.1927229	total: 39ms	remaining: 394ms
9:	learn: 741.5853611	total: 42.4ms	remaining: 382ms
10:	learn: 731.1648726	total: 46.1ms	remaining: 373ms
11:	learn: 698.1339506	total: 49.8ms	remaining: 365ms
12:	learn: 692.4772928	total: 53.4ms	remaining: 358ms
13:	learn: 683.3417204	total: 56.7ms	remaining: 348ms
14:	learn: 671.5967504	total: 60.4ms	remaining: 342ms
15:	learn: 667.4825386	total: 63.5ms	remaining: 334ms
16:	learn: 648.2426713	total: 66.7ms	remaining: 326ms
17:	learn: 638.4483466	total: 69.8ms	remaining: 318ms
18:	learn: 631.0412949	total: 72.7ms	

<catboost.core.CatBoostRegressor at 0x7fd2237d8040>

In [40]:
# Get predictions
mean_absolute_error(model.predict(X_train), y_train)

283.4295739995182

In [41]:
mean_absolute_error(model.predict(X_test), y_test)

308.5291969248766

In [42]:
model.save_model('diamond_catboost',
           format="cbm",
           export_parameters=None,
           pool=None)

In [43]:
model_load = CatBoostRegressor()
model_load.load_model('diamond_catboost', format='cbm')
mean_absolute_error(model_load.predict(X_test), y_test)

308.5291969248766

In [44]:
X_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
10176,1.1,Ideal,H,SI2,62.0,55.0,6.61,6.65,4.11
16083,1.29,Ideal,H,SI1,62.6,56.0,6.96,6.93,4.35
13420,1.2,Premium,I,SI1,61.1,58.0,6.88,6.8,4.18
20407,1.5,Ideal,F,SI1,60.9,56.0,7.43,7.36,4.5
8909,0.9,Very Good,F,VS2,61.7,57.0,6.17,6.21,3.82
