In [16]:
## load basic tools
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from catboost import CatBoostRegressor
import warnings
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import joblib
warnings.filterwarnings('ignore')

In [11]:
# Load data
data = pd.read_csv('../DatasetForML.csv')
data.head()

Unnamed: 0,symboling,doornumber,wheelbase,carlength,carwidth,carheight,curbweight,cylindernumber,enginesize,boreratio,...,enginetype_OHCV,enginetype_ROTOR,fuelsystem_2BBL,fuelsystem_4BBL,fuelsystem_IDI,fuelsystem_MFI,fuelsystem_MPFI,fuelsystem_SPDI,fuelsystem_SPFI,price_category
0,3,2,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,1,0,0,1
1,3,2,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,1,0,0,1
2,1,2,94.5,171.2,65.5,52.4,2823,6,152,2.68,...,1,0,0,0,0,0,1,0,0,1
3,2,4,99.8,176.6,66.2,54.3,2337,4,109,3.19,...,0,0,0,0,0,0,1,0,0,1
4,2,4,99.4,176.6,66.4,54.3,2824,5,136,3.19,...,0,0,0,0,0,0,1,0,0,1


In [12]:
# Correlation matrix filtered by correlation coefficient > 0.8 (absolute)
corr= data.corr()
max_corr = (abs(corr) >= 0.8)
sup_filter = np.triu(np.ones(corr.shape), k=1).astype(bool)
corr_filtered = corr[max_corr]
corr_filtered = corr_filtered.dropna(thresh=2).dropna(axis=1, how='all')


In [13]:
corr_features = list(corr_filtered.loc['price',:].dropna().drop('price').index)
corr_features

['curbweight', 'enginesize', 'horsepower']

In [14]:
X_red = data[corr_features]
y=data['price']
X_red_train, X_red_test, y_train, y_test = train_test_split(X_red, y, test_size=0.2, random_state=42)

In [15]:
pipeline = Pipeline([('scaler', StandardScaler()), ('model', CatBoostRegressor())])
pipeline.fit(X_red_train, y_train)
y_pred = pipeline.predict(X_red_test)
score = r2_score(y_test, y_pred)
print(score)

Learning rate set to 0.03077
0:	learn: 7577.9023636	total: 2.25ms	remaining: 2.24s
1:	learn: 7436.3933034	total: 2.93ms	remaining: 1.46s
2:	learn: 7278.7264613	total: 3.71ms	remaining: 1.23s
3:	learn: 7143.8575216	total: 4.19ms	remaining: 1.04s
4:	learn: 7014.6666019	total: 5.03ms	remaining: 1s
5:	learn: 6874.0397162	total: 6.2ms	remaining: 1.03s
6:	learn: 6755.2358085	total: 6.96ms	remaining: 987ms
7:	learn: 6629.2030995	total: 9.57ms	remaining: 1.19s
8:	learn: 6495.9752855	total: 10.3ms	remaining: 1.14s
9:	learn: 6390.1572033	total: 11ms	remaining: 1.09s
10:	learn: 6280.1869720	total: 11.7ms	remaining: 1.05s
11:	learn: 6171.5800765	total: 12.4ms	remaining: 1.02s
12:	learn: 6066.0633871	total: 13.1ms	remaining: 997ms
13:	learn: 5962.1291086	total: 13.8ms	remaining: 969ms
14:	learn: 5837.3584476	total: 14.3ms	remaining: 937ms
15:	learn: 5728.6386017	total: 14.9ms	remaining: 918ms
16:	learn: 5612.9055119	total: 15.8ms	remaining: 916ms
17:	learn: 5505.3121588	total: 16.5ms	remaining: 902

In [17]:
file_name = 'catboost_regressor.plk'
joblib.dump(pipeline, filename=file_name, compress=1)
print(f'Catboost pipeline saved into {file_name}' ) 

Catboost pipeline saved into catboost_regressor.plk
