In [47]:
import pandas as pd

df = pd.read_csv("diamonds.csv", sep=",", decimal=".")
df = df.drop(columns="Unnamed: 0")
df['cut'].value_counts()

cut
Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: count, dtype: int64

In [56]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X = df.drop('cut', axis=1)
y = df['cut']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [81]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Определить процедуру препроцессинга для числовых значений
numeric_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='mean')),
	('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
	('encoder', OneHotEncoder(drop='first'))
])

# Скомбинировать процедуры препроцессинга данных
preprocessor = ColumnTransformer(
	transformers=[
    	('num', numeric_transformer, numeric_features),
    	('cat', categorical_transformer, categorical_features)
	]
)

# Применить процедуру препроцессинга к обучающим данным
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

sampling_strategy={
    "Fair": 3000,
    'Good': 4000,
    'Very Good': 20000,
}
smote = SMOTE(random_state=10, sampling_strategy=sampling_strategy)
X_resampled, y_resampled = smote.fit_resample(X_train_processed, y_train)

In [94]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=2000, learning_rate=0.3, depth=3, random_state=42)
model.fit(X_train_processed, y_train)
predict = model.predict(X_test_processed)

0:	learn: 1.1971740	total: 7.8ms	remaining: 15.6s
1:	learn: 1.0507850	total: 14.9ms	remaining: 14.9s
2:	learn: 0.9678202	total: 22.5ms	remaining: 15s
3:	learn: 0.9181344	total: 29.8ms	remaining: 14.9s
4:	learn: 0.8815195	total: 36.2ms	remaining: 14.4s
5:	learn: 0.8526740	total: 43.1ms	remaining: 14.3s
6:	learn: 0.8322853	total: 49.3ms	remaining: 14s
7:	learn: 0.8176435	total: 55.8ms	remaining: 13.9s
8:	learn: 0.8045019	total: 62.4ms	remaining: 13.8s
9:	learn: 0.7945736	total: 68.6ms	remaining: 13.7s
10:	learn: 0.7810852	total: 75.3ms	remaining: 13.6s
11:	learn: 0.7728866	total: 83.1ms	remaining: 13.8s
12:	learn: 0.7665094	total: 89.4ms	remaining: 13.7s
13:	learn: 0.7621505	total: 96.8ms	remaining: 13.7s
14:	learn: 0.7582480	total: 103ms	remaining: 13.7s
15:	learn: 0.7550711	total: 109ms	remaining: 13.6s
16:	learn: 0.7496801	total: 116ms	remaining: 13.5s
17:	learn: 0.7455661	total: 122ms	remaining: 13.5s
18:	learn: 0.7409243	total: 128ms	remaining: 13.3s
19:	learn: 0.7368831	total: 133m

In [95]:
from sklearn.metrics import classification_report

report = classification_report(y_test, predict, output_dict=True)
report = pd.DataFrame(report).transpose()

report

Unnamed: 0,precision,recall,f1-score,support
Fair,0.927835,0.897756,0.912548,401.0
Good,0.794667,0.704492,0.746867,1269.0
Ideal,0.825501,0.913514,0.86728,5365.0
Premium,0.825823,0.82108,0.823445,3482.0
Very Good,0.690245,0.598383,0.64104,2968.0
accuracy,0.800148,0.800148,0.800148,0.800148
macro avg,0.812814,0.787045,0.798236,13485.0
weighted avg,0.795956,0.800148,0.796181,13485.0
