In [None]:
TRAIN = "https://www.dropbox.com/scl/fi/5zy935lqpaqr9lat76ung/music_genre_train.csv?rlkey=ccovu9ml8pfi9whk1ba26zdda&dl=1"

In [None]:
pip install catboost




In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE

# Загрузка данных
train_data = pd.read_csv(TRAIN)

# Обработка пропущенных значений
for column in train_data.columns:
    if train_data[column].isnull().sum() > 0:
        if train_data[column].dtype == 'object':
            train_data[column].fillna(train_data[column].mode()[0], inplace=True)
        else:
            train_data[column].fillna(train_data[column].median(), inplace=True)

# Замена отрицательных значений в столбце `duration_ms` на медианное значение положительных значений
median_duration = train_data[train_data['duration_ms'] > 0]['duration_ms'].median()
train_data.loc[train_data['duration_ms'] < 0, 'duration_ms'] = median_duration

# Исключение столбцов с высокой корреляцией и ненужных столбцов
X = train_data.drop(columns=['music_genre', 'instance_id', 'track_name', 'obtained_date', 'energy'])

# Преобразование категориальных признаков с помощью pd.get_dummies
X = pd.get_dummies(X, columns=['key', 'mode'], drop_first=True)
y = train_data['music_genre']

# Преобразование меток классов в числовой формат
le = LabelEncoder()
y = le.fit_transform(y)

# Балансировка данных с использованием SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Разделение данных на обучающую и валидационную выборки
X_train, X_val, y_train, y_val = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced)

# Определение модели CatBoostClassifier
catboost_model = CatBoostClassifier(random_state=42, silent=True)

# Определение параметров для поиска
param_grid = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5, 7]
}

# Инициализация GridSearchCV
grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Поиск лучших параметров
grid_search.fit(X_train, y_train)

# Использование лучших параметров для обучения модели
best_catboost_model = grid_search.best_estimator_

# Получение предсказаний и оценка модели
y_train_pred = best_catboost_model.predict(X_train)
y_val_pred = best_catboost_model.predict(X_val)

train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)
classification_rep = classification_report(y_val, y_val_pred, target_names=le.classes_)
conf_matrix = confusion_matrix(y_val, y_val_pred)

print("Train Accuracy:", train_accuracy)
print("Validation Accuracy:", val_accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Train Accuracy: 0.8122813156053184
Validation Accuracy: 0.5437368789363191

Classification Report:
               precision    recall  f1-score   support

 Alternative       0.32      0.27      0.29       571
       Anime       0.56      0.57      0.56       571
       Blues       0.43      0.46      0.45       572
   Classical       0.85      0.88      0.87       572
     Country       0.52      0.59      0.55       572
  Electronic       0.64      0.61      0.62       571
     Hip-Hop       0.61      0.63      0.62       572
        Jazz       0.60      0.66      0.63       572
         Rap       0.50      0.53      0.52       571
        Rock       0.31      0.24      0.27       572

    accuracy                           0.54      5716
   macro avg       0.53      0.54      0.54      5716
weighted avg       0.53      0.54      0.54      5716


Confusion Matrix:
 [[153  63  57   3  54  48  38  30  50  75]
 [ 50 324  24  