In [166]:
import os
import fnmatch
import numpy as np
import librosa
import librosa.display
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import mode
import optuna

# Извлечение данных. Треки длительностью 6 секунд

In [5]:
# Папка с аудиофайлами
folder_path = 'C:/Users/Mary/Desktop/Диплом/all_music/'
files = fnmatch.filter(os.listdir(folder_path), '*.mp3')

# Списки для хранения признаков и имен файлов
chroma_features_list = []
mel_list = []
mfcc_list = []
tempo_list = []
file_names = []

# Размер интервала в секундах
interval_duration = 6  # секунды

for file in files:
    audio_path = os.path.join(folder_path, file)
    y, sr = librosa.load(audio_path, sr=None, res_type='kaiser_fast')

    # Определяем длину трека в секундах
    track_duration = librosa.get_duration(y=y, sr=sr)

    # Разбиваем трек на интервалы по 6 секунд
    num_intervals = int(track_duration // interval_duration)

    # Обрабатываем каждый интервал по очереди
    for i in range(num_intervals):
        start_sample = i * interval_duration * sr
        end_sample = (i + 1) * interval_duration * sr
        y_interval = y[int(start_sample):int(end_sample)]

        # Извлекаем признаки для каждого интервала

        def extract_statistics(feature):
            return np.hstack([
                np.mean(feature, axis=1),    # Среднее
                np.std(feature, axis=1),     # Стандартное отклонение
                np.median(feature, axis=1),  # Медиана
                np.max(feature, axis=1) - np.min(feature, axis=1)  # Размах
            ])

        # 1. Хромаграмма CQT
        chroma_cqt = librosa.feature.chroma_cqt(y=y_interval, sr=sr, bins_per_octave=60, n_chroma=60)
        chroma_cqt = np.log1p(chroma_cqt)
        chroma_cqt_stats = extract_statistics(chroma_cqt)

        # 2. Хромаграмма STFT
        chroma_stft = librosa.feature.chroma_stft(y=y_interval, sr=sr)
        chroma_stft = np.log1p(chroma_stft)
        chroma_stft_stats = extract_statistics(chroma_stft)

        # 3. Хромаграмма CENS
        chroma_cens = librosa.feature.chroma_cens(y=y_interval, sr=sr)
        chroma_cens = np.log1p(chroma_cens)
        chroma_cens_stats = extract_statistics(chroma_cens)

        # Объединяем хромаграммы в один вектор
        chroma_features = np.hstack([chroma_cqt_stats, chroma_stft_stats, chroma_cens_stats])

        # 4. Mel-спектрограмма
        mel = librosa.feature.melspectrogram(y=y_interval, sr=sr)
        mel_list.append(mel.mean(axis=1))

        # 5. MFCC
        mfcc = librosa.feature.mfcc(y=y_interval, sr=sr, n_mfcc=40)
        mfcc_list.append(mfcc.mean(axis=1))

        # 6. Темп (BPM)
        tempo, _ = librosa.beat.beat_track(y=y_interval, sr=sr)
        tempo_list.append(tempo)

        # Добавляем признаки в список
        chroma_features_list.append(chroma_features)
        file_names.append(file)

# Преобразуем в numpy массив
chroma_array = np.array(chroma_features_list)
mel_array = np.array(mel_list)
mfcc_array = np.array(mfcc_list)

# Нормализация признаков
scaler = StandardScaler()
chroma_array_scaled = scaler.fit_transform(chroma_array)
mel_array_scaled = scaler.fit_transform(mel_array)
mfcc_array_scaled = scaler.fit_transform(mfcc_array)

# Выводим информацию о данных
print(f'Обработано {len(chroma_features_list)} интервалов.')
print(f'Форма массива хромаграмм: {chroma_array_scaled.shape}')
print(f'Форма массива MFCC: {mfcc_array_scaled.shape}')
print(f'Форма массива Mel-спектрограмм: {mel_array_scaled.shape}')


Обработано 4111 интервалов.
Форма массива хромаграмм: (4111, 336)
Форма массива MFCC: (4111, 40)
Форма массива Mel-спектрограмм: (4111, 128)


In [7]:
total_expected_intervals = 0

for file in files:
    audio_path = os.path.join(folder_path, file)
    y, sr = librosa.load(audio_path, sr=None, res_type='kaiser_fast')
    track_duration = librosa.get_duration(y=y, sr=sr)
    num_intervals = int(track_duration // interval_duration)
    print(f"{file}: {track_duration:.2f} сек → {num_intervals} интервалов")
    total_expected_intervals += num_intervals

print(f"Всего ожидается интервалов: {total_expected_intervals}")

MT0000004637.mp3: 30.06 сек → 5 интервалов
MT0000011357.mp3: 29.18 сек → 4 интервалов
MT0000011975.mp3: 30.06 сек → 5 интервалов
MT0000040632.mp3: 30.06 сек → 5 интервалов
MT0000044741.mp3: 30.06 сек → 5 интервалов
MT0000054705.mp3: 30.06 сек → 5 интервалов
MT0000082187.mp3: 30.06 сек → 5 интервалов
MT0000088320.mp3: 30.06 сек → 5 интервалов
MT0000092267.mp3: 30.06 сек → 5 интервалов
MT0000133200.mp3: 30.06 сек → 5 интервалов
MT0000202045.mp3: 30.06 сек → 5 интервалов
MT0000203193.mp3: 30.06 сек → 5 интервалов
MT0000203272.mp3: 29.13 сек → 4 интервалов
MT0000216849.mp3: 30.06 сек → 5 интервалов
MT0000218346.mp3: 30.06 сек → 5 интервалов
MT0000235880.mp3: 29.94 сек → 4 интервалов
MT0000249842.mp3: 30.06 сек → 5 интервалов
MT0000255724.mp3: 30.06 сек → 5 интервалов
MT0000299291.mp3: 30.06 сек → 5 интервалов
MT0000300896.mp3: 30.06 сек → 5 интервалов
MT0000315392.mp3: 30.10 сек → 5 интервалов
MT0000336135.mp3: 30.06 сек → 5 интервалов
MT0000348553.mp3: 30.06 сек → 5 интервалов
MT000036402

# Треки разной длины. Делим на фиксированное количество частей(5)

In [10]:
import os
import fnmatch
import numpy as np
import librosa
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Путь к папке
folder_path = 'C:/Users/Mary/Desktop/Диплом/all_music/'
files = fnmatch.filter(os.listdir(folder_path), '*.mp3')

# Списки для хранения признаков
chroma_features_list = []
mel_list = []
mfcc_list = []
tempo_list = []
file_names = []
interval_indices = []

# Обработка каждого аудиофайла
for file in files:
    audio_path = os.path.join(folder_path, file)
    y, sr = librosa.load(audio_path, sr=None, res_type='kaiser_fast')
    total_samples = len(y)

    num_segments = 5
    segment_length = total_samples // num_segments

    for i in range(num_segments):
        start_sample = i * segment_length
        end_sample = (i + 1) * segment_length if i < num_segments - 1 else total_samples
        y_interval = y[start_sample:end_sample]

        def extract_statistics(feature):
            return np.hstack([
                np.mean(feature, axis=1),
                np.std(feature, axis=1),
                np.median(feature, axis=1),
                np.max(feature, axis=1) - np.min(feature, axis=1)
            ])

        chroma_cqt = librosa.feature.chroma_cqt(y=y_interval, sr=sr, bins_per_octave=60, n_chroma=60)
        chroma_cqt = np.log1p(chroma_cqt)
        chroma_cqt_stats = extract_statistics(chroma_cqt)

        chroma_stft = librosa.feature.chroma_stft(y=y_interval, sr=sr)
        chroma_stft = np.log1p(chroma_stft)
        chroma_stft_stats = extract_statistics(chroma_stft)

        chroma_cens = librosa.feature.chroma_cens(y=y_interval, sr=sr)
        chroma_cens = np.log1p(chroma_cens)
        chroma_cens_stats = extract_statistics(chroma_cens)

        chroma_features = np.hstack([chroma_cqt_stats, chroma_stft_stats, chroma_cens_stats])

        mel = librosa.feature.melspectrogram(y=y_interval, sr=sr)
        mel_list.append(mel.mean(axis=1))

        mfcc = librosa.feature.mfcc(y=y_interval, sr=sr, n_mfcc=40)
        mfcc_list.append(mfcc.mean(axis=1))

        tempo, _ = librosa.beat.beat_track(y=y_interval, sr=sr)
        tempo_list.append(tempo)

        chroma_features_list.append(chroma_features)
        file_names.append(file)
        interval_indices.append(i)




In [12]:
# Преобразуем признаки в массивы
chroma_array = np.array(chroma_features_list)
mel_array = np.array(mel_list)
mfcc_array = np.array(mfcc_list)
tempo_array = np.array(tempo_list).reshape(-1, 1)

# Нормализация
scaler = StandardScaler()
chroma_array_scaled = scaler.fit_transform(chroma_array)
mel_array_scaled = scaler.fit_transform(mel_array)
mfcc_array_scaled = scaler.fit_transform(mfcc_array)
tempo_array_scaled = scaler.fit_transform(tempo_array)

# Объединяем признаки
X_full = np.hstack([chroma_array_scaled, mel_array_scaled, mfcc_array_scaled, tempo_array_scaled])

# Формируем имена колонок
chroma_cols = [f'chroma_{i}' for i in range(chroma_array.shape[1])]
mel_cols = [f'mel_{i}' for i in range(mel_array.shape[1])]
mfcc_cols = [f'mfcc_{i}' for i in range(mfcc_array.shape[1])]
tempo_col = ['tempo']
all_columns = chroma_cols + mel_cols + mfcc_cols + tempo_col

# Создаём датафрейм
df = pd.DataFrame(X_full, columns=all_columns)
df['file_name'] = file_names
df['interval_number'] = interval_indices

# Просмотр результатов
print(df.head())
print(f'Всего строк в датафрейме: {len(df)}')


   chroma_0  chroma_1  chroma_2  chroma_3  chroma_4  chroma_5  chroma_6  \
0  0.268737  0.023156  0.098043  0.040503  0.049885  0.212958  0.528250   
1  0.605032 -0.167954 -0.416713 -0.451981 -0.382166 -0.265434 -0.043554   
2 -0.350068 -0.480495 -0.290440 -0.157244  0.035751  0.247772  0.580638   
3  0.660533 -0.248691 -0.275367 -0.159114 -0.211003 -0.116964  0.206494   
4 -0.488292 -0.616932 -0.412675 -0.210915 -0.131289 -0.012906  0.390483   

   chroma_7  chroma_8  chroma_9  ...   mfcc_33   mfcc_34   mfcc_35   mfcc_36  \
0  0.847383  1.015245  1.485166  ...  0.797992  0.413381 -0.145381 -0.833907   
1  0.254138  0.535075  1.253548  ...  1.015882  1.074985 -0.195630 -0.756741   
2  0.794555  1.430729  1.389776  ...  1.478598  1.318274  0.148898 -0.431756   
3  0.271629  0.395723  0.262516  ...  0.549146  1.228777  0.476222  0.039069   
4  0.831392  0.939970  2.180129  ...  2.191127  1.996992  0.062888 -1.399714   

    mfcc_37   mfcc_38   mfcc_39     tempo         file_name  interva

In [18]:
df.to_csv(r'C:\Users\Mary\Desktop\Диплом\df_five_parts_features.csv')

In [22]:
df.shape

(4500, 507)

# Собираем итоговый датафрейм

In [42]:
music_df = pd.read_csv('C:/Users/Mary/Desktop/Диплом/MER_audio_taffc_dataset/panda_dataset_taffc_metadata.csv')

In [46]:
music_df['Artist'] = music_df['Artist'].isna().fillna('no_name')

In [52]:
combined_df = pd.concat([df, pd.Series(file_names, name='Song')], axis=1)

In [54]:
combined_df['Song'] = combined_df['Song'].str[:12]

In [64]:
combined_df = combined_df.drop('file_name', axis=1)

In [68]:
merged_df = pd.merge(music_df, combined_df, on='Song', how='left')

In [206]:
final_df = merged_df[['Quadrant'] +['Song'] + list(merged_df.columns[-506:])]
#merged_df = pd.concat(merged_df['Quadrant'], combined_df)
final_df.head()

Unnamed: 0,Quadrant,Song,chroma_0,chroma_1,chroma_2,chroma_3,chroma_4,chroma_5,chroma_6,chroma_7,...,mfcc_32,mfcc_33,mfcc_34,mfcc_35,mfcc_36,mfcc_37,mfcc_38,mfcc_39,tempo,interval_number
0,Q3,MT0000004637,0.268737,0.023156,0.098043,0.040503,0.049885,0.212958,0.52825,0.847383,...,1.032472,0.797992,0.413381,-0.145381,-0.833907,-0.793811,-0.550932,0.815139,1.191941,0
1,Q3,MT0000004637,0.605032,-0.167954,-0.416713,-0.451981,-0.382166,-0.265434,-0.043554,0.254138,...,0.89188,1.015882,1.074985,-0.19563,-0.756741,-0.858018,-0.896949,-0.223597,1.411365,1
2,Q3,MT0000004637,-0.350068,-0.480495,-0.29044,-0.157244,0.035751,0.247772,0.580638,0.794555,...,1.439565,1.478598,1.318274,0.148898,-0.431756,-0.679825,-0.724001,0.186107,1.191941,2
3,Q3,MT0000004637,0.660533,-0.248691,-0.275367,-0.159114,-0.211003,-0.116964,0.206494,0.271629,...,0.403377,0.549146,1.228777,0.476222,0.039069,-0.677075,-1.205169,-0.381277,1.191941,3
4,Q3,MT0000004637,-0.488292,-0.616932,-0.412675,-0.210915,-0.131289,-0.012906,0.390483,0.831392,...,1.613432,2.191127,1.996992,0.062888,-1.399714,-1.02457,-0.458077,0.92616,1.191941,4


In [80]:
final_df.shape

(4500, 507)

In [210]:
final_df.to_csv(r'C:\Users\Mary\Desktop\Диплом\df_five_parts.csv')

# Построение моделей

In [212]:
df = pd.read_csv('C:/Users/Mary/Desktop/Диплом/df_five_parts.csv')

In [214]:
df = df.drop(df.columns[[0]], axis=1)

# RFC

In [227]:
# Получаем уникальные треки
unique_tracks = df['Song'].unique()

# Делим треки на train и test
train_tracks, test_tracks = train_test_split(
    unique_tracks,
    test_size=0.10,
    random_state=42,
    stratify=df.groupby('Song')['Quadrant'].first()
)

# Создаем train/test датасеты
train_df = df[df['Song'].isin(train_tracks)].copy()
test_df = df[df['Song'].isin(test_tracks)].copy()

# Сохраняем истинные метки
test_df['Quadrant_true'] = test_df['Quadrant']

# Признаки и метки для обучения
X_train = train_df.drop(['Quadrant', 'Song'], axis=1)
y_train = train_df['Quadrant']

# Подготовка признаков для теста
X_test = test_df.drop(['Quadrant', 'Song', 'Quadrant_true'], axis=1)

# Обучение модели
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Предсказания
test_df['Quadrant_predicted'] = model.predict(X_test)

# Мажоритарное голосование по трекам
track_predictions = test_df.groupby('Song')['Quadrant_predicted'].agg(lambda x: x.mode()[0])
track_true_labels = test_df.groupby('Song')['Quadrant_true'].first()

# Метрики
accuracy = accuracy_score(track_true_labels, track_predictions)
report = classification_report(track_true_labels, track_predictions)

print("Accuracy:", accuracy)
print(report)


Accuracy: 0.6777777777777778
              precision    recall  f1-score   support

          Q1       0.56      0.65      0.60        23
          Q2       0.83      0.86      0.84        22
          Q3       0.68      0.65      0.67        23
          Q4       0.67      0.55      0.60        22

    accuracy                           0.68        90
   macro avg       0.68      0.68      0.68        90
weighted avg       0.68      0.68      0.68        90



In [224]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import optuna

# 1. Делим треки на train/test по названиям песен
unique_tracks = df['Song'].unique()
train_tracks, test_tracks = train_test_split(
    unique_tracks, 
    test_size=0.15, 
    random_state=42, 
    stratify=df.groupby('Song')['Quadrant'].first()
)

# 2. Создаем train/test датафреймы
train_df = df[df['Song'].isin(train_tracks)].copy()
test_df = df[df['Song'].isin(test_tracks)].copy()

# 3. Выделяем признаки и метки
X_train = train_df.drop(['Quadrant', 'Song'], axis=1)
y_train = train_df['Quadrant']
X_test = test_df.drop(['Quadrant', 'Song'], axis=1)
y_test_true = test_df['Quadrant'].copy()  # Сохраняем истинные метки

# 4. Оптимизация гиперпараметров (только по train!)
def objective(trial):
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 2, 50),
        min_samples_split=trial.suggest_float("min_samples_split", 0.01, 0.5),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 20),
        max_features=trial.suggest_float("max_features", 0.1, 1.0),
        bootstrap=trial.suggest_categorical("bootstrap", [True, False]),
        random_state=42,
        n_jobs=-1
    )
    # Кросс-валидация внутри трейна
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    return scores.mean()

# 5. Запуск Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print("Best hyperparameters:", study.best_params)

# 6. Обучение финальной модели на всем трейне
best_model = RandomForestClassifier(
    **study.best_params,
    random_state=42,
    n_jobs=-1
)
best_model.fit(X_train, y_train)

# 7. Предсказание по частям треков на тесте
test_df['Quadrant_predicted'] = best_model.predict(X_test)

# 8. Мажоритарное голосование по трекам
track_predictions = test_df.groupby('Song')['Quadrant_predicted'].agg(lambda x: x.mode()[0])
track_true_labels = test_df.groupby('Song')['Quadrant'].first()

# 9. Метрики
accuracy = accuracy_score(track_true_labels, track_predictions)
report = classification_report(track_true_labels, track_predictions)

print("Accuracy:", accuracy)
print(report)


[I 2025-05-07 20:29:32,221] A new study created in memory with name: no-name-7c121bfc-ad24-4edb-83ee-e3d39bae878c
[I 2025-05-07 20:30:23,809] Trial 0 finished with value: 0.6062745098039216 and parameters: {'n_estimators': 364, 'max_depth': 41, 'min_samples_split': 0.08260440553307664, 'min_samples_leaf': 13, 'max_features': 0.2806694722229516, 'bootstrap': True}. Best is trial 0 with value: 0.6062745098039216.
[I 2025-05-07 20:30:50,605] Trial 1 finished with value: 0.6248366013071895 and parameters: {'n_estimators': 209, 'max_depth': 11, 'min_samples_split': 0.08115168348734783, 'min_samples_leaf': 11, 'max_features': 0.18176108216292614, 'bootstrap': False}. Best is trial 1 with value: 0.6248366013071895.
[I 2025-05-07 20:32:22,935] Trial 2 finished with value: 0.48 and parameters: {'n_estimators': 481, 'max_depth': 17, 'min_samples_split': 0.3890970516954139, 'min_samples_leaf': 3, 'max_features': 0.7930066533804991, 'bootstrap': False}. Best is trial 1 with value: 0.62483660130718

Best hyperparameters: {'n_estimators': 113, 'max_depth': 39, 'min_samples_split': 0.010592267005094316, 'min_samples_leaf': 5, 'max_features': 0.41404881150661577, 'bootstrap': False}
Accuracy: 0.6296296296296297
              precision    recall  f1-score   support

          Q1       0.59      0.71      0.64        34
          Q2       0.76      0.76      0.76        33
          Q3       0.60      0.53      0.56        34
          Q4       0.58      0.53      0.55        34

    accuracy                           0.63       135
   macro avg       0.63      0.63      0.63       135
weighted avg       0.63      0.63      0.63       135



In [235]:
# Обучение модели
model = RandomForestClassifier(n_estimators = 113, max_depth = 39, min_samples_split = 0.010592267005094316, min_samples_leaf = 5, max_features = 0.41404881150661577, bootstrap = False)
model.fit(X_train, y_train)

# Мажоритарное голосование по трекам
track_predictions = test_df.groupby('Song')['Quadrant_predicted'].agg(lambda x: x.mode()[0])
track_true_labels = test_df.groupby('Song')['Quadrant_true'].first()

# Метрики
accuracy = accuracy_score(track_true_labels, track_predictions)
report = classification_report(track_true_labels, track_predictions)

print("Accuracy:", accuracy)
print(report)


Accuracy: 0.6777777777777778
              precision    recall  f1-score   support

          Q1       0.56      0.65      0.60        23
          Q2       0.83      0.86      0.84        22
          Q3       0.68      0.65      0.67        23
          Q4       0.67      0.55      0.60        22

    accuracy                           0.68        90
   macro avg       0.68      0.68      0.68        90
weighted avg       0.68      0.68      0.68        90



# Взвешенное среднее

In [250]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

# --- Подготовка данных ---

# Кодируем целевую переменную
label_encoder = LabelEncoder()
df['Quadrant_encoded'] = label_encoder.fit_transform(df['Quadrant'])

# Получаем уникальные треки
unique_tracks = df['Song'].unique()

# Делим треки на train и test (по целым трекам, не по строкам)
train_tracks, test_tracks = train_test_split(
    unique_tracks,
    test_size=0.10,
    random_state=42,
    stratify=df.groupby('Song')['Quadrant_encoded'].first()
)

# Создаем train/test датасеты
train_df = df[df['Song'].isin(train_tracks)].copy()
test_df = df[df['Song'].isin(test_tracks)].copy()

# Сохраняем истинные метки
test_df['Quadrant_true'] = test_df['Quadrant_encoded']

# Признаки и метки для обучения
X_train = train_df.drop(['Quadrant', 'Quadrant_encoded', 'Song'], axis=1)
y_train = train_df['Quadrant_encoded']

# Подготовка признаков для теста
X_test = test_df.drop(['Quadrant', 'Quadrant_encoded', 'Song', 'Quadrant_true'], axis=1)

# --- Обучение модели ---
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# --- Предсказания вероятностей ---
probas = model.predict_proba(X_test)
classes = model.classes_

# Добавим вероятности в test_df
for i, cls in enumerate(classes):
    test_df[f'proba_{cls}'] = probas[:, i]

# --- Взвешенное голосование по треку ---
proba_cols = [f'proba_{cls}' for cls in classes]
track_probas = test_df.groupby('Song')[proba_cols].mean()

# Выбор класса с наибольшей средней вероятностью
track_predictions_encoded = track_probas.idxmax(axis=1).apply(lambda x: int(x.replace("proba_", "")))

# Истинные метки
track_true_labels = test_df.groupby('Song')['Quadrant_true'].first()

# --- Метрики ---
accuracy = accuracy_score(track_true_labels, track_predictions_encoded)
report = classification_report(
    track_true_labels,
    track_predictions_encoded,
    target_names=label_encoder.classes_
)

print("Accuracy (взвешенное голосование):", accuracy)
print(report)


Accuracy (взвешенное голосование): 0.7
              precision    recall  f1-score   support

          Q1       0.64      0.70      0.67        23
          Q2       0.83      0.86      0.84        22
          Q3       0.73      0.70      0.71        23
          Q4       0.60      0.55      0.57        22

    accuracy                           0.70        90
   macro avg       0.70      0.70      0.70        90
weighted avg       0.70      0.70      0.70        90



# XGBoost

In [254]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

# --- Подготовка данных ---

# Кодируем целевую переменную
label_encoder = LabelEncoder()
df['Quadrant_encoded'] = label_encoder.fit_transform(df['Quadrant'])

# Получаем уникальные треки
unique_tracks = df['Song'].unique()

# Делим треки на train и test (по целым трекам, не по строкам)
train_tracks, test_tracks = train_test_split(
    unique_tracks,
    test_size=0.15,
    random_state=42,
    stratify=df.groupby('Song')['Quadrant_encoded'].first()
)

# Создаем train/test датасеты
train_df = df[df['Song'].isin(train_tracks)].copy()
test_df = df[df['Song'].isin(test_tracks)].copy()

# Сохраняем истинные метки
test_df['Quadrant_true'] = test_df['Quadrant_encoded']

# Признаки и метки для обучения
X_train = train_df.drop(['Quadrant', 'Quadrant_encoded', 'Song'], axis=1)
y_train = train_df['Quadrant_encoded']

# Подготовка признаков для теста
X_test = test_df.drop(['Quadrant', 'Quadrant_encoded', 'Song', 'Quadrant_true'], axis=1)

# --- Обучение модели ---
model = XGBClassifier(random_state=42)
model.fit(X_train, y_train)

# --- Предсказания вероятностей ---
probas = model.predict_proba(X_test)
classes = model.classes_

# Добавим вероятности в test_df
for i, cls in enumerate(classes):
    test_df[f'proba_{cls}'] = probas[:, i]

# --- Взвешенное голосование по треку ---
proba_cols = [f'proba_{cls}' for cls in classes]
track_probas = test_df.groupby('Song')[proba_cols].mean()

# Выбор класса с наибольшей средней вероятностью
track_predictions_encoded = track_probas.idxmax(axis=1).apply(lambda x: int(x.replace("proba_", "")))

# Истинные метки
track_true_labels = test_df.groupby('Song')['Quadrant_true'].first()

# --- Метрики ---
accuracy = accuracy_score(track_true_labels, track_predictions_encoded)
report = classification_report(
    track_true_labels,
    track_predictions_encoded,
    target_names=label_encoder.classes_
)

print("Accuracy (взвешенное голосование):", accuracy)
print(report)


Accuracy (взвешенное голосование): 0.674074074074074
              precision    recall  f1-score   support

          Q1       0.66      0.79      0.72        34
          Q2       0.87      0.82      0.84        33
          Q3       0.63      0.56      0.59        34
          Q4       0.55      0.53      0.54        34

    accuracy                           0.67       135
   macro avg       0.68      0.68      0.67       135
weighted avg       0.68      0.67      0.67       135



# Взвешенное среднее + Optuna

In [259]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import optuna

# --- Подготовка данных ---
label_encoder = LabelEncoder()
df['Quadrant_encoded'] = label_encoder.fit_transform(df['Quadrant'])

unique_tracks = df['Song'].unique()

train_tracks, test_tracks = train_test_split(
    unique_tracks,
    test_size=0.10,
    random_state=42,
    stratify=df.groupby('Song')['Quadrant_encoded'].first()
)

train_df = df[df['Song'].isin(train_tracks)].copy()
test_df = df[df['Song'].isin(test_tracks)].copy()

test_df['Quadrant_true'] = test_df['Quadrant_encoded']

X_train = train_df.drop(['Quadrant', 'Quadrant_encoded', 'Song'], axis=1)
y_train = train_df['Quadrant_encoded']

X_test = test_df.drop(['Quadrant', 'Quadrant_encoded', 'Song', 'Quadrant_true'], axis=1)

# --- Определение функции оптимизации для Optuna ---
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=100),
        'max_depth': trial.suggest_int('max_depth', 10, 110, step=20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, step=2),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, step=1),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt']),
        'random_state': 42
    }

    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)

    probas = model.predict_proba(X_test)
    classes = model.classes_

    for i, cls in enumerate(classes):
        test_df[f'proba_{cls}'] = probas[:, i]

    proba_cols = [f'proba_{cls}' for cls in classes]
    track_probas = test_df.groupby('Song')[proba_cols].mean()

    track_predictions_encoded = track_probas.idxmax(axis=1).apply(lambda x: int(x.replace("proba_", "")))
    track_true_labels = test_df.groupby('Song')['Quadrant_true'].first()

    accuracy = accuracy_score(track_true_labels, track_predictions_encoded)
    return accuracy

# --- Запуск оптимизации ---
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_params)

# --- Обучение модели с лучшими гиперпараметрами ---
best_model = RandomForestClassifier(**study.best_params, random_state=42)
best_model.fit(X_train, y_train)

probas = best_model.predict_proba(X_test)
classes = best_model.classes_

for i, cls in enumerate(classes):
    test_df[f'proba_{cls}'] = probas[:, i]

proba_cols = [f'proba_{cls}' for cls in classes]
track_probas = test_df.groupby('Song')[proba_cols].mean()

track_predictions_encoded = track_probas.idxmax(axis=1).apply(lambda x: int(x.replace("proba_", "")))
track_true_labels = test_df.groupby('Song')['Quadrant_true'].first()

accuracy = accuracy_score(track_true_labels, track_predictions_encoded)
report = classification_report(track_true_labels, track_predictions_encoded, target_names=label_encoder.classes_)

print("Accuracy (взвешенное голосование):", accuracy)
print(report)


[I 2025-05-07 23:07:06,293] A new study created in memory with name: no-name-bf9b0069-af58-421d-9b91-6beaecb3c7d0
  warn(
[I 2025-05-07 23:07:59,825] Trial 0 finished with value: 0.7 and parameters: {'n_estimators': 500, 'max_depth': 90, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'auto'}. Best is trial 0 with value: 0.7.
  warn(
[I 2025-05-07 23:08:49,766] Trial 1 finished with value: 0.6888888888888889 and parameters: {'n_estimators': 500, 'max_depth': 90, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'auto'}. Best is trial 0 with value: 0.7.
  warn(
[I 2025-05-07 23:08:58,267] Trial 2 finished with value: 0.6888888888888889 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'auto'}. Best is trial 0 with value: 0.7.
[I 2025-05-07 23:09:08,853] Trial 3 finished with value: 0.7 and parameters: {'n_estimators': 100, 'max_depth': 90, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_

Best hyperparameters: {'n_estimators': 300, 'max_depth': 50, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Accuracy (взвешенное голосование): 0.7222222222222222
              precision    recall  f1-score   support

          Q1       0.63      0.74      0.68        23
          Q2       0.86      0.86      0.86        22
          Q3       0.73      0.70      0.71        23
          Q4       0.68      0.59      0.63        22

    accuracy                           0.72        90
   macro avg       0.73      0.72      0.72        90
weighted avg       0.73      0.72      0.72        90



In [261]:
import xgboost as xgb

# --- Подготовка данных ---
label_encoder = LabelEncoder()
df['Quadrant_encoded'] = label_encoder.fit_transform(df['Quadrant'])

unique_tracks = df['Song'].unique()

train_tracks, test_tracks = train_test_split(
    unique_tracks,
    test_size=0.10,
    random_state=42,
    stratify=df.groupby('Song')['Quadrant_encoded'].first()
)

train_df = df[df['Song'].isin(train_tracks)].copy()
test_df = df[df['Song'].isin(test_tracks)].copy()

test_df['Quadrant_true'] = test_df['Quadrant_encoded']

X_train = train_df.drop(['Quadrant', 'Quadrant_encoded', 'Song'], axis=1)
y_train = train_df['Quadrant_encoded']
X_test = test_df.drop(['Quadrant', 'Quadrant_encoded', 'Song', 'Quadrant_true'], axis=1)

# --- Определение функции оптимизации для Optuna ---
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=100),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'use_label_encoder': False,
        'eval_metric': 'mlogloss',
        'random_state': 42
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    probas = model.predict_proba(X_test)
    classes = model.classes_

    for i, cls in enumerate(classes):
        test_df[f'proba_{cls}'] = probas[:, i]

    proba_cols = [f'proba_{cls}' for cls in classes]
    track_probas = test_df.groupby('Song')[proba_cols].mean()

    track_predictions_encoded = track_probas.idxmax(axis=1).apply(lambda x: int(x.replace("proba_", "")))
    track_true_labels = test_df.groupby('Song')['Quadrant_true'].first()

    accuracy = accuracy_score(track_true_labels, track_predictions_encoded)
    return accuracy

# --- Запуск оптимизации ---
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_params)

# --- Обучение модели с лучшими гиперпараметрами ---
best_model = xgb.XGBClassifier(**study.best_params, use_label_encoder=False, eval_metric='mlogloss', random_state=42)
best_model.fit(X_train, y_train)

probas = best_model.predict_proba(X_test)
classes = best_model.classes_

for i, cls in enumerate(classes):
    test_df[f'proba_{cls}'] = probas[:, i]

proba_cols = [f'proba_{cls}' for cls in classes]
track_probas = test_df.groupby('Song')[proba_cols].mean()

track_predictions_encoded = track_probas.idxmax(axis=1).apply(lambda x: int(x.replace("proba_", "")))
track_true_labels = test_df.groupby('Song')['Quadrant_true'].first()

accuracy = accuracy_score(track_true_labels, track_predictions_encoded)
report = classification_report(track_true_labels, track_predictions_encoded, target_names=label_encoder.classes_)

print("Accuracy (взвешенное голосование):", accuracy)
print(report)


[I 2025-05-07 23:35:18,957] A new study created in memory with name: no-name-8f930041-851d-4d80-9dc1-88488be6baad
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-05-07 23:36:02,635] Trial 0 finished with value: 0.6666666666666666 and parameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.15790642541617875, 'subsample': 0.5319306149718444, 'colsample_bytree': 0.9838965683153312, 'gamma': 0.36834210951697366}. Best is trial 0 with value: 0.6666666666666666.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-05-07 23:37:20,045] Trial 1 finished with value: 0.6555555555555556 and parameters: {'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.027400969036043804, 'subsample': 0.9927728675552693, 'colsample_bytree': 0.6450838150338469, 'gamma': 0.36699177177904885}. Best is trial 0 with value: 0.6666666666666666.
Parameters: { "use_label_encoder" } are not used.

  b

Best hyperparameters: {'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.1306167555154348, 'subsample': 0.6809985690046554, 'colsample_bytree': 0.6299077697675527, 'gamma': 0.06953923548252894}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy (взвешенное голосование): 0.7222222222222222
              precision    recall  f1-score   support

          Q1       0.72      0.78      0.75        23
          Q2       0.88      0.95      0.91        22
          Q3       0.70      0.61      0.65        23
          Q4       0.57      0.55      0.56        22

    accuracy                           0.72        90
   macro avg       0.72      0.72      0.72        90
weighted avg       0.72      0.72      0.72        90



# Подбор гиперпараметров и XGB

In [180]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import optuna
import pandas as pd

# --- Этап 1: Кодирование целевой переменной ---
label_encoder = LabelEncoder()
df['Quadrant_encoded'] = label_encoder.fit_transform(df['Quadrant'])

# Получаем уникальные треки
unique_tracks = df['Song'].unique()

# Делим треки на train и test
train_tracks, test_tracks = train_test_split(
    unique_tracks,
    test_size=0.15,
    random_state=42,
    stratify=df.groupby('Song')['Quadrant_encoded'].first()
)

# Создаем train/test датасеты
train_df = df[df['Song'].isin(train_tracks)]
test_df = df[df['Song'].isin(test_tracks)]

# Признаки и метки
X_train = train_df.drop(['Quadrant', 'Quadrant_encoded', 'Song'], axis=1)
y_train = train_df['Quadrant_encoded']
X_test = test_df.drop(['Quadrant', 'Quadrant_encoded', 'Song'], axis=1)
y_test = test_df['Quadrant_encoded']

# --- Этап 2: Оптимизация гиперпараметров ---
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42,
        'verbosity': 0,
        'use_label_encoder': False
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print("Best hyperparameters:", study.best_params)

# --- Этап 3: Обучение финальной модели ---
best_model = XGBClassifier(
    **study.best_params,
    random_state=42,
    verbosity=0,
    use_label_encoder=False
)
best_model.fit(X_train, y_train)

# Предсказания и обратное декодирование
test_df['Quadrant_encoded'] = best_model.predict(X_test)
test_df['Quadrant_predicted'] = label_encoder.inverse_transform(test_df['Quadrant_encoded'])

# Мажоритарное голосование
track_predictions = test_df.groupby('Song')['Quadrant_predicted'].agg(lambda x: x.mode()[0])
track_true_labels = test_df.groupby('Song')['Quadrant'].first()

# Метрики
accuracy = accuracy_score(track_true_labels, track_predictions)
report = classification_report(track_true_labels, track_predictions)

print("Accuracy:", accuracy)
print(report)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Quadrant_encoded'] = label_encoder.fit_transform(df['Quadrant'])
[I 2025-05-07 19:18:34,055] A new study created in memory with name: no-name-90cf111e-be79-48d6-b249-7b045525e88d
[I 2025-05-07 19:18:57,647] Trial 0 finished with value: 0.6281481481481481 and parameters: {'n_estimators': 108, 'max_depth': 6, 'learning_rate': 0.0202225534609789, 'subsample': 0.8617846284895918, 'colsample_bytree': 0.8443188047621206, 'gamma': 2.7786450016043336, 'reg_alpha': 3.7846295774664176, 'reg_lambda': 3.494898055368176}. Best is trial 0 with value: 0.6281481481481481.
[I 2025-05-07 19:19:26,664] Trial 1 finished with value: 0.6474074074074074 and parameters: {'n_estimators': 280, 'max_depth': 9, 'learning_rate': 0.06657777591472111, 

Best hyperparameters: {'n_estimators': 267, 'max_depth': 10, 'learning_rate': 0.012061780773956976, 'subsample': 0.6831818777114562, 'colsample_bytree': 0.6632851105280485, 'gamma': 2.7345480107386457, 'reg_alpha': 0.010485174920886964, 'reg_lambda': 1.698396786566164}
Accuracy: 0.6592592592592592
              precision    recall  f1-score   support

          Q1       0.59      0.68      0.63        34
          Q2       0.77      0.82      0.79        33
          Q3       0.70      0.56      0.62        34
          Q4       0.59      0.59      0.59        34

    accuracy                           0.66       135
   macro avg       0.66      0.66      0.66       135
weighted avg       0.66      0.66      0.66       135



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Quadrant_encoded'] = best_model.predict(X_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Quadrant_predicted'] = label_encoder.inverse_transform(test_df['Quadrant_encoded'])


In [237]:
label_encoder = LabelEncoder()
train_df['Quadrant'] = label_encoder.fit_transform(train_df['Quadrant'])

# Признаки и метки
X_train = train_df.drop(['Quadrant', 'Song'],axis=1)  # Подставь свои признаки
y_train = train_df['Quadrant']

# Обучение модели
model = XGBClassifier(n_estimators = 267, max_depth = 10, learning_rate = 0.012061780773956976, subsample = 0.6831818777114562, colsample_bytree = 0.6632851105280485, gamma = 2.7345480107386457, reg_alpha = 0.010485174920886964, reg_lambda = 1.698396786566164)
model.fit(X_train, y_train)

In [239]:
label_encoder = LabelEncoder()
test_df['Quadrant'] = label_encoder.fit_transform(test_df['Quadrant'])

# Мажоритарное голосование по трекам
track_predictions = test_df.groupby('Song')['Quadrant_predicted'].agg(lambda x: x.mode()[0])
track_true_labels = test_df.groupby('Song')['Quadrant_true'].first()

# Метрики
accuracy = accuracy_score(track_true_labels, track_predictions)
report = classification_report(track_true_labels, track_predictions)

print("Accuracy:", accuracy)
print(report)

Accuracy: 0.6777777777777778
              precision    recall  f1-score   support

          Q1       0.56      0.65      0.60        23
          Q2       0.83      0.86      0.84        22
          Q3       0.68      0.65      0.67        23
          Q4       0.67      0.55      0.60        22

    accuracy                           0.68        90
   macro avg       0.68      0.68      0.68        90
weighted avg       0.68      0.68      0.68        90



# Более сильный поиск по сетке с кросс-валидацией

In [242]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import optuna
import pandas as pd
import numpy as np

# --- Этап 1: Кодирование целевой переменной ---
label_encoder = LabelEncoder()
df['Quadrant_encoded'] = label_encoder.fit_transform(df['Quadrant'])

# Получаем уникальные треки
unique_tracks = df['Song'].unique()

# Делим треки на train и test
train_tracks, test_tracks = train_test_split(
    unique_tracks,
    test_size=0.15,
    random_state=42,
    stratify=df.groupby('Song')['Quadrant_encoded'].first()
)

# Создаем train/test датасеты
train_df = df[df['Song'].isin(train_tracks)].copy()
test_df = df[df['Song'].isin(test_tracks)].copy()

# Признаки и метки
X_train = train_df.drop(['Quadrant', 'Quadrant_encoded', 'Song'], axis=1)
y_train = train_df['Quadrant_encoded']
groups = train_df['Song']  # Для GroupKFold

X_test = test_df.drop(['Quadrant', 'Quadrant_encoded', 'Song'], axis=1)
y_test = test_df['Quadrant_encoded']

# --- Этап 2: Оптимизация гиперпараметров с GroupKFold ---
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42,
        'verbosity': 0,
        'use_label_encoder': False
    }

    model = XGBClassifier(**params)

    group_kfold = GroupKFold(n_splits=3)
    scores = []

    for train_idx, val_idx in group_kfold.split(X_train, y_train, groups=groups):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr)
        preds = model.predict(X_val)
        scores.append(accuracy_score(y_val, preds))

    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print("Best hyperparameters:", study.best_params)

# --- Этап 3: Обучение финальной модели на всем тренировочном датасете ---
best_model = XGBClassifier(
    **study.best_params,
    random_state=42,
    verbosity=0,
    use_label_encoder=False
)
best_model.fit(X_train, y_train)

# Предсказания и обратное декодирование
test_df['Quadrant_encoded'] = best_model.predict(X_test)
test_df['Quadrant_predicted'] = label_encoder.inverse_transform(test_df['Quadrant_encoded'])

# Мажоритарное голосование по треку
track_predictions = test_df.groupby('Song')['Quadrant_predicted'].agg(lambda x: x.mode()[0])
track_true_labels = test_df.groupby('Song')['Quadrant'].first()

# Метрики
accuracy = accuracy_score(track_true_labels, track_predictions)
report = classification_report(track_true_labels, track_predictions)

print("Accuracy:", accuracy)
print(report)


[I 2025-05-07 21:51:34,006] A new study created in memory with name: no-name-43316c83-7e76-4d55-948c-a9f2de39aad5
[I 2025-05-07 21:52:30,572] Trial 0 finished with value: 0.5840522875816992 and parameters: {'n_estimators': 155, 'max_depth': 5, 'learning_rate': 0.012650498511991952, 'subsample': 0.531423888600521, 'colsample_bytree': 0.5101134918699682, 'gamma': 3.0526096260536577, 'reg_alpha': 1.8040819732872566, 'reg_lambda': 4.466096181303164}. Best is trial 0 with value: 0.5840522875816992.
[I 2025-05-07 21:53:34,079] Trial 1 finished with value: 0.6047058823529412 and parameters: {'n_estimators': 244, 'max_depth': 3, 'learning_rate': 0.04069000625670161, 'subsample': 0.7186364465336084, 'colsample_bytree': 0.7325852693265797, 'gamma': 1.7516392892581167, 'reg_alpha': 1.6137915105126095, 'reg_lambda': 0.6852271775491608}. Best is trial 1 with value: 0.6047058823529412.
[I 2025-05-07 21:56:26,911] Trial 2 finished with value: 0.5976470588235294 and parameters: {'n_estimators': 263, '

Best hyperparameters: {'n_estimators': 295, 'max_depth': 4, 'learning_rate': 0.10943403243496633, 'subsample': 0.6000782322243383, 'colsample_bytree': 0.5845535582249093, 'gamma': 2.4476097959409953, 'reg_alpha': 0.6675989299116594, 'reg_lambda': 0.5347733994615613}
Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

          Q1       0.60      0.74      0.66        34
          Q2       0.76      0.79      0.78        33
          Q3       0.71      0.50      0.59        34
          Q4       0.63      0.65      0.64        34

    accuracy                           0.67       135
   macro avg       0.67      0.67      0.66       135
weighted avg       0.67      0.67      0.66       135

