In [1]:
import numpy as np
import optuna

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD

from xgboost import XGBRegressor

from env import DATA_DIR
from utils import load_jsons_data, delete_parts_of_lyrics
from language_detect import safe_decode

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = load_jsons_data(data_dir=DATA_DIR, 
                    columns=["lyrics", "popularity"])

In [8]:
df["lyrics"] = df["lyrics"].apply(safe_decode)
df["lyrics"] = df["lyrics"].str.lower()
df["lyrics"] = df["lyrics"].str.replace(r'[^\w\s]', '')
df["lyrics"] = df["lyrics"].apply(delete_parts_of_lyrics)
df["lyrics"] = df["lyrics"].str.replace(r'\d+', '')
df["lyrics"] = df["lyrics"].str.strip()
# Может стоит удалить цифры?
# Может стоит использовать лемматизацию или стемминг?
# Фильтрация стоп-слов?

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df["lyrics"], df["popularity"], test_size=0.2, random_state=42)

In [10]:
# TF-IDF: используем слова + символы, без `stop_words`
word_vectorizer = TfidfVectorizer(ngram_range=(2, 4), 
                                  max_features=75000, 
                                  min_df=2, 
                                  max_df=0.95)
char_vectorizer = TfidfVectorizer(analyzer='char_wb', 
                                  ngram_range=(3, 6), 
                                  max_features=20000,
                                  min_df=2,
                                  max_df=0.95)

In [11]:
# Обучаем TF-IDF
X_train_word = word_vectorizer.fit_transform(X_train)
X_train_char  = char_vectorizer.fit_transform(X_train)
X_train_tfidf = np.hstack((X_train_word.toarray(), X_train_char.toarray()))

X_test_word = word_vectorizer.transform(X_test)
X_test_char = char_vectorizer.transform(X_test)
X_test_tfidf = np.hstack((X_test_word.toarray(), X_test_char.toarray()))

In [12]:
# Снижаем размерность (ускоряем XGBoost)
svd = TruncatedSVD(n_components=700)
X_train_tfidf = svd.fit_transform(X_train_tfidf)
X_test_tfidf = svd.transform(X_test_tfidf)

In [13]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 5),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.03, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 3, 7),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.3, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.2, 0.4),
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_jobs': -1
    }

    model = XGBRegressor(**params)
    
    # Кросс-валидация
    score = cross_val_score(
        model, X_train_tfidf, y_train,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_jobs=-1
    )
    
    return -score.mean()  # хотим минимизировать RMSE

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=1500)

print("Лучшие параметры:")
print(study.best_params)
print(f"Лучший RMSE: {study.best_value:.4f}")

[I 2025-03-25 01:44:23,715] A new study created in memory with name: no-name-e8231bdc-6eca-4264-a353-c3b45d1ca084
[I 2025-03-25 01:44:31,279] Trial 0 finished with value: 24.05092163861521 and parameters: {'n_estimators': 551, 'max_depth': 2, 'learning_rate': 0.02435479894600591, 'subsample': 0.6253686576179355, 'colsample_bytree': 0.8539913151551519, 'gamma': 5.035526623962243, 'reg_alpha': 0.341098830723397, 'reg_lambda': 0.20136320607827773}. Best is trial 0 with value: 24.05092163861521.
[I 2025-03-25 01:44:45,502] Trial 1 finished with value: 24.180449896882994 and parameters: {'n_estimators': 744, 'max_depth': 3, 'learning_rate': 0.004477840306287856, 'subsample': 0.9225867492455198, 'colsample_bytree': 0.9459393739837837, 'gamma': 5.672172153018584, 'reg_alpha': 0.3437764540290475, 'reg_lambda': 0.2550134616616448}. Best is trial 0 with value: 24.05092163861521.
[I 2025-03-25 01:44:52,551] Trial 2 finished with value: 24.744864654798686 and parameters: {'n_estimators': 383, 'max

Лучшие параметры:
{'n_estimators': 959, 'max_depth': 2, 'learning_rate': 0.015616998322782292, 'subsample': 0.6416964877752129, 'colsample_bytree': 0.7093356342840186, 'gamma': 5.95215280097236, 'reg_alpha': 0.4288120265060383, 'reg_lambda': 0.38160712432873833}
Лучший RMSE: 24.0088


In [14]:
best_model = XGBRegressor(
    **study.best_params,
    n_jobs=-1,
    random_state=42
)

best_model.fit(X_train_tfidf, y_train)
y_pred = best_model.predict(X_test_tfidf)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"📊 RMSE на тесте: {rmse:.4f}")

📊 RMSE на тесте: 24.0056
