In [46]:
import numpy as np
import optuna

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD

from xgboost import XGBRegressor

from env import DATA_DIR
from utils import load_jsons_data, delete_parts_of_lyrics
from language_detect import safe_decode

In [61]:
df = load_jsons_data(data_dir=DATA_DIR, 
                    columns=["lyrics", "popularity", 'track_name'])

In [62]:
df=df[df['popularity']>2]

pattern = r' - (live|acoustic|acoustic version)$'
df = df[~df['track_name'].str.contains(pattern, regex=True)]

  df = df[~df['track_name'].str.contains(pattern, regex=True)]


In [63]:
df["lyrics"] = df["lyrics"].apply(safe_decode)
df["lyrics"] = df["lyrics"].str.lower()
df["lyrics"] = df["lyrics"].str.replace(r'[^\w\s]', '')
df["lyrics"] = df["lyrics"].apply(delete_parts_of_lyrics)
df["lyrics"] = df["lyrics"].str.replace(r'\d+', '')
df["lyrics"] = df["lyrics"].str.strip()
# Может стоит удалить цифры?
# Может стоит использовать лемматизацию или стемминг?
# Фильтрация стоп-слов?

In [64]:
X_train, X_test, y_train, y_test = train_test_split(df["lyrics"], df["popularity"], test_size=0.2, random_state=42)

In [65]:
# TF-IDF: используем слова + символы, без `stop_words`
word_vectorizer = TfidfVectorizer(ngram_range=(2, 4), 
                                  max_features=75000, 
                                  min_df=2, 
                                  max_df=0.95)
char_vectorizer = TfidfVectorizer(analyzer='char_wb', 
                                  ngram_range=(3, 6), 
                                  max_features=20000,
                                  min_df=2,
                                  max_df=0.95)

In [66]:
# Обучаем TF-IDF
X_train_word = word_vectorizer.fit_transform(X_train)
X_train_char  = char_vectorizer.fit_transform(X_train)
X_train_tfidf = np.hstack((X_train_word.toarray(), X_train_char.toarray()))

X_test_word = word_vectorizer.transform(X_test)
X_test_char = char_vectorizer.transform(X_test)
X_test_tfidf = np.hstack((X_test_word.toarray(), X_test_char.toarray()))

In [53]:
# Снижаем размерность (ускоряем XGBoost)
svd = TruncatedSVD(n_components=700)
X_train_tfidf = svd.fit_transform(X_train_tfidf)
X_test_tfidf = svd.transform(X_test_tfidf)

In [54]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 5),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.03, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 3, 7),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.3, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.2, 0.4),
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_jobs': -1
    }

    model = XGBRegressor(**params)
    
    # Кросс-валидация
    score = cross_val_score(
        model, X_train_tfidf, y_train,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_jobs=-1
    )
    
    return -score.mean()  # хотим минимизировать RMSE

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=1500)

print("Лучшие параметры:")
print(study.best_params)
print(f"Лучший RMSE: {study.best_value:.4f}")

[I 2025-04-08 19:29:09,852] A new study created in memory with name: no-name-7048ff64-5289-420b-8152-7d2505761e5c
[I 2025-04-08 19:29:30,489] Trial 0 finished with value: 17.36873944600423 and parameters: {'n_estimators': 839, 'max_depth': 2, 'learning_rate': 0.006019924851477963, 'subsample': 0.7306335148013986, 'colsample_bytree': 0.6992722105886697, 'gamma': 5.1973888928670435, 'reg_alpha': 0.44953253521852676, 'reg_lambda': 0.3612623801462084}. Best is trial 0 with value: 17.36873944600423.
[I 2025-04-08 19:30:05,065] Trial 1 finished with value: 17.092056274414062 and parameters: {'n_estimators': 650, 'max_depth': 4, 'learning_rate': 0.012879491886962668, 'subsample': 0.7204779637398986, 'colsample_bytree': 0.6010352585657243, 'gamma': 3.4428993376674035, 'reg_alpha': 0.44516865067998507, 'reg_lambda': 0.2360814420531145}. Best is trial 1 with value: 17.092056274414062.
[I 2025-04-08 19:30:47,745] Trial 2 finished with value: 17.222214380900066 and parameters: {'n_estimators': 390

Лучшие параметры:
{'n_estimators': 965, 'max_depth': 3, 'learning_rate': 0.016933451552045956, 'subsample': 0.7687094762942901, 'colsample_bytree': 0.7064611659246383, 'gamma': 6.363461660812275, 'reg_alpha': 0.4559335994022725, 'reg_lambda': 0.37322554691805326}
Лучший RMSE: 17.0897


In [None]:
best_model = XGBRegressor(
    **study.best_params,
    n_jobs=-1,
    random_state=42
)

best_model.fit(X_train_tfidf, y_train)
y_pred = best_model.predict(X_test_tfidf)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE на тесте: {rmse:.4f}")