In [1]:
import numpy as np
import optuna

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD

from xgboost import XGBRegressor

from env import DATA_DIR
from src.utils import load_jsons_data, delete_parts_of_lyrics
from src.lang_identification import safe_decode
from src.df_filters import DFFilters

  from .autonotebook import tqdm as notebook_tqdm


/Users/mac/.local/share/virtualenvs/Recommended_System-b--jxVTG/lib/python3.8/site-packages


In [2]:
df = load_jsons_data(data_dir=DATA_DIR, 
                    columns=["lyrics", 
                             "popularity", 
                             "track_name", 
                             "track_id",
                             "release_date", 
                             "track_artist"])

In [3]:
dffilter = DFFilters()
df = dffilter.popularity(df, 2)
df = dffilter.language(df, "tracks/language_mapping_trh-0.8.json", white_list=["en"])
df = dffilter.release_date(df, end_year = 2020)
df = dffilter.live(df)
df = dffilter.acoustic(df)
df = dffilter.remix(df)
df = dffilter.cover(df)
df = dffilter.smart_cover(df, sim_threshold=0.75)

  df["is_live"] = df["track_name"].str.contains(LIVE_REGEX)
Smart cover: 100%|██████████| 7782/7782 [00:05<00:00, 1477.45it/s]


In [4]:
df["lyrics"] = df["lyrics"].apply(safe_decode)
df["lyrics"] = df["lyrics"].str.lower()
df["lyrics"] = df["lyrics"].str.replace(r'[^\w\s]', '')
df["lyrics"] = df["lyrics"].apply(delete_parts_of_lyrics)
df["lyrics"] = df["lyrics"].str.replace(r'\d+', '')
df["lyrics"] = df["lyrics"].str.strip()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df["lyrics"], df["popularity"], test_size=0.2, random_state=42)

In [6]:
# TF-IDF: используем слова + символы, без `stop_words`
word_vectorizer = TfidfVectorizer(ngram_range=(2, 4), 
                                  max_features=75000, 
                                  min_df=2, 
                                  max_df=0.95)
char_vectorizer = TfidfVectorizer(analyzer='char_wb', 
                                  ngram_range=(3, 6), 
                                  max_features=20000,
                                  min_df=2,
                                  max_df=0.95)

In [9]:
# Обучаем TF-IDF
X_train_word = word_vectorizer.fit_transform(X_train)
X_train_char  = char_vectorizer.fit_transform(X_train)
X_train_tfidf = np.hstack((X_train_word.toarray(), X_train_char.toarray()))

X_test_word = word_vectorizer.transform(X_test)
X_test_char = char_vectorizer.transform(X_test)
X_test_tfidf = np.hstack((X_test_word.toarray(), X_test_char.toarray()))

In [10]:
# Снижаем размерность (ускоряем XGBoost)
svd = TruncatedSVD(n_components=700)
X_train_tfidf = svd.fit_transform(X_train_tfidf)
X_test_tfidf = svd.transform(X_test_tfidf)

In [11]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 5),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.03, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 3, 7),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.3, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.2, 0.4),
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_jobs': -1
    }

    model = XGBRegressor(**params)
    
    # Кросс-валидация
    score = cross_val_score(
        model, X_train_tfidf, y_train,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_jobs=-1
    )
    
    return -score.mean()  # хотим минимизировать RMSE

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=1000)

print("Лучшие параметры:")
print(study.best_params)
print(f"Лучший RMSE: {study.best_value:.4f}")

[I 2025-04-17 22:43:26,536] A new study created in memory with name: no-name-a82263fd-4b74-45fa-b780-04c13b5af612
[I 2025-04-17 22:43:33,832] Trial 0 finished with value: 17.68401023826266 and parameters: {'n_estimators': 523, 'max_depth': 2, 'learning_rate': 0.01141670561714172, 'subsample': 0.9055574046252444, 'colsample_bytree': 0.8161463233767876, 'gamma': 3.7494807305212103, 'reg_alpha': 0.3083322660062723, 'reg_lambda': 0.2753468534633884}. Best is trial 0 with value: 17.68401023826266.
[I 2025-04-17 22:43:38,998] Trial 1 finished with value: 17.8397363506886 and parameters: {'n_estimators': 473, 'max_depth': 2, 'learning_rate': 0.0043477620768367226, 'subsample': 0.8059051853129856, 'colsample_bytree': 0.8256284061498117, 'gamma': 4.9062131414204, 'reg_alpha': 0.3207393926773418, 'reg_lambda': 0.20917927719648619}. Best is trial 0 with value: 17.68401023826266.
[I 2025-04-17 22:43:44,991] Trial 2 finished with value: 17.614080045946803 and parameters: {'n_estimators': 770, 'max_

Лучшие параметры:
{'n_estimators': 723, 'max_depth': 3, 'learning_rate': 0.02354488670077883, 'subsample': 0.6674420979110127, 'colsample_bytree': 0.8196247568155655, 'gamma': 4.024993223520536, 'reg_alpha': 0.3188541125419953, 'reg_lambda': 0.3470885820348283}
Лучший RMSE: 17.4588


In [12]:
best_model = XGBRegressor(
    **study.best_params,
    n_jobs=-1,
    random_state=42
)

best_model.fit(X_train_tfidf, y_train)
y_pred = best_model.predict(X_test_tfidf)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE на тесте: {rmse:.4f}")

RMSE на тесте: 17.5855
