In [2]:
import numpy as np
import optuna

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD

from xgboost import XGBRegressor

from env import DATA_DIR
from utils import load_jsons_data, delete_parts_of_lyrics

ImportError: cannot import name 'load_jsons_data' from 'utils' (/Users/mac/Desktop/Recommended System/utils.py)

In [34]:
df = load_jsons_data(data_dir=DATA_DIR, 
                    columns=["lyrics", "popularity"])

In [35]:
df["lyrics"] = df["lyrics"].str.lower()
df["lyrics"] = df["lyrics"].str.replace(r'[^\w\s]', '')
df["lyrics"] = df["lyrics"].apply(delete_parts_of_lyrics)
df["lyrics"] = df["lyrics"].str.replace(r'\d+', '')
df["lyrics"] = df["lyrics"].str.strip()
# Может стоит удалить цифры?
# Может стоит использовать лемматизацию или стемминг?
# Фильтрация стоп-слов?

In [36]:
X_train, X_test, y_train, y_test = train_test_split(df["lyrics"], df["popularity"], test_size=0.2, random_state=42)

In [37]:
# TF-IDF: используем слова + символы, без `stop_words`
word_vectorizer = TfidfVectorizer(ngram_range=(2, 4), 
                                  max_features=75000, 
                                  min_df=2, 
                                  max_df=0.95)
char_vectorizer = TfidfVectorizer(analyzer='char_wb', 
                                  ngram_range=(3, 6), 
                                  max_features=20000,
                                  min_df=2,
                                  max_df=0.95)

In [38]:
# Обучаем TF-IDF
X_train_word = word_vectorizer.fit_transform(X_train)
X_train_char = char_vectorizer.fit_transform(X_train)
X_train_tfidf = np.hstack((X_train_word.toarray(), X_train_char.toarray()))

X_test_word = word_vectorizer.transform(X_test)
X_test_char = char_vectorizer.transform(X_test)
X_test_tfidf = np.hstack((X_test_word.toarray(), X_test_char.toarray()))

In [39]:
# Снижаем размерность (ускоряем XGBoost)
svd = TruncatedSVD(n_components=500)
X_train_tfidf = svd.fit_transform(X_train_tfidf)
X_test_tfidf = svd.transform(X_test_tfidf)

In [40]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 5),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.03, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 3, 7),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.3, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.2, 0.4),
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_jobs': -1
    }

    model = XGBRegressor(**params)
    
    # Кросс-валидация
    score = cross_val_score(
        model, X_train_tfidf, y_train,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_jobs=-1
    )
    
    return -score.mean()  # хотим минимизировать RMSE

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=1500)

print("Лучшие параметры:")
print(study.best_params)
print(f"Лучший RMSE: {study.best_value:.4f}")

[I 2025-03-21 23:43:48,883] A new study created in memory with name: no-name-3bda876a-ad9f-4c92-847a-ad3e048be3d5
[I 2025-03-21 23:43:59,356] Trial 0 finished with value: 24.231581518446642 and parameters: {'n_estimators': 530, 'max_depth': 4, 'learning_rate': 0.0033423789022136805, 'subsample': 0.9758119413632873, 'colsample_bytree': 0.6081248510260877, 'gamma': 4.984260597354197, 'reg_alpha': 0.30834580989593857, 'reg_lambda': 0.3454978113070515}. Best is trial 0 with value: 24.231581518446642.
[I 2025-03-21 23:44:04,296] Trial 1 finished with value: 24.142122392087998 and parameters: {'n_estimators': 786, 'max_depth': 2, 'learning_rate': 0.015651497227233312, 'subsample': 0.7106518901039729, 'colsample_bytree': 0.8350715903082455, 'gamma': 5.601266786091614, 'reg_alpha': 0.45444979247350037, 'reg_lambda': 0.24458239654253877}. Best is trial 1 with value: 24.142122392087998.
[I 2025-03-21 23:44:17,630] Trial 2 finished with value: 24.321947196095536 and parameters: {'n_estimators': 7

Лучшие параметры:
{'n_estimators': 932, 'max_depth': 3, 'learning_rate': 0.005888180885970952, 'subsample': 0.6090890269311064, 'colsample_bytree': 0.6356377234292379, 'gamma': 6.182573796736076, 'reg_alpha': 0.42201004371006007, 'reg_lambda': 0.32161787647887524}
Лучший RMSE: 24.0761


In [41]:
best_model = XGBRegressor(
    **study.best_params,
    n_jobs=-1,
    random_state=42
)

best_model.fit(X_train_tfidf, y_train)
y_pred = best_model.predict(X_test_tfidf)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"📊 RMSE на тесте: {rmse:.4f}")

📊 RMSE на тесте: 23.4784
