In [2]:
import os
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from glob import glob

In [3]:
DATA_DIR = "/Users/royalskifm/Desktop/with_lyrics" 
data_list = []

In [4]:
for file_path in glob(os.path.join(DATA_DIR, "*.json")):
    with open(file_path, "r") as f:
        data = json.load(f)
        if "lyrics" in data and "popularity" in data:
            data_list.append({
                "lyrics": data["lyrics"],
                "popularity": data["popularity"]
            })

df = pd.DataFrame(data_list)

In [5]:
import re
def delete_parts_of_lyrics(text: str) -> str:
    parts_of_lyrics = re.findall(r'\[(.*?)\]', text)
    for part in parts_of_lyrics:
        text = text.replace(f"[{part}]", "")
    return text

In [6]:
df

Unnamed: 0,lyrics,popularity
0,[Verse 1]\nSmoking cigarettes on the roof\nYou...,4
1,"[B…ônd 1]\nZoomaqazind…ô, vaƒüzalƒ±n tinind…ô\nBir ...",21
2,[Intro]\nLife's alright in Devil Town\nWe're r...,72
3,[B√∂l√ºm]\nGel g√ºnahƒ±n boynuma gel\nDur birazcƒ±k...,0
4,[Verse 1:]\nI could chase after greatness\nFir...,28
...,...,...
5722,"[Î∞©ÌÉÑÏÜåÎÖÑÎã® Outro: Ego Í∞ÄÏÇ¨]\n\n[Intro: RM, DJ Friz]\...",63
5723,[Chorus]\nYou thought I wouldn't come around t...,47
5724,"[B√∂l√ºm 1]\nAvcƒ± deƒüilim, vuramam, tuzak kurama...",26
5725,Seis ainda v√£o colar no meu show\nSeis ainda v...,40


In [7]:
df["lyrics"] = df["lyrics"].str.lower()
df["lyrics"] = df["lyrics"].str.replace(r'[^\w\s]', '')
df["lyrics"] = df["lyrics"].apply(delete_parts_of_lyrics)
df["lyrics"] = df["lyrics"].str.replace(r'\d+', '')
df["lyrics"] = df["lyrics"].str.strip()


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df["lyrics"], df["popularity"], test_size=0.2, random_state=42)

In [9]:
from sklearn.decomposition import TruncatedSVD
import numpy as np

# üìå TF-IDF: –∏—Å–ø–æ–ª—å–∑—É–µ–º —Å–ª–æ–≤–∞ + —Å–∏–º–≤–æ–ª—ã, –±–µ–∑ `stop_words`
word_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=25000, min_df=2, max_df=0.95)
char_vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 6), max_features=200000)

# üìå –û–±—É—á–∞–µ–º TF-IDF
X_train_word = word_vectorizer.fit_transform(X_train)
X_train_char = char_vectorizer.fit_transform(X_train)
X_train_tfidf = np.hstack((X_train_word.toarray(), X_train_char.toarray()))

X_test_word = word_vectorizer.transform(X_test)
X_test_char = char_vectorizer.transform(X_test)
X_test_tfidf = np.hstack((X_test_word.toarray(), X_test_char.toarray()))

# üìå –°–Ω–∏–∂–∞–µ–º —Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å (—É—Å–∫–æ—Ä—è–µ–º XGBoost)
svd = TruncatedSVD(n_components=300)
X_train_tfidf = svd.fit_transform(X_train_tfidf)
X_test_tfidf = svd.transform(X_test_tfidf)


In [10]:
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=1000,
    learning_rate=0.1,
    random_state=42
) 
model.fit(X_train_tfidf, y_train)

In [18]:
import numpy as np
y_pred = model.predict(X_test_tfidf)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")


RMSE: 25.60666372467532


In [12]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_jobs': -1
    }

    model = XGBRegressor(**params)
    
    # –ö—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏—è
    score = cross_val_score(
        model, X_train_tfidf, y_train,
        scoring='neg_root_mean_squared_error',
        cv=3,
        n_jobs=-1
    )
    
    return -score.mean()  # —Ö–æ—Ç–∏–º –º–∏–Ω–∏–º–∏–∑–∏—Ä–æ–≤–∞—Ç—å RMSE

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, timeout=900)  # 50 –∑–∞–ø—É—Å–∫–æ–≤ –∏–ª–∏ 15 –º–∏–Ω—É—Ç

print("–õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã:")
print(study.best_params)
print(f"–õ—É—á—à–∏–π RMSE: {study.best_value:.4f}")

[I 2025-03-21 18:36:52,996] A new study created in memory with name: no-name-203acf7a-6a25-4b8d-ac98-2d9edbdf846b
[I 2025-03-21 18:37:18,534] Trial 0 finished with value: 27.82259798427788 and parameters: {'n_estimators': 277, 'max_depth': 8, 'learning_rate': 0.29519020026078546, 'subsample': 0.9340062883729057, 'colsample_bytree': 0.9863125171876019, 'gamma': 2.456336327288, 'reg_alpha': 0.09260711127483323, 'reg_lambda': 0.14287807652448126}. Best is trial 0 with value: 27.82259798427788.
[I 2025-03-21 18:37:59,823] Trial 1 finished with value: 25.913784654727824 and parameters: {'n_estimators': 505, 'max_depth': 9, 'learning_rate': 0.07867985056591743, 'subsample': 0.8959252292372546, 'colsample_bytree': 0.6899935208270186, 'gamma': 1.5486950123635075, 'reg_alpha': 0.7715241814324901, 'reg_lambda': 0.5074466418319903}. Best is trial 1 with value: 25.913784654727824.
[I 2025-03-21 18:38:15,361] Trial 2 finished with value: 26.41972797400392 and parameters: {'n_estimators': 579, 'max_

–õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã:
{'n_estimators': 795, 'max_depth': 3, 'learning_rate': 0.011381683993033076, 'subsample': 0.6229389452545009, 'colsample_bytree': 0.9748065818485535, 'gamma': 0.03673021234970032, 'reg_alpha': 0.9071719563968507, 'reg_lambda': 0.7363217787471088}
–õ—É—á—à–∏–π RMSE: 24.0856


In [16]:
best_model = XGBRegressor(
    **study.best_params,
    n_jobs=-1,
    random_state=42
)

best_model.fit(X_train_tfidf, y_train)
y_pred = best_model.predict(X_test_tfidf)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"üìä RMSE –Ω–∞ —Ç–µ—Å—Ç–µ: {rmse:.4f}")

üìä RMSE –Ω–∞ —Ç–µ—Å—Ç–µ: 23.7726
