In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.linear_model import  LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
import optuna

In [5]:
import joblib
import os

# Tratamento dos dados

In [15]:
spotify_youtube_pt1 = pd.read_csv("Dados/spotify_youtube_up_pt1.csv")
spotify_youtube_pt2 = pd.read_csv("Dados/spotify_youtube_up_pt2.csv")

In [16]:
data_extracao = pd.to_datetime("2023-02-07")
spotify_youtube_pt1["upload_date"] = pd.to_datetime(spotify_youtube_pt1["upload_date"])
spotify_youtube_pt1["dias_na_plataforma"] = (data_extracao - spotify_youtube_pt1["upload_date"]).dt.days

spotify_youtube_pt2["upload_date"] = pd.to_datetime(spotify_youtube_pt2["upload_date"])
spotify_youtube_pt2["dias_na_plataforma"] = (data_extracao - spotify_youtube_pt2["upload_date"]).dt.days

In [17]:
spotify_youtube = pd.concat([spotify_youtube_pt1,spotify_youtube_pt2])

In [18]:
spotify_youtube = spotify_youtube[spotify_youtube['Album_type'] != 'compilation']
spotify_youtube_dummies= pd.get_dummies(spotify_youtube,columns=['Album_type'], dtype=int)

In [19]:
spotify_youtube_dummies['artist_number'] = spotify_youtube_dummies['Artist'].str.split(',').str.len()
spotify_youtube_dummies['engagement_rate'] = spotify_youtube_dummies.apply(lambda row: (row['Likes'] + row['Comments']) / row['Views'] * 100 if row['Views'] > 0 else 0, axis=1)

In [20]:
columns_drop=['Unnamed: 0', 'Artist', 'Url_spotify', 'Track', 'Album', 'Uri','Url_youtube', 'Title', 'Channel', 'Views', 'Likes','Comments', 'Description', 'Licensed', 'official_video', 'upload_date']
spotify_youtube_df=spotify_youtube_dummies.drop(columns=columns_drop)

In [21]:
spotify_youtube_limpo=spotify_youtube_df.dropna(subset=['Danceability','Energy','Key','Loudness','Speechiness','Acousticness', 'Instrumentalness','Liveness','Valence','Tempo','Duration_ms','Stream','engagement_rate'])

In [22]:
spotify_youtube_limpo.columns

Index(['Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness',
       'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',
       'Duration_ms', 'Stream', 'dias_na_plataforma', 'Album_type_album',
       'Album_type_single', 'artist_number', 'engagement_rate'],
      dtype='object')

# Preparação dos dados para os experimentos

In [23]:
youtube_30_df=spotify_youtube_limpo[spotify_youtube_limpo['dias_na_plataforma']<31]
youtube_90_df=spotify_youtube_limpo[spotify_youtube_limpo['dias_na_plataforma']<91]
youtube_365_df=spotify_youtube_limpo[spotify_youtube_limpo['dias_na_plataforma']<366]

In [24]:
spotify_youtube_df_366_more = spotify_youtube_limpo.drop(youtube_365_df.index)
spotify_youtube_df_91_365 = youtube_365_df.drop(youtube_90_df.index)
spotify_youtube_df_31_90 = youtube_365_df.drop(youtube_30_df.index)

In [25]:
youtube_30_df=youtube_30_df[youtube_30_df['engagement_rate']<9.338507725109846]
spotify_youtube_df_31_90=spotify_youtube_df_31_90[spotify_youtube_df_31_90['engagement_rate']<4.602401258613792]
spotify_youtube_df_91_365=spotify_youtube_df_91_365[spotify_youtube_df_91_365['engagement_rate']<4.253191569584401]
spotify_youtube_df_366_more=spotify_youtube_df_366_more[spotify_youtube_df_366_more['engagement_rate']<2.251193229196198]

In [26]:
youtube_30_df_x = youtube_30_df.drop('engagement_rate',axis=1)
youtube_30_df_y = youtube_30_df['engagement_rate']

spotify_youtube_df_31_90_x = spotify_youtube_df_31_90.drop('engagement_rate',axis=1)
spotify_youtube_df_31_90_y = spotify_youtube_df_31_90['engagement_rate']

spotify_youtube_df_91_365_x = spotify_youtube_df_91_365.drop('engagement_rate',axis=1)
spotify_youtube_df_91_365_y = spotify_youtube_df_91_365['engagement_rate']

spotify_youtube_df_366_more_x = spotify_youtube_df_366_more.drop('engagement_rate',axis=1)
spotify_youtube_df_366_more_y = spotify_youtube_df_366_more['engagement_rate']

In [27]:
spotify_youtube_30X_train_val, spotify_youtube_30X_test, spotify_youtube_30y_train_val, spotify_youtube_30y_test = train_test_split(youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=42)
spotify_youtube_30X_train, spotify_youtube_30X_val, spotify_youtube_30y_train, spotify_youtube_30y_val = train_test_split(spotify_youtube_30X_train_val, spotify_youtube_30y_train_val, test_size=0.20, random_state=42)

In [28]:
# Dados de treino 
sy_x_train=pd.concat([spotify_youtube_30X_train,spotify_youtube_df_31_90_x,spotify_youtube_df_91_365_x,spotify_youtube_df_366_more_x])
sy_y_train=pd.concat([spotify_youtube_30y_train,spotify_youtube_df_31_90_y,spotify_youtube_df_91_365_y,spotify_youtube_df_366_more_y])

# Normalização do treino
scaler_all = MinMaxScaler()
sy_x_train_scaled = scaler_all.fit_transform(sy_x_train)

In [29]:
# Normalização do treino de 30 dias
sy_30_x_train_scaled = scaler_all.transform(spotify_youtube_30X_train)

In [30]:
# Para testar o desempenho do modelo com videos com ate 30 dias
sy_30_x_test=spotify_youtube_30X_test
sy_30_y_test=spotify_youtube_30y_test

# validação de 30 dias
sy_30_x_val=spotify_youtube_30X_val
sy_30_y_val=spotify_youtube_30y_val

In [31]:
# Normalizção dos dados de teste de 30 dias
sy_30_x_test_scaled = scaler_all.transform(sy_30_x_test)

# Normalização dos dados de validação de 30 dias
sy_30_x_val_scaled = scaler_all.transform(sy_30_x_val)

# Tuning dos modelos

## Random Forest

Resultados:

GridSearch:
* Média dos MSEs: 3.3750
* Desvio padrão dos MSEs: 0.4742
* Melhor MSE individual: 2.5401 (r5)
* Tempo: 82m 17s

Optuna:
* Média dos MSEs: 3.2699
* Desvio padrão dos MSEs: 0.4399
* Melhor MSE individual: 2.4719 (r5)
* Tempo: 73m 31s


Fazendo o tuning com GridSearch

In [None]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [300, 400, 500],
    'max_samples': [0.5, 0.75],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [30, 40, 50],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

# Inicializando o modelo Random Forest
rf = RandomForestRegressor(random_state=42)

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=ps,  # Passa o PredefinedSplit
    scoring='neg_mean_squared_error',
    verbose=0,
    n_jobs=1
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo
best_rf = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
sy_30_x_test_scaled_df = pd.DataFrame(
    sy_30_x_test_scaled,
    columns=sy_x_train.columns,
    index=sy_30_x_test.index
)
predictions = best_rf.predict(sy_30_x_test_scaled_df)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (mse) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_random_forest_model_grid.pkl')
joblib.dump(best_rf, model_path)
print(f"Modelo salvo em: {model_path}")

Mean Squared Error (mse) on test data: 2.9452172749232077
Modelo salvo em: Modelos\best_random_forest_model_grid.pkl


In [32]:
# Hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [300, 400, 500],
    'max_samples': [0.5, 0.75],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [30, 40, 50],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")
    spotify_youtube_30X_train_val, spotify_youtube_30X_test, spotify_youtube_30y_train_val, spotify_youtube_30y_test = train_test_split(youtube_30_df_x, youtube_30_df_y, test_size=0.25,random_state=40+i)
    spotify_youtube_30X_train, spotify_youtube_30X_val, spotify_youtube_30y_train, spotify_youtube_30y_val = train_test_split(spotify_youtube_30X_train_val, spotify_youtube_30y_train_val, test_size=0.20,random_state=40+i)

    # Dados de treino
    sy_x_train = pd.concat([spotify_youtube_30X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([spotify_youtube_30y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler_all = MinMaxScaler()
    sy_x_train_scaled = scaler_all.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler_all.transform(spotify_youtube_30X_val)
    sy_30_x_test_scaled = scaler_all.transform(spotify_youtube_30X_test)

    sy_30_y_val = spotify_youtube_30y_val
    sy_30_y_test = spotify_youtube_30y_test

    # Reconstrução dos DataFrames normalizados
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=spotify_youtube_30X_val.columns, index=spotify_youtube_30X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=sy_x_train.columns, index=spotify_youtube_30X_test.index)

    # Junta treino e validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, sy_30_y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(spotify_youtube_30X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Inicializa o modelo base
    rf = RandomForestRegressor(random_state=i)

    # GridSearch
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=ps,
        scoring='neg_mean_squared_error',
        verbose=0,
        n_jobs=-1
    )

    grid_search.fit(X_full, y_full)

    # Recupera os melhores hiperparâmetros
    best_params = grid_search.best_params_

    # Reentreina o modelo com os melhores hiperparâmetros
    best_rf = RandomForestRegressor(**best_params, random_state=42)
    best_rf.fit(X_full, y_full)

    # Previsão no teste
    predictions = best_rf.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(sy_30_y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    # Atualiza o melhor modelo
    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = best_rf

# Cálculo da média e do desvio padrão
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_rf_model_grid.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")


Rodada 1/10
MSE da rodada 1: 3.3749

Rodada 2/10
MSE da rodada 2: 3.0917

Rodada 3/10
MSE da rodada 3: 2.8221

Rodada 4/10
MSE da rodada 4: 4.0446

Rodada 5/10
MSE da rodada 5: 2.5401

Rodada 6/10
MSE da rodada 6: 4.1256

Rodada 7/10
MSE da rodada 7: 3.7498

Rodada 8/10
MSE da rodada 8: 3.3979

Rodada 9/10
MSE da rodada 9: 3.2707

Rodada 10/10
MSE da rodada 10: 3.3323

Resumo após 10 execuções:
Média dos MSEs: 3.3750
Desvio padrão dos MSEs: 0.4742
Melhor MSE individual: 2.5401
Melhor modelo salvo em: Modelos\best_rf_model_grid.pkl


Fazendo o tuning com Optuna

In [35]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Função objetivo para o Optuna
def objective(trial):
    # Sugerindo os hiperparâmetros
    n_estimators = trial.suggest_int("n_estimators", 50, 500, step=50)
    max_depth = trial.suggest_int("max_depth", 10, 50, step=10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10, step=2)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 4, step=1)
    max_samples = trial.suggest_float("max_samples", 0.5, 1.0, step=0.05)
    max_features = trial.suggest_categorical("max_features", ['sqrt', 'log2'])


    
    

    # Criando o modelo com os hiperparâmetros sugeridos
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_samples=max_samples,
        max_features=max_features,
        
        
        random_state=42
    )

    # Treinando o modelo nos dados de treino
    rf.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = rf.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # Número de tentativas

# Obtendo os melhores hiperparâmetros
best_params = study.best_params
print("Melhores hiperparâmetros:", best_params)

# Treinando o modelo final com os melhores hiperparâmetros
best_rf = RandomForestRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    random_state=42
)
best_rf.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_rf.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_random_forest_model_optuna.pkl')
joblib.dump(best_rf, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-21 18:41:19,427] A new study created in memory with name: no-name-a6adede2-6e97-44a2-b830-b9359b490f1d
[I 2025-04-21 18:41:29,856] Trial 0 finished with value: 3.6594923027500004 and parameters: {'n_estimators': 350, 'max_depth': 30, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_samples': 0.95, 'max_features': 'log2'}. Best is trial 0 with value: 3.6594923027500004.
[I 2025-04-21 18:41:37,267] Trial 1 finished with value: 3.8796388232432424 and parameters: {'n_estimators': 350, 'max_depth': 40, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_samples': 0.55, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.6594923027500004.
[I 2025-04-21 18:41:44,639] Trial 2 finished with value: 3.7630106068662728 and parameters: {'n_estimators': 350, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_samples': 0.9, 'max_features': 'log2'}. Best is trial 0 with value: 3.6594923027500004.
[I 2025-04-21 18:41:51,941] Trial 3 finished with value: 3.87240710807

Melhores hiperparâmetros: {'n_estimators': 50, 'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.95, 'max_features': 'log2'}
Mean Squared Error (MSE) on test data: 2.951254082497033
Modelo salvo em: Modelos\best_random_forest_model_optuna.pkl


In [33]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40+i
    )

    # Concatena dados externos com treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    # Reconstruindo DataFrames
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=X_test.columns, index=X_test.index)

    # Junta treino + val
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Função objetivo do Optuna
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int("n_estimators", 50, 500, step=50),
            'max_depth': trial.suggest_int("max_depth", 10, 50, step=10),
            'min_samples_split': trial.suggest_int("min_samples_split", 2, 10, step=2),
            'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 4),
            'max_samples': trial.suggest_float("max_samples", 0.5, 1.0, step=0.05),
            'max_features': trial.suggest_categorical("max_features", ['sqrt', 'log2']),
            'random_state': 42,
        }

        model = RandomForestRegressor(**params)
        model.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])
        preds = model.predict(X_full[ps.test_fold == 0])
        mse = mean_squared_error(y_full[ps.test_fold == 0], preds)
        return mse

    # Rodando o estudo
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50, show_progress_bar=False)

    best_params = study.best_params
    print("Melhores hiperparâmetros:", best_params)

    # Treina modelo final com treino + val
    final_model = RandomForestRegressor(**best_params)
    final_model.fit(X_full, y_full)

    # Avaliação em teste
    predictions = final_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = final_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_random_forest_model_optuna.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")

[I 2025-05-15 18:19:25,496] A new study created in memory with name: no-name-1da90ebf-1d01-4e67-b2e1-49ee0618b53c



Rodada 1/10


[I 2025-05-15 18:19:39,999] Trial 0 finished with value: 4.874145207100956 and parameters: {'n_estimators': 450, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_samples': 0.95, 'max_features': 'log2'}. Best is trial 0 with value: 4.874145207100956.
[I 2025-05-15 18:19:51,322] Trial 1 finished with value: 4.7684852235299475 and parameters: {'n_estimators': 400, 'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_samples': 0.75, 'max_features': 'log2'}. Best is trial 1 with value: 4.7684852235299475.
[I 2025-05-15 18:19:58,969] Trial 2 finished with value: 4.893610873194328 and parameters: {'n_estimators': 400, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_samples': 0.75, 'max_features': 'log2'}. Best is trial 1 with value: 4.7684852235299475.
[I 2025-05-15 18:20:09,921] Trial 3 finished with value: 4.869855265281316 and parameters: {'n_estimators': 350, 'max_depth': 50, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_samp

Melhores hiperparâmetros: {'n_estimators': 250, 'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.95, 'max_features': 'log2'}


[I 2025-05-15 18:24:59,201] A new study created in memory with name: no-name-e1d04c96-3ea4-4cb9-b6a4-abab33016356


MSE da rodada 1: 3.2359

Rodada 2/10


[I 2025-05-15 18:25:12,193] Trial 0 finished with value: 2.820276864938503 and parameters: {'n_estimators': 500, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_samples': 0.75, 'max_features': 'sqrt'}. Best is trial 0 with value: 2.820276864938503.
[I 2025-05-15 18:25:14,504] Trial 1 finished with value: 2.8816714303493964 and parameters: {'n_estimators': 100, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_samples': 0.6, 'max_features': 'log2'}. Best is trial 0 with value: 2.820276864938503.
[I 2025-05-15 18:25:17,841] Trial 2 finished with value: 2.939392315150119 and parameters: {'n_estimators': 150, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_samples': 0.55, 'max_features': 'sqrt'}. Best is trial 0 with value: 2.820276864938503.
[I 2025-05-15 18:25:21,532] Trial 3 finished with value: 2.8988544207276057 and parameters: {'n_estimators': 150, 'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_sample

Melhores hiperparâmetros: {'n_estimators': 350, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_samples': 0.8500000000000001, 'max_features': 'sqrt'}


[I 2025-05-15 18:33:12,719] A new study created in memory with name: no-name-468ab62f-71c6-45d1-9422-0953dee8cc40


MSE da rodada 2: 3.1012

Rodada 3/10


[I 2025-05-15 18:33:21,155] Trial 0 finished with value: 3.246648454103933 and parameters: {'n_estimators': 200, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 1.0, 'max_features': 'log2'}. Best is trial 0 with value: 3.246648454103933.
[I 2025-05-15 18:33:26,170] Trial 1 finished with value: 3.647755652134939 and parameters: {'n_estimators': 300, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_samples': 0.6, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.246648454103933.
[I 2025-05-15 18:33:32,519] Trial 2 finished with value: 3.728823174874727 and parameters: {'n_estimators': 300, 'max_depth': 30, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_samples': 0.5, 'max_features': 'log2'}. Best is trial 0 with value: 3.246648454103933.
[I 2025-05-15 18:33:38,921] Trial 3 finished with value: 3.819271106719016 and parameters: {'n_estimators': 300, 'max_depth': 40, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_samples':

Melhores hiperparâmetros: {'n_estimators': 100, 'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.95, 'max_features': 'log2'}


[I 2025-05-15 18:39:47,265] A new study created in memory with name: no-name-5f1ca11a-891f-4bb1-ab15-987258abe481


MSE da rodada 3: 2.7327

Rodada 4/10


[I 2025-05-15 18:39:53,322] Trial 0 finished with value: 4.128970605985816 and parameters: {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_samples': 0.8, 'max_features': 'log2'}. Best is trial 0 with value: 4.128970605985816.
[I 2025-05-15 18:39:57,515] Trial 1 finished with value: 4.247697062830802 and parameters: {'n_estimators': 150, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_samples': 0.7, 'max_features': 'log2'}. Best is trial 0 with value: 4.128970605985816.
[I 2025-05-15 18:40:07,262] Trial 2 finished with value: 4.292574428699605 and parameters: {'n_estimators': 300, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_samples': 1.0, 'max_features': 'log2'}. Best is trial 0 with value: 4.128970605985816.
[I 2025-05-15 18:40:27,267] Trial 3 finished with value: 4.204460548768183 and parameters: {'n_estimators': 500, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 0

Melhores hiperparâmetros: {'n_estimators': 350, 'max_depth': 50, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_samples': 0.9, 'max_features': 'log2'}


[I 2025-05-15 18:48:39,164] A new study created in memory with name: no-name-c1c9e67a-9a77-40dd-8247-5c6dd45aa2e7


MSE da rodada 4: 3.9780

Rodada 5/10


[I 2025-05-15 18:48:43,921] Trial 0 finished with value: 3.684248970709111 and parameters: {'n_estimators': 150, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_samples': 0.8500000000000001, 'max_features': 'log2'}. Best is trial 0 with value: 3.684248970709111.
[I 2025-05-15 18:48:48,678] Trial 1 finished with value: 3.954070697983908 and parameters: {'n_estimators': 200, 'max_depth': 50, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_samples': 0.6, 'max_features': 'log2'}. Best is trial 0 with value: 3.684248970709111.
[I 2025-05-15 18:48:55,137] Trial 2 finished with value: 3.6581318103460787 and parameters: {'n_estimators': 200, 'max_depth': 40, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 0.65, 'max_features': 'log2'}. Best is trial 2 with value: 3.6581318103460787.
[I 2025-05-15 18:49:09,750] Trial 3 finished with value: 3.776361604972313 and parameters: {'n_estimators': 500, 'max_depth': 50, 'min_samples_split': 8, 'min_samples_leaf': 3

Melhores hiperparâmetros: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_samples': 0.9, 'max_features': 'sqrt'}


[I 2025-05-15 18:55:28,029] A new study created in memory with name: no-name-7405e21b-a6d6-4805-940b-a5e64c4845c0


MSE da rodada 5: 2.4719

Rodada 6/10


[I 2025-05-15 18:55:31,696] Trial 0 finished with value: 3.812118625104253 and parameters: {'n_estimators': 150, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_samples': 0.65, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.812118625104253.
[I 2025-05-15 18:55:32,895] Trial 1 finished with value: 3.451892326486708 and parameters: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_samples': 0.6, 'max_features': 'sqrt'}. Best is trial 1 with value: 3.451892326486708.
[I 2025-05-15 18:55:36,125] Trial 2 finished with value: 3.583021874890902 and parameters: {'n_estimators': 100, 'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_samples': 0.8, 'max_features': 'sqrt'}. Best is trial 1 with value: 3.451892326486708.
[I 2025-05-15 18:55:37,896] Trial 3 finished with value: 3.3711187231100155 and parameters: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 0

Melhores hiperparâmetros: {'n_estimators': 200, 'max_depth': 50, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 1.0, 'max_features': 'sqrt'}


[I 2025-05-15 19:00:09,429] A new study created in memory with name: no-name-587a8dc9-9bbf-4d96-b718-51651edb0fe4


MSE da rodada 6: 3.6887

Rodada 7/10


[I 2025-05-15 19:00:16,091] Trial 0 finished with value: 3.17140597503435 and parameters: {'n_estimators': 200, 'max_depth': 40, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_samples': 0.95, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.17140597503435.
[I 2025-05-15 19:00:20,289] Trial 1 finished with value: 3.4142181801752165 and parameters: {'n_estimators': 250, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_samples': 0.6, 'max_features': 'log2'}. Best is trial 0 with value: 3.17140597503435.
[I 2025-05-15 19:00:35,010] Trial 2 finished with value: 3.078912192190106 and parameters: {'n_estimators': 450, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_samples': 0.8, 'max_features': 'log2'}. Best is trial 2 with value: 3.078912192190106.
[I 2025-05-15 19:00:35,892] Trial 3 finished with value: 3.445274486901812 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_samples': 0.6

Melhores hiperparâmetros: {'n_estimators': 300, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 0.95, 'max_features': 'sqrt'}


[I 2025-05-15 19:08:44,756] A new study created in memory with name: no-name-18ce5354-48ca-4837-ae90-669ed3628943


MSE da rodada 7: 3.7547

Rodada 8/10


[I 2025-05-15 19:08:47,927] Trial 0 finished with value: 4.055577018340708 and parameters: {'n_estimators': 150, 'max_depth': 30, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_samples': 0.5, 'max_features': 'log2'}. Best is trial 0 with value: 4.055577018340708.
[I 2025-05-15 19:09:02,358] Trial 1 finished with value: 3.7631116272604634 and parameters: {'n_estimators': 500, 'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_samples': 0.75, 'max_features': 'sqrt'}. Best is trial 1 with value: 3.7631116272604634.
[I 2025-05-15 19:09:10,929] Trial 2 finished with value: 4.015530202270866 and parameters: {'n_estimators': 400, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_samples': 1.0, 'max_features': 'log2'}. Best is trial 1 with value: 3.7631116272604634.
[I 2025-05-15 19:09:13,638] Trial 3 finished with value: 3.93680733120726 and parameters: {'n_estimators': 100, 'max_depth': 50, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_samples

Melhores hiperparâmetros: {'n_estimators': 500, 'max_depth': 40, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_samples': 0.8, 'max_features': 'log2'}


[I 2025-05-15 19:17:34,332] A new study created in memory with name: no-name-60a6cdde-cd85-46d3-96bf-56d46d23fefb


MSE da rodada 8: 3.4321

Rodada 9/10


[I 2025-05-15 19:17:35,933] Trial 0 finished with value: 3.5485916857759876 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_samples': 0.65, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.5485916857759876.
[I 2025-05-15 19:17:45,477] Trial 1 finished with value: 3.454479405601435 and parameters: {'n_estimators': 400, 'max_depth': 30, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_samples': 0.7, 'max_features': 'sqrt'}. Best is trial 1 with value: 3.454479405601435.
[I 2025-05-15 19:17:58,102] Trial 2 finished with value: 3.429720125837772 and parameters: {'n_estimators': 450, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_samples': 0.9, 'max_features': 'log2'}. Best is trial 2 with value: 3.429720125837772.
[I 2025-05-15 19:18:07,196] Trial 3 finished with value: 3.3452059937445693 and parameters: {'n_estimators': 350, 'max_depth': 50, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_samples

Melhores hiperparâmetros: {'n_estimators': 100, 'max_depth': 40, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_samples': 0.95, 'max_features': 'sqrt'}


[I 2025-05-15 19:21:31,948] A new study created in memory with name: no-name-b08f7573-8c1a-449d-85b2-6d236e0d19cf


MSE da rodada 9: 3.0667

Rodada 10/10


[I 2025-05-15 19:21:37,496] Trial 0 finished with value: 3.514290712232265 and parameters: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_samples': 0.5, 'max_features': 'log2'}. Best is trial 0 with value: 3.514290712232265.
[I 2025-05-15 19:21:39,014] Trial 1 finished with value: 3.2432724693522728 and parameters: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_samples': 0.9, 'max_features': 'sqrt'}. Best is trial 1 with value: 3.2432724693522728.
[I 2025-05-15 19:21:42,219] Trial 2 finished with value: 3.5675122004078506 and parameters: {'n_estimators': 150, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_samples': 0.6, 'max_features': 'sqrt'}. Best is trial 1 with value: 3.2432724693522728.
[I 2025-05-15 19:21:48,598] Trial 3 finished with value: 3.5197959141957624 and parameters: {'n_estimators': 350, 'max_depth': 40, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_samp

Melhores hiperparâmetros: {'n_estimators': 500, 'max_depth': 30, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_samples': 1.0, 'max_features': 'sqrt'}
MSE da rodada 10: 3.2367

Resumo após 10 execuções:
Média dos MSEs: 3.2699
Desvio padrão dos MSEs: 0.4399
Melhor MSE individual: 2.4719
Melhor modelo salvo em: Modelos\best_random_forest_model_optuna.pkl


## Linear Regressor

Resultados:

GridSearch:
* Média dos MSEs: 7.2316
* Desvio padrão dos MSEs: 0.7596
* Melhor MSE individual: 6.1034 (r3)
* Tempo: 22s


Optuna:
* Média dos MSEs: 7.2316
* Desvio padrão dos MSEs: 0.7596
* Melhor MSE individual: 6.1034 (r3)
* Tempo: 0.3s


Fazendo tuning com GridSearch

In [47]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Inicializando o modelo Random Forest
lin_reg = LinearRegression() # Regressor logistico

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=lin_reg,
    param_grid={},  # Sem hiperparâmetros para ajustar
    cv=ps,  # Passa o PredefinedSplit
    scoring='neg_mean_squared_error',
    verbose=3,
    n_jobs=-1,
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo
best_lin_reg = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
predictions = best_lin_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (mse) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_Linear_Regressor_model_grid.pkl')
joblib.dump(best_lin_reg, model_path)
print(f"Modelo salvo em: {model_path}")

Fitting 1 folds for each of 1 candidates, totalling 1 fits
Mean Squared Error (mse) on test data: 6.18329638505496
Modelo salvo em: Modelos\best_Linear_Regressor_model_grid.pkl




In [34]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40+i
    )

    # Concatena dados externos com treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    # Reconstruindo DataFrames
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=X_test.columns, index=X_test.index)

    # Junta treino + val
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Modelo
    lin_reg = LinearRegression()

    # GridSearch (sem parâmetros a otimizar nesse caso)
    grid_search = GridSearchCV(
        estimator=lin_reg,
        param_grid={},  # LinearRegression não tem hiperparâmetros relevantes aqui
        cv=ps,
        scoring='neg_mean_squared_error',
        verbose=0,
        n_jobs=-1,
    )
    grid_search.fit(X_full, y_full)
    best_model = grid_search.best_estimator_

    # Avaliação
    predictions = best_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = best_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_Linear_Regressor_model_grid.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")


Rodada 1/10
MSE da rodada 1: 8.4844

Rodada 2/10
MSE da rodada 2: 6.4912

Rodada 3/10
MSE da rodada 3: 6.1034

Rodada 4/10
MSE da rodada 4: 7.5465

Rodada 5/10
MSE da rodada 5: 6.3378

Rodada 6/10
MSE da rodada 6: 7.8595

Rodada 7/10
MSE da rodada 7: 6.5832

Rodada 8/10
MSE da rodada 8: 7.7659

Rodada 9/10
MSE da rodada 9: 7.8448

Rodada 10/10
MSE da rodada 10: 7.2996

Resumo após 10 execuções:
Média dos MSEs: 7.2316
Desvio padrão dos MSEs: 0.7596
Melhor MSE individual: 6.1034
Melhor modelo salvo em: Modelos\best_Linear_Regressor_model_grid.pkl


Fazendo o Tuning com Optuna

In [48]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Função objetivo para o Optuna
def objective(trial):
    # Inicializando o modelo Linear Regression
    lin_reg = LinearRegression()

    # Treinando o modelo nos dados de treino
    lin_reg.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = lin_reg.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1)  # Apenas 1 tentativa, pois não há hiperparâmetros para ajustar

# Treinando o modelo final
best_lin_reg = LinearRegression()
best_lin_reg.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_lin_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_linear_regressor_model_optuna.pkl')
joblib.dump(best_lin_reg, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-17 15:47:13,647] A new study created in memory with name: no-name-8decee07-315f-4476-8cb2-7e5be21e71fa
[I 2025-04-17 15:47:13,671] Trial 0 finished with value: 7.789384479414533 and parameters: {}. Best is trial 0 with value: 7.789384479414533.


Mean Squared Error (MSE) on test data: 6.245837740756641
Modelo salvo em: Modelos\best_linear_regressor_model_optuna.pkl


In [35]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados (treino/val/teste)
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40 + i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40 + i
    )

    # Concatena dados externos com treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    # Reconstruindo DataFrames normalizados
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=X_test.columns, index=X_test.index)

    # Junta treino + validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # PredefinedSplit: -1 (treino), 0 (validação)
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Função objetivo do Optuna (sem hiperparâmetros para ajustar)
    def objective(trial):
        model = LinearRegression()
        model.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])
        preds = model.predict(X_full[ps.test_fold == 0])
        mse = mean_squared_error(y_full[ps.test_fold == 0], preds)
        return mse

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=1, show_progress_bar=False)

    # Treino final com treino + validação
    best_model = LinearRegression()
    best_model.fit(X_full, y_full)

    # Avaliação em teste
    predictions = best_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = best_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_linear_regressor_model_optuna.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")

[I 2025-05-15 19:31:54,820] A new study created in memory with name: no-name-9636fcd6-5582-4feb-86a4-560710a6a460
[I 2025-05-15 19:31:54,829] Trial 0 finished with value: 8.13508917899053 and parameters: {}. Best is trial 0 with value: 8.13508917899053.
[I 2025-05-15 19:31:54,848] A new study created in memory with name: no-name-3baa140e-faf4-4a36-a04b-724007e89ef8
[I 2025-05-15 19:31:54,857] Trial 0 finished with value: 6.42852090501063 and parameters: {}. Best is trial 0 with value: 6.42852090501063.
[I 2025-05-15 19:31:54,874] A new study created in memory with name: no-name-0952f4be-e85a-4aca-990c-3ad4d05ee5eb
[I 2025-05-15 19:31:54,884] Trial 0 finished with value: 7.661134602522894 and parameters: {}. Best is trial 0 with value: 7.661134602522894.
[I 2025-05-15 19:31:54,900] A new study created in memory with name: no-name-b2e8c402-7524-47af-8662-b9adc1b962c4
[I 2025-05-15 19:31:54,910] Trial 0 finished with value: 7.930440101465996 and parameters: {}. Best is trial 0 with value:


Rodada 1/10
MSE da rodada 1: 8.4844

Rodada 2/10
MSE da rodada 2: 6.4912

Rodada 3/10
MSE da rodada 3: 6.1034

Rodada 4/10
MSE da rodada 4: 7.5465

Rodada 5/10
MSE da rodada 5: 6.3378

Rodada 6/10


[I 2025-05-15 19:31:54,980] A new study created in memory with name: no-name-1e1bccd3-3ce7-441a-ab7e-3e2b5ab0b2d1
[I 2025-05-15 19:31:54,989] Trial 0 finished with value: 6.851325876517844 and parameters: {}. Best is trial 0 with value: 6.851325876517844.


MSE da rodada 6: 7.8595

Rodada 7/10


[I 2025-05-15 19:31:55,008] A new study created in memory with name: no-name-2bf0b422-63e8-4d4f-ba6b-d661ad80a3ca
[I 2025-05-15 19:31:55,017] Trial 0 finished with value: 7.517792949401467 and parameters: {}. Best is trial 0 with value: 7.517792949401467.
[I 2025-05-15 19:31:55,034] A new study created in memory with name: no-name-c1f4c31b-4fe5-4e62-988e-f574ae4b1fa6
[I 2025-05-15 19:31:55,044] Trial 0 finished with value: 6.836980298646069 and parameters: {}. Best is trial 0 with value: 6.836980298646069.
[I 2025-05-15 19:31:55,062] A new study created in memory with name: no-name-7239c5d1-198b-49b4-be80-4e44facbbbed
[I 2025-05-15 19:31:55,071] Trial 0 finished with value: 7.542051830787146 and parameters: {}. Best is trial 0 with value: 7.542051830787146.


MSE da rodada 7: 6.5832

Rodada 8/10
MSE da rodada 8: 7.7659

Rodada 9/10
MSE da rodada 9: 7.8448

Rodada 10/10
MSE da rodada 10: 7.2996

Resumo após 10 execuções:
Média dos MSEs: 7.2316
Desvio padrão dos MSEs: 0.7596
Melhor MSE individual: 6.1034
Melhor modelo salvo em: Modelos\best_linear_regressor_model_optuna.pkl


## XGBoost

Resultados:

GridSearch:
* Média dos MSEs: 3.3049
* Desvio padrão dos MSEs: 0.4324
* Melhor MSE individual: 2.7117 (r5)
* Tempo: 612m 57s


Optuna:
* Média dos MSEs: 3.3797
* Desvio padrão dos MSEs: 0.4148
* Melhor MSE individual: 2.8503 (r5)
* Tempo: 36m 52s



GridSearch

In [None]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500,600,700,800,900,1000],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.8, 1.0],  # mantido apenas uma vez
    'max_depth': [3, 5, 7, 9, 11],
}

# Inicializando o modelo Random Forest
xgb_reg = LinearRegression() # Regressor logistico
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=xgb_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Métrica de avaliação
    cv=ps,  # Número de folds para validação cruzada
    verbose=1,  # Mostra o progresso do tuning
    n_jobs=1  # Usa todos os núcleos disponíveis
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo

best_xgb_reg = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
predictions = best_xgb_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (mse) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_xgb_model_grid.pkl')
joblib.dump(best_xgb_reg, model_path)
print(f"Modelo salvo em: {model_path}")

Fitting 1 folds for each of 5280 candidates, totalling 5280 fits
Mean Squared Error (mse) on test data: 2.795087678116888
Modelo salvo em: Modelos\best_xgb_model_grid.pkl


In [36]:
# Hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [ 500, 600, 700, 800, 900, 1000],
    'learning_rate': [0.001, 0.01],
    'gamma': [0.1, 0.2, 0.3],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.8, 1.0],
    'max_depth': [3, 5, 7, 9],
}

mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40+i
    )

    # Dados de treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x,spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    sy_30_y_val = y_val
    sy_30_y_test = y_test

    # Reconstruindo DataFrames escalados
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=sy_x_train.columns, index=X_test.index)

    # Junção de treino + validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, sy_30_y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Grid search
    xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

    grid_search = GridSearchCV(
        estimator=xgb_reg,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=ps,
        verbose=0,
        n_jobs=1
    )
    grid_search.fit(X_full, y_full)

    # Re-treinar modelo com melhores hiperparâmetros, agora só com treino (sem val)
    best_params = grid_search.best_params_
    retrain_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
    retrain_model.fit(X_full, y_full)

    # Avaliação final
    predictions = retrain_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(sy_30_y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = retrain_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo da melhor rodada
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_xgb_model_grid.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")


Rodada 1/10
MSE da rodada 1: 3.1043

Rodada 2/10
MSE da rodada 2: 3.0657

Rodada 3/10
MSE da rodada 3: 2.9701

Rodada 4/10
MSE da rodada 4: 4.2957

Rodada 5/10
MSE da rodada 5: 2.7117

Rodada 6/10
MSE da rodada 6: 3.6845

Rodada 7/10
MSE da rodada 7: 3.6084

Rodada 8/10
MSE da rodada 8: 3.3837

Rodada 9/10
MSE da rodada 9: 3.0400

Rodada 10/10
MSE da rodada 10: 3.1848

Resumo após 10 execuções:
Média dos MSEs: 3.3049
Desvio padrão dos MSEs: 0.4324
Melhor MSE individual: 2.7117
Melhor modelo salvo em: Modelos\best_xgb_model_grid.pkl


Optuna

In [30]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)


# Função objetivo para o Optuna
def objective(trial):
    # Sugerindo os hiperparâmetros
    n_estimators = trial.suggest_int("n_estimators", 50, 1000, step=50)
    max_depth = trial.suggest_int("max_depth", 3, 10, step=1)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.3, step=0.0005)
    subsample = trial.suggest_float("subsample", 0.6, 1.0, step=0.1)
    gamma = trial.suggest_float("gamma", 0, 0.5, step=0.1)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10, step=1)
    
    

    # Criando o modelo com os hiperparâmetros sugeridos
    xgb_reg = xgb.XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        objective='reg:squarederror',
        gamma=gamma,
        min_child_weight=min_child_weight,
        
        random_state=42
    )

    # Treinando o modelo nos dados de treino
    xgb_reg.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = xgb_reg.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # Número de tentativas

# Obtendo os melhores hiperparâmetros
best_params = study.best_params
print("Melhores hiperparâmetros:", best_params)


# Treinando o modelo final com os melhores hiperparâmetros
best_xgb = xgb.XGBRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    gamma=best_params["gamma"],
    min_child_weight=best_params["min_child_weight"],
    objective='reg:squarederror',
    random_state=42
)
best_xgb.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_xgb.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_xgb_model_optuna.pkl')
joblib.dump(best_xgb, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-24 14:19:00,280] A new study created in memory with name: no-name-d0c9ec92-0047-421f-aae5-462d1c780571
[I 2025-04-24 14:19:01,015] Trial 0 finished with value: 2.9025841908281023 and parameters: {'n_estimators': 250, 'max_depth': 7, 'learning_rate': 0.0485, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 7}. Best is trial 0 with value: 2.9025841908281023.
[I 2025-04-24 14:19:01,633] Trial 1 finished with value: 3.101864530634946 and parameters: {'n_estimators': 900, 'max_depth': 9, 'learning_rate': 0.10250000000000001, 'subsample': 0.9, 'gamma': 0.5, 'min_child_weight': 9}. Best is trial 0 with value: 2.9025841908281023.
[I 2025-04-24 14:19:01,911] Trial 2 finished with value: 3.3768841974123456 and parameters: {'n_estimators': 50, 'max_depth': 8, 'learning_rate': 0.1, 'subsample': 0.9, 'gamma': 0.0, 'min_child_weight': 1}. Best is trial 0 with value: 2.9025841908281023.
[I 2025-04-24 14:19:02,453] Trial 3 finished with value: 3.1724126058006505 and parameters: {'n_estim

Melhores hiperparâmetros: {'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.127, 'subsample': 0.6, 'gamma': 0.2, 'min_child_weight': 8}
Mean Squared Error (MSE) on test data: 2.8858393687086332
Modelo salvo em: Modelos\best_xgb_model_optuna.pkl


In [37]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40+i
    )

    # Concatenando dados externos para o treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x,spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    # Reconstruindo DataFrames
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=sy_x_train.columns, index=X_test.index)

    # Concatenação treino + val para o fit
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Função objetivo do Optuna
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int("n_estimators", 500, 1000, step=100),
            'max_depth': trial.suggest_int("max_depth", 3, 9),
            'learning_rate': trial.suggest_float("learning_rate", 0.001, 0.01, step=0.001),
            'subsample': trial.suggest_float("subsample", 0.8, 1.0, step=0.1),
            'gamma': trial.suggest_float("gamma", 0.1, 0.3, step=0.1),
            'min_child_weight': trial.suggest_int("min_child_weight", 1, 3),
            'objective': 'reg:squarederror',
            'random_state': 42,
        }

        model = xgb.XGBRegressor(**params)
        model.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])
        preds = model.predict(X_full[ps.test_fold == 0])
        mse = mean_squared_error(y_full[ps.test_fold == 0], preds)
        return mse

    # Rodando o estudo
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50, show_progress_bar=False)

    best_params = study.best_params
    print("Melhores hiperparâmetros:", best_params)

    # Treina modelo final com treino + val
    final_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
    final_model.fit(X_full, y_full)

    # Avaliação em teste
    predictions = final_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = final_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_xgb_model_optuna.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")

[I 2025-05-16 00:50:47,316] A new study created in memory with name: no-name-39b1d705-2314-4c58-af1a-8fded5509991



Rodada 1/10


[I 2025-05-16 00:50:49,477] Trial 0 finished with value: 4.54213720113432 and parameters: {'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.004, 'subsample': 1.0, 'gamma': 0.1, 'min_child_weight': 3}. Best is trial 0 with value: 4.54213720113432.
[I 2025-05-16 00:50:50,551] Trial 1 finished with value: 5.139370761043498 and parameters: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.003, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 2}. Best is trial 0 with value: 4.54213720113432.
[I 2025-05-16 00:50:52,601] Trial 2 finished with value: 4.536041589295235 and parameters: {'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.006, 'subsample': 0.9, 'gamma': 0.1, 'min_child_weight': 2}. Best is trial 2 with value: 4.536041589295235.
[I 2025-05-16 00:50:55,748] Trial 3 finished with value: 4.828432101644955 and parameters: {'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.002, 'subsample': 0.9, 'gamma': 0.3, 'min_child_weight': 3}. Best is trial 2 with va

Melhores hiperparâmetros: {'n_estimators': 900, 'max_depth': 4, 'learning_rate': 0.009000000000000001, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 2}


[I 2025-05-16 00:51:50,567] A new study created in memory with name: no-name-7fc67318-f2af-4feb-abd4-26bd0b130aff


MSE da rodada 1: 3.0226

Rodada 2/10


[I 2025-05-16 00:51:54,874] Trial 0 finished with value: 3.3442447301686618 and parameters: {'n_estimators': 1000, 'max_depth': 8, 'learning_rate': 0.01, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 3}. Best is trial 0 with value: 3.3442447301686618.
[I 2025-05-16 00:51:57,988] Trial 1 finished with value: 3.5219098636988946 and parameters: {'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.003, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 0 with value: 3.3442447301686618.
[I 2025-05-16 00:51:58,671] Trial 2 finished with value: 3.176142437288894 and parameters: {'n_estimators': 1000, 'max_depth': 3, 'learning_rate': 0.004, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 2 with value: 3.176142437288894.
[I 2025-05-16 00:51:59,701] Trial 3 finished with value: 3.0802344564803374 and parameters: {'n_estimators': 700, 'max_depth': 5, 'learning_rate': 0.003, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 2}. Best is trial 3 

Melhores hiperparâmetros: {'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.002, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 3}


[I 2025-05-16 00:53:31,181] A new study created in memory with name: no-name-9e5e9627-d5ff-4c36-8aa7-4ebcd506d92a


MSE da rodada 2: 3.4598

Rodada 3/10


[I 2025-05-16 00:53:34,539] Trial 0 finished with value: 3.223870279752981 and parameters: {'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.01, 'subsample': 0.9, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 0 with value: 3.223870279752981.
[I 2025-05-16 00:53:39,146] Trial 1 finished with value: 3.3076502705631325 and parameters: {'n_estimators': 600, 'max_depth': 9, 'learning_rate': 0.005, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 2}. Best is trial 0 with value: 3.223870279752981.
[I 2025-05-16 00:53:43,210] Trial 2 finished with value: 3.6015774655540027 and parameters: {'n_estimators': 600, 'max_depth': 9, 'learning_rate': 0.007, 'subsample': 1.0, 'gamma': 0.3, 'min_child_weight': 1}. Best is trial 0 with value: 3.223870279752981.
[I 2025-05-16 00:53:44,283] Trial 3 finished with value: 3.2396454650378512 and parameters: {'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.01, 'subsample': 0.9, 'gamma': 0.1, 'min_child_weight': 3}. Best is trial 0 with 

Melhores hiperparâmetros: {'n_estimators': 1000, 'max_depth': 9, 'learning_rate': 0.008, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 3}


[I 2025-05-16 00:56:20,413] A new study created in memory with name: no-name-506ac132-269f-41c9-8530-879bcea17e35


MSE da rodada 3: 3.1137

Rodada 4/10


[I 2025-05-16 00:56:23,733] Trial 0 finished with value: 3.7304970745238064 and parameters: {'n_estimators': 600, 'max_depth': 8, 'learning_rate': 0.005, 'subsample': 0.9, 'gamma': 0.1, 'min_child_weight': 1}. Best is trial 0 with value: 3.7304970745238064.
[I 2025-05-16 00:56:26,417] Trial 1 finished with value: 3.9593492196221165 and parameters: {'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.007, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 1}. Best is trial 0 with value: 3.7304970745238064.
[I 2025-05-16 00:56:26,836] Trial 2 finished with value: 6.39460483101121 and parameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.001, 'subsample': 1.0, 'gamma': 0.3, 'min_child_weight': 1}. Best is trial 0 with value: 3.7304970745238064.
[I 2025-05-16 00:56:27,869] Trial 3 finished with value: 3.8979257762087265 and parameters: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.005, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 3}. Best is trial 0 w

Melhores hiperparâmetros: {'n_estimators': 600, 'max_depth': 9, 'learning_rate': 0.008, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 1}


[I 2025-05-16 00:59:03,316] A new study created in memory with name: no-name-d935e0e2-7abc-405e-bbbf-3704da2da4b8


MSE da rodada 4: 4.2927

Rodada 5/10


[I 2025-05-16 00:59:04,424] Trial 0 finished with value: 3.6998682128603186 and parameters: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.005, 'subsample': 0.9, 'gamma': 0.1, 'min_child_weight': 1}. Best is trial 0 with value: 3.6998682128603186.
[I 2025-05-16 00:59:06,538] Trial 1 finished with value: 3.6178720334901295 and parameters: {'n_estimators': 700, 'max_depth': 7, 'learning_rate': 0.007, 'subsample': 0.9, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 1 with value: 3.6178720334901295.
[I 2025-05-16 00:59:07,373] Trial 2 finished with value: 3.8553384467569747 and parameters: {'n_estimators': 1000, 'max_depth': 3, 'learning_rate': 0.01, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 3}. Best is trial 1 with value: 3.6178720334901295.
[I 2025-05-16 00:59:08,871] Trial 3 finished with value: 3.7092148613139106 and parameters: {'n_estimators': 700, 'max_depth': 6, 'learning_rate': 0.003, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 1 

Melhores hiperparâmetros: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.008, 'subsample': 1.0, 'gamma': 0.3, 'min_child_weight': 3}


[I 2025-05-16 01:00:21,722] A new study created in memory with name: no-name-c3f94124-0722-4b04-bd63-e44c4bdabc84


MSE da rodada 5: 2.8503

Rodada 6/10


[I 2025-05-16 01:00:23,448] Trial 0 finished with value: 4.917430470676186 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.001, 'subsample': 0.9, 'gamma': 0.3, 'min_child_weight': 2}. Best is trial 0 with value: 4.917430470676186.
[I 2025-05-16 01:00:24,234] Trial 1 finished with value: 3.7722168900756197 and parameters: {'n_estimators': 700, 'max_depth': 4, 'learning_rate': 0.005, 'subsample': 0.9, 'gamma': 0.3, 'min_child_weight': 1}. Best is trial 1 with value: 3.7722168900756197.
[I 2025-05-16 01:00:29,830] Trial 2 finished with value: 4.008086277256018 and parameters: {'n_estimators': 700, 'max_depth': 9, 'learning_rate': 0.002, 'subsample': 0.9, 'gamma': 0.3, 'min_child_weight': 2}. Best is trial 1 with value: 3.7722168900756197.
[I 2025-05-16 01:00:30,909] Trial 3 finished with value: 3.6069341916228375 and parameters: {'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.005, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 3}. Best is trial 3 wi

Melhores hiperparâmetros: {'n_estimators': 900, 'max_depth': 7, 'learning_rate': 0.007, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 1}


[I 2025-05-16 01:02:28,133] A new study created in memory with name: no-name-093cffce-eb5b-4163-8167-4b264abb541f


MSE da rodada 6: 3.6279

Rodada 7/10


[I 2025-05-16 01:02:34,701] Trial 0 finished with value: 3.1547943070714077 and parameters: {'n_estimators': 1000, 'max_depth': 9, 'learning_rate': 0.006, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 2}. Best is trial 0 with value: 3.1547943070714077.
[I 2025-05-16 01:02:36,618] Trial 1 finished with value: 3.189255874498177 and parameters: {'n_estimators': 900, 'max_depth': 6, 'learning_rate': 0.003, 'subsample': 0.9, 'gamma': 0.3, 'min_child_weight': 1}. Best is trial 0 with value: 3.1547943070714077.
[I 2025-05-16 01:02:38,217] Trial 2 finished with value: 3.310027211227439 and parameters: {'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.007, 'subsample': 0.9, 'gamma': 0.3, 'min_child_weight': 1}. Best is trial 0 with value: 3.1547943070714077.
[I 2025-05-16 01:02:41,572] Trial 3 finished with value: 3.1614845224056283 and parameters: {'n_estimators': 600, 'max_depth': 9, 'learning_rate': 0.01, 'subsample': 0.9, 'gamma': 0.3, 'min_child_weight': 3}. Best is trial 0 wi

Melhores hiperparâmetros: {'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.006, 'subsample': 1.0, 'gamma': 0.1, 'min_child_weight': 3}


[I 2025-05-16 01:03:41,628] A new study created in memory with name: no-name-6441132e-ec0c-48ac-91db-41ca22c969eb


MSE da rodada 7: 3.7977

Rodada 8/10


[I 2025-05-16 01:03:42,537] Trial 0 finished with value: 3.6947568145980085 and parameters: {'n_estimators': 700, 'max_depth': 5, 'learning_rate': 0.006, 'subsample': 1.0, 'gamma': 0.1, 'min_child_weight': 3}. Best is trial 0 with value: 3.6947568145980085.
[I 2025-05-16 01:03:45,804] Trial 1 finished with value: 4.626215317671392 and parameters: {'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.001, 'subsample': 0.9, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 0 with value: 3.6947568145980085.
[I 2025-05-16 01:03:47,353] Trial 2 finished with value: 3.6229780531061144 and parameters: {'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.007, 'subsample': 0.9, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 2 with value: 3.6229780531061144.
[I 2025-05-16 01:03:55,598] Trial 3 finished with value: 3.5966784911640213 and parameters: {'n_estimators': 1000, 'max_depth': 9, 'learning_rate': 0.003, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 1}. Best is trial 3

Melhores hiperparâmetros: {'n_estimators': 1000, 'max_depth': 3, 'learning_rate': 0.01, 'subsample': 1.0, 'gamma': 0.1, 'min_child_weight': 3}


[I 2025-05-16 01:04:41,030] A new study created in memory with name: no-name-2f0631ed-26ca-4dd2-80ea-076ef110550b


MSE da rodada 8: 3.4408

Rodada 9/10


[I 2025-05-16 01:04:42,416] Trial 0 finished with value: 3.1881430963649535 and parameters: {'n_estimators': 700, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 0 with value: 3.1881430963649535.
[I 2025-05-16 01:04:43,127] Trial 1 finished with value: 3.336705770460789 and parameters: {'n_estimators': 600, 'max_depth': 4, 'learning_rate': 0.005, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 3}. Best is trial 0 with value: 3.1881430963649535.
[I 2025-05-16 01:04:43,681] Trial 2 finished with value: 3.4130438152013434 and parameters: {'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.005, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 1}. Best is trial 0 with value: 3.1881430963649535.
[I 2025-05-16 01:04:47,631] Trial 3 finished with value: 3.362486280021815 and parameters: {'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.005, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 2}. Best is trial 0 wit

Melhores hiperparâmetros: {'n_estimators': 800, 'max_depth': 8, 'learning_rate': 0.01, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 2}


[I 2025-05-16 01:06:37,921] A new study created in memory with name: no-name-0c1577d0-5a74-4522-bd02-6d154687cdc0


MSE da rodada 9: 3.1157

Rodada 10/10


[I 2025-05-16 01:06:39,617] Trial 0 finished with value: 3.411908163976777 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.003, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 1}. Best is trial 0 with value: 3.411908163976777.
[I 2025-05-16 01:06:41,239] Trial 1 finished with value: 3.368152288847281 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 1 with value: 3.368152288847281.
[I 2025-05-16 01:06:45,377] Trial 2 finished with value: 3.774086261922247 and parameters: {'n_estimators': 600, 'max_depth': 9, 'learning_rate': 0.008, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 1}. Best is trial 1 with value: 3.368152288847281.
[I 2025-05-16 01:06:46,677] Trial 3 finished with value: 3.3108440669955157 and parameters: {'n_estimators': 900, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 0.9, 'gamma': 0.2, 'min_child_weight': 1}. Best is trial 3 with val

Melhores hiperparâmetros: {'n_estimators': 700, 'max_depth': 4, 'learning_rate': 0.01, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 1}
MSE da rodada 10: 3.0761

Resumo após 10 execuções:
Média dos MSEs: 3.3797
Desvio padrão dos MSEs: 0.4148
Melhor MSE individual: 2.8503
Melhor modelo salvo em: Modelos\best_xgb_model_optuna.pkl


## MLP

Resultados:

GridSearch:
* Média dos MSEs: 4.9773
* Desvio padrão dos MSEs: 0.9203
* Melhor MSE individual: 3.5314 (r3)
* Tempo: 220m 3s

Optuna:
* Média dos MSEs: 4.4607
* Desvio padrão dos MSEs: 0.6952
* Melhor MSE individual: 3.0931 (r3)
* Tempo: 364m 16s


GridSearch

In [25]:


# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100),(50, 50, 50), (100, 50, 50), (100, 100, 50), (100, 100, 100)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.01]
}

# Inicializando o modelo MLP
mlp_reg = MLPRegressor(random_state=42, max_iter=500)

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=mlp_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Métrica de avaliação
    cv=ps,  # Número de folds para validação cruzada
    verbose=3,  # Mostra o progresso do tuning
    n_jobs=-1  # Usa todos os núcleos disponíveis
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo
best_mlp_reg = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
predictions = best_mlp_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_mlp_model_grid.pkl')
joblib.dump(best_mlp_reg, model_path)
print(f"Modelo salvo em: {model_path}")

Fitting 1 folds for each of 216 candidates, totalling 216 fits
Mean Squared Error (MSE) on test data: 3.3342401641444734
Modelo salvo em: Modelos\best_mlp_model_grid.pkl




In [38]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'hidden_layer_sizes': [ (100, 100), (50, 50, 50), (100, 50, 50), (100, 100, 50), (100, 100, 100)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.0001, 0.001]
}

# Variáveis para armazenar os melhores modelos e MSE
best_mlp_model = None
best_mse = float('inf')
mse_list = []

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Dividindo os dados de até 30 dias em treino/validação/teste
    X_train_val, X_test, y_train_val, y_test = train_test_split(youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.20, random_state=40+i)

    # Concatenação com os demais dados
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x,spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler_all = MinMaxScaler()
    sy_x_train_scaled = scaler_all.fit_transform(sy_x_train)
    sy_x_val_scaled = scaler_all.transform(X_val)
    sy_x_test_scaled = scaler_all.transform(X_test)

    # Convertendo de volta para DataFrames
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_x_val_scaled_df = pd.DataFrame(sy_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_x_test_scaled_df = pd.DataFrame(sy_x_test_scaled, columns=sy_x_train.columns, index=X_test.index)

    # Concatena treino e validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # Cria PredefinedSplit
    validation_fold = np.concatenate([np.full(len(sy_x_train), -1), np.zeros(len(X_val))])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Inicializa o modelo base
    mlp_reg = MLPRegressor(random_state=i, max_iter=1000)

    # GridSearchCV
    grid_search = GridSearchCV(
        estimator=mlp_reg,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=ps,
        verbose=0,
        n_jobs=-1
    )

    # Treinamento
    grid_search.fit(X_full, y_full)

    # Obtém os melhores hiperparâmetros
    best_params = grid_search.best_params_

    # Reentreina o modelo com os melhores hiperparâmetros usando treino + validação
    best_mlp = MLPRegressor(**best_params, random_state=42, max_iter=1000)
    best_mlp.fit(X_full, y_full)

    # Avaliação no conjunto de teste
    predictions = best_mlp.predict(sy_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    # Verifica se é o melhor MSE
    if mse < best_mse:
        best_mse = mse
        best_mlp_model = best_mlp

# Resumo final
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_mlp_model_grid.pkl')
joblib.dump(best_mlp_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")


Rodada 1/10
MSE da rodada 1: 4.9939

Rodada 2/10
MSE da rodada 2: 6.5292

Rodada 3/10
MSE da rodada 3: 3.5314

Rodada 4/10
MSE da rodada 4: 4.5876

Rodada 5/10
MSE da rodada 5: 4.0421

Rodada 6/10
MSE da rodada 6: 6.5722

Rodada 7/10
MSE da rodada 7: 4.5585

Rodada 8/10
MSE da rodada 8: 4.6216

Rodada 9/10
MSE da rodada 9: 5.0581

Rodada 10/10
MSE da rodada 10: 5.2786

Resumo após 10 execuções:
Média dos MSEs: 4.9773
Desvio padrão dos MSEs: 0.9203
Melhor modelo salvo em: Modelos\best_mlp_model_grid.pkl


Optuna

In [23]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Função objetivo para o Optuna
def objective(trial):
    # Sugerindo os hiperparâmetros
     # Escolhe número de camadas: 1, 2 ou 3
    n_layers = trial.suggest_int('n_layers', 1, 3)
    
    # Para cada camada, escolher número de neurônios
    hidden_layer_sizes = []
    for i in range(n_layers):
        num_units = trial.suggest_int(f'n_units_layer_{i}', 50, 200, step=10)
        hidden_layer_sizes.append(num_units)
    
    hidden_layer_sizes = tuple(hidden_layer_sizes)  # vira tupla, como o MLP espera
    activation = trial.suggest_categorical("activation", ["relu", "tanh"])
    solver = trial.suggest_categorical("solver", ["adam", "sgd"])
    alpha = trial.suggest_float("alpha", 0.0001, 0.01, log=True)
    learning_rate = trial.suggest_categorical("learning_rate", ["constant", "adaptive"])

    # Criando o modelo com os hiperparâmetros sugeridos
    mlp_reg = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate=learning_rate,
        max_iter=1000,
        random_state=42
    )

    # Treinando o modelo nos dados de treino
    mlp_reg.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = mlp_reg.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # Número de tentativas

# Obtendo os melhores hiperparâmetros
best_params = study.best_params

# Construindo o hidden_layer_sizes a partir dos melhores parâmetros
hidden_layer_sizes = []
for i in range(best_params["n_layers"]):
    hidden_layer_sizes.append(best_params[f"n_units_layer_{i}"])
best_params["hidden_layer_sizes"] = tuple(hidden_layer_sizes)

print("Melhores hiperparâmetros:", best_params)

# Treinando o modelo final com os melhores hiperparâmetros
best_mlp = MLPRegressor(
    hidden_layer_sizes=best_params["hidden_layer_sizes"],
    activation=best_params["activation"],
    solver=best_params["solver"],
    alpha=best_params["alpha"],
    learning_rate=best_params["learning_rate"],
    max_iter=1000,
    random_state=42
)
best_mlp.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_mlp.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_mlp_model_optuna.pkl')
joblib.dump(best_mlp, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-28 09:49:18,683] A new study created in memory with name: no-name-593c19ba-4856-4ffc-84a5-7ada84bd71bc
[I 2025-04-28 09:49:53,402] Trial 0 finished with value: 4.253595743081349 and parameters: {'n_layers': 1, 'n_units_layer_0': 70, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.006071323986349375, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.253595743081349.
[I 2025-04-28 09:54:09,362] Trial 1 finished with value: 4.783693931466111 and parameters: {'n_layers': 2, 'n_units_layer_0': 180, 'n_units_layer_1': 190, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00045592803267607867, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.253595743081349.
[I 2025-04-28 09:54:51,381] Trial 2 finished with value: 4.579312869609497 and parameters: {'n_layers': 3, 'n_units_layer_0': 110, 'n_units_layer_1': 80, 'n_units_layer_2': 100, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.002598256618167818, 'learning_rate': 'constant'}. Best is trial 0 with val

Melhores hiperparâmetros: {'n_layers': 1, 'n_units_layer_0': 140, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00020915691387984604, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (140,)}
Mean Squared Error (MSE) on test data: 3.092867299798431
Modelo salvo em: Modelos\best_mlp_model_optuna.pkl


In [39]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40+i
    )

    # Concatena dados externos com treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x,spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    # Reconstruindo DataFrames
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=X_test.columns, index=X_test.index)

    # Junta treino + val
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Função objetivo do Optuna
    def objective(trial):
        # Camadas ocultas
        n_layers = trial.suggest_int('n_layers', 1, 3)
        hidden_layer_sizes = tuple(
            trial.suggest_int(f'n_units_layer_{i}', 50, 200, step=10) for i in range(n_layers)
        )

        # Outros hiperparâmetros
        activation = trial.suggest_categorical("activation", ["relu", "tanh"])
        solver = trial.suggest_categorical("solver", ["adam", "sgd"])
        alpha = trial.suggest_float("alpha", 0.0001, 0.01, log=True)
        learning_rate = trial.suggest_categorical("learning_rate", ["constant", "adaptive"])

        model = MLPRegressor(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=activation,
            solver=solver,
            alpha=alpha,
            learning_rate=learning_rate,
            max_iter=1000,
            random_state=42
        )

        model.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])
        preds = model.predict(X_full[ps.test_fold == 0])
        mse = mean_squared_error(y_full[ps.test_fold == 0], preds)
        return mse

    # Rodando Optuna
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50, show_progress_bar=False)

    best_params = study.best_params
    hidden_layer_sizes = tuple(
        best_params[f"n_units_layer_{i}"] for i in range(best_params["n_layers"])
    )
    best_params["hidden_layer_sizes"] = hidden_layer_sizes
    print("Melhores hiperparâmetros:", best_params)

    # Treinando o modelo final com melhores parâmetros
    final_model = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=best_params["activation"],
        solver=best_params["solver"],
        alpha=best_params["alpha"],
        learning_rate=best_params["learning_rate"],
        max_iter=1000,
        random_state=42
    )
    final_model.fit(X_full, y_full)

    # Avaliação em teste
    predictions = final_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = final_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_mlp_model_optuna.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")


Rodada 1/10


[I 2025-05-16 04:21:55,947] A new study created in memory with name: no-name-db192197-c67c-49e7-a0fa-06634a5d3c70
[I 2025-05-16 04:23:38,869] Trial 0 finished with value: 6.5057385420621 and parameters: {'n_layers': 3, 'n_units_layer_0': 200, 'n_units_layer_1': 60, 'n_units_layer_2': 180, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.006246073595358389, 'learning_rate': 'constant'}. Best is trial 0 with value: 6.5057385420621.
[I 2025-05-16 04:24:37,841] Trial 1 finished with value: 5.797260388170723 and parameters: {'n_layers': 2, 'n_units_layer_0': 60, 'n_units_layer_1': 90, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.0008694448276900267, 'learning_rate': 'constant'}. Best is trial 1 with value: 5.797260388170723.
[I 2025-05-16 04:25:06,467] Trial 2 finished with value: 5.37260367920598 and parameters: {'n_layers': 1, 'n_units_layer_0': 200, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00020584121403161232, 'learning_rate': 'constant'}. Best is trial 2 with value: 5

Melhores hiperparâmetros: {'n_layers': 2, 'n_units_layer_0': 50, 'n_units_layer_1': 150, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00022012702593168323, 'learning_rate': 'constant', 'hidden_layer_sizes': (50, 150)}


[I 2025-05-16 04:59:37,752] A new study created in memory with name: no-name-c8cfeed8-15d3-4b20-8954-d2f45d625ae7


MSE da rodada 1: 4.8616

Rodada 2/10


[I 2025-05-16 05:00:53,636] Trial 0 finished with value: 4.261860525483937 and parameters: {'n_layers': 3, 'n_units_layer_0': 160, 'n_units_layer_1': 90, 'n_units_layer_2': 190, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.005667052677851232, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 4.261860525483937.
[I 2025-05-16 05:01:49,415] Trial 1 finished with value: 3.8242315538771545 and parameters: {'n_layers': 2, 'n_units_layer_0': 130, 'n_units_layer_1': 70, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.00013321465946595498, 'learning_rate': 'constant'}. Best is trial 1 with value: 3.8242315538771545.
[I 2025-05-16 05:02:40,898] Trial 2 finished with value: 4.665270096571207 and parameters: {'n_layers': 2, 'n_units_layer_0': 130, 'n_units_layer_1': 80, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.004394967487046408, 'learning_rate': 'adaptive'}. Best is trial 1 with value: 3.8242315538771545.
[I 2025-05-16 05:02:47,673] Trial 3 finished with value: 6.40708417

Melhores hiperparâmetros: {'n_layers': 3, 'n_units_layer_0': 200, 'n_units_layer_1': 80, 'n_units_layer_2': 90, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0013162372472530753, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (200, 80, 90)}


[I 2025-05-16 05:52:19,324] A new study created in memory with name: no-name-7ba2df38-6564-4a45-9e0d-bbb6b6beba66


MSE da rodada 2: 3.6378

Rodada 3/10


[I 2025-05-16 05:52:55,237] Trial 0 finished with value: 4.313210611151709 and parameters: {'n_layers': 2, 'n_units_layer_0': 80, 'n_units_layer_1': 70, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0018464564589938156, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.313210611151709.
[I 2025-05-16 05:53:17,420] Trial 1 finished with value: 4.65133326895701 and parameters: {'n_layers': 2, 'n_units_layer_0': 130, 'n_units_layer_1': 140, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.006087733526620902, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 4.313210611151709.
[I 2025-05-16 05:54:02,222] Trial 2 finished with value: 5.467703122819719 and parameters: {'n_layers': 3, 'n_units_layer_0': 180, 'n_units_layer_1': 80, 'n_units_layer_2': 70, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.0006063703765189923, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.313210611151709.
[I 2025-05-16 05:55:09,111] Trial 3 finished with value: 5.265834342676

Melhores hiperparâmetros: {'n_layers': 2, 'n_units_layer_0': 90, 'n_units_layer_1': 140, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.009979314187988971, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (90, 140)}


[I 2025-05-16 06:37:14,474] A new study created in memory with name: no-name-9e820683-0302-4c34-bb49-da7eb7d69b4d


MSE da rodada 3: 3.0931

Rodada 4/10


[I 2025-05-16 06:37:42,647] Trial 0 finished with value: 7.5901148525817685 and parameters: {'n_layers': 2, 'n_units_layer_0': 120, 'n_units_layer_1': 110, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0002692849495499339, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 7.5901148525817685.
[I 2025-05-16 06:38:58,261] Trial 1 finished with value: 6.222874555335551 and parameters: {'n_layers': 3, 'n_units_layer_0': 140, 'n_units_layer_1': 130, 'n_units_layer_2': 190, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.0002810794774654313, 'learning_rate': 'adaptive'}. Best is trial 1 with value: 6.222874555335551.
[I 2025-05-16 06:39:11,173] Trial 2 finished with value: 7.954248876195448 and parameters: {'n_layers': 2, 'n_units_layer_0': 130, 'n_units_layer_1': 170, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0006319959416578072, 'learning_rate': 'adaptive'}. Best is trial 1 with value: 6.222874555335551.
[I 2025-05-16 06:40:50,236] Trial 3 finished with value: 5.492115

Melhores hiperparâmetros: {'n_layers': 2, 'n_units_layer_0': 110, 'n_units_layer_1': 130, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.00029469914560248996, 'learning_rate': 'constant', 'hidden_layer_sizes': (110, 130)}


[I 2025-05-16 07:31:14,336] A new study created in memory with name: no-name-fc6c3ae8-ff31-4e14-9147-05e9c4a71c86


MSE da rodada 4: 5.2265

Rodada 5/10


[I 2025-05-16 07:32:32,823] Trial 0 finished with value: 5.8724018763737265 and parameters: {'n_layers': 3, 'n_units_layer_0': 70, 'n_units_layer_1': 70, 'n_units_layer_2': 140, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.0007136427218590088, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 5.8724018763737265.
[I 2025-05-16 07:34:02,763] Trial 1 finished with value: 4.229709491583373 and parameters: {'n_layers': 3, 'n_units_layer_0': 130, 'n_units_layer_1': 130, 'n_units_layer_2': 170, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.000696165748346772, 'learning_rate': 'constant'}. Best is trial 1 with value: 4.229709491583373.
[I 2025-05-16 07:34:11,213] Trial 2 finished with value: 4.458332905467392 and parameters: {'n_layers': 1, 'n_units_layer_0': 50, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0003112205062380622, 'learning_rate': 'adaptive'}. Best is trial 1 with value: 4.229709491583373.
[I 2025-05-16 07:35:56,138] Trial 3 finished with value: 3.8892681

Melhores hiperparâmetros: {'n_layers': 2, 'n_units_layer_0': 70, 'n_units_layer_1': 90, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00023604168215555412, 'learning_rate': 'constant', 'hidden_layer_sizes': (70, 90)}


[I 2025-05-16 08:41:08,955] A new study created in memory with name: no-name-3980c762-c1a8-40f1-978f-e83939ed8dd3


MSE da rodada 5: 4.2390

Rodada 6/10


[I 2025-05-16 08:43:14,795] Trial 0 finished with value: 6.705496268499695 and parameters: {'n_layers': 3, 'n_units_layer_0': 190, 'n_units_layer_1': 180, 'n_units_layer_2': 140, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.00028756626215812806, 'learning_rate': 'constant'}. Best is trial 0 with value: 6.705496268499695.
[I 2025-05-16 08:45:11,950] Trial 1 finished with value: 6.605429397431271 and parameters: {'n_layers': 3, 'n_units_layer_0': 200, 'n_units_layer_1': 120, 'n_units_layer_2': 110, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.00019059130488203904, 'learning_rate': 'adaptive'}. Best is trial 1 with value: 6.605429397431271.
[I 2025-05-16 08:47:46,409] Trial 2 finished with value: 5.358742292289397 and parameters: {'n_layers': 3, 'n_units_layer_0': 110, 'n_units_layer_1': 130, 'n_units_layer_2': 170, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00031347602397998047, 'learning_rate': 'adaptive'}. Best is trial 2 with value: 5.358742292289397.
[I 2025-05-16 0

Melhores hiperparâmetros: {'n_layers': 2, 'n_units_layer_0': 60, 'n_units_layer_1': 150, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.00013695597222834212, 'learning_rate': 'constant', 'hidden_layer_sizes': (60, 150)}


[I 2025-05-16 09:48:07,657] A new study created in memory with name: no-name-44dde26a-b4ea-4e2b-87ae-a5a84f45cca3


MSE da rodada 6: 4.8913

Rodada 7/10


[I 2025-05-16 09:48:21,408] Trial 0 finished with value: 5.635735086683 and parameters: {'n_layers': 1, 'n_units_layer_0': 80, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.006233326661645354, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 5.635735086683.
[I 2025-05-16 09:49:41,446] Trial 1 finished with value: 5.102695903518435 and parameters: {'n_layers': 3, 'n_units_layer_0': 140, 'n_units_layer_1': 80, 'n_units_layer_2': 60, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.00874793772546605, 'learning_rate': 'constant'}. Best is trial 1 with value: 5.102695903518435.
[I 2025-05-16 09:50:05,347] Trial 2 finished with value: 5.176991954175594 and parameters: {'n_layers': 1, 'n_units_layer_0': 140, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.00021590589832832139, 'learning_rate': 'adaptive'}. Best is trial 1 with value: 5.102695903518435.
[I 2025-05-16 09:50:37,540] Trial 3 finished with value: 4.12756379582351 and parameters: {'n_layers': 1, 'n_units_layer_0':

Melhores hiperparâmetros: {'n_layers': 2, 'n_units_layer_0': 200, 'n_units_layer_1': 110, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.002492739448828337, 'learning_rate': 'constant', 'hidden_layer_sizes': (200, 110)}


[I 2025-05-16 10:53:39,282] A new study created in memory with name: no-name-16825940-f5f2-400c-9424-f84aee826390


MSE da rodada 7: 4.9782

Rodada 8/10


[I 2025-05-16 10:54:14,384] Trial 0 finished with value: 4.726227811156442 and parameters: {'n_layers': 3, 'n_units_layer_0': 50, 'n_units_layer_1': 130, 'n_units_layer_2': 200, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.006029223331000863, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.726227811156442.
[I 2025-05-16 10:58:07,447] Trial 1 finished with value: 5.726628750328109 and parameters: {'n_layers': 2, 'n_units_layer_0': 120, 'n_units_layer_1': 180, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.0001399578645666875, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.726227811156442.
[I 2025-05-16 10:59:18,951] Trial 2 finished with value: 6.573606999070633 and parameters: {'n_layers': 3, 'n_units_layer_0': 80, 'n_units_layer_1': 120, 'n_units_layer_2': 100, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0016162859802728205, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.726227811156442.
[I 2025-05-16 10:59:28,402] Trial 3 finished

Melhores hiperparâmetros: {'n_layers': 3, 'n_units_layer_0': 50, 'n_units_layer_1': 130, 'n_units_layer_2': 200, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.006029223331000863, 'learning_rate': 'constant', 'hidden_layer_sizes': (50, 130, 200)}


[I 2025-05-16 11:31:26,577] A new study created in memory with name: no-name-83394706-3d80-4365-a5c4-894b575f5bc0


MSE da rodada 8: 5.3754

Rodada 9/10


[I 2025-05-16 11:31:45,051] Trial 0 finished with value: 3.526410107969894 and parameters: {'n_layers': 1, 'n_units_layer_0': 140, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00014830403011176717, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 3.526410107969894.
[I 2025-05-16 11:31:50,861] Trial 1 finished with value: 6.892555298064288 and parameters: {'n_layers': 1, 'n_units_layer_0': 110, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0052516052536935465, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 3.526410107969894.
[I 2025-05-16 11:33:40,034] Trial 2 finished with value: 5.052311272646139 and parameters: {'n_layers': 2, 'n_units_layer_0': 150, 'n_units_layer_1': 110, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0021635817058741845, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 3.526410107969894.
[I 2025-05-16 11:33:56,218] Trial 3 finished with value: 6.9242899810019285 and parameters: {'n_layers': 2, 'n_units_layer_0': 200, 'n_u

Melhores hiperparâmetros: {'n_layers': 1, 'n_units_layer_0': 140, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0001513229016733997, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (140,)}


[I 2025-05-16 12:09:54,590] A new study created in memory with name: no-name-673aac55-86ec-4a53-95d3-d3c90ea5b647


MSE da rodada 9: 4.0866

Rodada 10/10


[I 2025-05-16 12:10:39,843] Trial 0 finished with value: 4.175392913077266 and parameters: {'n_layers': 2, 'n_units_layer_0': 70, 'n_units_layer_1': 70, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.0003445967483426899, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 4.175392913077266.
[I 2025-05-16 12:13:09,891] Trial 1 finished with value: 4.462126894463748 and parameters: {'n_layers': 3, 'n_units_layer_0': 200, 'n_units_layer_1': 180, 'n_units_layer_2': 120, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.00010587618736854864, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.175392913077266.
[I 2025-05-16 12:13:15,587] Trial 2 finished with value: 7.579525967203289 and parameters: {'n_layers': 1, 'n_units_layer_0': 70, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0006870565766027743, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 4.175392913077266.
[I 2025-05-16 12:13:21,385] Trial 3 finished with value: 7.580751216341393 and parameters: {

Melhores hiperparâmetros: {'n_layers': 1, 'n_units_layer_0': 190, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0001278071903126698, 'learning_rate': 'constant', 'hidden_layer_sizes': (190,)}
MSE da rodada 10: 4.2174

Resumo após 10 execuções:
Média dos MSEs: 4.4607
Desvio padrão dos MSEs: 0.6952
Melhor MSE individual: 3.0931
Melhor modelo salvo em: Modelos\best_mlp_model_optuna.pkl
