In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.linear_model import  LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
import optuna

In [4]:
import joblib
import os

# Tratamento dos dados

In [5]:
spotify_youtube_pt1 = pd.read_csv("Dados/spotify_youtube_up_pt1.csv")
spotify_youtube_pt2 = pd.read_csv("Dados/spotify_youtube_up_pt2.csv")

In [6]:
data_extracao = pd.to_datetime("2023-02-07")
spotify_youtube_pt1["upload_date"] = pd.to_datetime(spotify_youtube_pt1["upload_date"])
spotify_youtube_pt1["dias_na_plataforma"] = (data_extracao - spotify_youtube_pt1["upload_date"]).dt.days

spotify_youtube_pt2["upload_date"] = pd.to_datetime(spotify_youtube_pt2["upload_date"])
spotify_youtube_pt2["dias_na_plataforma"] = (data_extracao - spotify_youtube_pt2["upload_date"]).dt.days

In [7]:
spotify_youtube = pd.concat([spotify_youtube_pt1,spotify_youtube_pt2])

In [8]:
spotify_youtube = spotify_youtube[spotify_youtube['Album_type'] != 'compilation']
spotify_youtube_dummies= pd.get_dummies(spotify_youtube,columns=['Album_type'], dtype=int)

In [9]:
spotify_youtube_dummies['artist_number'] = spotify_youtube_dummies['Artist'].str.split(',').str.len()
spotify_youtube_dummies['engagement_rate'] = spotify_youtube_dummies.apply(lambda row: (row['Likes'] + row['Comments']) / row['Views'] * 100 if row['Views'] > 0 else 0, axis=1)

In [10]:
columns_drop=['Unnamed: 0', 'Artist', 'Url_spotify', 'Track', 'Album', 'Uri','Url_youtube', 'Title', 'Channel', 'Views', 'Likes','Comments', 'Description', 'Licensed', 'official_video', 'upload_date']
spotify_youtube_df=spotify_youtube_dummies.drop(columns=columns_drop)

In [11]:
spotify_youtube_limpo=spotify_youtube_df.dropna(subset=['Danceability','Energy','Key','Loudness','Speechiness','Acousticness', 'Instrumentalness','Liveness','Valence','Tempo','Duration_ms','Stream','engagement_rate'])

In [12]:
spotify_youtube_limpo.columns

Index(['Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness',
       'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',
       'Duration_ms', 'Stream', 'dias_na_plataforma', 'Album_type_album',
       'Album_type_single', 'artist_number', 'engagement_rate'],
      dtype='object')

# Preparação dos dados para os experimentos

In [13]:
youtube_30_df=spotify_youtube_limpo[spotify_youtube_limpo['dias_na_plataforma']<31]
youtube_90_df=spotify_youtube_limpo[spotify_youtube_limpo['dias_na_plataforma']<91]
youtube_365_df=spotify_youtube_limpo[spotify_youtube_limpo['dias_na_plataforma']<366]

In [14]:
spotify_youtube_df_366_more = spotify_youtube_limpo.drop(youtube_365_df.index)
spotify_youtube_df_91_365 = youtube_365_df.drop(youtube_90_df.index)
spotify_youtube_df_31_90 = youtube_365_df.drop(youtube_30_df.index)

In [15]:
youtube_30_df=youtube_30_df[youtube_30_df['engagement_rate']<9.338507725109846]
spotify_youtube_df_31_90=spotify_youtube_df_31_90[spotify_youtube_df_31_90['engagement_rate']<4.602401258613792]
spotify_youtube_df_91_365=spotify_youtube_df_91_365[spotify_youtube_df_91_365['engagement_rate']<4.253191569584401]
spotify_youtube_df_366_more=spotify_youtube_df_366_more[spotify_youtube_df_366_more['engagement_rate']<2.251193229196198]

In [16]:
youtube_30_df_x = youtube_30_df.drop('engagement_rate',axis=1)
youtube_30_df_y = youtube_30_df['engagement_rate']

spotify_youtube_df_31_90_x = spotify_youtube_df_31_90.drop('engagement_rate',axis=1)
spotify_youtube_df_31_90_y = spotify_youtube_df_31_90['engagement_rate']

spotify_youtube_df_91_365_x = spotify_youtube_df_91_365.drop('engagement_rate',axis=1)
spotify_youtube_df_91_365_y = spotify_youtube_df_91_365['engagement_rate']

spotify_youtube_df_366_more_x = spotify_youtube_df_366_more.drop('engagement_rate',axis=1)
spotify_youtube_df_366_more_y = spotify_youtube_df_366_more['engagement_rate']

In [17]:
spotify_youtube_30X_train_val, spotify_youtube_30X_test, spotify_youtube_30y_train_val, spotify_youtube_30y_test = train_test_split(youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=42)
spotify_youtube_30X_train, spotify_youtube_30X_val, spotify_youtube_30y_train, spotify_youtube_30y_val = train_test_split(spotify_youtube_30X_train_val, spotify_youtube_30y_train_val, test_size=0.20, random_state=42)

In [18]:
# Dados de treino 
sy_x_train=pd.concat([spotify_youtube_30X_train,spotify_youtube_df_31_90_x,spotify_youtube_df_91_365_x,spotify_youtube_df_366_more_x])
sy_y_train=pd.concat([spotify_youtube_30y_train,spotify_youtube_df_31_90_y,spotify_youtube_df_91_365_y,spotify_youtube_df_366_more_y])

# Normalização do treino
scaler_all = MinMaxScaler()
sy_x_train_scaled = scaler_all.fit_transform(sy_x_train)

In [19]:
# Normalização do treino de 30 dias
sy_30_x_train_scaled = scaler_all.transform(spotify_youtube_30X_train)

In [20]:
# Para testar o desempenho do modelo com videos com ate 30 dias
sy_30_x_test=spotify_youtube_30X_test
sy_30_y_test=spotify_youtube_30y_test

# validação de 30 dias
sy_30_x_val=spotify_youtube_30X_val
sy_30_y_val=spotify_youtube_30y_val

In [21]:
# Normalizção dos dados de teste de 30 dias
sy_30_x_test_scaled = scaler_all.transform(sy_30_x_test)

# Normalização dos dados de validação de 30 dias
sy_30_x_val_scaled = scaler_all.transform(sy_30_x_val)

# Modelos Padrões

Random Forest Base

In [22]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")
    spotify_youtube_30X_train_val, spotify_youtube_30X_test, spotify_youtube_30y_train_val, spotify_youtube_30y_test = train_test_split(youtube_30_df_x, youtube_30_df_y, test_size=0.25,random_state=40+i)
    spotify_youtube_30X_train, spotify_youtube_30X_val, spotify_youtube_30y_train, spotify_youtube_30y_val = train_test_split(spotify_youtube_30X_train_val, spotify_youtube_30y_train_val, test_size=0.20,random_state=40+i)

    # Dados de treino
    sy_x_train = pd.concat([spotify_youtube_30X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([spotify_youtube_30y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler_all = MinMaxScaler()
    sy_x_train_scaled = scaler_all.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler_all.transform(spotify_youtube_30X_val)
    sy_30_x_test_scaled = scaler_all.transform(spotify_youtube_30X_test)

    sy_30_y_val = spotify_youtube_30y_val
    sy_30_y_test = spotify_youtube_30y_test

    # Reconstrução dos DataFrames normalizados
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=spotify_youtube_30X_val.columns, index=spotify_youtube_30X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=sy_x_train.columns, index=spotify_youtube_30X_test.index)

    # Junta treino e validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, sy_30_y_val])

    # Reentreina o modelo com os melhores hiperparâmetros
    rf = RandomForestRegressor(n_estimators=100,random_state=42)
    rf.fit(X_full, y_full)

    # Previsão no teste
    predictions = rf.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(sy_30_y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")


# Cálculo da média e do desvio padrão
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")



Rodada 1/10
MSE da rodada 1: 2.9555

Rodada 2/10
MSE da rodada 2: 3.2737

Rodada 3/10
MSE da rodada 3: 2.9854

Rodada 4/10
MSE da rodada 4: 3.7640

Rodada 5/10
MSE da rodada 5: 2.6423

Rodada 6/10
MSE da rodada 6: 3.4468

Rodada 7/10
MSE da rodada 7: 3.7576

Rodada 8/10
MSE da rodada 8: 2.9551

Rodada 9/10
MSE da rodada 9: 2.7762

Rodada 10/10
MSE da rodada 10: 2.8014

Resumo após 10 execuções:
Média dos MSEs: 3.1358
Desvio padrão dos MSEs: 0.3833


Linear Regressor Base

In [28]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")
    spotify_youtube_30X_train_val, spotify_youtube_30X_test, spotify_youtube_30y_train_val, spotify_youtube_30y_test = train_test_split(youtube_30_df_x, youtube_30_df_y, test_size=0.25,random_state=40+i)
    spotify_youtube_30X_train, spotify_youtube_30X_val, spotify_youtube_30y_train, spotify_youtube_30y_val = train_test_split(spotify_youtube_30X_train_val, spotify_youtube_30y_train_val, test_size=0.20,random_state=40+i)

    # Dados de treino
    sy_x_train = pd.concat([spotify_youtube_30X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([spotify_youtube_30y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler_all = MinMaxScaler()
    sy_x_train_scaled = scaler_all.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler_all.transform(spotify_youtube_30X_val)
    sy_30_x_test_scaled = scaler_all.transform(spotify_youtube_30X_test)

    sy_30_y_val = spotify_youtube_30y_val
    sy_30_y_test = spotify_youtube_30y_test

    # Reconstrução dos DataFrames normalizados
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=spotify_youtube_30X_val.columns, index=spotify_youtube_30X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=sy_x_train.columns, index=spotify_youtube_30X_test.index)

    # Junta treino e validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, sy_30_y_val])

    # Reentreina o modelo com os melhores hiperparâmetros
    lr = LinearRegression()
    lr.fit(X_full, y_full)

    # Previsão no teste
    predictions = lr.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(sy_30_y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")


# Cálculo da média e do desvio padrão
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")


Rodada 1/10
MSE da rodada 1: 8.4844

Rodada 2/10
MSE da rodada 2: 6.4912

Rodada 3/10
MSE da rodada 3: 6.1034

Rodada 4/10
MSE da rodada 4: 7.5465

Rodada 5/10
MSE da rodada 5: 6.3378

Rodada 6/10
MSE da rodada 6: 7.8595

Rodada 7/10
MSE da rodada 7: 6.5832

Rodada 8/10
MSE da rodada 8: 7.7659

Rodada 9/10
MSE da rodada 9: 7.8448

Rodada 10/10
MSE da rodada 10: 7.2996

Resumo após 10 execuções:
Média dos MSEs: 7.2316
Desvio padrão dos MSEs: 0.7596


XGBoost Base

In [33]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")
    spotify_youtube_30X_train_val, spotify_youtube_30X_test, spotify_youtube_30y_train_val, spotify_youtube_30y_test = train_test_split(youtube_30_df_x, youtube_30_df_y, test_size=0.25,random_state=40+i)
    spotify_youtube_30X_train, spotify_youtube_30X_val, spotify_youtube_30y_train, spotify_youtube_30y_val = train_test_split(spotify_youtube_30X_train_val, spotify_youtube_30y_train_val, test_size=0.20,random_state=40+i)

    # Dados de treino
    sy_x_train = pd.concat([spotify_youtube_30X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([spotify_youtube_30y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler_all = MinMaxScaler()
    sy_x_train_scaled = scaler_all.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler_all.transform(spotify_youtube_30X_val)
    sy_30_x_test_scaled = scaler_all.transform(spotify_youtube_30X_test)

    sy_30_y_val = spotify_youtube_30y_val
    sy_30_y_test = spotify_youtube_30y_test

    # Reconstrução dos DataFrames normalizados
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=spotify_youtube_30X_val.columns, index=spotify_youtube_30X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=sy_x_train.columns, index=spotify_youtube_30X_test.index)

    # Junta treino e validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, sy_30_y_val])

    # Reentreina o modelo com os melhores hiperparâmetros
    xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42)
    xgb_reg.fit(X_full, y_full)

    # Previsão no teste
    predictions = xgb_reg.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(sy_30_y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")


# Cálculo da média e do desvio padrão
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")


Rodada 1/10
MSE da rodada 1: 3.2294

Rodada 2/10
MSE da rodada 2: 3.2208

Rodada 3/10
MSE da rodada 3: 2.8943

Rodada 4/10
MSE da rodada 4: 4.0578

Rodada 5/10
MSE da rodada 5: 2.9070

Rodada 6/10
MSE da rodada 6: 3.4138

Rodada 7/10
MSE da rodada 7: 4.1988

Rodada 8/10
MSE da rodada 8: 3.4271

Rodada 9/10
MSE da rodada 9: 3.1767

Rodada 10/10
MSE da rodada 10: 3.1811

Resumo após 10 execuções:
Média dos MSEs: 3.3707
Desvio padrão dos MSEs: 0.4148


MLP Base

In [34]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")
    spotify_youtube_30X_train_val, spotify_youtube_30X_test, spotify_youtube_30y_train_val, spotify_youtube_30y_test = train_test_split(youtube_30_df_x, youtube_30_df_y, test_size=0.25,random_state=40+i)
    spotify_youtube_30X_train, spotify_youtube_30X_val, spotify_youtube_30y_train, spotify_youtube_30y_val = train_test_split(spotify_youtube_30X_train_val, spotify_youtube_30y_train_val, test_size=0.20,random_state=40+i)

    # Dados de treino
    sy_x_train = pd.concat([spotify_youtube_30X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([spotify_youtube_30y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler_all = MinMaxScaler()
    sy_x_train_scaled = scaler_all.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler_all.transform(spotify_youtube_30X_val)
    sy_30_x_test_scaled = scaler_all.transform(spotify_youtube_30X_test)

    sy_30_y_val = spotify_youtube_30y_val
    sy_30_y_test = spotify_youtube_30y_test

    # Reconstrução dos DataFrames normalizados
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=spotify_youtube_30X_val.columns, index=spotify_youtube_30X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=sy_x_train.columns, index=spotify_youtube_30X_test.index)

    # Junta treino e validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, sy_30_y_val])

    # Reentreina o modelo com os melhores hiperparâmetros
    mlp_reg = MLPRegressor(random_state=42, max_iter=1000)
    mlp_reg.fit(X_full, y_full)

    # Previsão no teste
    predictions = mlp_reg.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(sy_30_y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")


# Cálculo da média e do desvio padrão
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")


Rodada 1/10
MSE da rodada 1: 4.8853

Rodada 2/10
MSE da rodada 2: 3.5492

Rodada 3/10
MSE da rodada 3: 3.2495

Rodada 4/10
MSE da rodada 4: 4.5642

Rodada 5/10
MSE da rodada 5: 3.6619

Rodada 6/10
MSE da rodada 6: 4.5303

Rodada 7/10
MSE da rodada 7: 4.0225

Rodada 8/10
MSE da rodada 8: 4.1663

Rodada 9/10
MSE da rodada 9: 3.9171

Rodada 10/10
MSE da rodada 10: 4.5673

Resumo após 10 execuções:
Média dos MSEs: 4.1114
Desvio padrão dos MSEs: 0.4997


# Tuning dos modelos

## Random Forest

Resultados:

GridSearch:
* Média dos MSEs: 3.3750
* Desvio padrão dos MSEs: 0.4742
* Melhor MSE individual: 2.5401 (r5)
* Tempo: 82m 17s

Optuna:
* Média dos MSEs: 3.3223
* Desvio padrão dos MSEs: 0.4055
* Melhor MSE individual: 2.5819 (r5)
* Tempo: 118m 38s


Fazendo o tuning com GridSearch

In [None]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [300, 400, 500],
    'max_samples': [0.5, 0.75],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [30, 40, 50],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

# Inicializando o modelo Random Forest
rf = RandomForestRegressor(random_state=42)

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=ps,  # Passa o PredefinedSplit
    scoring='neg_mean_squared_error',
    verbose=0,
    n_jobs=1
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo
best_rf = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
sy_30_x_test_scaled_df = pd.DataFrame(
    sy_30_x_test_scaled,
    columns=sy_x_train.columns,
    index=sy_30_x_test.index
)
predictions = best_rf.predict(sy_30_x_test_scaled_df)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (mse) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_random_forest_model_grid.pkl')
joblib.dump(best_rf, model_path)
print(f"Modelo salvo em: {model_path}")

Mean Squared Error (mse) on test data: 2.9452172749232077
Modelo salvo em: Modelos\best_random_forest_model_grid.pkl


In [32]:
# Hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [300, 400, 500],
    'max_samples': [0.5, 0.75],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [30, 40, 50],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")
    spotify_youtube_30X_train_val, spotify_youtube_30X_test, spotify_youtube_30y_train_val, spotify_youtube_30y_test = train_test_split(youtube_30_df_x, youtube_30_df_y, test_size=0.25,random_state=40+i)
    spotify_youtube_30X_train, spotify_youtube_30X_val, spotify_youtube_30y_train, spotify_youtube_30y_val = train_test_split(spotify_youtube_30X_train_val, spotify_youtube_30y_train_val, test_size=0.20,random_state=40+i)

    # Dados de treino
    sy_x_train = pd.concat([spotify_youtube_30X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([spotify_youtube_30y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler_all = MinMaxScaler()
    sy_x_train_scaled = scaler_all.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler_all.transform(spotify_youtube_30X_val)
    sy_30_x_test_scaled = scaler_all.transform(spotify_youtube_30X_test)

    sy_30_y_val = spotify_youtube_30y_val
    sy_30_y_test = spotify_youtube_30y_test

    # Reconstrução dos DataFrames normalizados
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=spotify_youtube_30X_val.columns, index=spotify_youtube_30X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=sy_x_train.columns, index=spotify_youtube_30X_test.index)

    # Junta treino e validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, sy_30_y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(spotify_youtube_30X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Inicializa o modelo base
    rf = RandomForestRegressor(random_state=i)

    # GridSearch
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=ps,
        scoring='neg_mean_squared_error',
        verbose=0,
        n_jobs=-1
    )

    grid_search.fit(X_full, y_full)

    # Recupera os melhores hiperparâmetros
    best_params = grid_search.best_params_

    # Reentreina o modelo com os melhores hiperparâmetros
    best_rf = RandomForestRegressor(**best_params, random_state=42)
    best_rf.fit(X_full, y_full)

    # Previsão no teste
    predictions = best_rf.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(sy_30_y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    # Atualiza o melhor modelo
    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = best_rf

# Cálculo da média e do desvio padrão
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_rf_model_grid.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")


Rodada 1/10
MSE da rodada 1: 3.3749

Rodada 2/10
MSE da rodada 2: 3.0917

Rodada 3/10
MSE da rodada 3: 2.8221

Rodada 4/10
MSE da rodada 4: 4.0446

Rodada 5/10
MSE da rodada 5: 2.5401

Rodada 6/10
MSE da rodada 6: 4.1256

Rodada 7/10
MSE da rodada 7: 3.7498

Rodada 8/10
MSE da rodada 8: 3.3979

Rodada 9/10
MSE da rodada 9: 3.2707

Rodada 10/10
MSE da rodada 10: 3.3323

Resumo após 10 execuções:
Média dos MSEs: 3.3750
Desvio padrão dos MSEs: 0.4742
Melhor MSE individual: 2.5401
Melhor modelo salvo em: Modelos\best_rf_model_grid.pkl


Fazendo o tuning com Optuna

In [35]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Função objetivo para o Optuna
def objective(trial):
    # Sugerindo os hiperparâmetros
    n_estimators = trial.suggest_int("n_estimators", 50, 500, step=50)
    max_depth = trial.suggest_int("max_depth", 10, 50, step=10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10, step=2)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 4, step=1)
    max_samples = trial.suggest_float("max_samples", 0.5, 1.0, step=0.05)
    max_features = trial.suggest_categorical("max_features", ['sqrt', 'log2'])


    
    

    # Criando o modelo com os hiperparâmetros sugeridos
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_samples=max_samples,
        max_features=max_features,
        
        
        random_state=42
    )

    # Treinando o modelo nos dados de treino
    rf.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = rf.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # Número de tentativas

# Obtendo os melhores hiperparâmetros
best_params = study.best_params
print("Melhores hiperparâmetros:", best_params)

# Treinando o modelo final com os melhores hiperparâmetros
best_rf = RandomForestRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    random_state=42
)
best_rf.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_rf.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_random_forest_model_optuna.pkl')
joblib.dump(best_rf, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-21 18:41:19,427] A new study created in memory with name: no-name-a6adede2-6e97-44a2-b830-b9359b490f1d
[I 2025-04-21 18:41:29,856] Trial 0 finished with value: 3.6594923027500004 and parameters: {'n_estimators': 350, 'max_depth': 30, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_samples': 0.95, 'max_features': 'log2'}. Best is trial 0 with value: 3.6594923027500004.
[I 2025-04-21 18:41:37,267] Trial 1 finished with value: 3.8796388232432424 and parameters: {'n_estimators': 350, 'max_depth': 40, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_samples': 0.55, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.6594923027500004.
[I 2025-04-21 18:41:44,639] Trial 2 finished with value: 3.7630106068662728 and parameters: {'n_estimators': 350, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_samples': 0.9, 'max_features': 'log2'}. Best is trial 0 with value: 3.6594923027500004.
[I 2025-04-21 18:41:51,941] Trial 3 finished with value: 3.87240710807

Melhores hiperparâmetros: {'n_estimators': 50, 'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.95, 'max_features': 'log2'}
Mean Squared Error (MSE) on test data: 2.951254082497033
Modelo salvo em: Modelos\best_random_forest_model_optuna.pkl


In [39]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40+i
    )

    # Concatena dados externos com treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    # Reconstruindo DataFrames
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=X_test.columns, index=X_test.index)

    # Junta treino + val
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Função objetivo do Optuna
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int("n_estimators", 50, 500, step=50),
            'max_depth': trial.suggest_int("max_depth", 10, 50, step=10),
            'min_samples_split': trial.suggest_int("min_samples_split", 2, 10, step=2),
            'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 4),
            'max_samples': trial.suggest_float("max_samples", 0.5, 1.0, step=0.05),
            'max_features': trial.suggest_categorical("max_features", ['sqrt', 'log2']),
            'random_state': 42,
        }

        model = RandomForestRegressor(**params)
        model.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])
        preds = model.predict(X_full[ps.test_fold == 0])
        mse = mean_squared_error(y_full[ps.test_fold == 0], preds)
        return mse

    # Rodando o estudo
    SEED = 40+i
    study = optuna.create_study(direction="minimize",sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(objective, n_trials=100, show_progress_bar=False)

    best_params = study.best_params
    print("Melhores hiperparâmetros:", best_params)

    # Treina modelo final com treino + val
    final_model = RandomForestRegressor(**best_params)
    final_model.fit(X_full, y_full)

    # Avaliação em teste
    predictions = final_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = final_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_random_forest_model_optuna.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")

[I 2025-05-25 12:49:05,777] A new study created in memory with name: no-name-a70399c7-c1a8-49f6-bc3e-8c13b280a12d



Rodada 1/10


[I 2025-05-25 12:49:10,192] Trial 0 finished with value: 4.808343760416967 and parameters: {'n_estimators': 250, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_samples': 0.7, 'max_features': 'log2'}. Best is trial 0 with value: 4.808343760416967.
[I 2025-05-25 12:49:19,254] Trial 1 finished with value: 4.7865543811276225 and parameters: {'n_estimators': 350, 'max_depth': 40, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_samples': 0.8, 'max_features': 'sqrt'}. Best is trial 1 with value: 4.7865543811276225.
[I 2025-05-25 12:49:21,004] Trial 2 finished with value: 4.580151644821913 and parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_samples': 0.95, 'max_features': 'sqrt'}. Best is trial 2 with value: 4.580151644821913.
[I 2025-05-25 12:49:27,468] Trial 3 finished with value: 4.83198334470398 and parameters: {'n_estimators': 250, 'max_depth': 40, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_samples':

Melhores hiperparâmetros: {'n_estimators': 300, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 0.9, 'max_features': 'sqrt'}


[I 2025-05-25 13:05:27,607] A new study created in memory with name: no-name-fbc413ed-a523-4657-b562-c0085236792b


MSE da rodada 1: 3.2295

Rodada 2/10


[I 2025-05-25 13:05:29,874] Trial 0 finished with value: 2.897851736360735 and parameters: {'n_estimators': 150, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_samples': 0.55, 'max_features': 'sqrt'}. Best is trial 0 with value: 2.897851736360735.
[I 2025-05-25 13:05:38,807] Trial 1 finished with value: 2.736977751221431 and parameters: {'n_estimators': 350, 'max_depth': 50, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_samples': 0.65, 'max_features': 'log2'}. Best is trial 1 with value: 2.736977751221431.
[I 2025-05-25 13:05:43,841] Trial 2 finished with value: 2.740713354428065 and parameters: {'n_estimators': 250, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_samples': 0.9, 'max_features': 'log2'}. Best is trial 1 with value: 2.736977751221431.
[I 2025-05-25 13:05:55,764] Trial 3 finished with value: 2.6923605089190814 and parameters: {'n_estimators': 400, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples'

Melhores hiperparâmetros: {'n_estimators': 200, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.8, 'max_features': 'sqrt'}


[I 2025-05-25 13:23:34,936] A new study created in memory with name: no-name-9b048e57-ecff-4e13-aa2c-9dc39c886fb6


MSE da rodada 2: 3.0357

Rodada 3/10


[I 2025-05-25 13:23:39,170] Trial 0 finished with value: 3.8052561664961657 and parameters: {'n_estimators': 200, 'max_depth': 50, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_samples': 0.55, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.8052561664961657.
[I 2025-05-25 13:23:54,354] Trial 1 finished with value: 3.3935597873712884 and parameters: {'n_estimators': 450, 'max_depth': 40, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_samples': 1.0, 'max_features': 'sqrt'}. Best is trial 1 with value: 3.3935597873712884.
[I 2025-05-25 13:23:56,071] Trial 2 finished with value: 3.750145888906079 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_samples': 0.7, 'max_features': 'log2'}. Best is trial 1 with value: 3.3935597873712884.
[I 2025-05-25 13:23:59,321] Trial 3 finished with value: 3.406349088877511 and parameters: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_sampl

Melhores hiperparâmetros: {'n_estimators': 200, 'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.8500000000000001, 'max_features': 'sqrt'}


[I 2025-05-25 13:39:33,347] A new study created in memory with name: no-name-0896abb1-9d4a-465d-aca8-2da1cbce2e61


MSE da rodada 3: 2.8248

Rodada 4/10


[I 2025-05-25 13:39:36,358] Trial 0 finished with value: 4.229890418849297 and parameters: {'n_estimators': 100, 'max_depth': 40, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 0.65, 'max_features': 'sqrt'}. Best is trial 0 with value: 4.229890418849297.
[I 2025-05-25 13:39:42,275] Trial 1 finished with value: 4.279142721794562 and parameters: {'n_estimators': 300, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_samples': 0.9, 'max_features': 'sqrt'}. Best is trial 0 with value: 4.229890418849297.
[I 2025-05-25 13:39:51,159] Trial 2 finished with value: 4.3123738439914305 and parameters: {'n_estimators': 450, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_samples': 0.5, 'max_features': 'log2'}. Best is trial 0 with value: 4.229890418849297.
[I 2025-05-25 13:40:05,127] Trial 3 finished with value: 4.191382040228378 and parameters: {'n_estimators': 500, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_samples'

Melhores hiperparâmetros: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_samples': 1.0, 'max_features': 'log2'}


[I 2025-05-25 13:46:04,764] A new study created in memory with name: no-name-2e2c8cfa-6a60-418d-a2b2-f3d4355f6c36


MSE da rodada 4: 3.9399

Rodada 5/10


[I 2025-05-25 13:46:12,064] Trial 0 finished with value: 3.8275729169328487 and parameters: {'n_estimators': 450, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_samples': 0.65, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.8275729169328487.
[I 2025-05-25 13:46:18,398] Trial 1 finished with value: 3.814044818241503 and parameters: {'n_estimators': 250, 'max_depth': 30, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_samples': 0.75, 'max_features': 'sqrt'}. Best is trial 1 with value: 3.814044818241503.
[I 2025-05-25 13:46:22,398] Trial 2 finished with value: 3.8161005301213824 and parameters: {'n_estimators': 150, 'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_samples': 0.8500000000000001, 'max_features': 'log2'}. Best is trial 1 with value: 3.814044818241503.
[I 2025-05-25 13:46:25,332] Trial 3 finished with value: 3.7693660421260473 and parameters: {'n_estimators': 100, 'max_depth': 30, 'min_samples_split': 10, 'min_samples_lea

Melhores hiperparâmetros: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_samples': 0.95, 'max_features': 'sqrt'}


[I 2025-05-25 13:52:45,054] A new study created in memory with name: no-name-477bb054-4d39-4f91-975b-48b9e8308f89


MSE da rodada 5: 2.5819

Rodada 6/10


[I 2025-05-25 13:52:59,821] Trial 0 finished with value: 3.6440652649697087 and parameters: {'n_estimators': 500, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.7, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.6440652649697087.
[I 2025-05-25 13:53:01,954] Trial 1 finished with value: 3.9179265840524313 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_samples': 0.8500000000000001, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.6440652649697087.
[I 2025-05-25 13:53:10,318] Trial 2 finished with value: 3.7198266804565177 and parameters: {'n_estimators': 350, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_samples': 0.7, 'max_features': 'log2'}. Best is trial 0 with value: 3.6440652649697087.
[I 2025-05-25 13:53:13,022] Trial 3 finished with value: 3.7239086600857805 and parameters: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_lea

Melhores hiperparâmetros: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 1.0, 'max_features': 'log2'}


[I 2025-05-25 14:01:51,669] A new study created in memory with name: no-name-4b4c0575-98f1-4667-a60d-67bb4155b18a


MSE da rodada 6: 3.7912

Rodada 7/10


[I 2025-05-25 14:02:00,852] Trial 0 finished with value: 3.3836717291455436 and parameters: {'n_estimators': 400, 'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_samples': 0.65, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.3836717291455436.
[I 2025-05-25 14:02:05,819] Trial 1 finished with value: 3.400784263532126 and parameters: {'n_estimators': 250, 'max_depth': 50, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_samples': 0.5, 'max_features': 'log2'}. Best is trial 0 with value: 3.3836717291455436.
[I 2025-05-25 14:02:16,420] Trial 2 finished with value: 3.2972849067075827 and parameters: {'n_estimators': 450, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_samples': 0.7, 'max_features': 'sqrt'}. Best is trial 2 with value: 3.2972849067075827.
[I 2025-05-25 14:02:19,554] Trial 3 finished with value: 3.140296535133092 and parameters: {'n_estimators': 150, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_sampl

Melhores hiperparâmetros: {'n_estimators': 50, 'max_depth': 40, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_samples': 0.8, 'max_features': 'log2'}


[I 2025-05-25 14:09:31,895] A new study created in memory with name: no-name-19067f0c-8a1a-45df-a08f-6966ef738999


MSE da rodada 7: 3.7018

Rodada 8/10


[I 2025-05-25 14:09:34,845] Trial 0 finished with value: 3.9302887886262825 and parameters: {'n_estimators': 100, 'max_depth': 50, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_samples': 0.8500000000000001, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.9302887886262825.
[I 2025-05-25 14:09:40,179] Trial 1 finished with value: 3.9045483663774276 and parameters: {'n_estimators': 250, 'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_samples': 0.5, 'max_features': 'log2'}. Best is trial 1 with value: 3.9045483663774276.
[I 2025-05-25 14:09:47,579] Trial 2 finished with value: 4.057039503283099 and parameters: {'n_estimators': 350, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_samples': 0.6, 'max_features': 'log2'}. Best is trial 1 with value: 3.9045483663774276.
[I 2025-05-25 14:09:50,296] Trial 3 finished with value: 4.130514681230659 and parameters: {'n_estimators': 100, 'max_depth': 40, 'min_samples_split': 2, 'min_samples_leaf':

Melhores hiperparâmetros: {'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.95, 'max_features': 'sqrt'}


[I 2025-05-25 14:26:09,571] A new study created in memory with name: no-name-a9774824-ea53-40b2-8353-a01dd26718e8


MSE da rodada 8: 3.4160

Rodada 9/10


[I 2025-05-25 14:26:11,306] Trial 0 finished with value: 3.519107754589934 and parameters: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_samples': 0.9, 'max_features': 'log2'}. Best is trial 0 with value: 3.519107754589934.
[I 2025-05-25 14:26:17,754] Trial 1 finished with value: 3.3272957446483655 and parameters: {'n_estimators': 250, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.55, 'max_features': 'log2'}. Best is trial 1 with value: 3.3272957446483655.
[I 2025-05-25 14:26:24,199] Trial 2 finished with value: 3.4362789887800718 and parameters: {'n_estimators': 250, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_samples': 0.6, 'max_features': 'sqrt'}. Best is trial 1 with value: 3.3272957446483655.
[I 2025-05-25 14:26:25,202] Trial 3 finished with value: 3.465445607528785 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_samples'

Melhores hiperparâmetros: {'n_estimators': 50, 'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_samples': 0.8, 'max_features': 'log2'}


[I 2025-05-25 14:34:22,665] A new study created in memory with name: no-name-be30c390-d28a-4795-a0fc-52c1093b5f7c


MSE da rodada 9: 3.3865

Rodada 10/10


[I 2025-05-25 14:34:28,167] Trial 0 finished with value: 3.2966441326765477 and parameters: {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_samples': 0.8500000000000001, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.2966441326765477.
[I 2025-05-25 14:34:32,115] Trial 1 finished with value: 3.4277093292079757 and parameters: {'n_estimators': 150, 'max_depth': 40, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_samples': 0.75, 'max_features': 'log2'}. Best is trial 0 with value: 3.2966441326765477.
[I 2025-05-25 14:34:43,382] Trial 2 finished with value: 3.226776569715065 and parameters: {'n_estimators': 450, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_samples': 0.65, 'max_features': 'log2'}. Best is trial 2 with value: 3.226776569715065.
[I 2025-05-25 14:34:54,017] Trial 3 finished with value: 3.340520589608775 and parameters: {'n_estimators': 400, 'max_depth': 50, 'min_samples_split': 8, 'min_samples_le

Melhores hiperparâmetros: {'n_estimators': 200, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_samples': 0.8, 'max_features': 'log2'}
MSE da rodada 10: 3.3159

Resumo após 10 execuções:
Média dos MSEs: 3.3223
Desvio padrão dos MSEs: 0.4055
Melhor MSE individual: 2.5819
Melhor modelo salvo em: Modelos\best_random_forest_model_optuna.pkl


## Linear Regressor

Resultados:

GridSearch:
* Média dos MSEs: 7.2316
* Desvio padrão dos MSEs: 0.7596
* Melhor MSE individual: 6.1034 (r3)
* Tempo: 22s


Optuna:
* Média dos MSEs: 7.2316
* Desvio padrão dos MSEs: 0.7596
* Melhor MSE individual: 6.1034 (r3)
* Tempo: 0.3s


Fazendo tuning com GridSearch

In [47]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Inicializando o modelo Random Forest
lin_reg = LinearRegression() # Regressor logistico

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=lin_reg,
    param_grid={},  # Sem hiperparâmetros para ajustar
    cv=ps,  # Passa o PredefinedSplit
    scoring='neg_mean_squared_error',
    verbose=3,
    n_jobs=-1,
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo
best_lin_reg = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
predictions = best_lin_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (mse) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_Linear_Regressor_model_grid.pkl')
joblib.dump(best_lin_reg, model_path)
print(f"Modelo salvo em: {model_path}")

Fitting 1 folds for each of 1 candidates, totalling 1 fits
Mean Squared Error (mse) on test data: 6.18329638505496
Modelo salvo em: Modelos\best_Linear_Regressor_model_grid.pkl




In [34]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40+i
    )

    # Concatena dados externos com treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    # Reconstruindo DataFrames
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=X_test.columns, index=X_test.index)

    # Junta treino + val
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Modelo
    lin_reg = LinearRegression()

    # GridSearch (sem parâmetros a otimizar nesse caso)
    grid_search = GridSearchCV(
        estimator=lin_reg,
        param_grid={},  # LinearRegression não tem hiperparâmetros relevantes aqui
        cv=ps,
        scoring='neg_mean_squared_error',
        verbose=0,
        n_jobs=-1,
    )
    grid_search.fit(X_full, y_full)
    best_model = grid_search.best_estimator_

    # Avaliação
    predictions = best_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = best_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_Linear_Regressor_model_grid.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")


Rodada 1/10
MSE da rodada 1: 8.4844

Rodada 2/10
MSE da rodada 2: 6.4912

Rodada 3/10
MSE da rodada 3: 6.1034

Rodada 4/10
MSE da rodada 4: 7.5465

Rodada 5/10
MSE da rodada 5: 6.3378

Rodada 6/10
MSE da rodada 6: 7.8595

Rodada 7/10
MSE da rodada 7: 6.5832

Rodada 8/10
MSE da rodada 8: 7.7659

Rodada 9/10
MSE da rodada 9: 7.8448

Rodada 10/10
MSE da rodada 10: 7.2996

Resumo após 10 execuções:
Média dos MSEs: 7.2316
Desvio padrão dos MSEs: 0.7596
Melhor MSE individual: 6.1034
Melhor modelo salvo em: Modelos\best_Linear_Regressor_model_grid.pkl


Fazendo o Tuning com Optuna

In [48]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Função objetivo para o Optuna
def objective(trial):
    # Inicializando o modelo Linear Regression
    lin_reg = LinearRegression()

    # Treinando o modelo nos dados de treino
    lin_reg.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = lin_reg.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1)  # Apenas 1 tentativa, pois não há hiperparâmetros para ajustar

# Treinando o modelo final
best_lin_reg = LinearRegression()
best_lin_reg.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_lin_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_linear_regressor_model_optuna.pkl')
joblib.dump(best_lin_reg, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-17 15:47:13,647] A new study created in memory with name: no-name-8decee07-315f-4476-8cb2-7e5be21e71fa
[I 2025-04-17 15:47:13,671] Trial 0 finished with value: 7.789384479414533 and parameters: {}. Best is trial 0 with value: 7.789384479414533.


Mean Squared Error (MSE) on test data: 6.245837740756641
Modelo salvo em: Modelos\best_linear_regressor_model_optuna.pkl


In [35]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados (treino/val/teste)
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40 + i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40 + i
    )

    # Concatena dados externos com treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x, spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    # Reconstruindo DataFrames normalizados
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=X_test.columns, index=X_test.index)

    # Junta treino + validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # PredefinedSplit: -1 (treino), 0 (validação)
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Função objetivo do Optuna (sem hiperparâmetros para ajustar)
    def objective(trial):
        model = LinearRegression()
        model.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])
        preds = model.predict(X_full[ps.test_fold == 0])
        mse = mean_squared_error(y_full[ps.test_fold == 0], preds)
        return mse

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=1, show_progress_bar=False)

    # Treino final com treino + validação
    best_model = LinearRegression()
    best_model.fit(X_full, y_full)

    # Avaliação em teste
    predictions = best_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = best_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_linear_regressor_model_optuna.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")

[I 2025-05-15 19:31:54,820] A new study created in memory with name: no-name-9636fcd6-5582-4feb-86a4-560710a6a460
[I 2025-05-15 19:31:54,829] Trial 0 finished with value: 8.13508917899053 and parameters: {}. Best is trial 0 with value: 8.13508917899053.
[I 2025-05-15 19:31:54,848] A new study created in memory with name: no-name-3baa140e-faf4-4a36-a04b-724007e89ef8
[I 2025-05-15 19:31:54,857] Trial 0 finished with value: 6.42852090501063 and parameters: {}. Best is trial 0 with value: 6.42852090501063.
[I 2025-05-15 19:31:54,874] A new study created in memory with name: no-name-0952f4be-e85a-4aca-990c-3ad4d05ee5eb
[I 2025-05-15 19:31:54,884] Trial 0 finished with value: 7.661134602522894 and parameters: {}. Best is trial 0 with value: 7.661134602522894.
[I 2025-05-15 19:31:54,900] A new study created in memory with name: no-name-b2e8c402-7524-47af-8662-b9adc1b962c4
[I 2025-05-15 19:31:54,910] Trial 0 finished with value: 7.930440101465996 and parameters: {}. Best is trial 0 with value:


Rodada 1/10
MSE da rodada 1: 8.4844

Rodada 2/10
MSE da rodada 2: 6.4912

Rodada 3/10
MSE da rodada 3: 6.1034

Rodada 4/10
MSE da rodada 4: 7.5465

Rodada 5/10
MSE da rodada 5: 6.3378

Rodada 6/10


[I 2025-05-15 19:31:54,980] A new study created in memory with name: no-name-1e1bccd3-3ce7-441a-ab7e-3e2b5ab0b2d1
[I 2025-05-15 19:31:54,989] Trial 0 finished with value: 6.851325876517844 and parameters: {}. Best is trial 0 with value: 6.851325876517844.


MSE da rodada 6: 7.8595

Rodada 7/10


[I 2025-05-15 19:31:55,008] A new study created in memory with name: no-name-2bf0b422-63e8-4d4f-ba6b-d661ad80a3ca
[I 2025-05-15 19:31:55,017] Trial 0 finished with value: 7.517792949401467 and parameters: {}. Best is trial 0 with value: 7.517792949401467.
[I 2025-05-15 19:31:55,034] A new study created in memory with name: no-name-c1f4c31b-4fe5-4e62-988e-f574ae4b1fa6
[I 2025-05-15 19:31:55,044] Trial 0 finished with value: 6.836980298646069 and parameters: {}. Best is trial 0 with value: 6.836980298646069.
[I 2025-05-15 19:31:55,062] A new study created in memory with name: no-name-7239c5d1-198b-49b4-be80-4e44facbbbed
[I 2025-05-15 19:31:55,071] Trial 0 finished with value: 7.542051830787146 and parameters: {}. Best is trial 0 with value: 7.542051830787146.


MSE da rodada 7: 6.5832

Rodada 8/10
MSE da rodada 8: 7.7659

Rodada 9/10
MSE da rodada 9: 7.8448

Rodada 10/10
MSE da rodada 10: 7.2996

Resumo após 10 execuções:
Média dos MSEs: 7.2316
Desvio padrão dos MSEs: 0.7596
Melhor MSE individual: 6.1034
Melhor modelo salvo em: Modelos\best_linear_regressor_model_optuna.pkl


## XGBoost

Resultados:

GridSearch:
* Média dos MSEs: 3.3049
* Desvio padrão dos MSEs: 0.4324
* Melhor MSE individual: 2.7117 (r5)
* Tempo: 612m 57s


Optuna:
* Média dos MSEs: 3.3086
* Desvio padrão dos MSEs: 0.4402
* Melhor MSE individual: 2.7636 (r5)
* Tempo: 32m 58s



GridSearch

In [None]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500,600,700,800,900,1000],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.8, 1.0],  # mantido apenas uma vez
    'max_depth': [3, 5, 7, 9, 11],
}

# Inicializando o modelo Random Forest
xgb_reg = LinearRegression() # Regressor logistico
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=xgb_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Métrica de avaliação
    cv=ps,  # Número de folds para validação cruzada
    verbose=1,  # Mostra o progresso do tuning
    n_jobs=1  # Usa todos os núcleos disponíveis
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo

best_xgb_reg = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
predictions = best_xgb_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (mse) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_xgb_model_grid.pkl')
joblib.dump(best_xgb_reg, model_path)
print(f"Modelo salvo em: {model_path}")

Fitting 1 folds for each of 5280 candidates, totalling 5280 fits
Mean Squared Error (mse) on test data: 2.795087678116888
Modelo salvo em: Modelos\best_xgb_model_grid.pkl


In [36]:
# Hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [ 500, 600, 700, 800, 900, 1000],
    'learning_rate': [0.001, 0.01],
    'gamma': [0.1, 0.2, 0.3],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.8, 1.0],
    'max_depth': [3, 5, 7, 9],
}

mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40+i
    )

    # Dados de treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x,spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    sy_30_y_val = y_val
    sy_30_y_test = y_test

    # Reconstruindo DataFrames escalados
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=sy_x_train.columns, index=X_test.index)

    # Junção de treino + validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, sy_30_y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Grid search
    xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

    grid_search = GridSearchCV(
        estimator=xgb_reg,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=ps,
        verbose=0,
        n_jobs=1
    )
    grid_search.fit(X_full, y_full)

    # Re-treinar modelo com melhores hiperparâmetros, agora só com treino (sem val)
    best_params = grid_search.best_params_
    retrain_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
    retrain_model.fit(X_full, y_full)

    # Avaliação final
    predictions = retrain_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(sy_30_y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = retrain_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo da melhor rodada
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_xgb_model_grid.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")


Rodada 1/10
MSE da rodada 1: 3.1043

Rodada 2/10
MSE da rodada 2: 3.0657

Rodada 3/10
MSE da rodada 3: 2.9701

Rodada 4/10
MSE da rodada 4: 4.2957

Rodada 5/10
MSE da rodada 5: 2.7117

Rodada 6/10
MSE da rodada 6: 3.6845

Rodada 7/10
MSE da rodada 7: 3.6084

Rodada 8/10
MSE da rodada 8: 3.3837

Rodada 9/10
MSE da rodada 9: 3.0400

Rodada 10/10
MSE da rodada 10: 3.1848

Resumo após 10 execuções:
Média dos MSEs: 3.3049
Desvio padrão dos MSEs: 0.4324
Melhor MSE individual: 2.7117
Melhor modelo salvo em: Modelos\best_xgb_model_grid.pkl


Optuna

In [30]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)


# Função objetivo para o Optuna
def objective(trial):
    # Sugerindo os hiperparâmetros
    n_estimators = trial.suggest_int("n_estimators", 50, 1000, step=50)
    max_depth = trial.suggest_int("max_depth", 3, 10, step=1)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.3, step=0.0005)
    subsample = trial.suggest_float("subsample", 0.6, 1.0, step=0.1)
    gamma = trial.suggest_float("gamma", 0, 0.5, step=0.1)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10, step=1)
    
    

    # Criando o modelo com os hiperparâmetros sugeridos
    xgb_reg = xgb.XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        objective='reg:squarederror',
        gamma=gamma,
        min_child_weight=min_child_weight,
        
        random_state=42
    )

    # Treinando o modelo nos dados de treino
    xgb_reg.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = xgb_reg.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # Número de tentativas

# Obtendo os melhores hiperparâmetros
best_params = study.best_params
print("Melhores hiperparâmetros:", best_params)


# Treinando o modelo final com os melhores hiperparâmetros
best_xgb = xgb.XGBRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    gamma=best_params["gamma"],
    min_child_weight=best_params["min_child_weight"],
    objective='reg:squarederror',
    random_state=42
)
best_xgb.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_xgb.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_xgb_model_optuna.pkl')
joblib.dump(best_xgb, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-24 14:19:00,280] A new study created in memory with name: no-name-d0c9ec92-0047-421f-aae5-462d1c780571
[I 2025-04-24 14:19:01,015] Trial 0 finished with value: 2.9025841908281023 and parameters: {'n_estimators': 250, 'max_depth': 7, 'learning_rate': 0.0485, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 7}. Best is trial 0 with value: 2.9025841908281023.
[I 2025-04-24 14:19:01,633] Trial 1 finished with value: 3.101864530634946 and parameters: {'n_estimators': 900, 'max_depth': 9, 'learning_rate': 0.10250000000000001, 'subsample': 0.9, 'gamma': 0.5, 'min_child_weight': 9}. Best is trial 0 with value: 2.9025841908281023.
[I 2025-04-24 14:19:01,911] Trial 2 finished with value: 3.3768841974123456 and parameters: {'n_estimators': 50, 'max_depth': 8, 'learning_rate': 0.1, 'subsample': 0.9, 'gamma': 0.0, 'min_child_weight': 1}. Best is trial 0 with value: 2.9025841908281023.
[I 2025-04-24 14:19:02,453] Trial 3 finished with value: 3.1724126058006505 and parameters: {'n_estim

Melhores hiperparâmetros: {'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.127, 'subsample': 0.6, 'gamma': 0.2, 'min_child_weight': 8}
Mean Squared Error (MSE) on test data: 2.8858393687086332
Modelo salvo em: Modelos\best_xgb_model_optuna.pkl


In [38]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40+i
    )

    # Concatenando dados externos para o treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x,spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    # Reconstruindo DataFrames
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=sy_x_train.columns, index=X_test.index)

    # Concatenação treino + val para o fit
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Função objetivo do Optuna
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int("n_estimators", 500, 1000, step=100),
            'max_depth': trial.suggest_int("max_depth", 3, 9),
            'learning_rate': trial.suggest_float("learning_rate", 0.001, 0.01, step=0.001),
            'subsample': trial.suggest_float("subsample", 0.8, 1.0, step=0.1),
            'gamma': trial.suggest_float("gamma", 0.1, 0.3, step=0.1),
            'min_child_weight': trial.suggest_int("min_child_weight", 1, 3),
            'objective': 'reg:squarederror',
            'random_state': 42,
        }

        model = xgb.XGBRegressor(**params)
        model.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])
        preds = model.predict(X_full[ps.test_fold == 0])
        mse = mean_squared_error(y_full[ps.test_fold == 0], preds)
        return mse

    # Rodando o estudo
    SEED = 40+i
    study = optuna.create_study(direction="minimize",sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(objective, n_trials=100, show_progress_bar=False)

    best_params = study.best_params
    print("Melhores hiperparâmetros:", best_params)

    # Treina modelo final com treino + val
    final_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
    final_model.fit(X_full, y_full)

    # Avaliação em teste
    predictions = final_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = final_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_xgb_model_optuna.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")

[I 2025-05-25 12:16:13,974] A new study created in memory with name: no-name-9e354ee7-2556-4897-b2b2-02f2973019bb



Rodada 1/10


[I 2025-05-25 12:16:14,766] Trial 0 finished with value: 4.313682541371598 and parameters: {'n_estimators': 700, 'max_depth': 3, 'learning_rate': 0.008, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 1}. Best is trial 0 with value: 4.313682541371598.
[I 2025-05-25 12:16:16,939] Trial 1 finished with value: 4.8133525907018555 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.008, 'subsample': 1.0, 'gamma': 0.3, 'min_child_weight': 2}. Best is trial 0 with value: 4.313682541371598.
[I 2025-05-25 12:16:19,815] Trial 2 finished with value: 6.304409701935316 and parameters: {'n_estimators': 900, 'max_depth': 7, 'learning_rate': 0.001, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 1}. Best is trial 0 with value: 4.313682541371598.
[I 2025-05-25 12:16:20,850] Trial 3 finished with value: 4.875013382772824 and parameters: {'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.002, 'subsample': 0.9, 'gamma': 0.3, 'min_child_weight': 3}. Best is trial 0 with 

Melhores hiperparâmetros: {'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.009000000000000001, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 2}


[I 2025-05-25 12:18:13,141] A new study created in memory with name: no-name-5a5f11fc-06de-46e9-92be-f72f61f305ff


MSE da rodada 1: 3.0002

Rodada 2/10


[I 2025-05-25 12:18:13,601] Trial 0 finished with value: 3.2785479949501006 and parameters: {'n_estimators': 600, 'max_depth': 3, 'learning_rate': 0.007, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 2}. Best is trial 0 with value: 3.2785479949501006.
[I 2025-05-25 12:18:15,241] Trial 1 finished with value: 3.42030080589969 and parameters: {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.01, 'subsample': 0.9, 'gamma': 0.1, 'min_child_weight': 1}. Best is trial 0 with value: 3.2785479949501006.
[I 2025-05-25 12:18:16,058] Trial 2 finished with value: 3.1860803440382797 and parameters: {'n_estimators': 600, 'max_depth': 5, 'learning_rate': 0.005, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 1}. Best is trial 2 with value: 3.1860803440382797.
[I 2025-05-25 12:18:17,284] Trial 3 finished with value: 3.2628713140792565 and parameters: {'n_estimators': 900, 'max_depth': 5, 'learning_rate': 0.007, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 2 wit

Melhores hiperparâmetros: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.004, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 3}


[I 2025-05-25 12:20:45,254] A new study created in memory with name: no-name-871d6249-a58a-4dfb-ac1e-99677061bb3a


MSE da rodada 2: 3.4127

Rodada 3/10


[I 2025-05-25 12:20:50,343] Trial 0 finished with value: 3.5823172795442275 and parameters: {'n_estimators': 700, 'max_depth': 9, 'learning_rate': 0.008, 'subsample': 0.9, 'gamma': 0.1, 'min_child_weight': 1}. Best is trial 0 with value: 3.5823172795442275.
[I 2025-05-25 12:20:54,019] Trial 1 finished with value: 3.769944155126382 and parameters: {'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.007, 'subsample': 1.0, 'gamma': 0.1, 'min_child_weight': 3}. Best is trial 0 with value: 3.5823172795442275.
[I 2025-05-25 12:20:54,923] Trial 2 finished with value: 3.8051398008091817 and parameters: {'n_estimators': 900, 'max_depth': 4, 'learning_rate': 0.002, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 2}. Best is trial 0 with value: 3.5823172795442275.
[I 2025-05-25 12:20:55,880] Trial 3 finished with value: 3.359451813747423 and parameters: {'n_estimators': 700, 'max_depth': 5, 'learning_rate': 0.007, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 2}. Best is trial 3 wi

Melhores hiperparâmetros: {'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.008, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 2}


[I 2025-05-25 12:25:46,522] A new study created in memory with name: no-name-42d69200-14ce-443c-bb1d-a4a77399c3f2


MSE da rodada 3: 2.9175

Rodada 4/10


[I 2025-05-25 12:25:48,407] Trial 0 finished with value: 4.874905070022703 and parameters: {'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.002, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 3}. Best is trial 0 with value: 4.874905070022703.
[I 2025-05-25 12:25:50,022] Trial 1 finished with value: 5.46588103405765 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.001, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 3}. Best is trial 0 with value: 4.874905070022703.
[I 2025-05-25 12:25:50,570] Trial 2 finished with value: 3.710843884122236 and parameters: {'n_estimators': 600, 'max_depth': 3, 'learning_rate': 0.009000000000000001, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 1}. Best is trial 2 with value: 3.710843884122236.
[I 2025-05-25 12:25:53,281] Trial 3 finished with value: 4.2207368376591266 and parameters: {'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.009000000000000001, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight

Melhores hiperparâmetros: {'n_estimators': 1000, 'max_depth': 9, 'learning_rate': 0.009000000000000001, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 1}


[I 2025-05-25 12:32:14,409] A new study created in memory with name: no-name-2bf79155-315e-4706-b88c-08dc20baa8b2


MSE da rodada 4: 4.2988

Rodada 5/10


[I 2025-05-25 12:32:15,145] Trial 0 finished with value: 3.848234946800163 and parameters: {'n_estimators': 1000, 'max_depth': 3, 'learning_rate': 0.008, 'subsample': 0.9, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 0 with value: 3.848234946800163.
[I 2025-05-25 12:32:15,996] Trial 1 finished with value: 3.5197511325645663 and parameters: {'n_estimators': 700, 'max_depth': 5, 'learning_rate': 0.006, 'subsample': 1.0, 'gamma': 0.3, 'min_child_weight': 2}. Best is trial 1 with value: 3.5197511325645663.
[I 2025-05-25 12:32:16,457] Trial 2 finished with value: 3.851840714656967 and parameters: {'n_estimators': 700, 'max_depth': 3, 'learning_rate': 0.003, 'subsample': 1.0, 'gamma': 0.3, 'min_child_weight': 3}. Best is trial 1 with value: 3.5197511325645663.
[I 2025-05-25 12:32:17,233] Trial 3 finished with value: 3.6917549200542146 and parameters: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.007, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 3}. Best is trial 1 wi

Melhores hiperparâmetros: {'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.008, 'subsample': 1.0, 'gamma': 0.2, 'min_child_weight': 2}


[I 2025-05-25 12:35:03,478] A new study created in memory with name: no-name-208d9858-e426-4141-95a2-180ed99fe04d


MSE da rodada 5: 2.7636

Rodada 6/10


[I 2025-05-25 12:35:05,385] Trial 0 finished with value: 3.4547298925710583 and parameters: {'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.003, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 0 with value: 3.4547298925710583.
[I 2025-05-25 12:35:05,896] Trial 1 finished with value: 4.814302941879142 and parameters: {'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.002, 'subsample': 0.9, 'gamma': 0.3, 'min_child_weight': 2}. Best is trial 0 with value: 3.4547298925710583.
[I 2025-05-25 12:35:07,639] Trial 2 finished with value: 3.2767890073179444 and parameters: {'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.007, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 3}. Best is trial 2 with value: 3.2767890073179444.
[I 2025-05-25 12:35:08,694] Trial 3 finished with value: 3.4964158400309486 and parameters: {'n_estimators': 700, 'max_depth': 5, 'learning_rate': 0.006, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 2}. Best is trial 2

Melhores hiperparâmetros: {'n_estimators': 1000, 'max_depth': 7, 'learning_rate': 0.009000000000000001, 'subsample': 0.8, 'gamma': 0.2, 'min_child_weight': 2}


[I 2025-05-25 12:39:30,249] A new study created in memory with name: no-name-a40d86bc-69b0-45d3-a7c1-1f148f77899a


MSE da rodada 6: 3.6244

Rodada 7/10


[I 2025-05-25 12:39:32,797] Trial 0 finished with value: 3.0825319714940314 and parameters: {'n_estimators': 900, 'max_depth': 7, 'learning_rate': 0.003, 'subsample': 1.0, 'gamma': 0.1, 'min_child_weight': 3}. Best is trial 0 with value: 3.0825319714940314.
[I 2025-05-25 12:39:33,790] Trial 1 finished with value: 3.1289513907003306 and parameters: {'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.01, 'subsample': 0.9, 'gamma': 0.2, 'min_child_weight': 1}. Best is trial 0 with value: 3.0825319714940314.
[I 2025-05-25 12:39:37,497] Trial 2 finished with value: 3.0561978614191756 and parameters: {'n_estimators': 700, 'max_depth': 8, 'learning_rate': 0.009000000000000001, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 3}. Best is trial 2 with value: 3.0561978614191756.
[I 2025-05-25 12:39:39,900] Trial 3 finished with value: 4.608698998152458 and parameters: {'n_estimators': 700, 'max_depth': 7, 'learning_rate': 0.001, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 2}. Bes

Melhores hiperparâmetros: {'n_estimators': 900, 'max_depth': 4, 'learning_rate': 0.008, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 3}


[I 2025-05-25 12:41:24,092] A new study created in memory with name: no-name-acbd8b34-1b48-47d5-8cbe-70dcadd41c97


MSE da rodada 7: 3.7135

Rodada 8/10


[I 2025-05-25 12:41:27,018] Trial 0 finished with value: 3.7480602965588425 and parameters: {'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.008, 'subsample': 0.9, 'gamma': 0.3, 'min_child_weight': 3}. Best is trial 0 with value: 3.7480602965588425.
[I 2025-05-25 12:41:28,155] Trial 1 finished with value: 3.5138966980022626 and parameters: {'n_estimators': 800, 'max_depth': 5, 'learning_rate': 0.008, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 1}. Best is trial 1 with value: 3.5138966980022626.
[I 2025-05-25 12:41:28,812] Trial 2 finished with value: 3.5593557496865444 and parameters: {'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.007, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 3}. Best is trial 1 with value: 3.5138966980022626.
[I 2025-05-25 12:41:29,390] Trial 3 finished with value: 3.3849369915511147 and parameters: {'n_estimators': 600, 'max_depth': 4, 'learning_rate': 0.01, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 1}. Best is trial 3 w

Melhores hiperparâmetros: {'n_estimators': 700, 'max_depth': 4, 'learning_rate': 0.01, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 1}


[I 2025-05-25 12:43:03,077] A new study created in memory with name: no-name-a020a82f-ae43-4894-9f37-6d037d176ef0


MSE da rodada 8: 3.2527

Rodada 9/10


[I 2025-05-25 12:43:07,536] Trial 0 finished with value: 3.8822434314753402 and parameters: {'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.003, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 1}. Best is trial 0 with value: 3.8822434314753402.
[I 2025-05-25 12:43:09,562] Trial 1 finished with value: 3.266088810138984 and parameters: {'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.006, 'subsample': 0.9, 'gamma': 0.1, 'min_child_weight': 1}. Best is trial 1 with value: 3.266088810138984.
[I 2025-05-25 12:43:13,334] Trial 2 finished with value: 3.258628753590342 and parameters: {'n_estimators': 700, 'max_depth': 8, 'learning_rate': 0.005, 'subsample': 0.9, 'gamma': 0.2, 'min_child_weight': 2}. Best is trial 2 with value: 3.258628753590342.
[I 2025-05-25 12:43:14,246] Trial 3 finished with value: 3.9865701889081406 and parameters: {'n_estimators': 600, 'max_depth': 5, 'learning_rate': 0.002, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 1}. Best is trial 2 wit

Melhores hiperparâmetros: {'n_estimators': 700, 'max_depth': 8, 'learning_rate': 0.01, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 2}


[I 2025-05-25 12:47:15,636] A new study created in memory with name: no-name-d463d522-3ece-4d8b-829b-29ff95838ee3


MSE da rodada 9: 3.0832

Rodada 10/10


[I 2025-05-25 12:47:16,240] Trial 0 finished with value: 3.1497123809998744 and parameters: {'n_estimators': 600, 'max_depth': 4, 'learning_rate': 0.01, 'subsample': 1.0, 'gamma': 0.3, 'min_child_weight': 2}. Best is trial 0 with value: 3.1497123809998744.
[I 2025-05-25 12:47:17,063] Trial 1 finished with value: 3.1312048769675234 and parameters: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.008, 'subsample': 1.0, 'gamma': 0.3, 'min_child_weight': 2}. Best is trial 1 with value: 3.1312048769675234.
[I 2025-05-25 12:47:20,183] Trial 2 finished with value: 3.31958481714163 and parameters: {'n_estimators': 700, 'max_depth': 8, 'learning_rate': 0.009000000000000001, 'subsample': 0.8, 'gamma': 0.3, 'min_child_weight': 1}. Best is trial 1 with value: 3.1312048769675234.
[I 2025-05-25 12:47:20,907] Trial 3 finished with value: 3.2366361514018327 and parameters: {'n_estimators': 600, 'max_depth': 5, 'learning_rate': 0.008, 'subsample': 1.0, 'gamma': 0.3, 'min_child_weight': 2}. Best

Melhores hiperparâmetros: {'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.01, 'subsample': 1.0, 'gamma': 0.3, 'min_child_weight': 2}
MSE da rodada 10: 3.0191

Resumo após 10 execuções:
Média dos MSEs: 3.3086
Desvio padrão dos MSEs: 0.4402
Melhor MSE individual: 2.7636
Melhor modelo salvo em: Modelos\best_xgb_model_optuna.pkl


## MLP

Resultados:

GridSearch:
* Média dos MSEs: 4.9773
* Desvio padrão dos MSEs: 0.9203
* Melhor MSE individual: 3.5314 (r3)
* Tempo: 220m 3s

Optuna:
* Média dos MSEs: 4.2563
* Desvio padrão dos MSEs: 0.5586
* Melhor MSE individual: 3.1479 (r3)
* Tempo: 828m 50s



GridSearch

In [25]:


# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100),(50, 50, 50), (100, 50, 50), (100, 100, 50), (100, 100, 100)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.01]
}

# Inicializando o modelo MLP
mlp_reg = MLPRegressor(random_state=42, max_iter=500)

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=mlp_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Métrica de avaliação
    cv=ps,  # Número de folds para validação cruzada
    verbose=3,  # Mostra o progresso do tuning
    n_jobs=-1  # Usa todos os núcleos disponíveis
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo
best_mlp_reg = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
predictions = best_mlp_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_mlp_model_grid.pkl')
joblib.dump(best_mlp_reg, model_path)
print(f"Modelo salvo em: {model_path}")

Fitting 1 folds for each of 216 candidates, totalling 216 fits
Mean Squared Error (MSE) on test data: 3.3342401641444734
Modelo salvo em: Modelos\best_mlp_model_grid.pkl




In [38]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'hidden_layer_sizes': [ (100, 100), (50, 50, 50), (100, 50, 50), (100, 100, 50), (100, 100, 100)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.0001, 0.001]
}

# Variáveis para armazenar os melhores modelos e MSE
best_mlp_model = None
best_mse = float('inf')
mse_list = []

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Dividindo os dados de até 30 dias em treino/validação/teste
    X_train_val, X_test, y_train_val, y_test = train_test_split(youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.20, random_state=40+i)

    # Concatenação com os demais dados
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x,spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler_all = MinMaxScaler()
    sy_x_train_scaled = scaler_all.fit_transform(sy_x_train)
    sy_x_val_scaled = scaler_all.transform(X_val)
    sy_x_test_scaled = scaler_all.transform(X_test)

    # Convertendo de volta para DataFrames
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_x_val_scaled_df = pd.DataFrame(sy_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_x_test_scaled_df = pd.DataFrame(sy_x_test_scaled, columns=sy_x_train.columns, index=X_test.index)

    # Concatena treino e validação
    X_full = pd.concat([sy_x_train_scaled_df, sy_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # Cria PredefinedSplit
    validation_fold = np.concatenate([np.full(len(sy_x_train), -1), np.zeros(len(X_val))])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Inicializa o modelo base
    mlp_reg = MLPRegressor(random_state=i, max_iter=1000)

    # GridSearchCV
    grid_search = GridSearchCV(
        estimator=mlp_reg,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=ps,
        verbose=0,
        n_jobs=-1
    )

    # Treinamento
    grid_search.fit(X_full, y_full)

    # Obtém os melhores hiperparâmetros
    best_params = grid_search.best_params_

    # Reentreina o modelo com os melhores hiperparâmetros usando treino + validação
    best_mlp = MLPRegressor(**best_params, random_state=42, max_iter=1000)
    best_mlp.fit(X_full, y_full)

    # Avaliação no conjunto de teste
    predictions = best_mlp.predict(sy_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    # Verifica se é o melhor MSE
    if mse < best_mse:
        best_mse = mse
        best_mlp_model = best_mlp

# Resumo final
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_mlp_model_grid.pkl')
joblib.dump(best_mlp_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")


Rodada 1/10
MSE da rodada 1: 4.9939

Rodada 2/10
MSE da rodada 2: 6.5292

Rodada 3/10
MSE da rodada 3: 3.5314

Rodada 4/10
MSE da rodada 4: 4.5876

Rodada 5/10
MSE da rodada 5: 4.0421

Rodada 6/10
MSE da rodada 6: 6.5722

Rodada 7/10
MSE da rodada 7: 4.5585

Rodada 8/10
MSE da rodada 8: 4.6216

Rodada 9/10
MSE da rodada 9: 5.0581

Rodada 10/10
MSE da rodada 10: 5.2786

Resumo após 10 execuções:
Média dos MSEs: 4.9773
Desvio padrão dos MSEs: 0.9203
Melhor modelo salvo em: Modelos\best_mlp_model_grid.pkl


Optuna

In [23]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Função objetivo para o Optuna
def objective(trial):
    # Sugerindo os hiperparâmetros
     # Escolhe número de camadas: 1, 2 ou 3
    n_layers = trial.suggest_int('n_layers', 1, 3)
    
    # Para cada camada, escolher número de neurônios
    hidden_layer_sizes = []
    for i in range(n_layers):
        num_units = trial.suggest_int(f'n_units_layer_{i}', 50, 200, step=10)
        hidden_layer_sizes.append(num_units)
    
    hidden_layer_sizes = tuple(hidden_layer_sizes)  # vira tupla, como o MLP espera
    activation = trial.suggest_categorical("activation", ["relu", "tanh"])
    solver = trial.suggest_categorical("solver", ["adam", "sgd"])
    alpha = trial.suggest_float("alpha", 0.0001, 0.01, log=True)
    learning_rate = trial.suggest_categorical("learning_rate", ["constant", "adaptive"])

    # Criando o modelo com os hiperparâmetros sugeridos
    mlp_reg = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate=learning_rate,
        max_iter=1000,
        random_state=42
    )

    # Treinando o modelo nos dados de treino
    mlp_reg.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = mlp_reg.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # Número de tentativas

# Obtendo os melhores hiperparâmetros
best_params = study.best_params

# Construindo o hidden_layer_sizes a partir dos melhores parâmetros
hidden_layer_sizes = []
for i in range(best_params["n_layers"]):
    hidden_layer_sizes.append(best_params[f"n_units_layer_{i}"])
best_params["hidden_layer_sizes"] = tuple(hidden_layer_sizes)

print("Melhores hiperparâmetros:", best_params)

# Treinando o modelo final com os melhores hiperparâmetros
best_mlp = MLPRegressor(
    hidden_layer_sizes=best_params["hidden_layer_sizes"],
    activation=best_params["activation"],
    solver=best_params["solver"],
    alpha=best_params["alpha"],
    learning_rate=best_params["learning_rate"],
    max_iter=1000,
    random_state=42
)
best_mlp.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_mlp.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_mlp_model_optuna.pkl')
joblib.dump(best_mlp, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-28 09:49:18,683] A new study created in memory with name: no-name-593c19ba-4856-4ffc-84a5-7ada84bd71bc
[I 2025-04-28 09:49:53,402] Trial 0 finished with value: 4.253595743081349 and parameters: {'n_layers': 1, 'n_units_layer_0': 70, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.006071323986349375, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.253595743081349.
[I 2025-04-28 09:54:09,362] Trial 1 finished with value: 4.783693931466111 and parameters: {'n_layers': 2, 'n_units_layer_0': 180, 'n_units_layer_1': 190, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00045592803267607867, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.253595743081349.
[I 2025-04-28 09:54:51,381] Trial 2 finished with value: 4.579312869609497 and parameters: {'n_layers': 3, 'n_units_layer_0': 110, 'n_units_layer_1': 80, 'n_units_layer_2': 100, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.002598256618167818, 'learning_rate': 'constant'}. Best is trial 0 with val

Melhores hiperparâmetros: {'n_layers': 1, 'n_units_layer_0': 140, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00020915691387984604, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (140,)}
Mean Squared Error (MSE) on test data: 3.092867299798431
Modelo salvo em: Modelos\best_mlp_model_optuna.pkl


In [35]:
mse_list = []
best_overall_model = None
lowest_mse = float('inf')

for i in range(10):
    print(f"\nRodada {i+1}/10")

    # Divisão dos dados
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=40+i
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=0.20, random_state=40+i
    )

    # Concatena dados externos com treino
    sy_x_train = pd.concat([X_train, spotify_youtube_df_31_90_x,spotify_youtube_df_91_365_x, spotify_youtube_df_366_more_x])
    sy_y_train = pd.concat([y_train, spotify_youtube_df_31_90_y, spotify_youtube_df_91_365_y, spotify_youtube_df_366_more_y])

    # Normalização
    scaler = MinMaxScaler()
    sy_x_train_scaled = scaler.fit_transform(sy_x_train)
    sy_30_x_val_scaled = scaler.transform(X_val)
    sy_30_x_test_scaled = scaler.transform(X_test)

    # Reconstruindo DataFrames
    sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
    sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=X_val.columns, index=X_val.index)
    sy_30_x_test_scaled_df = pd.DataFrame(sy_30_x_test_scaled, columns=X_test.columns, index=X_test.index)

    # Junta treino + val
    X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
    y_full = pd.concat([sy_y_train, y_val])

    # PredefinedSplit
    validation_fold = np.concatenate([
        np.full(len(sy_x_train), -1),
        np.zeros(len(X_val))
    ])
    ps = PredefinedSplit(test_fold=validation_fold)

    # Função objetivo do Optuna
    def objective(trial):
        # Camadas ocultas
        n_layers = trial.suggest_int('n_layers', 1, 3)
        hidden_layer_sizes = tuple(
            trial.suggest_int(f'n_units_layer_{i}', 50, 200, step=10) for i in range(n_layers)
        )

        # Outros hiperparâmetros
        activation = trial.suggest_categorical("activation", ["relu", "tanh"])
        solver = trial.suggest_categorical("solver", ["adam", "sgd"])
        alpha = trial.suggest_float("alpha", 0.0001, 0.01, log=True)
        learning_rate = trial.suggest_categorical("learning_rate", ["constant", "adaptive"])

        model = MLPRegressor(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=activation,
            solver=solver,
            alpha=alpha,
            learning_rate=learning_rate,
            max_iter=1000,
            random_state=42
        )

        model.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])
        preds = model.predict(X_full[ps.test_fold == 0])
        mse = mean_squared_error(y_full[ps.test_fold == 0], preds)
        return mse

    # Rodando Optuna
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=100, show_progress_bar=False)

    best_params = study.best_params
    hidden_layer_sizes = tuple(
        best_params[f"n_units_layer_{i}"] for i in range(best_params["n_layers"])
    )
    best_params["hidden_layer_sizes"] = hidden_layer_sizes
    print("Melhores hiperparâmetros:", best_params)

    # Treinando o modelo final com melhores parâmetros
    final_model = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=best_params["activation"],
        solver=best_params["solver"],
        alpha=best_params["alpha"],
        learning_rate=best_params["learning_rate"],
        max_iter=1000,
        random_state=42
    )
    final_model.fit(X_full, y_full)

    # Avaliação em teste
    predictions = final_model.predict(sy_30_x_test_scaled_df)
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print(f"MSE da rodada {i+1}: {mse:.4f}")

    if mse < lowest_mse:
        lowest_mse = mse
        best_overall_model = final_model

# Estatísticas finais
mean_mse = np.mean(mse_list)
std_mse = np.std(mse_list)
print("\nResumo após 10 execuções:")
print(f"Média dos MSEs: {mean_mse:.4f}")
print(f"Desvio padrão dos MSEs: {std_mse:.4f}")
print(f"Melhor MSE individual: {lowest_mse:.4f}")

# Salvando o melhor modelo
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_mlp_model_optuna.pkl')
joblib.dump(best_overall_model, model_path)
print(f"Melhor modelo salvo em: {model_path}")

[I 2025-05-24 16:09:28,448] A new study created in memory with name: no-name-42fc4ec3-290d-4626-8769-9df644b87a46



Rodada 1/10


[I 2025-05-24 16:11:15,032] Trial 0 finished with value: 6.880261135114375 and parameters: {'n_layers': 3, 'n_units_layer_0': 100, 'n_units_layer_1': 190, 'n_units_layer_2': 120, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0027882251860144314, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 6.880261135114375.
[I 2025-05-24 16:12:37,936] Trial 1 finished with value: 5.283525609185303 and parameters: {'n_layers': 2, 'n_units_layer_0': 90, 'n_units_layer_1': 70, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.001706611658457262, 'learning_rate': 'constant'}. Best is trial 1 with value: 5.283525609185303.
[I 2025-05-24 16:13:16,434] Trial 2 finished with value: 5.830873299833873 and parameters: {'n_layers': 2, 'n_units_layer_0': 50, 'n_units_layer_1': 100, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0002468821363106685, 'learning_rate': 'constant'}. Best is trial 1 with value: 5.283525609185303.
[I 2025-05-24 16:13:38,607] Trial 3 finished with value: 6.4131697436

Melhores hiperparâmetros: {'n_layers': 2, 'n_units_layer_0': 50, 'n_units_layer_1': 190, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00023327205617672243, 'learning_rate': 'constant', 'hidden_layer_sizes': (50, 190)}


[I 2025-05-24 17:45:19,099] A new study created in memory with name: no-name-4988a398-e93b-4f36-bda3-355c844bdabd


MSE da rodada 1: 4.2729

Rodada 2/10


[I 2025-05-24 17:46:19,472] Trial 0 finished with value: 3.2431414201352533 and parameters: {'n_layers': 2, 'n_units_layer_0': 180, 'n_units_layer_1': 70, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0037623635087424352, 'learning_rate': 'constant'}. Best is trial 0 with value: 3.2431414201352533.
[I 2025-05-24 17:46:41,011] Trial 1 finished with value: 5.655951708281969 and parameters: {'n_layers': 1, 'n_units_layer_0': 200, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.00020614653435051025, 'learning_rate': 'constant'}. Best is trial 0 with value: 3.2431414201352533.
[I 2025-05-24 17:46:51,267] Trial 2 finished with value: 5.0698849985448495 and parameters: {'n_layers': 1, 'n_units_layer_0': 120, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.006822075277037641, 'learning_rate': 'constant'}. Best is trial 0 with value: 3.2431414201352533.
[I 2025-05-24 17:47:35,524] Trial 3 finished with value: 3.1909663825848007 and parameters: {'n_layers': 3, 'n_units_layer_0': 80, 'n

Melhores hiperparâmetros: {'n_layers': 1, 'n_units_layer_0': 170, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00034959475843565846, 'learning_rate': 'constant', 'hidden_layer_sizes': (170,)}


[I 2025-05-24 19:13:55,204] A new study created in memory with name: no-name-dcbffdb3-4761-49db-b812-57f626de0055


MSE da rodada 2: 3.4474

Rodada 3/10


[I 2025-05-24 19:14:27,572] Trial 0 finished with value: 4.626471179776763 and parameters: {'n_layers': 3, 'n_units_layer_0': 180, 'n_units_layer_1': 70, 'n_units_layer_2': 60, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0077460295679693935, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 4.626471179776763.
[I 2025-05-24 19:16:23,525] Trial 1 finished with value: 5.1806615292580735 and parameters: {'n_layers': 3, 'n_units_layer_0': 180, 'n_units_layer_1': 150, 'n_units_layer_2': 130, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.000509237239306106, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.626471179776763.
[I 2025-05-24 19:16:26,809] Trial 2 finished with value: 7.830133329457586 and parameters: {'n_layers': 1, 'n_units_layer_0': 70, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.008422607920004266, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.626471179776763.
[I 2025-05-24 19:16:42,904] Trial 3 finished with value: 7.0776914602

Melhores hiperparâmetros: {'n_layers': 1, 'n_units_layer_0': 150, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0003802496058878936, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (150,)}


[I 2025-05-24 20:17:05,646] A new study created in memory with name: no-name-777a72f5-319e-4d68-8ac5-bb40b2fa2cc4


MSE da rodada 3: 3.1479

Rodada 4/10


[I 2025-05-24 20:17:12,654] Trial 0 finished with value: 7.47257384709118 and parameters: {'n_layers': 1, 'n_units_layer_0': 80, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.001226329506203848, 'learning_rate': 'constant'}. Best is trial 0 with value: 7.47257384709118.
[I 2025-05-24 20:18:12,304] Trial 1 finished with value: 7.032193500432574 and parameters: {'n_layers': 3, 'n_units_layer_0': 200, 'n_units_layer_1': 110, 'n_units_layer_2': 70, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0071706397843001686, 'learning_rate': 'adaptive'}. Best is trial 1 with value: 7.032193500432574.
[I 2025-05-24 20:19:15,714] Trial 2 finished with value: 5.08161057812522 and parameters: {'n_layers': 2, 'n_units_layer_0': 190, 'n_units_layer_1': 140, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.0022774900902892176, 'learning_rate': 'constant'}. Best is trial 2 with value: 5.08161057812522.
[I 2025-05-24 20:20:35,420] Trial 3 finished with value: 6.292405285505327 and parameters: {'n_la

Melhores hiperparâmetros: {'n_layers': 2, 'n_units_layer_0': 130, 'n_units_layer_1': 80, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.0016178792493721798, 'learning_rate': 'constant', 'hidden_layer_sizes': (130, 80)}


[I 2025-05-24 21:38:33,608] A new study created in memory with name: no-name-589f2dc8-053b-44ca-839f-f54a8ee61c7f


MSE da rodada 4: 4.6751

Rodada 5/10


[I 2025-05-24 21:38:51,481] Trial 0 finished with value: 5.959029595762432 and parameters: {'n_layers': 1, 'n_units_layer_0': 140, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.0029938456177299854, 'learning_rate': 'constant'}. Best is trial 0 with value: 5.959029595762432.
[I 2025-05-24 21:39:43,902] Trial 1 finished with value: 5.018450573045866 and parameters: {'n_layers': 3, 'n_units_layer_0': 90, 'n_units_layer_1': 70, 'n_units_layer_2': 160, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.00013481032616505944, 'learning_rate': 'constant'}. Best is trial 1 with value: 5.018450573045866.
[I 2025-05-24 21:40:03,885] Trial 2 finished with value: 6.6476451295165315 and parameters: {'n_layers': 2, 'n_units_layer_0': 190, 'n_units_layer_1': 100, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.00012849599347754246, 'learning_rate': 'constant'}. Best is trial 1 with value: 5.018450573045866.
[I 2025-05-24 21:41:34,880] Trial 3 finished with value: 4.048516637644167 and parameters:

Melhores hiperparâmetros: {'n_layers': 2, 'n_units_layer_0': 60, 'n_units_layer_1': 180, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.002099866531521651, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (60, 180)}


[I 2025-05-24 23:41:50,211] A new study created in memory with name: no-name-d8ba82b3-36bb-4b88-b233-9bff6378af50


MSE da rodada 5: 4.5774

Rodada 6/10


[I 2025-05-24 23:43:12,817] Trial 0 finished with value: 5.262760294592608 and parameters: {'n_layers': 3, 'n_units_layer_0': 160, 'n_units_layer_1': 70, 'n_units_layer_2': 100, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.006791700656190395, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 5.262760294592608.
[I 2025-05-24 23:43:22,432] Trial 1 finished with value: 6.892152141222682 and parameters: {'n_layers': 1, 'n_units_layer_0': 110, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.0009408371543878778, 'learning_rate': 'constant'}. Best is trial 0 with value: 5.262760294592608.
[I 2025-05-24 23:43:34,381] Trial 2 finished with value: 6.941120947882771 and parameters: {'n_layers': 1, 'n_units_layer_0': 150, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.0005649283632634445, 'learning_rate': 'constant'}. Best is trial 0 with value: 5.262760294592608.
[I 2025-05-24 23:45:55,823] Trial 3 finished with value: 4.929946927781972 and parameters: {'n_layers': 3, 'n_units_

Melhores hiperparâmetros: {'n_layers': 2, 'n_units_layer_0': 50, 'n_units_layer_1': 140, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.004602745902610535, 'learning_rate': 'constant', 'hidden_layer_sizes': (50, 140)}


[I 2025-05-25 01:12:42,730] A new study created in memory with name: no-name-c1050e10-3e8d-4dfc-8341-c95dd5cb2808


MSE da rodada 6: 5.0725

Rodada 7/10


[I 2025-05-25 01:13:48,560] Trial 0 finished with value: 3.5102912786364633 and parameters: {'n_layers': 3, 'n_units_layer_0': 90, 'n_units_layer_1': 170, 'n_units_layer_2': 60, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0001452146970216462, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 3.5102912786364633.
[I 2025-05-25 01:14:57,001] Trial 1 finished with value: 6.20445211851914 and parameters: {'n_layers': 2, 'n_units_layer_0': 200, 'n_units_layer_1': 110, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0001433097790441612, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 3.5102912786364633.
[I 2025-05-25 01:15:20,401] Trial 2 finished with value: 6.283490947222752 and parameters: {'n_layers': 2, 'n_units_layer_0': 70, 'n_units_layer_1': 50, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0001384648725856564, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 3.5102912786364633.
[I 2025-05-25 01:15:25,651] Trial 3 finished with value: 6.93519896

Melhores hiperparâmetros: {'n_layers': 2, 'n_units_layer_0': 140, 'n_units_layer_1': 140, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.004499278034221205, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (140, 140)}


[I 2025-05-25 03:16:18,951] A new study created in memory with name: no-name-92200f1e-b029-4923-9df4-bb2ac7bd0efa


MSE da rodada 7: 4.3068

Rodada 8/10


[I 2025-05-25 03:16:29,817] Trial 0 finished with value: 5.833259988098093 and parameters: {'n_layers': 1, 'n_units_layer_0': 110, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.002051349432825086, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 5.833259988098093.
[I 2025-05-25 03:18:09,018] Trial 1 finished with value: 7.548660713091339 and parameters: {'n_layers': 3, 'n_units_layer_0': 140, 'n_units_layer_1': 170, 'n_units_layer_2': 120, 'activation': 'tanh', 'solver': 'adam', 'alpha': 0.00016528904721758498, 'learning_rate': 'constant'}. Best is trial 0 with value: 5.833259988098093.
[I 2025-05-25 03:18:31,467] Trial 2 finished with value: 5.7177056883020585 and parameters: {'n_layers': 1, 'n_units_layer_0': 160, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.005590764937739448, 'learning_rate': 'adaptive'}. Best is trial 2 with value: 5.7177056883020585.
[I 2025-05-25 03:19:23,051] Trial 3 finished with value: 6.069415501292699 and parameters: {'n_layers': 2, 'n_u

Melhores hiperparâmetros: {'n_layers': 1, 'n_units_layer_0': 120, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0005977506048956559, 'learning_rate': 'constant', 'hidden_layer_sizes': (120,)}


[I 2025-05-25 03:57:09,351] A new study created in memory with name: no-name-da6e2008-89bb-41fa-907e-8e35b521cede


MSE da rodada 8: 4.7548

Rodada 9/10


[I 2025-05-25 03:57:55,551] Trial 0 finished with value: 6.050619573208713 and parameters: {'n_layers': 2, 'n_units_layer_0': 80, 'n_units_layer_1': 130, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0005232610351294011, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 6.050619573208713.
[I 2025-05-25 03:59:15,901] Trial 1 finished with value: 4.447773707332383 and parameters: {'n_layers': 3, 'n_units_layer_0': 200, 'n_units_layer_1': 120, 'n_units_layer_2': 70, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.0005790417716624081, 'learning_rate': 'adaptive'}. Best is trial 1 with value: 4.447773707332383.
[I 2025-05-25 04:00:46,084] Trial 2 finished with value: 4.634210330706036 and parameters: {'n_layers': 3, 'n_units_layer_0': 70, 'n_units_layer_1': 100, 'n_units_layer_2': 110, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0034317179641306333, 'learning_rate': 'constant'}. Best is trial 1 with value: 4.447773707332383.
[I 2025-05-25 04:00:48,067] Trial 3 finished 

Melhores hiperparâmetros: {'n_layers': 1, 'n_units_layer_0': 140, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.000135368282242128, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (140,)}


[I 2025-05-25 04:49:11,949] A new study created in memory with name: no-name-a329969c-8cda-4251-841a-2c6b4a8c804f


MSE da rodada 9: 4.0525

Rodada 10/10


[I 2025-05-25 04:49:16,733] Trial 0 finished with value: 7.592177947601678 and parameters: {'n_layers': 1, 'n_units_layer_0': 120, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0009910550557502393, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 7.592177947601678.
[I 2025-05-25 04:50:20,009] Trial 1 finished with value: 6.546902497704214 and parameters: {'n_layers': 3, 'n_units_layer_0': 120, 'n_units_layer_1': 130, 'n_units_layer_2': 170, 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.005421942579717518, 'learning_rate': 'adaptive'}. Best is trial 1 with value: 6.546902497704214.
[I 2025-05-25 04:51:18,797] Trial 2 finished with value: 5.0265945794939855 and parameters: {'n_layers': 3, 'n_units_layer_0': 200, 'n_units_layer_1': 60, 'n_units_layer_2': 170, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.006868271803615584, 'learning_rate': 'adaptive'}. Best is trial 2 with value: 5.0265945794939855.
[I 2025-05-25 04:52:44,057] Trial 3 finished with value: 5.292277371

Melhores hiperparâmetros: {'n_layers': 1, 'n_units_layer_0': 190, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0003714649583567011, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (190,)}
MSE da rodada 10: 4.2561

Resumo após 10 execuções:
Média dos MSEs: 4.2563
Desvio padrão dos MSEs: 0.5586
Melhor MSE individual: 3.1479
Melhor modelo salvo em: Modelos\best_mlp_model_optuna.pkl
