In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.linear_model import  LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
import optuna

In [4]:
import joblib
import os

# Tratamento dos dados

In [5]:
spotify_youtube_pt1 = pd.read_csv("Dados/spotify_youtube_up_pt1.csv")
spotify_youtube_pt2 = pd.read_csv("Dados/spotify_youtube_up_pt2.csv")

In [6]:
data_extracao = pd.to_datetime("2023-02-07")
spotify_youtube_pt1["upload_date"] = pd.to_datetime(spotify_youtube_pt1["upload_date"])
spotify_youtube_pt1["dias_na_plataforma"] = (data_extracao - spotify_youtube_pt1["upload_date"]).dt.days

spotify_youtube_pt2["upload_date"] = pd.to_datetime(spotify_youtube_pt2["upload_date"])
spotify_youtube_pt2["dias_na_plataforma"] = (data_extracao - spotify_youtube_pt2["upload_date"]).dt.days

In [7]:
spotify_youtube = pd.concat([spotify_youtube_pt1,spotify_youtube_pt2])

In [8]:
spotify_youtube_dummies= pd.get_dummies(spotify_youtube,columns=['Album_type'], dtype=int)

In [9]:
spotify_youtube_dummies['artist_number'] = spotify_youtube_dummies['Artist'].str.split(',').str.len()
spotify_youtube_dummies['engagement_rate'] = spotify_youtube_dummies.apply(lambda row: (row['Likes'] + row['Comments']) / row['Views'] * 100 if row['Views'] > 0 else 0, axis=1)

In [10]:
columns_drop=['Unnamed: 0', 'Artist', 'Url_spotify', 'Track', 'Album', 'Uri','Url_youtube', 'Title', 'Channel', 'Views', 'Likes','Comments', 'Description', 'Licensed', 'official_video', 'upload_date']
spotify_youtube_df=spotify_youtube_dummies.drop(columns=columns_drop)

In [11]:
spotify_youtube_limpo=spotify_youtube_df.dropna(subset=['Danceability','Energy','Key','Loudness','Speechiness','Acousticness', 'Instrumentalness','Liveness','Valence','Tempo','Duration_ms','Stream','engagement_rate'])

# Preparação dos dados para os experimentos

In [12]:
youtube_30_df=spotify_youtube_limpo[spotify_youtube_limpo['dias_na_plataforma']<31]
youtube_90_df=spotify_youtube_limpo[spotify_youtube_limpo['dias_na_plataforma']<91]
youtube_365_df=spotify_youtube_limpo[spotify_youtube_limpo['dias_na_plataforma']<366]

In [13]:
spotify_youtube_df_366_more = spotify_youtube_limpo.drop(youtube_365_df.index)
spotify_youtube_df_91_365 = youtube_365_df.drop(youtube_90_df.index)
spotify_youtube_df_31_90 = youtube_365_df.drop(youtube_30_df.index)

In [14]:
youtube_30_df=youtube_30_df[youtube_30_df['engagement_rate']<9.338507725109846]
spotify_youtube_df_31_90=spotify_youtube_df_31_90[spotify_youtube_df_31_90['engagement_rate']<4.602401258613792]
spotify_youtube_df_91_365=spotify_youtube_df_91_365[spotify_youtube_df_91_365['engagement_rate']<4.253191569584401]
spotify_youtube_df_366_more=spotify_youtube_df_366_more[spotify_youtube_df_366_more['engagement_rate']<2.251193229196198]

In [15]:
youtube_30_df_x = youtube_30_df.drop('engagement_rate',axis=1)
youtube_30_df_y = youtube_30_df['engagement_rate']

spotify_youtube_df_31_90_x = spotify_youtube_df_31_90.drop('engagement_rate',axis=1)
spotify_youtube_df_31_90_y = spotify_youtube_df_31_90['engagement_rate']

spotify_youtube_df_91_365_x = spotify_youtube_df_91_365.drop('engagement_rate',axis=1)
spotify_youtube_df_91_365_y = spotify_youtube_df_91_365['engagement_rate']

spotify_youtube_df_366_more_x = spotify_youtube_df_366_more.drop('engagement_rate',axis=1)
spotify_youtube_df_366_more_y = spotify_youtube_df_366_more['engagement_rate']

In [16]:
spotify_youtube_30X_train_val, spotify_youtube_30X_test, spotify_youtube_30y_train_val, spotify_youtube_30y_test = train_test_split(youtube_30_df_x, youtube_30_df_y, test_size=0.25, random_state=42)
spotify_youtube_df_31_90X_train, spotify_youtube_df_31_90X_test, spotify_youtube_df_31_90y_train, spotify_youtube_df_31_90y_test = train_test_split(spotify_youtube_df_31_90_x, spotify_youtube_df_31_90_y, test_size=0.25, random_state=42)
spotify_youtube_df_91_365X_train, spotify_youtube_df_91_365X_test, spotify_youtube_df_91_365y_train, spotify_youtube_df_91_365y_test = train_test_split(spotify_youtube_df_91_365_x, spotify_youtube_df_91_365_y, test_size=0.25, random_state=42)
spotify_youtube_30X_train, spotify_youtube_30X_val, spotify_youtube_30y_train, spotify_youtube_30y_val = train_test_split(spotify_youtube_30X_train_val, spotify_youtube_30y_train_val, test_size=0.20, random_state=42)

In [17]:
# Dados de treino 
sy_x_train=pd.concat([spotify_youtube_30X_train,spotify_youtube_df_31_90X_train,spotify_youtube_df_91_365X_train,spotify_youtube_df_366_more_x])
sy_y_train=pd.concat([spotify_youtube_30y_train,spotify_youtube_df_31_90y_train,spotify_youtube_df_91_365y_train,spotify_youtube_df_366_more_y])

# Normalização do treino
scaler_all = MinMaxScaler()
sy_x_train_scaled = scaler_all.fit_transform(sy_x_train)

In [18]:
# Normalização do treino de 30 dias
sy_30_x_train_scaled = scaler_all.transform(spotify_youtube_30X_train)

In [19]:
# Dados para treinar o modelo com dados de 0 a 90 dias
sy_90_x_train=pd.concat([spotify_youtube_30X_train,spotify_youtube_df_31_90X_train])
sy_90_y_train=pd.concat([spotify_youtube_30y_train,spotify_youtube_df_31_90y_train])

# Normalização do treino de 90 dias
sy_90_x_train_scaled = scaler_all.transform(sy_90_x_train)

In [20]:
# Dados para treinar o modelo com dados de 0 a 365 dias
sy_365_x_train=pd.concat([spotify_youtube_30X_train,spotify_youtube_df_31_90X_train,spotify_youtube_df_91_365X_train])
sy_365_y_train=pd.concat([spotify_youtube_30y_train,spotify_youtube_df_31_90y_train,spotify_youtube_df_91_365y_train])

# Normalização do treino de 90 dias
sy_365_x_train_scaled = scaler_all.transform(sy_365_x_train)

In [21]:
# Para testar o desempenho do modelo com videos com ate 30 dias
sy_30_x_test=spotify_youtube_30X_test
sy_30_y_test=spotify_youtube_30y_test

# validação de 30 dias
sy_30_x_val=spotify_youtube_30X_val
sy_30_y_val=spotify_youtube_30y_val

# Para testar o desempenho do modelo com videos com ate 90 dias
sy_90_x_test=pd.concat([spotify_youtube_30X_test,spotify_youtube_df_31_90X_test])
sy_90_y_test=pd.concat([spotify_youtube_30y_test,spotify_youtube_df_31_90y_test])

# Para testar o desempenho do modelo com videos com ate 1 ano
sy_365_x_test=pd.concat([spotify_youtube_30X_test,spotify_youtube_df_31_90X_test,spotify_youtube_df_91_365X_test])
sy_365_y_test=pd.concat([spotify_youtube_30y_test,spotify_youtube_df_31_90y_test,spotify_youtube_df_91_365y_test])

In [22]:
# Normalizção dos dados de teste de 30 dias
sy_30_x_test_scaled = scaler_all.transform(sy_30_x_test)

# Normalizção dos dados de teste de 90 dias
sy_90_x_test_scaled = scaler_all.transform(sy_90_x_test)

# Normalizção dos dados de teste de 365 dias
sy_365_x_test_scaled = scaler_all.transform(sy_365_x_test)

# Normalização dos dados de validação de 30 dias
sy_30_x_val_scaled = scaler_all.transform(sy_30_x_val)

# Tuning dos modelos

## Random Forest

Fazendo o tuning com GridSearch

In [36]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_samples': [0.5, 0.75, 1.0],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Inicializando o modelo Random Forest
rf = RandomForestRegressor(random_state=42)

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=ps,  # Passa o PredefinedSplit
    scoring='neg_mean_squared_error',
    verbose=0,
    n_jobs=1
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo
best_rf = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
sy_30_x_test_scaled_df = pd.DataFrame(
    sy_30_x_test_scaled,
    columns=sy_x_train.columns,
    index=sy_30_x_test.index
)
predictions = best_rf.predict(sy_30_x_test_scaled_df)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (mse) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_random_forest_model_grid.pkl')
joblib.dump(best_rf, model_path)
print(f"Modelo salvo em: {model_path}")

Mean Squared Error (mse) on test data: 2.9452172749232077
Modelo salvo em: Modelos\best_random_forest_model_grid.pkl


Fazendo o tuning com Optuna

In [35]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Função objetivo para o Optuna
def objective(trial):
    # Sugerindo os hiperparâmetros
    n_estimators = trial.suggest_int("n_estimators", 50, 500, step=50)
    max_depth = trial.suggest_int("max_depth", 10, 50, step=10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10, step=2)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 4, step=1)
    max_samples = trial.suggest_float("max_samples", 0.5, 1.0, step=0.05)
    max_features = trial.suggest_categorical("max_features", ['sqrt', 'log2'])


    
    

    # Criando o modelo com os hiperparâmetros sugeridos
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_samples=max_samples,
        max_features=max_features,
        
        
        random_state=42
    )

    # Treinando o modelo nos dados de treino
    rf.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = rf.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # Número de tentativas

# Obtendo os melhores hiperparâmetros
best_params = study.best_params
print("Melhores hiperparâmetros:", best_params)

# Treinando o modelo final com os melhores hiperparâmetros
best_rf = RandomForestRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    random_state=42
)
best_rf.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_rf.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_random_forest_model_optuna.pkl')
joblib.dump(best_rf, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-21 18:41:19,427] A new study created in memory with name: no-name-a6adede2-6e97-44a2-b830-b9359b490f1d
[I 2025-04-21 18:41:29,856] Trial 0 finished with value: 3.6594923027500004 and parameters: {'n_estimators': 350, 'max_depth': 30, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_samples': 0.95, 'max_features': 'log2'}. Best is trial 0 with value: 3.6594923027500004.
[I 2025-04-21 18:41:37,267] Trial 1 finished with value: 3.8796388232432424 and parameters: {'n_estimators': 350, 'max_depth': 40, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_samples': 0.55, 'max_features': 'sqrt'}. Best is trial 0 with value: 3.6594923027500004.
[I 2025-04-21 18:41:44,639] Trial 2 finished with value: 3.7630106068662728 and parameters: {'n_estimators': 350, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_samples': 0.9, 'max_features': 'log2'}. Best is trial 0 with value: 3.6594923027500004.
[I 2025-04-21 18:41:51,941] Trial 3 finished with value: 3.87240710807

Melhores hiperparâmetros: {'n_estimators': 50, 'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_samples': 0.95, 'max_features': 'log2'}
Mean Squared Error (MSE) on test data: 2.951254082497033
Modelo salvo em: Modelos\best_random_forest_model_optuna.pkl


## Linear Regressor

Fazendo tuning com GridSearch

In [47]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Inicializando o modelo Random Forest
lin_reg = LinearRegression() # Regressor logistico

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=lin_reg,
    param_grid={},  # Sem hiperparâmetros para ajustar
    cv=ps,  # Passa o PredefinedSplit
    scoring='neg_mean_squared_error',
    verbose=3,
    n_jobs=-1,
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo
best_lin_reg = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
predictions = best_lin_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (mse) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_Linear_Regressor_model_grid.pkl')
joblib.dump(best_lin_reg, model_path)
print(f"Modelo salvo em: {model_path}")

Fitting 1 folds for each of 1 candidates, totalling 1 fits
Mean Squared Error (mse) on test data: 6.18329638505496
Modelo salvo em: Modelos\best_Linear_Regressor_model_grid.pkl




Fazendo o Tuning com Optuna

In [48]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Função objetivo para o Optuna
def objective(trial):
    # Inicializando o modelo Linear Regression
    lin_reg = LinearRegression()

    # Treinando o modelo nos dados de treino
    lin_reg.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = lin_reg.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1)  # Apenas 1 tentativa, pois não há hiperparâmetros para ajustar

# Treinando o modelo final
best_lin_reg = LinearRegression()
best_lin_reg.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_lin_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_linear_regressor_model_optuna.pkl')
joblib.dump(best_lin_reg, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-17 15:47:13,647] A new study created in memory with name: no-name-8decee07-315f-4476-8cb2-7e5be21e71fa
[I 2025-04-17 15:47:13,671] Trial 0 finished with value: 7.789384479414533 and parameters: {}. Best is trial 0 with value: 7.789384479414533.


Mean Squared Error (MSE) on test data: 6.245837740756641
Modelo salvo em: Modelos\best_linear_regressor_model_optuna.pkl


## XGBoost

GridSearch

In [29]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500,600,700,800,900,1000],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.8, 1.0],  # mantido apenas uma vez
    'max_depth': [3, 5, 7, 9, 11],
}

# Inicializando o modelo Random Forest
xgb_reg = LinearRegression() # Regressor logistico
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=xgb_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Métrica de avaliação
    cv=ps,  # Número de folds para validação cruzada
    verbose=1,  # Mostra o progresso do tuning
    n_jobs=1  # Usa todos os núcleos disponíveis
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo

best_xgb_reg = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
predictions = best_xgb_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (mse) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_xgb_model_grid.pkl')
joblib.dump(best_xgb_reg, model_path)
print(f"Modelo salvo em: {model_path}")

Fitting 1 folds for each of 5280 candidates, totalling 5280 fits
Mean Squared Error (mse) on test data: 2.795087678116888
Modelo salvo em: Modelos\best_xgb_model_grid.pkl


Optuna

In [30]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)


# Função objetivo para o Optuna
def objective(trial):
    # Sugerindo os hiperparâmetros
    n_estimators = trial.suggest_int("n_estimators", 50, 1000, step=50)
    max_depth = trial.suggest_int("max_depth", 3, 10, step=1)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.3, step=0.0005)
    subsample = trial.suggest_float("subsample", 0.6, 1.0, step=0.1)
    gamma = trial.suggest_float("gamma", 0, 0.5, step=0.1)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10, step=1)
    
    

    # Criando o modelo com os hiperparâmetros sugeridos
    xgb_reg = xgb.XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        objective='reg:squarederror',
        gamma=gamma,
        min_child_weight=min_child_weight,
        
        random_state=42
    )

    # Treinando o modelo nos dados de treino
    xgb_reg.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = xgb_reg.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # Número de tentativas

# Obtendo os melhores hiperparâmetros
best_params = study.best_params
print("Melhores hiperparâmetros:", best_params)


# Treinando o modelo final com os melhores hiperparâmetros
best_xgb = xgb.XGBRegressor(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    learning_rate=best_params["learning_rate"],
    subsample=best_params["subsample"],
    gamma=best_params["gamma"],
    min_child_weight=best_params["min_child_weight"],
    objective='reg:squarederror',
    random_state=42
)
best_xgb.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_xgb.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_xgb_model_optuna.pkl')
joblib.dump(best_xgb, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-24 14:19:00,280] A new study created in memory with name: no-name-d0c9ec92-0047-421f-aae5-462d1c780571
[I 2025-04-24 14:19:01,015] Trial 0 finished with value: 2.9025841908281023 and parameters: {'n_estimators': 250, 'max_depth': 7, 'learning_rate': 0.0485, 'subsample': 0.8, 'gamma': 0.1, 'min_child_weight': 7}. Best is trial 0 with value: 2.9025841908281023.
[I 2025-04-24 14:19:01,633] Trial 1 finished with value: 3.101864530634946 and parameters: {'n_estimators': 900, 'max_depth': 9, 'learning_rate': 0.10250000000000001, 'subsample': 0.9, 'gamma': 0.5, 'min_child_weight': 9}. Best is trial 0 with value: 2.9025841908281023.
[I 2025-04-24 14:19:01,911] Trial 2 finished with value: 3.3768841974123456 and parameters: {'n_estimators': 50, 'max_depth': 8, 'learning_rate': 0.1, 'subsample': 0.9, 'gamma': 0.0, 'min_child_weight': 1}. Best is trial 0 with value: 2.9025841908281023.
[I 2025-04-24 14:19:02,453] Trial 3 finished with value: 3.1724126058006505 and parameters: {'n_estim

Melhores hiperparâmetros: {'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.127, 'subsample': 0.6, 'gamma': 0.2, 'min_child_weight': 8}
Mean Squared Error (MSE) on test data: 2.8858393687086332
Modelo salvo em: Modelos\best_xgb_model_optuna.pkl


## MLP

GridSearch

In [25]:


# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100),(50, 50, 50), (100, 50, 50), (100, 100, 50), (100, 100, 100)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.01]
}

# Inicializando o modelo MLP
mlp_reg = MLPRegressor(random_state=42, max_iter=500)

# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Configurando o GridSearchCV com PredefinedSplit
grid_search = GridSearchCV(
    estimator=mlp_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Métrica de avaliação
    cv=ps,  # Número de folds para validação cruzada
    verbose=3,  # Mostra o progresso do tuning
    n_jobs=-1  # Usa todos os núcleos disponíveis
)

# Treinando o GridSearchCV
grid_search.fit(X_full, y_full)

# Obtendo o melhor modelo
best_mlp_reg = grid_search.best_estimator_

# Avaliando o modelo nos dados de teste
predictions = best_mlp_reg.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_mlp_model_grid.pkl')
joblib.dump(best_mlp_reg, model_path)
print(f"Modelo salvo em: {model_path}")

Fitting 1 folds for each of 216 candidates, totalling 216 fits
Mean Squared Error (MSE) on test data: 3.3342401641444734
Modelo salvo em: Modelos\best_mlp_model_grid.pkl




Optuna

In [23]:
# Converte os arrays normalizados de volta para DataFrame
sy_x_train_scaled_df = pd.DataFrame(sy_x_train_scaled, columns=sy_x_train.columns, index=sy_x_train.index)
sy_30_x_val_scaled_df = pd.DataFrame(sy_30_x_val_scaled, columns=sy_30_x_val.columns, index=sy_30_x_val.index)

# Junta treino e validação
X_full = pd.concat([sy_x_train_scaled_df, sy_30_x_val_scaled_df])
y_full = pd.concat([sy_y_train, sy_30_y_val])

# Cria vetor de validação: -1 para treino, 0 para validação
validation_fold = np.concatenate([
    np.full(len(sy_x_train), -1),
    np.zeros(len(sy_30_x_val))
])

# Cria o PredefinedSplit
ps = PredefinedSplit(test_fold=validation_fold)

# Função objetivo para o Optuna
def objective(trial):
    # Sugerindo os hiperparâmetros
     # Escolhe número de camadas: 1, 2 ou 3
    n_layers = trial.suggest_int('n_layers', 1, 3)
    
    # Para cada camada, escolher número de neurônios
    hidden_layer_sizes = []
    for i in range(n_layers):
        num_units = trial.suggest_int(f'n_units_layer_{i}', 50, 200, step=10)
        hidden_layer_sizes.append(num_units)
    
    hidden_layer_sizes = tuple(hidden_layer_sizes)  # vira tupla, como o MLP espera
    activation = trial.suggest_categorical("activation", ["relu", "tanh"])
    solver = trial.suggest_categorical("solver", ["adam", "sgd"])
    alpha = trial.suggest_float("alpha", 0.0001, 0.01, log=True)
    learning_rate = trial.suggest_categorical("learning_rate", ["constant", "adaptive"])

    # Criando o modelo com os hiperparâmetros sugeridos
    mlp_reg = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate=learning_rate,
        max_iter=1000,
        random_state=42
    )

    # Treinando o modelo nos dados de treino
    mlp_reg.fit(X_full[ps.test_fold == -1], y_full[ps.test_fold == -1])

    # Avaliando o modelo nos dados de validação
    predictions = mlp_reg.predict(X_full[ps.test_fold == 0])
    mse = mean_squared_error(y_full[ps.test_fold == 0], predictions)

    return mse  # O objetivo é minimizar o MSE

# Criando o estudo do Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # Número de tentativas

# Obtendo os melhores hiperparâmetros
best_params = study.best_params

# Construindo o hidden_layer_sizes a partir dos melhores parâmetros
hidden_layer_sizes = []
for i in range(best_params["n_layers"]):
    hidden_layer_sizes.append(best_params[f"n_units_layer_{i}"])
best_params["hidden_layer_sizes"] = tuple(hidden_layer_sizes)

print("Melhores hiperparâmetros:", best_params)

# Treinando o modelo final com os melhores hiperparâmetros
best_mlp = MLPRegressor(
    hidden_layer_sizes=best_params["hidden_layer_sizes"],
    activation=best_params["activation"],
    solver=best_params["solver"],
    alpha=best_params["alpha"],
    learning_rate=best_params["learning_rate"],
    max_iter=1000,
    random_state=42
)
best_mlp.fit(sy_x_train_scaled, sy_y_train)

# Avaliando o modelo nos dados de teste
predictions = best_mlp.predict(sy_30_x_test_scaled)
mse = mean_squared_error(sy_30_y_test, predictions)
print(f"Mean Squared Error (MSE) on test data: {mse}")

# Salvando o modelo na pasta 'Modelos'
os.makedirs('Modelos', exist_ok=True)
model_path = os.path.join('Modelos', 'best_mlp_model_optuna.pkl')
joblib.dump(best_mlp, model_path)
print(f"Modelo salvo em: {model_path}")

[I 2025-04-28 09:49:18,683] A new study created in memory with name: no-name-593c19ba-4856-4ffc-84a5-7ada84bd71bc
[I 2025-04-28 09:49:53,402] Trial 0 finished with value: 4.253595743081349 and parameters: {'n_layers': 1, 'n_units_layer_0': 70, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.006071323986349375, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.253595743081349.
[I 2025-04-28 09:54:09,362] Trial 1 finished with value: 4.783693931466111 and parameters: {'n_layers': 2, 'n_units_layer_0': 180, 'n_units_layer_1': 190, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00045592803267607867, 'learning_rate': 'constant'}. Best is trial 0 with value: 4.253595743081349.
[I 2025-04-28 09:54:51,381] Trial 2 finished with value: 4.579312869609497 and parameters: {'n_layers': 3, 'n_units_layer_0': 110, 'n_units_layer_1': 80, 'n_units_layer_2': 100, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.002598256618167818, 'learning_rate': 'constant'}. Best is trial 0 with val

Melhores hiperparâmetros: {'n_layers': 1, 'n_units_layer_0': 140, 'activation': 'relu', 'solver': 'adam', 'alpha': 0.00020915691387984604, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (140,)}
Mean Squared Error (MSE) on test data: 3.092867299798431
Modelo salvo em: Modelos\best_mlp_model_optuna.pkl
