In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Carregar os conjuntos de dados
train_data = pd.read_csv('../data/splits/train_data.csv')
val_data = pd.read_csv('../data/splits/val_data.csv')
test_data = pd.read_csv('../data/splits/test_data.csv')

# Verificar as colunas nos conjuntos de dados
print("Train data columns:", train_data.columns)
print("Validation data columns:", val_data.columns)
print("Test data columns:", test_data.columns)

# Definir a coluna de destino (target)
target_column = 'popularity'  # Substitua pelo nome correto se necessário

# Separar recursos e rótulos
X_train = train_data.drop(target_column, axis=1)
y_train = train_data[target_column]
X_val = val_data.drop(target_column, axis=1)
y_val = val_data[target_column]
X_test = test_data.drop(target_column, axis=1)
y_test = test_data[target_column]

# Identificar colunas categóricas e numéricas
categorical_columns = X_train.select_dtypes(include=['object']).columns
numerical_columns = X_train.select_dtypes(exclude=['object']).columns

# Criar transformadores para colunas categóricas e numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])

# Inicializar modelos
models = {
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier()
}

# Treinar e avaliar modelos usando um pipeline
for model_name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f'{model_name} Validation Accuracy: {accuracy:.4f}')
    print(classification_report(y_val, y_pred))

# Exemplo de ajuste de hiperparâmetros para o RandomForestClassifier
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30]
}
grid_search = GridSearchCV(Pipeline(steps=[('preprocessor', preprocessor), ('model', RandomForestClassifier())]), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Melhor modelo encontrado
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_val)
print(f'Best RandomForest Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}')
print(classification_report(y_val, y_pred))

# Exemplo: Aplicar regularização ao Logistic Regression
log_reg = Pipeline(steps=[('preprocessor', preprocessor), ('model', LogisticRegression(C=0.1, max_iter=1000))])
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_val)
print(f'Logistic Regression with Regularization Validation Accuracy: {accuracy_score(y_val, y_pred):.4f}')
print(classification_report(y_val, y_pred))

# Salvar o modelo treinado
joblib.dump(best_rf_model, '../models/best_random_forest.pkl')

# Carregar o modelo salvo (exemplo)
# loaded_model = joblib.load('../models/best_random_forest.pkl')
# y_test_pred = loaded_model.predict(X_test)
# print(f'Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}')
# print(classification_report(y_test, y_test_pred))


Train data columns: Index(['artist', 'song', 'duration_ms', 'explicit', 'year', 'popularity',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'genre_Dance/Electronic', 'genre_Folk/Acoustic, pop',
       'genre_Folk/Acoustic, rock', 'genre_Folk/Acoustic, rock, pop',
       'genre_R&B', 'genre_World/Traditional, Folk/Acoustic',
       'genre_World/Traditional, hip hop', 'genre_World/Traditional, pop',
       'genre_World/Traditional, pop, Folk/Acoustic',
       'genre_World/Traditional, rock', 'genre_World/Traditional, rock, pop',
       'genre_country', 'genre_country, latin', 'genre_easy listening',
       'genre_hip hop', 'genre_hip hop, Dance/Electronic',
       'genre_hip hop, R&B', 'genre_hip hop, country',
       'genre_hip hop, latin, Dance/Electronic', 'genre_hip hop, pop',
       'genre_hip hop, pop, Dance/Electronic', 'genre_hip hop, pop, R&B',
       'genre_hip hop, po

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


GradientBoosting Validation Accuracy: 0.0875
              precision    recall  f1-score   support

           0       0.07      0.95      0.13        20
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
          32       0.00      0.00      0.00         1
          35       0.00      0.00      0.00         3
          38       0.00      0.00      0.00         1
          39       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         2
          43       0.00      0.00      0.00         4
          45       0.00      0.00      0.00         1
          46       0.00      0.00      0.00         2
          47       0.00      0.00      0.00         1
          48       0.00      0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.

LogisticRegression Validation Accuracy: 0.0781
              precision    recall  f1-score   support

           0       0.07      0.90      0.13        20
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
          32       0.00      0.00      0.00         1
          35       0.00      0.00      0.00         3
          38       0.00      0.00      0.00         1
          39       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         2
          43       0.00      0.00      0.00         4
          45       0.00      0.00      0.00         1
          46       0.00      0.00      0.00         2
          47       0.00      0.00      0.00         1
          48       0.00      0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression with Regularization Validation Accuracy: 0.0813
              precision    recall  f1-score   support

           0       0.07      0.90      0.14        20
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
          32       0.00      0.00      0.00         1
          35       0.00      0.00      0.00         3
          38       0.00      0.00      0.00         1
          39       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         2
          43       0.00      0.00      0.00         4
          45       0.00      0.00      0.00         1
          46       0.00      0.00      0.00         2
          47       0.00      0.00      0.00         1
          48 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['../models/best_random_forest.pkl']