# Projeto Final - Fundamentos de Machine Learning
## Modelos de Regressão 

# 0.0. Imports

In [8]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import ParameterGrid
from sklearn import metrics as mt

import warnings
warnings.filterwarnings('ignore')

## 0.1. Funções

In [9]:
def get_metrics(y, y_pred):
    scores = pd.DataFrame({
        'r2': [mt.r2_score(y, y_pred)],
        'MSE': [mt.mean_squared_error(y, y_pred)], 
        'RMSE' : [mt.root_mean_squared_error(y, y_pred)],
        'MAE': [mt.mean_absolute_error(y, y_pred)], 
        'MAPE': [mt.mean_absolute_percentage_error(y, y_pred)]
    })
    return scores

def get_score_table(scores_train, scores_val, scores_test):
    score_comp = pd.concat([scores_train, scores_val, scores_test])
    score_comp.index = ['train', 'validation', 'test']
    return score_comp

def search_best_params(x_train, y_train, x_val, y_val, algorithm, parameter_grid):
    # DataFrame for storing metrics for each parameter set
    best_mae = 5000
    best_params = {}
    # Iterate over each set of parameters in the grid
    for params in parameter_grid:
        # Create and fit the model with the current set of parameters
        model = algorithm(**params)
        model.fit(x_train, y_train.values.ravel())
        # Predict using the validation set
        y_pred_val = model.predict(x_val)
        # Get metrics and convert to DataFrame
        current_metrics = get_metrics(y_val, y_pred_val)
        current_mae = current_metrics['MAE'].iloc[0]
        if current_mae < best_mae:
            best_mae = current_mae
            best_params = params
    
    return best_mae, best_params

def modeling(x_train, y_train, x_val, y_val, x_test, y_test, algorithm, parameters):
    model = algorithm(**parameters).fit(x_train, y_train.values.ravel())
    y_pred_train = model.predict(x_train)
    y_pred_val = model.predict(x_val)
    y_pred_test = model.predict(x_test)

    scores_train = get_metrics(y_train, y_pred_train)
    scores_val = get_metrics(y_val, y_pred_val)
    scores_test = get_metrics(y_test, y_pred_test)
    scores = get_score_table(scores_train, scores_val, scores_test)

    return scores

# 1.0. Loading Data

In [10]:
x_train = pd.read_csv('data/regressao/X_training.csv')
y_train = pd.read_csv('data/regressao/y_training.csv')

x_val = pd.read_csv('data/regressao/X_validation.csv')
y_val = pd.read_csv('data/regressao/y_val.csv')

x_test = pd.read_csv('data/regressao/X_test.csv')
y_test = pd.read_csv('data/regressao/y_test.csv')

In [11]:
display(x_train.head(3))
display(y_train.head(3))
display(x_test.head(3))
display(y_test.head(3))
display(x_val.head(3))
display(y_val.head(3))

Unnamed: 0,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,0.205673,0.0921,0.72,0.802,0.0,0.090909,0.694,0.431778,1,0.0582,0.103876,0.8,0.723
1,-0.240409,0.737,0.483,0.412,0.0,0.636364,0.116,-0.262732,1,0.0402,1.711532,0.6,0.247
2,-0.12577,0.274,0.671,0.565,6.5e-05,1.0,0.37,0.013612,0,0.16,1.009176,0.8,0.561


Unnamed: 0,song_popularity
0,79.0
1,86.0
2,63.0


Unnamed: 0,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,-1.662539,0.073,0.754,0.342,0.0,0.909091,0.193,-0.319043,0,0.0991,0.276006,0.8,0.423
1,-0.054995,0.191,0.687,0.792,0.0,0.454545,0.167,1.226398,1,0.0452,-0.733694,0.8,0.671
2,0.564739,0.318,0.63,0.478,6.5e-05,1.0,0.0942,-0.817506,1,0.0275,-0.995967,0.8,0.279


Unnamed: 0,song_popularity
0,87.0
1,100.0
2,37.0


Unnamed: 0,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,0.143252,0.0259,0.616,0.933,0.0,0.0,0.359,0.962307,0,0.0513,0.309853,0.8,0.806
1,0.363603,0.000188,0.49,0.972,0.0299,0.909091,0.368,0.765216,0,0.111,-0.908089,0.8,0.376
2,-1.409083,0.694,0.876,0.167,0.912,1.0,0.369,-1.733352,1,0.0885,-0.975052,0.8,0.845


Unnamed: 0,song_popularity
0,60.0
1,48.0
2,43.0


# 2.0. Ensaios

## 2.1. Linear Regression

In [12]:
lr_scores = modeling(x_train, y_train, x_val, y_val, x_test, y_test, LinearRegression, {})
lr_scores['model_name'] = 'Linear Regression'
lr_scores

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
train,0.046058,455.996112,21.354065,16.998249,8.653186,Linear Regression
validation,0.039925,458.447042,21.411376,17.039754,8.682542,Linear Regression
test,0.052317,461.427719,21.480869,17.129965,8.521859,Linear Regression


## 2.2. Decision Tree Regression

In [13]:
dt_param = ParameterGrid({
    'max_depth' : np.arange(1,50,1)
})
best_score_dt = search_best_params(x_train, y_train, x_val, y_val, DecisionTreeRegressor, dt_param)

In [14]:
best_mae_tree = best_score_dt[0]
best_depth_tree = list(best_score_dt[1].values())[0]
print('Melhor MAE é: {} com melhor max_depth = {}'.format(best_mae_tree, best_depth_tree))

Melhor MAE é: 16.72056151506039 com melhor max_depth = 7


In [15]:
best_params_dt = {'max_depth' : best_depth_tree}
dt_scores = modeling(x_train, y_train, x_val, y_val, x_test, y_test, DecisionTreeRegressor, best_params_dt)
dt_scores['model_name'] = 'Decision Tree'
dt_scores

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
train,0.191565,386.442081,19.65813,15.505405,6.535123,Decision Tree
validation,0.058818,449.425251,21.199652,16.724101,7.997339,Decision Tree
test,0.077381,449.224004,21.194905,16.856217,7.453107,Decision Tree


## 2.3. Random Forest Regressor

In [16]:
rf_param = ParameterGrid({
    'n_estimators' : [1,5,10],
    'max_depth' : np.arange(1,20,2)
})
best_score_rf = search_best_params(x_train, y_train, x_val, y_val, RandomForestRegressor, rf_param)
best_score_rf

(13.995592372236672, {'max_depth': 19, 'n_estimators': 10})

In [17]:
best_accuracy_rf = best_score_rf[0]
best_depth_rf = list(best_score_rf[1].values())[0]
best_estimators_rf = list(best_score_rf[1].values())[1]
print('Melhor acurácia é: {} com melhor max_depth = {} e melhor estimador = {}'.format(best_accuracy_rf, best_depth_rf, best_estimators_rf))

Melhor acurácia é: 13.995592372236672 com melhor max_depth = 19 e melhor estimador = 10


In [18]:
best_params_rf = {'max_depth' : best_depth_rf,
                  'n_estimators' : best_estimators_rf}
rf_scores = modeling(x_train, y_train, x_val, y_val, x_test, y_test, RandomForestRegressor, best_params_rf)
rf_scores['model_name'] = 'Random Forest'
rf_scores

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
train,0.844462,74.349328,8.622606,6.159131,2.686078,Random Forest
validation,0.247422,359.364904,18.956922,14.030857,7.210582,Random Forest
test,0.286721,347.296028,18.63588,13.877213,6.530233,Random Forest


## 2.4. Polynomial Regression

In [19]:
best_degree = 0
best_score = 5000
degrees = range(2,5,1)
for degree in degrees:
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    x_poly_train = poly.fit_transform(x_train)
    x_poly_val = poly.fit_transform(x_val)
    model = LinearRegression().fit(x_poly_train, y_train)
    y_pred_val = model.predict(x_poly_val)
    scores = get_metrics(y_val, y_pred_val)
    current_mae = scores['MAE'].iloc[0]
    if current_mae < best_score:
        best_score = current_mae
        best_degree = degree
print('Menor MAE: ',best_score)
print('Melhor degree: ',best_degree)

Menor MAE:  16.749939090484027
Melhor degree:  2


In [20]:
poly = PolynomialFeatures(degree=best_degree, include_bias=False)
x_poly_train = poly.fit_transform(x_train)
x_poly_val = poly.fit_transform(x_val)
x_poly_test = poly.fit_transform(x_test)
pr_scores = modeling(x_poly_train, y_train, x_poly_val, y_val, x_poly_test, y_test, LinearRegression, {})
pr_scores['model_name'] = 'Polynomial Regression'
pr_scores

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
train,0.094195,432.98621,20.808321,16.458032,8.35054,Polynomial Regression
validation,0.066477,445.768223,21.113224,16.749939,8.547931,Polynomial Regression
test,0.090079,443.041256,21.048545,16.720535,8.242464,Polynomial Regression


## 2.5. Linear Regression Lasso

In [21]:
lrl_param = ParameterGrid({
    'alpha' : [0.01, 0.1, 1, 10],
    'max_iter' : [1000, 2000]
})
best_score_lrl = search_best_params(x_train, y_train, x_val, y_val, Lasso, lrl_param)
best_score_lrl

(17.03824289549502, {'alpha': 0.01, 'max_iter': 1000})

In [22]:
best_mae_lrl = best_score_lrl[0]
best_alpha_lrl = list(best_score_lrl[1].values())[0]
best_max_iter_lrl = list(best_score_lrl[1].values())[1]
print('Melhor MAE é: {} com melhor alpha = {} e max_iter = {}'.format(best_mae_lrl, best_alpha_lrl, best_max_iter_lrl))

Melhor MAE é: 17.03824289549502 com melhor alpha = 0.01 e max_iter = 1000


In [23]:
best_params_lrl = {'alpha' : best_alpha_lrl,
                  'max_iter' : best_max_iter_lrl}
lrl_scores = modeling(x_train, y_train, x_val, y_val, x_test, y_test, Lasso, best_params_lrl)
lrl_scores['model_name'] = 'Linear Regression Lasso'
lrl_scores

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
train,0.04593,456.057415,21.355501,17.002115,8.66067,Linear Regression Lasso
validation,0.039928,458.445407,21.411338,17.038243,8.686215,Linear Regression Lasso
test,0.051981,461.591607,21.484683,17.130186,8.539474,Linear Regression Lasso


## 2.6. Linear Regression Ridge

In [24]:
lrr_param = ParameterGrid({
    'alpha' : [0.01, 0.1, 1, 10, 100, 1000],
    'max_iter' : [500, 1000, 2000]
})
best_score_lrr = search_best_params(x_train, y_train, x_val, y_val, Ridge, lrr_param)

In [25]:
best_mae_lrr = best_score_lrr[0]
best_alpha_lrr = list(best_score_lrr[1].values())[0]
best_max_iter_lrr = list(best_score_lrr[1].values())[1]
print('Melhor MAE é: {} com melhor alpha = {} e max_iter = {}'.format(best_mae_lrr, best_alpha_lrr, best_max_iter_lrr))

Melhor MAE é: 17.034552675630103 com melhor alpha = 100 e max_iter = 500


In [26]:
best_params_lrr = {'alpha' : best_alpha_lrr,
                  'max_iter' : best_max_iter_lrr}
lrr_scores = modeling(x_train, y_train, x_val, y_val, x_test, y_test, Ridge, best_params_lrr)
lrr_scores['model_name'] = 'Linear Regression Ridge'
lrr_scores

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
train,0.043985,456.987024,21.377255,17.016602,8.664151,Linear Regression Ridge
validation,0.038881,458.945555,21.423015,17.034553,8.674899,Linear Regression Ridge
test,0.048924,463.079905,21.519291,17.141974,8.577784,Linear Regression Ridge


## 2.7. Linear Regression Elastic Net

In [27]:
lren_param = ParameterGrid({
    'alpha' : [0.01, 0.1, 0.5, 1, 10, 100],
    'max_iter' : [500, 1000],
    'l1_ratio' : [0.0, 0.1, 0.5, 0.7, 1.0]
})
best_score_lren = search_best_params(x_train, y_train, x_val, y_val, ElasticNet, lren_param)
best_score_lren

(17.034077957880456, {'alpha': 0.01, 'l1_ratio': 0.5, 'max_iter': 500})

In [28]:
best_mae_lren = best_score_lren[0]
best_alpha_lren = list(best_score_lren[1].values())[0]
best_l1_ratio_lren = list(best_score_lren[1].values())[1]
best_max_iter_lren = list(best_score_lren[1].values())[2]
print('Melhor MAE é: {} com melhor alpha = {} , max_iter = {} e l1_ratio = {}'.format(best_mae_lren, best_alpha_lren, best_max_iter_lren, best_l1_ratio_lren))

Melhor MAE é: 17.034077957880456 com melhor alpha = 0.01 , max_iter = 500 e l1_ratio = 0.5


In [29]:
best_params_lren = {'alpha' : best_alpha_lren,
                  'max_iter' : best_max_iter_lren,
                   'l1_ratio': best_l1_ratio_lren}
lren_scores = modeling(x_train, y_train, x_val, y_val, x_test, y_test, ElasticNet, best_params_lren)
lren_scores['model_name'] = 'Linear Regression Elastic Net'
lren_scores

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
train,0.045055,456.475703,21.365292,17.008724,8.663312,Linear Regression Elastic Net
validation,0.039538,458.631708,21.415688,17.034078,8.679104,Linear Regression Elastic Net
test,0.050521,462.302115,21.501212,17.133874,8.563382,Linear Regression Elastic Net


## 2.8. Polynomial Regression Lasso

In [30]:
prl_param = ParameterGrid({
    'alpha' : [0.01, 0.1, 1, 10],
    'max_iter' : [1000, 2000]
})
best_score_prl = search_best_params(x_poly_train, y_train, x_poly_val, y_val, Lasso, prl_param)
best_score_prl

(16.73238601818895, {'alpha': 0.01, 'max_iter': 1000})

In [31]:
best_mae_prl = best_score_prl[0]
best_alpha_prl = list(best_score_prl[1].values())[0]
best_max_iter_prl = list(best_score_prl[1].values())[1]
print('Melhor MAE é: {} com melhor alpha = {} e max_iter = {}'.format(best_mae_prl, best_alpha_prl, best_max_iter_prl))

Melhor MAE é: 16.73238601818895 com melhor alpha = 0.01 e max_iter = 1000


In [32]:
best_params_prl = {'alpha' : best_alpha_prl,
                  'max_iter' : best_max_iter_prl}
prl_scores = modeling(x_poly_train, y_train, x_poly_val, y_val, x_poly_test, y_test, Lasso, best_params_prl)
prl_scores['model_name'] = 'Polynomial Regression Lasso'
prl_scores

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
train,0.086817,436.512957,20.892893,16.540954,8.432707,Polynomial Regression Lasso
validation,0.068473,444.814973,21.090637,16.732386,8.591033,Polynomial Regression Lasso
test,0.085899,445.07689,21.096846,16.760885,8.322458,Polynomial Regression Lasso


## 2.9. Polynomial Regression Ridge

In [33]:
prr_param = ParameterGrid({
    'alpha' : [0.01, 0.1, 1, 10, 100, 1000],
    'max_iter' : [500, 1000, 2000]
})
best_score_prr = search_best_params(x_poly_train, y_train, x_poly_val, y_val, Ridge, prr_param)

In [34]:
best_mae_prr = best_score_prr[0]
best_alpha_prr = list(best_score_prr[1].values())[0]
best_max_iter_prr = list(best_score_prr[1].values())[1]
print('Melhor MAE é: {} com melhor alpha = {} e max_iter = {}'.format(best_mae_prr, best_alpha_prr, best_max_iter_prr))

Melhor MAE é: 16.738740611690595 com melhor alpha = 1 e max_iter = 500


In [35]:
best_params_prr = {'alpha' : best_alpha_prr,
                  'max_iter' : best_max_iter_prr}
prr_scores = modeling(x_poly_train, y_train, x_poly_val, y_val, x_poly_test, y_test, Ridge, best_params_prr)
prr_scores['model_name'] = 'Polynomial Regression Ridge'
prr_scores

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
train,0.093171,433.475457,20.820073,16.471972,8.372689,Polynomial Regression Ridge
validation,0.067699,445.18441,21.099394,16.738741,8.568992,Polynomial Regression Ridge
test,0.089167,443.4853,21.059091,16.728879,8.288682,Polynomial Regression Ridge


## 2.10. Polynomial Regression Elastic Net

In [36]:
pren_param = ParameterGrid({
    'alpha' : [0.01, 0.1, 0.5, 1, 10, 100],
    'max_iter' : [500, 1000],
    'l1_ratio' : [0.0, 0.1, 0.5, 0.7, 1.0]
})
best_score_pren = search_best_params(x_poly_train, y_train, x_poly_val, y_val, ElasticNet, pren_param)
best_score_pren

(16.73238601818895, {'alpha': 0.01, 'l1_ratio': 1.0, 'max_iter': 500})

In [37]:
best_mae_pren = best_score_pren[0]
best_alpha_pren = list(best_score_pren[1].values())[0]
best_l1_ratio_pren = list(best_score_pren[1].values())[1]
best_max_iter_pren = list(best_score_pren[1].values())[2]
print('Melhor MAE é: {} com melhor alpha = {} , max_iter = {} e l1_ratio = {}'.format(best_mae_pren, best_alpha_pren, best_max_iter_pren, best_l1_ratio_pren))

Melhor MAE é: 16.73238601818895 com melhor alpha = 0.01 , max_iter = 500 e l1_ratio = 1.0


In [38]:
best_params_pren = {'alpha' : best_alpha_pren,
                  'max_iter' : best_max_iter_pren,
                   'l1_ratio': best_l1_ratio_pren}
pren_scores = modeling(x_poly_train, y_train, x_poly_val, y_val, x_poly_test, y_test, ElasticNet, best_params_pren)
pren_scores['model_name'] = 'Polynomial Regression Elastic Net'
pren_scores

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
train,0.086817,436.512957,20.892893,16.540954,8.432707,Polynomial Regression Elastic Net
validation,0.068473,444.814973,21.090637,16.732386,8.591033,Polynomial Regression Elastic Net
test,0.085899,445.07689,21.096846,16.760885,8.322458,Polynomial Regression Elastic Net


# 3.0. Comparando os modelos

In [39]:
scores = pd.concat([lr_scores, dt_scores, rf_scores, pr_scores, lrl_scores, lrr_scores, lren_scores, prl_scores, prr_scores, pren_scores])

## 3.1. Treinamento

In [40]:
scores.loc['train']

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
train,0.046058,455.996112,21.354065,16.998249,8.653186,Linear Regression
train,0.191565,386.442081,19.65813,15.505405,6.535123,Decision Tree
train,0.844462,74.349328,8.622606,6.159131,2.686078,Random Forest
train,0.094195,432.98621,20.808321,16.458032,8.35054,Polynomial Regression
train,0.04593,456.057415,21.355501,17.002115,8.66067,Linear Regression Lasso
train,0.043985,456.987024,21.377255,17.016602,8.664151,Linear Regression Ridge
train,0.045055,456.475703,21.365292,17.008724,8.663312,Linear Regression Elastic Net
train,0.086817,436.512957,20.892893,16.540954,8.432707,Polynomial Regression Lasso
train,0.093171,433.475457,20.820073,16.471972,8.372689,Polynomial Regression Ridge
train,0.086817,436.512957,20.892893,16.540954,8.432707,Polynomial Regression Elastic Net


## 3.2. Validação

In [41]:
scores.loc['validation']

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
validation,0.039925,458.447042,21.411376,17.039754,8.682542,Linear Regression
validation,0.058818,449.425251,21.199652,16.724101,7.997339,Decision Tree
validation,0.247422,359.364904,18.956922,14.030857,7.210582,Random Forest
validation,0.066477,445.768223,21.113224,16.749939,8.547931,Polynomial Regression
validation,0.039928,458.445407,21.411338,17.038243,8.686215,Linear Regression Lasso
validation,0.038881,458.945555,21.423015,17.034553,8.674899,Linear Regression Ridge
validation,0.039538,458.631708,21.415688,17.034078,8.679104,Linear Regression Elastic Net
validation,0.068473,444.814973,21.090637,16.732386,8.591033,Polynomial Regression Lasso
validation,0.067699,445.18441,21.099394,16.738741,8.568992,Polynomial Regression Ridge
validation,0.068473,444.814973,21.090637,16.732386,8.591033,Polynomial Regression Elastic Net


## 3.3. Teste

In [42]:
scores.loc['test']

Unnamed: 0,r2,MSE,RMSE,MAE,MAPE,model_name
test,0.052317,461.427719,21.480869,17.129965,8.521859,Linear Regression
test,0.077381,449.224004,21.194905,16.856217,7.453107,Decision Tree
test,0.286721,347.296028,18.63588,13.877213,6.530233,Random Forest
test,0.090079,443.041256,21.048545,16.720535,8.242464,Polynomial Regression
test,0.051981,461.591607,21.484683,17.130186,8.539474,Linear Regression Lasso
test,0.048924,463.079905,21.519291,17.141974,8.577784,Linear Regression Ridge
test,0.050521,462.302115,21.501212,17.133874,8.563382,Linear Regression Elastic Net
test,0.085899,445.07689,21.096846,16.760885,8.322458,Polynomial Regression Lasso
test,0.089167,443.4853,21.059091,16.728879,8.288682,Polynomial Regression Ridge
test,0.085899,445.07689,21.096846,16.760885,8.322458,Polynomial Regression Elastic Net
