<a href="https://colab.research.google.com/github/MaxVieiraSantiago/Machine-Learning-Algorithms/blob/master/Linear%20Regression/Regress%C3%A3o_RIDGE%2C_LASSO%2C_ELASTIC_NET.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Regressão RIDGE, LASSO e ELASTIC NET**

In [1]:
import numpy as np
import pandas as pd

In [2]:
file = '/content/drive/MyDrive/Udemy/gasto_almoco.xlsx'
dados = pd.read_excel(file)

## **Análise Inicial**

In [3]:
dados.head()

Unnamed: 0,dia,gasto_dia_reais,num_refeicoes,gasto_acum_reais
0,1,26,1,26
1,2,18,2,44
2,3,37,3,81
3,4,21,4,102
4,5,39,5,141


**Objetivo: Criação de um modelo de regressão para variáveis altamente correlacionadas (multicolinearidade)**

In [4]:
dados.shape

(64, 4)

### **Valores Missing**

In [5]:
dados.isnull().sum()

dia                 0
gasto_dia_reais     0
num_refeicoes       0
gasto_acum_reais    0
dtype: int64

In [6]:
dados.dtypes

dia                 int64
gasto_dia_reais     int64
num_refeicoes       int64
gasto_acum_reais    int64
dtype: object

### **Ausência de Multicolinearidade**

Somente entre as variáveis independentes.\
Considera multicolinearidade quando r > 0.9

In [7]:
correlacoes = dados.corr(method='spearman')
correlacoes

Unnamed: 0,dia,gasto_dia_reais,num_refeicoes,gasto_acum_reais
dia,1.0,0.140909,0.999989,0.999989
gasto_dia_reais,0.140909,1.0,0.142091,0.142091
num_refeicoes,0.999989,0.142091,1.0,1.0
gasto_acum_reais,0.999989,0.142091,1.0,1.0


**dia x num_refeicoes** = 0.999087 -> *problema de multicolinearidade*

Usando Fator de Inflação de Variância - VIF (tem que ser **menor** que 5)

In [8]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [9]:
vif = add_constant(dados)
pd.Series([variance_inflation_factor(vif.values, i) for i in range(vif.shape[1])], index=vif.columns)

const                 11.762488
dia                  836.171143
gasto_dia_reais        1.402142
num_refeicoes       2777.880784
gasto_acum_reais    3306.046633
dtype: float64

## **Modelo RIDGE**

In [10]:
gasto = dados.drop(columns=['gasto_dia_reais'])
gasto.head()

Unnamed: 0,dia,num_refeicoes,gasto_acum_reais
0,1,1,26
1,2,2,44
2,3,3,81
3,4,4,102
4,5,5,141


In [11]:
X = gasto.iloc[ :, 0:2].values
y = gasto.iloc[ :, 2].values

In [12]:
from sklearn.linear_model import Ridge

In [13]:
# Deve-se escolher o melhor parâmetro de regularização (alpha)
modelo_ridge = Ridge(alpha = 0)
modelo_ridge.fit(X, y)
modelo_ridge.score(X, y)

0.9996807593215069

In [14]:
modelo_ridge.intercept_

2.6345750647848263

In [15]:
modelo_ridge.coef_

array([ 4.89145178, 25.63954273])

**Equação: gasto_acum = 2.6345750647848263 * (4.89145178 * dia) + (4.89145178 * num_refeicoes)**

In [16]:
previsao_ridge = modelo_ridge.predict(X)
previsao_ridge

array([  33.16556957,   63.69656407,   94.22755858,  124.75855308,
        155.28954758,  160.18099936,  190.71199387,  221.24298837,
        251.77398287,  282.30497738,  312.83597188,  343.36696638,
        373.89796089,  404.42895539,  460.59949262,  491.13048712,
        521.66148163,  552.19247613,  582.72347063,  613.25446514,
        643.78545964,  674.31645415,  704.84744865,  735.37844315,
        765.90943766,  796.44043216,  826.97142666,  857.50242117,
        888.03341567,  918.56441018,  949.09540468,  979.62639918,
       1010.15739369, 1040.68838819, 1071.21938269, 1101.7503772 ,
       1132.2813717 , 1162.81236621, 1193.34336071, 1223.87435521,
       1254.40534972, 1284.93634422, 1315.46733872, 1345.99833323,
       1376.52932773, 1407.06032223, 1437.59131674, 1468.12231124,
       1498.65330575, 1529.18430025, 1559.71529475, 1590.24628926,
       1620.77728376, 1651.30827826, 1681.83927277, 1712.37026727,
       1742.90126178, 1773.43225628, 1803.96325078, 1834.49424

In [17]:
gasto['previsao_ridge'] = modelo_ridge.predict(X)

In [18]:
gasto.head()

Unnamed: 0,dia,num_refeicoes,gasto_acum_reais,previsao_ridge
0,1,1,26,33.16557
1,2,2,44,63.696564
2,3,3,81,94.227559
3,4,4,102,124.758553
4,5,5,141,155.289548


In [19]:
gasto.describe()

Unnamed: 0,dia,num_refeicoes,gasto_acum_reais,previsao_ridge
count,64.0,64.0,64.0,64.0
mean,32.5,32.546875,996.09375,996.09375
std,18.618987,19.121109,581.353012,581.260209
min,1.0,1.0,26.0,33.16557
25%,16.75,16.75,518.75,514.028733
50%,32.5,32.5,1006.0,994.891896
75%,48.25,48.25,1471.75,1475.75506
max,64.0,68.0,2068.0,2059.176394


In [20]:
gasto['erro_ridge_abs'] = abs(gasto['gasto_acum_reais'] - gasto['previsao_ridge'])

In [21]:
gasto.head()

Unnamed: 0,dia,num_refeicoes,gasto_acum_reais,previsao_ridge,erro_ridge_abs
0,1,1,26,33.16557,7.16557
1,2,2,44,63.696564,19.696564
2,3,3,81,94.227559,13.227559
3,4,4,102,124.758553,22.758553
4,5,5,141,155.289548,14.289548


In [22]:
gasto.describe()

Unnamed: 0,dia,num_refeicoes,gasto_acum_reais,previsao_ridge,erro_ridge_abs
count,64.0,64.0,64.0,64.0,64.0
mean,32.5,32.546875,996.09375,996.09375,8.807401
std,18.618987,19.121109,581.353012,581.260209,5.393756
min,1.0,1.0,26.0,33.16557,0.40535
25%,16.75,16.75,518.75,514.028733,4.800202
50%,32.5,32.5,1006.0,994.891896,8.42047
75%,48.25,48.25,1471.75,1475.75506,12.139439
max,64.0,68.0,2068.0,2059.176394,22.758553


In [23]:
# Erro médio asboluto (MAE)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y, previsao_ridge)

8.807401102211182

In [24]:
# Erro médio quadrático (MSE)
from sklearn.metrics import mean_squared_error
mean_squared_error(y, previsao_ridge)

106.20834811864499

In [26]:
# R-squared
print('R-squared: {:.2f}%'.format( modelo_ridge.score(X, y) * 100 ) )

R-squared: 99.97%


## **Modelo LASSO**

In [157]:
X = gasto.iloc[:, 0:2].values
y = gasto.iloc[:, 2].values

In [158]:
from sklearn.linear_model import Lasso

In [159]:
# Deve-se escolher o melhor parâmetro de regularização (alpha)
modelo_lasso = Lasso(alpha = 28)
modelo_lasso.fit(X, y)
modelo_lasso.score(X, y)

0.9996550235468806

In [160]:
modelo_lasso.intercept_

2.4884777044605926

In [161]:
modelo_lasso.coef_

array([ 8.09213073, 22.44796231])

**Equação: gasto_acum = 2.4885 + (8.0921 * dia) + (22.4479 * num_refeicoes)**

In [162]:
previsao_lasso = modelo_lasso.predict(X)

In [163]:
gasto['previsao_lasso'] = modelo_lasso.predict(X)

In [164]:
gasto.head()

Unnamed: 0,dia,num_refeicoes,gasto_acum_reais,previsao_ridge,erro_ridge_abs,previsao_lasso
0,1,1,26,33.16557,7.16557,33.028571
1,2,2,44,63.696564,19.696564,63.568664
2,3,3,81,94.227559,13.227559,94.108757
3,4,4,102,124.758553,22.758553,124.64885
4,5,5,141,155.289548,14.289548,155.188943


In [165]:
gasto['erro_lasso_abs'] = abs(gasto['gasto_acum_reais'] - gasto['previsao_lasso'])

In [166]:
gasto.head()

Unnamed: 0,dia,num_refeicoes,gasto_acum_reais,previsao_ridge,erro_ridge_abs,previsao_lasso,erro_lasso_abs
0,1,1,26,33.16557,7.16557,33.028571,7.028571
1,2,2,44,63.696564,19.696564,63.568664,19.568664
2,3,3,81,94.227559,13.227559,94.108757,13.108757
3,4,4,102,124.758553,22.758553,124.64885,22.64885
4,5,5,141,155.289548,14.289548,155.188943,14.188943


In [167]:
gasto.describe()

Unnamed: 0,dia,num_refeicoes,gasto_acum_reais,previsao_ridge,erro_ridge_abs,previsao_lasso,erro_lasso_abs
count,64.0,64.0,64.0,64.0,64.0,64.0,64.0
mean,32.5,32.546875,996.09375,996.09375,8.807401,996.09375,8.983458
std,18.618987,19.121109,581.353012,581.260209,5.393756,579.795335,5.882911
min,1.0,1.0,26.0,33.16557,0.40535,33.028571,0.233904
25%,16.75,16.75,518.75,514.028733,4.800202,514.035036,4.666379
50%,32.5,32.5,1006.0,994.891896,8.42047,995.041502,7.9078
75%,48.25,48.25,1471.75,1475.75506,12.139439,1476.047967,12.408963
max,64.0,68.0,2068.0,2059.176394,22.758553,2046.846282,22.64885


In [168]:
# Erro médio absoluto
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y, previsao_lasso)

8.983457670916096

In [169]:
# Erro quadrático médio
from sklearn.metrics import mean_squared_error
mean_squared_error(y, previsao_lasso)

114.7703964250214

## **Modelo ELASTIC NET**

In [170]:
X = gasto.iloc[ : , 0:2 ].values
y = gasto.iloc[ :, 2].values

In [171]:
from sklearn.linear_model import ElasticNet

In [172]:
modelo_elastic = ElasticNet(alpha=0.001, l1_ratio=0.5)
modelo_elastic.fit(X, y)
modelo_elastic.score(X, y)

  model = cd_fast.enet_coordinate_descent(


0.9996471069279312

In [173]:
modelo_elastic.intercept_

-0.9111899003638655

In [174]:
modelo_elastic.coef_

array([ 9.13016866, 21.51587391])

**Equação: gasto_acum = -0.91111 + (9.1301 * dia) + (21.5158 * num_refeicoes)**

In [175]:
previsao_elastic = modelo_elastic.predict(X)

In [176]:
gasto['previsao_elastic'] = previsao_elastic

In [177]:
gasto['erro_elastic_abs'] = abs( y - previsao_elastic)

In [178]:
gasto.head()

Unnamed: 0,dia,num_refeicoes,gasto_acum_reais,previsao_ridge,erro_ridge_abs,previsao_lasso,erro_lasso_abs,previsao_elastic,erro_elastic_abs
0,1,1,26,33.16557,7.16557,33.028571,7.028571,29.734853,3.734853
1,2,2,44,63.696564,19.696564,63.568664,19.568664,60.380895,16.380895
2,3,3,81,94.227559,13.227559,94.108757,13.108757,91.026938,10.026938
3,4,4,102,124.758553,22.758553,124.64885,22.64885,121.67298,19.67298
4,5,5,141,155.289548,14.289548,155.188943,14.188943,152.319023,11.319023


In [179]:
# Erro absoluto médio
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y, previsao_elastic)

9.242296852806257

In [180]:
# Erro quadrático médio
from sklearn.metrics import mean_squared_error
mean_squared_error(y, previsao_elastic)

117.40418051941032