# Outros tipos de Regressão

$\quad$ Esse tipo de regressão é utilizado para dados que não são linearmente separáveis. Normalmente podemos aplicar nesses caso a regressão polinomial para resolver esses problemas.

## Regressão Polinomial

### Base Plano de Saúde

In [60]:
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [61]:
base_plano_saude2 = pd.read_csv('plano_saude2.csv')
base_plano_saude2

Unnamed: 0,idade,custo
0,18,470
1,23,520
2,28,630
3,33,830
4,38,1150
5,43,1530
6,48,2040
7,53,3080
8,58,5100
9,63,10100


In [62]:
x_plano_saude2 = base_plano_saude2.iloc[:, 0:1].values
y_plano_saude2 = base_plano_saude2.iloc[:, 1].values

In [63]:
x_plano_saude2

array([[18],
       [23],
       [28],
       [33],
       [38],
       [43],
       [48],
       [53],
       [58],
       [63]], dtype=int64)

In [64]:
y_plano_saude2

array([  470,   520,   630,   830,  1150,  1530,  2040,  3080,  5100,
       10100], dtype=int64)

In [65]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 4)
x_plano_saude2_poly = poly.fit_transform(x_plano_saude2)

In [66]:
x_plano_saude2.shape

(10, 1)

In [67]:
x_plano_saude2_poly.shape

(10, 5)

In [68]:
regressor_saude_polinomial = LinearRegression()
regressor_saude_polinomial.fit(x_plano_saude2_poly, y_plano_saude2)

In [69]:
# b0
regressor_saude_polinomial.intercept_

16561.974652809193

In [70]:
# b1 (n)
regressor_saude_polinomial.coef_

array([ 0.00000000e+00, -2.12242253e+03,  9.90404200e+01, -1.95058276e+00,
        1.40792541e-02])

In [71]:
novo = [[40]]
novo = poly.transform(novo)
novo

array([[1.00e+00, 4.00e+01, 1.60e+03, 6.40e+04, 2.56e+06]])

In [72]:
regressor_saude_polinomial.predict(novo)

array([1335.33958117])

In [73]:
previsoes = regressor_saude_polinomial.predict(x_plano_saude2_poly)
previsoes

array([ 549.65035612,  345.85081673,  616.53846088,  975.83916058,
       1249.06759962, 1472.72727352, 1894.51048957, 2973.29836684,
       5379.16083613, 9993.35664002])

In [74]:
grafico = px.scatter(x = x_plano_saude2[:, 0], y = y_plano_saude2)
grafico.add_scatter(x = x_plano_saude2[:,0], y = previsoes, name = 'Regressão')
grafico.show()

### Base preço das casas

In [75]:
base_casas = pd.read_csv('house_prices.csv')
base_casas

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [76]:
x_casas = base_casas.iloc[:, 3:19].values
y_casas = base_casas.iloc[:, 2].values

In [77]:
from sklearn.model_selection import train_test_split
x_casas_treinamento, x_casas_teste, y_casas_treinamento, y_casas_teste = train_test_split(x_casas, y_casas, test_size=0.3, random_state = 0)

In [78]:
x_casas_treinamento.shape

(15129, 16)

In [79]:
x_casas_teste.shape

(6484, 16)

In [80]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2)
x_casas_treinamento_poly = poly.fit_transform(x_casas_treinamento)
x_casas_teste_poly = poly.transform(x_casas_teste)

In [81]:
x_casas_treinamento_poly.shape, x_casas_teste_poly.shape

((15129, 153), (6484, 153))

In [82]:
regressor_casas_poly = LinearRegression()
regressor_casas_poly.fit(x_casas_treinamento_poly, y_casas_treinamento)

In [83]:
regressor_casas_poly.score(x_casas_treinamento_poly, y_casas_treinamento)

0.7924863532415345

In [84]:
regressor_casas_poly.score(x_casas_teste_poly, y_casas_teste)

0.7945486443447527

In [85]:
previsoes = regressor_casas_poly.predict(x_casas_teste_poly)
previsoes

array([ 442197.02063529, 1956399.27396442,  576589.98631182, ...,
        350582.4526704 ,  246776.53699639,  221178.8176958 ])

In [86]:
y_casas_teste

array([ 297000., 1578000.,  562100., ...,  380000.,  268000.,  206000.])

In [87]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
mean_absolute_error(y_casas_teste, previsoes)

105307.1638579619

## Ávore de Decisão

### Base plano de Saúde

In [88]:
x_plano_saude2

array([[18],
       [23],
       [28],
       [33],
       [38],
       [43],
       [48],
       [53],
       [58],
       [63]], dtype=int64)

In [89]:
y_plano_saude2

array([  470,   520,   630,   830,  1150,  1530,  2040,  3080,  5100,
       10100], dtype=int64)

In [90]:
from sklearn.tree import DecisionTreeRegressor
regressor_arvore_saude = DecisionTreeRegressor()
regressor_arvore_saude.fit(x_plano_saude2, y_plano_saude2)

In [91]:
previsoes = regressor_arvore_saude.predict(x_plano_saude2)
previsoes

array([  470.,   520.,   630.,   830.,  1150.,  1530.,  2040.,  3080.,
        5100., 10100.])

In [92]:
regressor_arvore_saude.score(x_plano_saude2, y_plano_saude2)

1.0

In [93]:
grafico = px.scatter(x = x_plano_saude2.ravel(), y = y_plano_saude2)
grafico.add_scatter(x = x_plano_saude2.ravel(), y = previsoes, name='Regressão')
grafico.show()

In [94]:
x_teste_arvore = np.arange(min(x_plano_saude2), max(x_plano_saude2), 0.1)
x_teste_arvore


Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)



array([18. , 18.1, 18.2, 18.3, 18.4, 18.5, 18.6, 18.7, 18.8, 18.9, 19. ,
       19.1, 19.2, 19.3, 19.4, 19.5, 19.6, 19.7, 19.8, 19.9, 20. , 20.1,
       20.2, 20.3, 20.4, 20.5, 20.6, 20.7, 20.8, 20.9, 21. , 21.1, 21.2,
       21.3, 21.4, 21.5, 21.6, 21.7, 21.8, 21.9, 22. , 22.1, 22.2, 22.3,
       22.4, 22.5, 22.6, 22.7, 22.8, 22.9, 23. , 23.1, 23.2, 23.3, 23.4,
       23.5, 23.6, 23.7, 23.8, 23.9, 24. , 24.1, 24.2, 24.3, 24.4, 24.5,
       24.6, 24.7, 24.8, 24.9, 25. , 25.1, 25.2, 25.3, 25.4, 25.5, 25.6,
       25.7, 25.8, 25.9, 26. , 26.1, 26.2, 26.3, 26.4, 26.5, 26.6, 26.7,
       26.8, 26.9, 27. , 27.1, 27.2, 27.3, 27.4, 27.5, 27.6, 27.7, 27.8,
       27.9, 28. , 28.1, 28.2, 28.3, 28.4, 28.5, 28.6, 28.7, 28.8, 28.9,
       29. , 29.1, 29.2, 29.3, 29.4, 29.5, 29.6, 29.7, 29.8, 29.9, 30. ,
       30.1, 30.2, 30.3, 30.4, 30.5, 30.6, 30.7, 30.8, 30.9, 31. , 31.1,
       31.2, 31.3, 31.4, 31.5, 31.6, 31.7, 31.8, 31.9, 32. , 32.1, 32.2,
       32.3, 32.4, 32.5, 32.6, 32.7, 32.8, 32.9, 33

In [95]:
x_teste_arvore.shape

(450,)

In [96]:
x_teste_arvore = x_teste_arvore.reshape(-1,1)
x_teste_arvore.shape

(450, 1)

In [97]:
grafico = px.scatter(x = x_plano_saude2.ravel(), y = y_plano_saude2)
grafico.add_scatter(x = x_teste_arvore.ravel(), y = regressor_arvore_saude.predict(x_teste_arvore), name='Regressão')
grafico.show()

In [98]:
regressor_arvore_saude.predict([[40]])

array([1150.])

### Base preço das casas

In [99]:
x_casas_treinamento.shape

(15129, 16)

In [100]:
x_casas_treinamento

array([[ 4.00000e+00,  1.50000e+00,  1.39000e+03, ...,  9.81330e+04,
         4.77224e+01, -1.22332e+02],
       [ 3.00000e+00,  1.50000e+00,  1.45000e+03, ...,  9.81330e+04,
         4.77725e+01, -1.22349e+02],
       [ 5.00000e+00,  2.75000e+00,  2.86000e+03, ...,  9.80520e+04,
         4.77082e+01, -1.22104e+02],
       ...,
       [ 3.00000e+00,  2.25000e+00,  2.36000e+03, ...,  9.80420e+04,
         4.73856e+01, -1.22158e+02],
       [ 4.00000e+00,  2.00000e+00,  2.37000e+03, ...,  9.80010e+04,
         4.72831e+01, -1.22279e+02],
       [ 4.00000e+00,  2.25000e+00,  2.38000e+03, ...,  9.80080e+04,
         4.76126e+01, -1.22120e+02]])

In [101]:
y_casas_treinamento

array([400000., 430000., 720000., ..., 431000., 411000., 699900.])

In [102]:
x_casas_teste.shape

(6484, 16)

In [103]:
regressor_arvore_casas = DecisionTreeRegressor()
regressor_arvore_casas.fit(x_casas_treinamento, y_casas_treinamento)

In [104]:
regressor_arvore_casas.score(x_casas_treinamento, y_casas_treinamento)

0.9992712177726063

In [105]:
regressor_arvore_casas.score(x_casas_teste, y_casas_teste)

0.7202785127319334

In [106]:
previsoes = regressor_arvore_casas.predict(x_casas_teste)
previsoes

array([ 288000., 1835000.,  507000., ...,  289000.,  215000.,  201000.])

In [107]:
y_casas_teste

array([ 297000., 1578000.,  562100., ...,  380000.,  268000.,  206000.])

In [108]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_casas_teste, previsoes)

99807.55081739667

## Random Forest

### Base plano de saúde

In [109]:
x_plano_saude2

array([[18],
       [23],
       [28],
       [33],
       [38],
       [43],
       [48],
       [53],
       [58],
       [63]], dtype=int64)

In [110]:
y_plano_saude2

array([  470,   520,   630,   830,  1150,  1530,  2040,  3080,  5100,
       10100], dtype=int64)

In [111]:
from sklearn.ensemble import RandomForestRegressor
regressor_random_forest_saude = RandomForestRegressor(n_estimators = 10)
regressor_random_forest_saude.fit(x_plano_saude2, y_plano_saude2)

In [112]:
regressor_random_forest_saude.score(x_plano_saude2, y_plano_saude2)

0.9493886465806126

In [113]:
x_teste_arvore

array([[18. ],
       [18.1],
       [18.2],
       [18.3],
       [18.4],
       [18.5],
       [18.6],
       [18.7],
       [18.8],
       [18.9],
       [19. ],
       [19.1],
       [19.2],
       [19.3],
       [19.4],
       [19.5],
       [19.6],
       [19.7],
       [19.8],
       [19.9],
       [20. ],
       [20.1],
       [20.2],
       [20.3],
       [20.4],
       [20.5],
       [20.6],
       [20.7],
       [20.8],
       [20.9],
       [21. ],
       [21.1],
       [21.2],
       [21.3],
       [21.4],
       [21.5],
       [21.6],
       [21.7],
       [21.8],
       [21.9],
       [22. ],
       [22.1],
       [22.2],
       [22.3],
       [22.4],
       [22.5],
       [22.6],
       [22.7],
       [22.8],
       [22.9],
       [23. ],
       [23.1],
       [23.2],
       [23.3],
       [23.4],
       [23.5],
       [23.6],
       [23.7],
       [23.8],
       [23.9],
       [24. ],
       [24.1],
       [24.2],
       [24.3],
       [24.4],
       [24.5],
       [24

In [114]:
grafico = px.scatter(x=x_plano_saude2.ravel(), y = y_plano_saude2)
grafico.add_scatter(x = x_teste_arvore.ravel(), y = regressor_arvore_saude.predict(x_teste_arvore), name = 'Regressão')
grafico.show()

In [115]:
regressor_random_forest_saude.predict([[40]])

array([1194.])

### Base preço das casas

In [116]:
x_casas_treinamento.shape

(15129, 16)

In [117]:
x_casas_treinamento

array([[ 4.00000e+00,  1.50000e+00,  1.39000e+03, ...,  9.81330e+04,
         4.77224e+01, -1.22332e+02],
       [ 3.00000e+00,  1.50000e+00,  1.45000e+03, ...,  9.81330e+04,
         4.77725e+01, -1.22349e+02],
       [ 5.00000e+00,  2.75000e+00,  2.86000e+03, ...,  9.80520e+04,
         4.77082e+01, -1.22104e+02],
       ...,
       [ 3.00000e+00,  2.25000e+00,  2.36000e+03, ...,  9.80420e+04,
         4.73856e+01, -1.22158e+02],
       [ 4.00000e+00,  2.00000e+00,  2.37000e+03, ...,  9.80010e+04,
         4.72831e+01, -1.22279e+02],
       [ 4.00000e+00,  2.25000e+00,  2.38000e+03, ...,  9.80080e+04,
         4.76126e+01, -1.22120e+02]])

In [118]:
y_casas_treinamento

array([400000., 430000., 720000., ..., 431000., 411000., 699900.])

In [119]:
x_casas_teste

array([[ 2.00000e+00,  1.50000e+00,  1.43000e+03, ...,  9.81250e+04,
         4.77222e+01, -1.22290e+02],
       [ 4.00000e+00,  3.25000e+00,  4.67000e+03, ...,  9.80050e+04,
         4.76350e+01, -1.22164e+02],
       [ 2.00000e+00,  7.50000e-01,  1.44000e+03, ...,  9.81070e+04,
         4.76707e+01, -1.22364e+02],
       ...,
       [ 3.00000e+00,  2.50000e+00,  2.15000e+03, ...,  9.80580e+04,
         4.74514e+01, -1.22089e+02],
       [ 3.00000e+00,  1.75000e+00,  1.48000e+03, ...,  9.80320e+04,
         4.73657e+01, -1.22280e+02],
       [ 3.00000e+00,  1.00000e+00,  1.32000e+03, ...,  9.80920e+04,
         4.73120e+01, -1.22183e+02]])

In [120]:
regressor_random_forest_casas = RandomForestRegressor(n_estimators = 100)
regressor_random_forest_casas.fit(x_casas_treinamento, y_casas_treinamento)

In [121]:
regressor_random_forest_casas.score(x_casas_treinamento, y_casas_treinamento)

0.981201528445938

In [122]:
regressor_random_forest_casas.score(x_casas_teste, y_casas_teste)

0.8809681519900077

In [123]:
previsoes = regressor_random_forest_casas.predict(x_casas_teste)
previsoes

array([ 313468.5 , 1755897.3 ,  532975.9 , ...,  368349.14,  230672.75,
        205170.  ])

In [124]:
y_casas_teste

array([ 297000., 1578000.,  562100., ...,  380000.,  268000.,  206000.])

In [125]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_casas_teste, previsoes)

68031.15268640389