# Modelos de regressão

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#metricas
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
#modelos
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [16]:
df = pd.read_csv('house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [17]:
df.shape

(21613, 21)

In [20]:
df = df[['price', 'bedrooms', 'bathrooms', 'sqft_lot', 'floors', 'waterfront']]

In [21]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_lot,floors,waterfront
0,221900,3,1.0,5650,1.0,0
1,538000,3,2.25,7242,2.0,0
2,180000,2,1.0,10000,1.0,0
3,604000,4,3.0,5000,1.0,0
4,510000,3,2.0,8080,1.0,0


In [22]:
X = df.drop('price', axis=1)
y = df['price']

In [31]:
#Padronizando a escala dos valores numéricos do dataset
min_max_scaler = StandardScaler() 
X = min_max_scaler.fit_transform(X)

In [27]:
X

array([[-0.39873715, -1.44746357, -0.22832133, -0.915427  , -0.08717263],
       [-0.39873715,  0.1756067 , -0.18988538,  0.93650577, -0.08717263],
       [-1.47395936, -1.44746357, -0.12329847, -0.915427  , -0.08717263],
       ...,
       [-1.47395936, -1.77207762, -0.33213703,  0.93650577, -0.08717263],
       [-0.39873715,  0.50022075, -0.30707641,  0.93650577, -0.08717263],
       [-1.47395936, -1.77207762, -0.33875227,  0.93650577, -0.08717263]])

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 ,random_state=42)

### Regressão linear

In [36]:
#Trinando o modelos
lr = LinearRegression()
lr.fit(X_train, y_train)

#Predizendo
y_pred = lr.predict(X_test)

In [43]:
mse = root_mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test,y_pred)

print(f'mse = {mse}')
print(f'rmse = {rmse}')
print(f'mape = {mape}')

mse = 307355.64158673503
rmse = 554.3966464425403
mape = 0.42234497859405845


### KNN regressor

In [71]:
knr = KNeighborsRegressor(n_neighbors=7, metric='euclidean')

In [72]:
knr.fit(X_train, y_train)
y_pred = knr.predict(X_test)

In [73]:
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'mape = {mape}')

mape = 0.38148032883087835


### SVM regressor SVR (support vector machine para regressão)

In [77]:
from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train, y_train)

y_pred = svr.predict(X_test)

In [78]:
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'mape = {mape}')

mape = 0.4251465902015768


In [80]:
#Passando dois hiperparâmetros
svr = SVR(kernel='linear', C=100)
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'mape = {mape}')

mape = 0.36658213687926744


# Arvore de regressão

In [81]:
from sklearn.tree import DecisionTreeRegressor

In [82]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

In [83]:
y_pred = dtr.predict(X_test)

In [84]:
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'mape = {mape}')

mape = 0.48949507321286445


### XGBoost

In [85]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 640.0 kB/s eta 0:03:16
   ---------------------------------------- 0.2/124.9 MB 2.0 MB/s eta 0:01:04
   ---------------------------------------- 0.5/124.9 MB 4.7 MB/s eta 0:00:27
   ---------------------------------------- 0.9/124.9 MB 5.2 MB/s eta 0:00:24
   ---------------------------------------- 1.4/124.9 MB 6.9 MB/s eta 0:00:19
    --------------------------------------- 1.8/124.9 MB 7.3 MB/s eta 0:00:17
    --------------------------------------- 2.1/124.9 MB 7.0 MB/s eta 0:00:18
    --------------------------------------- 2.6/124.9 MB 7.6 MB/s eta 0:00:17
   - -------------------------------------- 3.2/124.9 MB 8.0 MB/s eta 0:00:16
   - -------------------------------------- 3.6/124.9 MB 8.3 MB/s eta 0:00:15


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [86]:
from xgboost import XGBRegressor

In [87]:
xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)

In [88]:
y_pred = xgboost.predict(X_test)

In [89]:
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'mape = {mape}')

mape = 0.37332829257311173


In [90]:
params = {
    'n_estimators' : 100,
    'max_depth' : 6,
    'learning_rate' : 0.1
}

In [91]:
xgboost = XGBRegressor(**params)
xgboost.fit(X_train, y_train)

In [92]:
y_pred = xgboost.predict(X_test)

In [93]:
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'mape = {mape}')

mape = 0.3733095889584748
