# Imports

In [19]:
import pandas as pd
import numpy as np
from sklearn import linear_model as lm

# Loading data

In [4]:
df_raw = pd.read_csv('kc_house_data.csv')

In [5]:
df_raw.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


# Data Preparation

In [6]:
# features
x_train = df_raw.drop(['price','date'], axis = 1) # excluir as colunas price e data

# response variable
y_train = df_raw['price'].copy() # copiar, pois será minha resposta

In [7]:
x_train

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,3,1.00,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,2,1.00,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,4,3.00,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,3,2.00,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,3,2.50,1530,1131,3.0,0,0,3,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,4,2.50,2310,5813,2.0,0,0,3,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,2,0.75,1020,1350,2.0,0,0,3,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,3,2.50,1600,2388,2.0,0,0,3,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [8]:
y_train

0        221900.0
1        538000.0
2        180000.0
3        604000.0
4        510000.0
           ...   
21608    360000.0
21609    400000.0
21610    402101.0
21611    400000.0
21612    325000.0
Name: price, Length: 21613, dtype: float64

# Model Training
Ir o sklearn e porcura => linear regression(regressão linear)

In [11]:
# model description
model_lr = lm.LinearRegression()

# model traning 
model_lr.fit(x_train, y_train)

In [12]:
# prediction
pred = model_lr.predict(x_train)

# Performance Metrics

In [14]:
df1 = df_raw.copy()

In [15]:
df1['prediction'] = pred

In [16]:
df1[['id','price','prediction']].head()

Unnamed: 0,id,price,prediction
0,7129300520,221900.0,205766.26461
1,6414100192,538000.0,731219.64983
2,5631500400,180000.0,379097.882654
3,2487200875,604000.0,457424.134325
4,1954400510,510000.0,444789.054718


In [17]:
df1['error'] = df1['price'] - df1['prediction']

In [18]:
df1[['id','price','prediction','error']].head()

Unnamed: 0,id,price,prediction,error
0,7129300520,221900.0,205766.26461,16133.73539
1,6414100192,538000.0,731219.64983,-193219.64983
2,5631500400,180000.0,379097.882654,-199097.882654
3,2487200875,604000.0,457424.134325,146575.865675
4,1954400510,510000.0,444789.054718,65210.945282


In [20]:
# criar um erro absoluto
df1['error_abs'] = np.abs(df1['error'])

In [21]:
df1[['id','price','prediction','error','error_abs']].head()

Unnamed: 0,id,price,prediction,error,error_abs
0,7129300520,221900.0,205766.26461,16133.73539,16133.73539
1,6414100192,538000.0,731219.64983,-193219.64983,193219.64983
2,5631500400,180000.0,379097.882654,-199097.882654,199097.882654
3,2487200875,604000.0,457424.134325,146575.865675,146575.865675
4,1954400510,510000.0,444789.054718,65210.945282,65210.945282


In [28]:
# soma dos erro (média)
np.sum(df1['error_abs']) / len(df1['error_abs'])

# erro percentual
df1['error_perc'] = (df1['price']-df1['prediction']) / df1['price']
df1['error_perc_abs'] = np.abs(df1['error_perc'])


In [29]:
df1[['id','price','prediction','error','error_abs','error_perc','error_perc_abs']].head()

Unnamed: 0,id,price,prediction,error,error_abs,error_perc,error_perc_abs
0,7129300520,221900.0,205766.26461,16133.73539,16133.73539,0.072707,0.072707
1,6414100192,538000.0,731219.64983,-193219.64983,193219.64983,-0.359144,0.359144
2,5631500400,180000.0,379097.882654,-199097.882654,199097.882654,-1.106099,1.106099
3,2487200875,604000.0,457424.134325,146575.865675,146575.865675,0.242675,0.242675
4,1954400510,510000.0,444789.054718,65210.945282,65210.945282,0.127865,0.127865


In [31]:
# mean absolute error

mae = np.mean(df1['error_abs'])
print(f'mae = {mae}')
# mean absolute percentage error
mape = np.mean(df1['error_perc_abs'])
print(f'mape = {mape}')


mae = 125921.54419398172
mape = 0.2558051253618555
