## Imports

In [15]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F

<big>**Read data**</big>

In [56]:
data = pd.read_csv('datasets/house_prices/kc_house_data.csv')

In [57]:
pd.set_option('max_columns', None)

display(data.head())
display(data.info())

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

None

## Preprocessing

<big>**Id**</big>

In [58]:
data = data.drop('id', axis=1)

<big>**Date**</big>

In [19]:
data['year'] = data['date'].apply(lambda x: x[:4])
data['month'] = data['date'].apply(lambda x: x[4:6])

data = data.drop('date', axis=1)

<big>**Zipcode**</big>

In [20]:
len(data['zipcode'].unique())

70

In [21]:
def onehot_code(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    
    return df

In [22]:
data = onehot_code(data, 'zipcode', 'zip')

<big>**Renovated**</big>

In [25]:
len(data.query('yr_renovated == 0'))

20699

In [27]:
data = data.drop('yr_renovated', axis=1)

<big>**Splitting and scaling**</big>

In [30]:
y = data['price'].copy()
X = data.drop('price', axis=1).copy()

In [31]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [32]:
tf_X_train, tf_X_test, tf_y_train, tf_y_test = train_test_split(X, y, train_size=0.8, random_state=1)

## Training with TensorFlow

In [33]:
tf_X_train.shape

(17290, 88)

In [35]:
inputs = tf.keras.Input(shape=(88, ))
hidden = tf.keras.layers.Dense(64, activation='relu')(inputs)
hidden = tf.keras.layers.Dense(64, activation='relu')(hidden)
outputs = tf.keras.layers.Dense(1, activation='linear')(hidden)

tf_model = tf.keras.Model(inputs, outputs)

tf_model.compile(
    optimizer='adam',
    loss='mse'
)

history = tf_model.fit(
    tf_X_train,
    tf_y_train,
    validation_split=0.12,
    batch_size=32,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
tf_rmse = np.sqrt(tf_model.evaluate(tf_X_test, tf_y_test))



## Training with pytorch

In [38]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(88, 64)
        self.layer2 = nn.Linear(64, 64)
        self.out = nn.Linear(64, 1)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.out(x)
        
        return x
    
net = Net()

In [42]:
for param in list(net.parameters()):
    print(param.shape)

torch.Size([64, 88])
torch.Size([64])
torch.Size([64, 64])
torch.Size([64])
torch.Size([1, 64])
torch.Size([1])


In [46]:
torch_X_train = torch.tensor(tf_X_train).type(torch.float32)
torch_y_train = torch.tensor(np.array(tf_y_train)).type(torch.float32)

torch_X_test = torch.tensor(tf_X_test).type(torch.float32)
torch_y_test = torch.tensor(np.array(tf_y_test)).type(torch.float32)

In [47]:
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
criterion = nn.MSELoss()

In [55]:
for x, target in zip(torch_X_train, torch_y_train):
    optimizer.zero_grad()
    output = net(x)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()

  return F.mse_loss(input, target, reduction=self.reduction)


tensor(3.7432e+08, grad_fn=<MseLossBackward>)

In [49]:
total_loss = 0

for x, target in zip(torch_X_test, torch_y_test):
    output = net(x)
    loss = criterion(output, target)
    total_loss += loss
    
avg_loss = total_loss / len(torch_X_test)

  return F.mse_loss(input, target, reduction=self.reduction)


In [50]:
torch_rmse = torch.sqrt(avg_loss).detach().numpy()

## Results

In [52]:
print('TensorFlow RMSE:', tf_rmse)
print('   Pytorch RMSE:', torch_rmse)

TensorFlow RMSE: 231670.70036584255
   Pytorch RMSE: 201569.11
