<a href="https://colab.research.google.com/github/KevBeltrao/house_price_model/blob/master/house_price_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [276]:
import torch
from torch import nn

torch.__version__

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using: {device}')

Using: cuda


In [277]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using: {device}')

Using: cuda


# Consume dataset and treat data

* yes/no values become 1/0
* `furnishingstatus` is furnished/semi-furnished/unfurnished and becomes 2/1/0
* `price` normalizes to range from 0 to 1
* `area` normalizes to range from 0 to 1

I also shuffle the data because the CSV is ordered by price, meaning I'd train for a specific range and test for another range.

In [278]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [279]:
local_folder = '/content/drive/My Drive/Colab Notebooks/house_price'
file_name = 'housing.csv'

file_path = f'{local_folder}/{file_name}'

In [280]:
import pandas as pd

houses_dataframe = pd.read_csv(file_path)
houses_dataframe.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [281]:
houses_dataframe['mainroad'] = houses_dataframe['mainroad'].replace({ 'no': 0, 'yes': 1 })
houses_dataframe['guestroom'] = houses_dataframe['guestroom'].replace({ 'no': 0, 'yes': 1 })
houses_dataframe['basement'] = houses_dataframe['basement'].replace({ 'no': 0, 'yes': 1 })
houses_dataframe['hotwaterheating'] = houses_dataframe['hotwaterheating'].replace({ 'no': 0, 'yes': 1 })
houses_dataframe['airconditioning'] = houses_dataframe['airconditioning'].replace({ 'no': 0, 'yes': 1 })
houses_dataframe['prefarea'] = houses_dataframe['prefarea'].replace({ 'no': 0, 'yes': 1 })
houses_dataframe['furnishingstatus'] = houses_dataframe['furnishingstatus'].replace({ 'unfurnished': 0, 'semi-furnished': 1, 'furnished': 2 })

min_price = houses_dataframe['price'].min()
max_price = houses_dataframe['price'].max()
houses_dataframe['price'] = (houses_dataframe['price'] - min_price) / (max_price - min_price)

min_area = houses_dataframe['area'].min()
max_area = houses_dataframe['area'].max()
houses_dataframe['area'] = (houses_dataframe['area'] - min_area) / (max_area - min_area)

houses_dataframe.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,1.0,0.396564,4,2,3,1,0,0,0,1,2,1,2
1,0.909091,0.502405,4,4,4,1,0,0,0,1,3,0,2
2,0.909091,0.571134,3,2,2,1,0,1,0,0,2,1,1
3,0.906061,0.402062,4,2,2,1,0,1,0,1,3,1,2
4,0.836364,0.396564,4,1,2,1,1,1,0,1,2,0,2


In [282]:
shuffled_houses_dataframe = houses_dataframe.sample(frac=1, random_state=1)

y = torch.tensor(shuffled_houses_dataframe['price'], dtype=torch.float, device=device).unsqueeze(dim=1)
X = torch.tensor(shuffled_houses_dataframe.loc[:, shuffled_houses_dataframe.columns != 'price'].values, dtype=torch.float, device=device)
X[:10], y[:10]


(tensor([[0.3155, 4.0000, 2.0000, 2.0000, 1.0000, 0.0000, 0.0000, 0.0000, 1.0000,
          1.0000, 0.0000, 2.0000],
         [0.4639, 4.0000, 1.0000, 4.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          3.0000, 0.0000, 0.0000],
         [0.6082, 4.0000, 2.0000, 2.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          1.0000, 0.0000, 1.0000],
         [0.3278, 3.0000, 1.0000, 3.0000, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000,
          0.0000, 1.0000, 0.0000],
         [0.3412, 3.0000, 1.0000, 2.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 1.0000],
         [0.1340, 2.0000, 1.0000, 1.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 1.0000],
         [0.1093, 2.0000, 1.0000, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000,
          1.0000, 0.0000, 0.0000],
         [0.3402, 3.0000, 1.0000, 4.0000, 1.0000, 0.0000, 0.0000, 0.0000, 1.0000,
          3.0000, 1.0000, 2.0000],
         [0.0722, 2.0000, 1.0000, 1.0000, 0.0000, 0.0000, 0.0000

In [283]:
data_set_length = len(X)

training_set_length = int(0.8 * data_set_length)
testing_set_length = data_set_length - training_set_length

data_set_length, training_set_length, testing_set_length

(545, 436, 109)

In [284]:
X_train, y_train = X[:training_set_length], y[:training_set_length]
X_test, y_test = X[training_set_length:], y[training_set_length:]

len(X_train), len(y_train), len(X_test), len(y_test)

(436, 436, 109, 109)

# Create model and train it

With the data treated and split between 80% for training and 20% for testing:
* create the model
* setup loss function
* setup optimizer
* train model

In [285]:
class HousePriceModel(nn.Module):
  def __init__(self):
    super().__init__()

    self.linear_layer = nn.Linear(
      in_features=12,
      out_features=1
    )

  def forward(self, x: torch.Tensor) -> torch.Tensor:
    return self.linear_layer(x)

torch.manual_seed(42)
model = HousePriceModel()

model, model.state_dict()

(HousePriceModel(
   (linear_layer): Linear(in_features=12, out_features=1, bias=True)
 ),
 OrderedDict([('linear_layer.weight',
               tensor([[ 0.2207,  0.2396, -0.0676,  0.2652, -0.0632,  0.0583, -0.1405,  0.1695,
                         0.2545, -0.2118,  0.2509,  0.0540]])),
              ('linear_layer.bias', tensor([0.2133]))]))

In [286]:
model.to(device)
next(model.parameters()).device

device(type='cuda', index=0)

In [287]:
loss_fn = nn.L1Loss()
optimizer = torch.optim.SGD(
  params=model.parameters(),
  lr=0.0001,
)

In [288]:
torch.manual_seed(42)

epochs = 3 * 10 ** 5 # 300,000

X_train = X_train.to(device)
y_train = y_train.to(device)
X_test = X_test.to(device)
X_test = X_test.to(device)

for epoch in range(epochs):
  model.train()

  y_prediction = model(X_train)

  loss = loss_fn(y_prediction, y_train)

  optimizer.zero_grad()

  loss.backward()

  optimizer.step()

  model.eval()
  with torch.inference_mode():
    test_prediction = model(X_test)
    test_loss = loss_fn(test_prediction, y_test)

  amount_of_spaces = 3 - len(str(epoch))
  spaces = ' ' * amount_of_spaces

  loss_formatted = float("{:.2f}".format(loss * max_price))

  amount_of_spaces2 = 12 - len(str(loss_formatted))
  spaces2 = ' ' * amount_of_spaces2

  test_loss_fixed = max_price * test_loss

  if epoch % 5000 == 0:
    print(f'{epoch}:{spaces} | loss: {loss_formatted:,}{spaces2} | test_loss {test_loss_fixed:,}')



0:   | loss: 14,069,028.0   | test_loss 14,498,697.0
5000: | loss: 1,810,872.5    | test_loss 1,866,814.375
10000: | loss: 1,306,620.75   | test_loss 1,499,483.875
15000: | loss: 1,152,197.62   | test_loss 1,266,863.625
20000: | loss: 1,054,923.62   | test_loss 1,090,752.0
25000: | loss: 986,552.88    | test_loss 979,081.0
30000: | loss: 949,114.62    | test_loss 928,754.375
35000: | loss: 926,587.5     | test_loss 903,696.0
40000: | loss: 912,441.25    | test_loss 890,734.9375
45000: | loss: 904,974.31    | test_loss 882,621.125
50000: | loss: 900,625.69    | test_loss 878,311.9375
55000: | loss: 897,914.75    | test_loss 874,866.25
60000: | loss: 895,999.75    | test_loss 874,565.5
65000: | loss: 894,392.5     | test_loss 874,767.375
70000: | loss: 893,177.5     | test_loss 873,858.6875
75000: | loss: 892,385.38    | test_loss 871,251.25
80000: | loss: 891,781.88    | test_loss 868,637.0625
85000: | loss: 891,183.94    | test_loss 867,044.875
90000: | loss: 890,618.06    | test_loss 

In [289]:
comparison_tensor = torch.cat((test_prediction, y_test), dim=1)
comparison_tensor * max_price

tensor([[ 3279880.2500,  2498787.7500],
        [ 4226864.5000,  5642424.5000],
        [ 3595910.0000,  3627272.7500],
        [ 6216307.0000,  5642424.5000],
        [ 1731256.6250,  3063030.2500],
        [ 2640196.2500,  3627272.7500],
        [ 4465146.0000,  4433333.5000],
        [ 1678231.3750,  1047878.7500],
        [ 6251046.5000, 12090909.0000],
        [ 6582407.5000,  6448485.0000],
        [ 2085742.0000,  3103333.2500],
        [ 2933829.5000,  3183939.2500],
        [ 1405966.0000,  2095757.5000],
        [ 3842823.2500,  3135575.7500],
        [ 1980980.0000,  2498787.7500],
        [ 1671333.5000,  2216666.7500],
        [ 4517424.5000,  2619697.0000],
        [ 3524084.5000,  3909393.7500],
        [ 4533661.5000,  4393030.5000],
        [ 3349048.2500,  3546666.7500],
        [ 2200686.5000,  1894242.3750],
        [ 3099683.7500,  2821212.2500],
        [ 4803129.5000,  3788484.7500],
        [ 3930049.5000,  5642424.5000],
        [ 2281367.0000,  2377878.7500],


In [290]:
torch.mean((abs(test_prediction - y_test)) * max_price)

tensor(852563.9375, device='cuda:0')

In [291]:
comparison = comparison_tensor[:, 0] > comparison_tensor[:, 1]

In [292]:
predictions_greater_than_tests = torch.sum(comparison)
tests_greater_than_predictions = comparison.numel() - predictions_greater_than_tests

f'{predictions_greater_than_tests=} {tests_greater_than_predictions=}'

"predictions_greater_than_tests=tensor(63, device='cuda:0') tests_greater_than_predictions=tensor(46, device='cuda:0')"

# Save model




In [293]:
from pathlib import Path

MODEL_PATH = Path('models')
MODEL_PATH.mkdir(parents=True, exist_ok=True)

MODEL_NAME = 'house_price_model.pth'
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

print(f'Saving model to: {MODEL_SAVE_PATH}')
torch.save(
    obj=model.state_dict(),
    f=MODEL_SAVE_PATH
)

!ls -l models


Saving model to: models/house_price_model.pth
total 4
-rw-r--r-- 1 root root 1620 Dec 22 17:25 house_price_model.pth
