Kaggle Advance House Price Prediction using PyTorch- Tabular dataset

https://docs.fast.ai/tabular.html https://www.fast.ai/2018/04/29/categorical-embeddings/ https://www.fast.ai/2018/04/29/categorical-embeddings/ https://yashuseth.blog/2018/07/22/pytorch-neural-network-for-tabular-data-with-categorical-embeddings/

In [2]:
import pandas as pd

In [15]:
df=pd.read_csv('houseprice.csv',usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
                                         "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()

In [16]:
df.sample(10)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
1395,60,RL,88.0,12665,Pave,IR1,2005,1133,1349,281213
1078,120,RM,37.0,4435,Pave,Reg,2004,848,0,155900
1053,20,RL,68.0,8562,Pave,Reg,1957,1526,0,144500
311,20,RL,50.0,8000,Pave,Reg,1948,972,0,132000
907,50,RL,86.0,11500,Pave,IR1,1936,1020,1037,250000
1410,60,RL,79.0,12420,Pave,Reg,2001,944,896,230000
1385,50,RM,40.0,5436,Pave,Reg,1922,796,358,125500
1149,70,RM,50.0,9000,Pave,Reg,1920,832,650,143000
1125,20,RL,60.0,10434,Pave,Reg,1955,1005,0,115000
39,90,RL,65.0,6040,Pave,Reg,1955,1152,0,82000


In [17]:
df.shape

(1201, 10)

In [18]:
for i in df.columns:
    print("Column name {} and unique values are {}".format(i,len(df[i].unique())))

Column name MSSubClass and unique values are 15
Column name MSZoning and unique values are 5
Column name LotFrontage and unique values are 110
Column name LotArea and unique values are 869
Column name Street and unique values are 2
Column name LotShape and unique values are 4
Column name YearBuilt and unique values are 112
Column name 1stFlrSF and unique values are 678
Column name 2ndFlrSF and unique values are 368
Column name SalePrice and unique values are 597


In [19]:
import datetime
datetime.datetime.now().year

2024

In [20]:
df['Total Years']=datetime.datetime.now().year-df['YearBuilt']

In [21]:
df.drop("YearBuilt",axis=1,inplace=True)

In [22]:
cat_features=["MSSubClass", "MSZoning", "Street", "LotShape"]
out_feature="SalePrice"

In [23]:
from sklearn.preprocessing import LabelEncoder
lbl_encoders={}
for feature in cat_features:
    lbl_encoders[feature]=LabelEncoder()
    df[feature]=lbl_encoders[feature].fit_transform(df[feature])

In [24]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,1stFlrSF,2ndFlrSF,SalePrice,Total Years
0,5,3,65.0,8450,1,3,856,854,208500,21
1,0,3,80.0,9600,1,3,1262,0,181500,48
2,5,3,68.0,11250,1,0,920,866,223500,23
3,6,3,60.0,9550,1,0,961,756,140000,109
4,5,3,84.0,14260,1,0,1145,1053,250000,24


In [26]:
# stacking and converting into tensors
import numpy as np
cat_features=np.stack([df['MSSubClass'], df['MSZoning'], df['Street'], df['LotShape']],1)
cat_features

array([[5, 3, 1, 3],
       [0, 3, 1, 3],
       [5, 3, 1, 0],
       ...,
       [6, 3, 1, 3],
       [0, 3, 1, 3],
       [0, 3, 1, 3]])

In [27]:
# convert numpy to tensors
import torch
cat_features=torch.tensor(cat_features,dtype=torch.int64)
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [28]:
# create continuous features
cont_features=[]
for i in df.columns:
    if i in ['MSSubClass', 'MSZoning', 'Street', 'LotShape','SalePrice']:
        pass
    else:
        cont_features.append(i)

In [29]:
cont_features

['LotFrontage', 'LotArea', '1stFlrSF', '2ndFlrSF', 'Total Years']

In [30]:
# stacking continuous variables into tensors
cont_values=np.stack([df[i].values for i in cont_features],axis=1)
cont_values=torch.tensor(cont_values,dtype=torch.float)
cont_values

tensor([[   65.,  8450.,   856.,   854.,    21.],
        [   80.,  9600.,  1262.,     0.,    48.],
        [   68., 11250.,   920.,   866.,    23.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    83.],
        [   68.,  9717.,  1078.,     0.,    74.],
        [   75.,  9937.,  1256.,     0.,    59.]])

In [31]:
# dependent feature
y=torch.tensor(df['SalePrice'].values, dtype=torch.float).reshape(-1,1)
y

tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]])

#### Embedding Size for Categorical Numbers

In [32]:
len(df['MSSubClass'].unique())

15

In [33]:
cat_dims = [len(df[col].unique()) for col in ["MSSubClass","MSZoning","Street","LotShape"]]

In [34]:
cat_dims

[15, 5, 2, 4]

In [None]:
# Thumb Rule: Output dimension should be set based on input dimension(min(50,feature dimension//2))
embedding_dim = [(x, min(50,(x+1)//2)) for x in cat_dims]

In [36]:
embedding_dim

[(15, 8), (5, 3), (2, 1), (4, 2)]

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
embed_representation=nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
embed_representation

ModuleList(
  (0): Embedding(15, 8)
  (1): Embedding(5, 3)
  (2): Embedding(2, 1)
  (3): Embedding(4, 2)
)

In [39]:
cat_featuresz=cat_features[:4]
cat_featuresz

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        [6, 3, 1, 0]])

In [41]:
pd.set_option('display.max_rows',500)
embedding_val=[]
for i,e in enumerate(embed_representation):
    embedding_val.append(e(cat_features[:,i]))

In [42]:
embedding_val

[tensor([[ 0.7053,  1.6639,  0.6774,  ...,  1.2852,  0.2202, -0.6388],
         [-0.2177,  0.0288, -0.7971,  ...,  0.5431,  0.5801, -0.5142],
         [ 0.7053,  1.6639,  0.6774,  ...,  1.2852,  0.2202, -0.6388],
         ...,
         [ 1.2890, -0.1303,  0.8250,  ...,  0.9110, -0.9025, -0.5493],
         [-0.2177,  0.0288, -0.7971,  ...,  0.5431,  0.5801, -0.5142],
         [-0.2177,  0.0288, -0.7971,  ...,  0.5431,  0.5801, -0.5142]],
        grad_fn=<EmbeddingBackward0>),
 tensor([[1.9121, 1.5920, 1.9095],
         [1.9121, 1.5920, 1.9095],
         [1.9121, 1.5920, 1.9095],
         ...,
         [1.9121, 1.5920, 1.9095],
         [1.9121, 1.5920, 1.9095],
         [1.9121, 1.5920, 1.9095]], grad_fn=<EmbeddingBackward0>),
 tensor([[1.9103],
         [1.9103],
         [1.9103],
         ...,
         [1.9103],
         [1.9103],
         [1.9103]], grad_fn=<EmbeddingBackward0>),
 tensor([[-0.3248,  0.6676],
         [-0.3248,  0.6676],
         [ 0.2395,  0.8146],
         ...,
   

In [43]:
# concatenating the embeddings based on rows
z=torch.cat(embedding_val,1)
z

tensor([[ 0.7053,  1.6639,  0.6774,  ...,  1.9103, -0.3248,  0.6676],
        [-0.2177,  0.0288, -0.7971,  ...,  1.9103, -0.3248,  0.6676],
        [ 0.7053,  1.6639,  0.6774,  ...,  1.9103,  0.2395,  0.8146],
        ...,
        [ 1.2890, -0.1303,  0.8250,  ...,  1.9103, -0.3248,  0.6676],
        [-0.2177,  0.0288, -0.7971,  ...,  1.9103, -0.3248,  0.6676],
        [-0.2177,  0.0288, -0.7971,  ...,  1.9103, -0.3248,  0.6676]],
       grad_fn=<CatBackward0>)

In [44]:
# Implement Dropout
dropout=nn.Dropout(.4)

In [45]:
final_embed=dropout(z)
final_embed

tensor([[ 1.1755,  0.0000,  1.1290,  ...,  3.1838, -0.5414,  0.0000],
        [-0.0000,  0.0480, -1.3284,  ...,  3.1838, -0.5414,  0.0000],
        [ 1.1755,  0.0000,  1.1290,  ...,  3.1838,  0.3992,  1.3577],
        ...,
        [ 2.1484, -0.2172,  0.0000,  ...,  3.1838, -0.5414,  0.0000],
        [-0.3629,  0.0000, -1.3284,  ...,  3.1838, -0.0000,  1.1126],
        [-0.3629,  0.0000, -0.0000,  ...,  3.1838, -0.0000,  0.0000]],
       grad_fn=<MulBackward0>)

In [71]:
##### Create a Feed Forward Neural Network
import torch
import torch.nn as nn
import torch.nn.functional as F
class FeedForwardNN(nn.Module):

    def __init__(self, embedding_dim, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(inp,out) for inp,out in embedding_dim])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((out for inp,out in embedding_dim))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [72]:
torch.manual_seed(100)
model=FeedForwardNN(embedding_dim,len(cont_features),1,[100,50],p=0.4)

In [73]:
model

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)

In [74]:
model.layers

Sequential(
  (0): Linear(in_features=19, out_features=100, bias=True)
  (1): ReLU(inplace=True)
  (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (3): Dropout(p=0.4, inplace=False)
  (4): Linear(in_features=100, out_features=50, bias=True)
  (5): ReLU(inplace=True)
  (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): Dropout(p=0.4, inplace=False)
  (8): Linear(in_features=50, out_features=1, bias=True)
)

In [75]:
loss_function = nn.MSELoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.01)

In [76]:
df.shape

(1201, 10)

In [77]:
cont_values

tensor([[   65.,  8450.,   856.,   854.,    21.],
        [   80.,  9600.,  1262.,     0.,    48.],
        [   68., 11250.,   920.,   866.,    23.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    83.],
        [   68.,  9717.,  1078.,     0.,    74.],
        [   75.,  9937.,  1256.,     0.,    59.]])

In [78]:
batch_size=1200
test_size=int(batch_size*0.15)
train_categorical=cat_features[:batch_size-test_size]
test_categorical=cat_features[batch_size-test_size:batch_size]
train_cont=cont_values[:batch_size-test_size]
test_cont=cont_values[batch_size-test_size:batch_size]
y_train=y[:batch_size-test_size]
y_test=y[batch_size-test_size:batch_size]

In [79]:
len(train_categorical),len(test_categorical),len(train_cont),len(test_cont),len(y_train)

(1020, 180, 1020, 180, 1020)

In [80]:
epochs=500
final_losses=[]
for i in range(epochs):
    i=i+1
    y_pred=model(train_categorical,train_cont)
    loss=torch.sqrt(loss_function(y_pred,y_train))
    final_losses.append(loss)
    if i%10==1:
        print("Epoch number: {} and the loss: {}".format(i,loss.item()))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch number: 1 and the loss: 200496.75
Epoch number: 11 and the loss: 200493.46875
Epoch number: 21 and the loss: 200489.140625
Epoch number: 31 and the loss: 200482.625
Epoch number: 41 and the loss: 200473.234375
Epoch number: 51 and the loss: 200461.375
Epoch number: 61 and the loss: 200446.421875
Epoch number: 71 and the loss: 200429.359375
Epoch number: 81 and the loss: 200407.953125
Epoch number: 91 and the loss: 200383.390625
Epoch number: 101 and the loss: 200355.28125
Epoch number: 111 and the loss: 200322.140625
Epoch number: 121 and the loss: 200291.109375
Epoch number: 131 and the loss: 200252.109375
Epoch number: 141 and the loss: 200206.4375
Epoch number: 151 and the loss: 200163.390625
Epoch number: 161 and the loss: 200112.140625
Epoch number: 171 and the loss: 200059.203125
Epoch number: 181 and the loss: 200006.640625
Epoch number: 191 and the loss: 199948.140625
Epoch number: 201 and the loss: 199881.234375
Epoch number: 211 and the loss: 199815.703125
Epoch number:

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(range(epochs),final_losses)
plt.ylabel('RMSE Loss')
plt.xlabel('Epoch')

In [82]:
y_pred=""
with torch.no_grad():
    y_pred=model(test_categorical,test_cont)
    loss=torch.sqrt(loss_function(y_pred,y_test))
print('RMSE: {}'.format(loss))

RMSE: 187075.109375


In [83]:
# saving the model
torch.save(model,'Houseprice.pt')

In [84]:
torch.save(model.state_dict(),'Houseweights.pt')

In [85]:
# load the model
embs_size=[(15,8),(5,3),(2,1),(4,2)]
model1=FeedForwardNN(embs_size,5,1,[100,50],p=0.4)

In [86]:
model1.load_state_dict(torch.load('Houseweights.pt'))

  model1.load_state_dict(torch.load('Houseweights.pt'))


<All keys matched successfully>

In [88]:
model1.eval()

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)