In [None]:
import pandas as pd

import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import tqdm

In [None]:
data = pd.read_csv("train.csv", usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
                                         "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()
data.head()

In [None]:
cat_features = ["MSSubClass", "MSZoning", "Street", "LotShape", "YearBuilt"]
for cat_col in cat_features:
  data[cat_col] = LabelEncoder().fit_transform(data[cat_col])
cat_dims = [int(data[col].nunique()) for col in cat_features]
cat_input_emb_size = [(x, min(50, (x + 1) // 2)) for x in cat_dims]

num_features = ["LotFrontage", "LotArea", "1stFlrSF", "2ndFlrSF"]
output = ["SalePrice"]

In [None]:
class TabularDataset(Dataset):
    def __init__(self, X_num, X_cat, Y):
        self.n = X_num.shape[0]
        self.y = Y.astype(np.float32).values.reshape(-1, 1)

        normalized_X_num = (X_num-X_num.mean())/X_num.std()
        self.x_num = normalized_X_num.astype(np.float32).values

        self.x_cat = X_cat.astype(np.int64).values

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        return [self.x_num[idx], self.x_cat[idx], self.y[idx]]

In [None]:
training, test = train_test_split(data, test_size=0.2)

In [None]:
dataset = TabularDataset(X_num=training[num_features], X_cat=training[cat_features], Y=np.log(training[output]))
batchsize = 64
dataloader = DataLoader(dataset, batchsize, shuffle=True, num_workers=1)

In [None]:
class TabularMLP(nn.Module):
  def __init__(self, num_input_size, cat_input_emb_size, hidden_size):
      super().__init__()

      self.embeddings = nn.ModuleList([nn.Embedding(x, y) for x, y in cat_input_emb_size])
      total_embedding_size = sum([y for x, y in cat_input_emb_size])

      self.fc1 = torch.nn.Linear(num_input_size+total_embedding_size, hidden_size)
      self.fc2 = torch.nn.Linear(hidden_size, hidden_size)

      self.emb_dropout = torch.nn.Dropout(.1)
      self.dropout1 = torch.nn.Dropout(.1)
      self.dropout2 = torch.nn.Dropout(.1)
      
      self.output = torch.nn.Linear(hidden_size, 1)

  def forward(self, x_num, x_cat):
      x = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embeddings)]
      x = torch.cat(x, 1)
      x = self.emb_dropout(x)

      x = torch.cat([x, x_num], 1)

      x = F.relu(self.fc1(x))
      x = self.dropout1(x)
      x = F.relu(self.fc2(x))
      x = self.dropout2(x)
      y_ = self.output(x)
      return y_

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TabularMLP(num_input_size=4, cat_input_emb_size=cat_input_emb_size, hidden_size=256).to(device)

In [None]:
total_epochs = 2000
criterion = nn.MSELoss()
lr = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
t_epochs = tqdm.notebook.tqdm(range(total_epochs), unit="epoch")
for epoch in t_epochs:
  t_epochs.set_description(f"Epoch {epoch}")

  total_loss = 0
  for X_num, X_cat, Y in dataloader:       
    X_num = X_num.to(device)
    X_cat = X_cat.to(device)
    Y = Y.to(device)

    # Forward Pass
    Y_ = model(X_num, X_cat)
    loss = criterion(Y_, Y)

    total_loss += loss.item()*X_num.size(0)
  
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    avg_loss = total_loss/len(training)
  t_epochs.set_postfix(loss=avg_loss)

In [None]:
model.eval()
test_dataset = TabularDataset(X_num=test[num_features], X_cat=test[cat_features], Y=np.log(test[output]))
test_dataloader = DataLoader(test_dataset, batchsize, shuffle=True, num_workers=1)

total_loss = 0
for X_num, X_cat, Y in test_dataloader: 
  X_num = X_num.to(device)
  X_cat = X_cat.to(device)
  Y = Y.to(device)
  Y_ = model(X_num, X_cat)
  loss = criterion(Y_, Y)

  total_loss += loss.item()*X_num.size(0)
avg_loss = total_loss/len(test)
print(f"Avg. Loss = {avg_loss:e}")