In [115]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from config import DATA_FOLDER

In [116]:
train_df = pd.read_csv(DATA_FOLDER / "train.csv")
test_df = pd.read_csv(DATA_FOLDER / "test.csv")

In [117]:
train_df.shape

(1460, 81)

In [118]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [119]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [120]:
test_df.shape

(1459, 80)

In [121]:
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [122]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In the above we've loaded our datasets, and taken a first look at them.

We can see that we have some relatively small datasets, with less than 1500 rows.

Length wise they're not much, but column-wise we have a lot of information, with 78 columns. These columns are a mix of categorical, integer and float types. Id is our identifier column. SalePrice is our target variable to predict.

We'll need to do some preprocessing before we can train our model. Some null rows are included, and we'll have to encode many of the categorical features.

In [123]:
# remove ID column
train_df.drop("Id", axis=1, inplace=True)
test_df.drop("Id", axis=1, inplace=True)

In [124]:
# Combine df
combined_df = pd.concat([train_df, test_df], axis=0)

# Identify object type columns
object_cols = combined_df.select_dtypes(include=['object']).columns

# Initialize OrdinalEncoder
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Fit and transform the object type columns
encoder.fit(train_df[object_cols])
train_df[object_cols] = encoder.transform(train_df[object_cols])
test_df[object_cols] = encoder.transform(test_df[object_cols])

Looking at our data description, we can see that the missing cells are missing on purpose. If they've been set to None, it means that that specific house feature is not available. As this is the case, we should include a None value in our categorical encoding, which we'll set to -1.

In [125]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   float64
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   float64
 5   Alley          91 non-null     float64
 6   LotShape       1460 non-null   float64
 7   LandContour    1460 non-null   float64
 8   Utilities      1460 non-null   float64
 9   LotConfig      1460 non-null   float64
 10  LandSlope      1460 non-null   float64
 11  Neighborhood   1460 non-null   float64
 12  Condition1     1460 non-null   float64
 13  Condition2     1460 non-null   float64
 14  BldgType       1460 non-null   float64
 15  HouseStyle     1460 non-null   float64
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [126]:
# get all column with nan is true
missing_cols = train_df.columns[train_df.isna().any()].tolist()

In [127]:
# Set all in missing_cols to -1
train_df[missing_cols] = train_df[missing_cols].fillna(-1)
test_df[missing_cols] = test_df[missing_cols].fillna(-1)

We have now cleaned our dataset quite a bit. Next is splitting it into features and target. 

In [128]:
y_train = train_df["SalePrice"]
X_train = train_df.drop("SalePrice", axis=1)
X_test = test_df

In [129]:
# Log the target
y_train = np.log1p(y_train)

In [130]:
# convert X to numpy
X_train = X_train.values
X_test = X_test.values
# shape target to 2D
y_train = y_train.values.reshape(-1, 1)

In [131]:
# Split train into train and val
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

We now have all of our variables, and we'll be ready to train our model.

In [132]:
import torch.nn as nn
import torch
import torch.optim as optim

In [133]:
class MyModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

Above is a simple deep regression model. It consists of 4 fully connected layers, with ReLU activation functions.

In [134]:
model = MyModel(X_train.shape[1])

loss_fun =  nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [135]:
EPOCHS = 100
BATCH_SIZE = 12

In [136]:
def rmse(y_pred, y_true):
    # to numpy 
    y_pred_np = y_pred.detach().numpy()
    y_true_np = y_true.detach().numpy()
    return np.sqrt(((y_pred_np - y_true_np) ** 2).mean())

In [137]:
for epoch in range(EPOCHS):
    for i in range((len(X_train) + BATCH_SIZE - 1) // BATCH_SIZE):
        start_i = i * BATCH_SIZE
        end_i = start_i + BATCH_SIZE
        
        x_batch = torch.tensor(X_train[start_i:end_i], dtype=torch.float32)
        y_batch = torch.tensor(y_train[start_i:end_i], dtype=torch.float32)
        
        y_pred = model(x_batch)
        loss = loss_fun(y_pred, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 10 == 0:
            print(f"Epoch: {epoch}, loss: {loss}")
        
    model.eval()
    y_pred = model(torch.tensor(X_val, dtype=torch.float32))
    val_loss = loss_fun(y_pred, torch.tensor(y_val, dtype=torch.float32))
    val_rmse = rmse(y_pred, torch.tensor(y_val, dtype=torch.float32))
    print(f"Epoch: {epoch}, val_loss: {val_loss}, val_rmse: {val_rmse}")

Epoch: 0, loss: 9227.5146484375
Epoch: 0, loss: 410.47265625
Epoch: 0, loss: 46.54493713378906
Epoch: 0, loss: 38.1705322265625
Epoch: 0, loss: 110.77234649658203
Epoch: 0, loss: 1146.1058349609375
Epoch: 0, loss: 221.2699737548828
Epoch: 0, loss: 544.9750366210938
Epoch: 0, loss: 1150.7191162109375
Epoch: 0, loss: 153.9175262451172
Epoch: 0, loss: 9.257464408874512
Epoch: 0, val_loss: 95.49334716796875, val_rmse: 9.772068977355957
Epoch: 1, loss: 93.99178314208984
Epoch: 1, loss: 19.043127059936523
Epoch: 1, loss: 2.182343006134033
Epoch: 1, loss: 1.890360951423645
Epoch: 1, loss: 72.40323638916016
Epoch: 1, loss: 14.539566040039062
Epoch: 1, loss: 6.200313091278076
Epoch: 1, loss: 223.46372985839844
Epoch: 1, loss: 417.2265930175781
Epoch: 1, loss: 14.707416534423828
Epoch: 1, loss: 5.459478378295898
Epoch: 1, val_loss: 41.173004150390625, val_rmse: 6.416619777679443
Epoch: 2, loss: 36.82200622558594
Epoch: 2, loss: 18.942731857299805
Epoch: 2, loss: 3.3559582233428955
Epoch: 2, loss