https://www.kaggle.com/competitions/home-data-for-ml-course/overview

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tqdm import tqdm

In [2]:
df_train = pd.read_csv('../Datasets/Ames/train.csv')
df_test = pd.read_csv('../Datasets/Ames/test.csv')

In [3]:
df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [4]:
df_train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [5]:
numerical_features = ["MSSubClass", "LotFrontage", "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd",
                      "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF",
                      "LowQualFinSF",
                      "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr",
                      "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars", "GarageArea",
                      "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
                      "MoSold", "YrSold"]
categorical_features = ["MSZoning", "Neighborhood", "Condition1", "Condition2", "HouseStyle", "RoofStyle", "RoofMatl",
                        "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "BsmtFinType1", "BsmtFinType2",
                        "Heating", "CentralAir", "Functional", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]
target_features = ["SalePrice"]
ordinal_feature_mappings = {
    "Street": {"Grvl": 0, "Pave": 1},
    "Alley": {np.nan: 0, "Grvl": 1, "Pave": 2},
    "LotShape": {"Reg": 3, "IR1": 2, "IR2": 1, "IR3": 0},
    "LandContour": {"Lvl": 3, "Bnk": 2, "HLS": 1, "Low": 0},
    "Utilities": {"AllPub": 3, "NoSewr": 2, "NoSeWa": 1, "ELO": 0},
    "LotConfig": {"Inside": 0, "Corner": 1, "CulDSac": 2, "FR2": 3, "FR3": 4},
    "LandSlope": {"Gtl": 2, "Mod": 1, "Sev": 0},
    "BldgType": {"1Fam": 4, "2FmCon": 3, "Duplx": 2, "TwnhsE": 1, "TwnhsI": 0},
    "ExterQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "ExterCond": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "BsmtQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "BsmtCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "BsmtExposure": {"Gd": 4, "Av": 3, "Mn": 2, "No": 1, np.nan: 0},
    "HeatingQC": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "Electrical": {"SBrkr": 4, "FuseA": 3, "FuseF": 2, "FuseP": 1, "Mix": 0},
    "KitchenQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "FireplaceQu": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "GarageFinish": {"Fin": 3, "RFn": 2, "Unf": 1, np.nan: 0},
    "GarageQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "GarageCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "PavedDrive": {"Y": 2, "P": 1, "N": 0},
    "PoolQC": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "Fence": {"GdPrv": 4, "MnPrv": 3, "GdWo": 2, "MnWw": 1, np.nan: 0}
}


In [6]:
all_features = set()
from itertools import chain
for col in chain(numerical_features, categorical_features, ordinal_feature_mappings.keys()):
    all_features.add(col)
for f in all_features:
    if f not in df_train.columns:
        print(f,0)
for col in df_train.columns:
    if col not in all_features:
        print(col,1)

Id 1
SalePrice 1


In [7]:
df_train = df_train.drop(columns=['Id'])
df_test = df_test.drop(columns=['Id'])
for col, mapping in ordinal_feature_mappings.items():
    df_train[col] = df_train[col].map(mapping)
    df_test[col] = df_test[col].map(mapping)

In [8]:
for col in df_train.select_dtypes(include=["int64", "float64"]).columns:
    print(col)
    df_train[col] = df_train[col].fillna(df_train[col].mean()).astype(df_train[col].dtype)
    if col != "SalePrice":
        df_test[col] = df_test[col].fillna(df_test[col].mean()).astype(df_test[col].dtype)

for col in df_train.select_dtypes(include="object"):
    df_train[col] = df_train[col].fillna("Unknown")
    df_test[col] = df_test[col].fillna("Unknown")


MSSubClass
LotFrontage
LotArea
Street
Alley
LotShape
LandContour
Utilities
LotConfig
LandSlope
BldgType
OverallQual
OverallCond
YearBuilt
YearRemodAdd
MasVnrArea
ExterQual
ExterCond
BsmtQual
BsmtCond
BsmtExposure
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
HeatingQC
Electrical
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
BedroomAbvGr
KitchenAbvGr
KitchenQual
TotRmsAbvGrd
Fireplaces
FireplaceQu
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
PavedDrive
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
PoolQC
Fence
MiscVal
MoSold
YrSold
SalePrice


In [9]:
df_train.isna().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 80, dtype: int64

In [10]:
X_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features + list(ordinal_feature_mappings.keys())),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)
Y_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), target_features),
    ]
)
X_train_processed = X_preprocessor.fit_transform(df_train)
Y_train_processed = Y_preprocessor.fit_transform(df_train)
X_test_processed = X_preprocessor.transform(df_test)

In [11]:
X_train = torch.tensor(X_train_processed, dtype=torch.float32)
Y_train = torch.tensor(Y_train_processed, dtype=torch.float32)
X_test = torch.tensor(X_test_processed, dtype=torch.float32)

In [12]:
X_train

tensor([[ 0.0734, -0.2294, -0.2071,  ...,  0.0000,  1.0000,  0.0000],
        [-0.8726,  0.4519, -0.0919,  ...,  0.0000,  1.0000,  0.0000],
        [ 0.0734, -0.0931,  0.0735,  ...,  0.0000,  1.0000,  0.0000],
        ...,
        [ 0.3099, -0.1840, -0.1478,  ...,  0.0000,  1.0000,  0.0000],
        [-0.8726, -0.0931, -0.0802,  ...,  0.0000,  1.0000,  0.0000],
        [-0.8726,  0.2248, -0.0581,  ...,  0.0000,  1.0000,  0.0000]])

In [13]:
Y_train

tensor([[ 0.5601],
        [ 0.2128],
        [ 0.7340],
        ...,
        [ 1.1747],
        [-0.3997],
        [-0.3067]])

In [14]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X_train, Y_train)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [15]:
import torch.nn as nn

class AmesNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)


In [16]:
model = AmesNN(X_train.shape[1])

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

AmesNN(
  (net): Sequential(
    (0): Linear(in_features=225, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [18]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)


In [19]:
model.train()
for epoch in range(200):
    total_loss = 0
    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        predictions = model(X_batch)
        loss = loss_fn(predictions, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * X_batch.size(0)

    avg_loss = total_loss / len(dataloader.dataset)
    scheduler.step(avg_loss)

    print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f}")



Epoch 1 | Loss: 0.9943
Epoch 2 | Loss: 0.8853
Epoch 3 | Loss: 0.7680
Epoch 4 | Loss: 0.6320
Epoch 5 | Loss: 0.4865
Epoch 6 | Loss: 0.3497
Epoch 7 | Loss: 0.2641
Epoch 8 | Loss: 0.2209
Epoch 9 | Loss: 0.2143
Epoch 10 | Loss: 0.2020
Epoch 11 | Loss: 0.1899
Epoch 12 | Loss: 0.1897
Epoch 13 | Loss: 0.1833
Epoch 14 | Loss: 0.1765
Epoch 15 | Loss: 0.1822
Epoch 16 | Loss: 0.1726
Epoch 17 | Loss: 0.1670
Epoch 18 | Loss: 0.1616
Epoch 19 | Loss: 0.1606
Epoch 20 | Loss: 0.1488
Epoch 21 | Loss: 0.1450
Epoch 22 | Loss: 0.1558
Epoch 23 | Loss: 0.1487
Epoch 24 | Loss: 0.1345
Epoch 25 | Loss: 0.1392
Epoch 26 | Loss: 0.1452
Epoch 27 | Loss: 0.1427
Epoch 28 | Loss: 0.1316
Epoch 29 | Loss: 0.1207
Epoch 30 | Loss: 0.1271
Epoch 31 | Loss: 0.1209
Epoch 32 | Loss: 0.1256
Epoch 33 | Loss: 0.1326
Epoch 34 | Loss: 0.1241
Epoch 35 | Loss: 0.1274
Epoch 36 | Loss: 0.1211
Epoch 37 | Loss: 0.1264
Epoch 38 | Loss: 0.1156
Epoch 39 | Loss: 0.1148
Epoch 40 | Loss: 0.1217
Epoch 41 | Loss: 0.1206
Epoch 42 | Loss: 0.1180
E

In [20]:
model.eval()
X_test = X_test.to(device)
X_test
answer_tensor = model(X_test)
answer_tensor = answer_tensor.detach().cpu().numpy()
scaler = Y_preprocessor.named_transformers_['num']
answer = scaler.inverse_transform(answer_tensor)
answer

array([[118623.32],
       [143160.53],
       [176363.8 ],
       ...,
       [165054.52],
       [114329.5 ],
       [218148.8 ]], shape=(1459, 1), dtype=float32)

In [21]:
submission = pd.DataFrame(answer,columns=["SalePrice"])
submission.index = range(1461, 2920)
submission.index.name = "Id"
submission


Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,118623.320312
1462,143160.531250
1463,176363.796875
1464,193593.500000
1465,170641.515625
...,...
2915,90765.554688
2916,89313.367188
2917,165054.515625
2918,114329.500000


In [22]:
submission.to_csv("AmesNN.csv")