In [57]:
import numpy as np
import pandas as pd
import torch
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tqdm import tqdm

In [58]:
df_train = pd.read_csv('../../Datasets/Boston/train.csv')
df_test = pd.read_csv('../../Datasets/Boston/test.csv')

In [59]:
df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [60]:
df_train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [61]:
numerical_features = ["MSSubClass", "LotFrontage", "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd",
                      "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF",
                      "LowQualFinSF",
                      "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr",
                      "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars", "GarageArea",
                      "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
                      "MoSold", "YrSold"]
categorical_features = ["MSZoning", "Neighborhood", "Condition1", "Condition2", "HouseStyle", "RoofStyle", "RoofMatl",
                        "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "BsmtFinType1", "BsmtFinType2",
                        "Heating", "CentralAir", "Functional", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]
target_features = ["SalePrice"]
ordinal_feature_mappings = {
    "Street": {"Grvl": 0, "Pave": 1},
    "Alley": {np.nan: 0, "Grvl": 1, "Pave": 2},
    "LotShape": {"Reg": 3, "IR1": 2, "IR2": 1, "IR3": 0},
    "LandContour": {"Lvl": 3, "Bnk": 2, "HLS": 1, "Low": 0},
    "Utilities": {"AllPub": 3, "NoSewr": 2, "NoSeWa": 1, "ELO": 0},
    "LotConfig": {"Inside": 0, "Corner": 1, "CulDSac": 2, "FR2": 3, "FR3": 4},
    "LandSlope": {"Gtl": 2, "Mod": 1, "Sev": 0},
    "BldgType": {"1Fam": 4, "2FmCon": 3, "Duplx": 2, "TwnhsE": 1, "TwnhsI": 0},
    "ExterQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "ExterCond": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "BsmtQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "BsmtCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "BsmtExposure": {"Gd": 4, "Av": 3, "Mn": 2, "No": 1, np.nan: 0},
    "HeatingQC": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "Electrical": {"SBrkr": 4, "FuseA": 3, "FuseF": 2, "FuseP": 1, "Mix": 0},
    "KitchenQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po": 0},
    "FireplaceQu": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "GarageFinish": {"Fin": 3, "RFn": 2, "Unf": 1, np.nan: 0},
    "GarageQual": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "GarageCond": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "PavedDrive": {"Y": 2, "P": 1, "N": 0},
    "PoolQC": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0},
    "Fence": {"GdPrv": 4, "MnPrv": 3, "GdWo": 2, "MnWw": 1, np.nan: 0}
}


In [62]:
all_features = set()
from itertools import chain
for col in chain(numerical_features, categorical_features, ordinal_feature_mappings.keys()):
    all_features.add(col)
for f in all_features:
    if f not in df_train.columns:
        print(f,0)
for col in df_train.columns:
    if col not in all_features:
        print(col,1)

Id 1
SalePrice 1


In [63]:
for col, mapping in ordinal_feature_mappings.items():
    df_train[col] = df_train[col].map(mapping)
    df_test[col] = df_test[col].map(mapping)

In [64]:
for col in df_train.select_dtypes(include=["int64", "float64"]).columns:
    df_train[col] = df_train[col].fillna(df_train[col].mean()).astype(df_train[col].dtype)
    df_test[col] = df_test[col].fillna(df_test[col].mean()).astype(df_test[col].dtype)

for col in df_train.select_dtypes(include="object"):
    df_train[col] = df_train[col].fillna("Unknown")
    df_test[col] = df_test[col].fillna("Unknown")


In [65]:
df_train.isna().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64

In [66]:
X_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features + list(ordinal_feature_mappings.keys())),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)
Y_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), target_features),
    ]
)
X_train_processed = X_preprocessor.fit_transform(df_train)
Y_train_processed = Y_preprocessor.fit_transform(df_train)
X_test_processed = X_preprocessor.transform(df_test)
Y_test_processed = Y_preprocessor.transform(df_test)

In [67]:
X_train = torch.tensor(X_train_processed, dtype=torch.float32)
Y_train = torch.tensor(Y_train_processed, dtype=torch.float32)
X_test = torch.tensor(X_test_processed, dtype=torch.float32)
Y_test = torch.tensor(Y_test_processed, dtype=torch.float32)

In [68]:
X_train

tensor([[ 0.0734, -0.2294, -0.2071,  ...,  0.0000,  1.0000,  0.0000],
        [-0.8726,  0.4519, -0.0919,  ...,  0.0000,  1.0000,  0.0000],
        [ 0.0734, -0.0931,  0.0735,  ...,  0.0000,  1.0000,  0.0000],
        ...,
        [ 0.3099, -0.1840, -0.1478,  ...,  0.0000,  1.0000,  0.0000],
        [-0.8726, -0.0931, -0.0802,  ...,  0.0000,  1.0000,  0.0000],
        [-0.8726,  0.2248, -0.0581,  ...,  0.0000,  1.0000,  0.0000]])

In [69]:
Y_train

tensor([[ 0.3473],
        [ 0.0073],
        [ 0.5362],
        ...,
        [ 1.0776],
        [-0.4885],
        [-0.4208]])

In [70]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X_train, Y_train)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [71]:
import torch.nn as nn

class BostonNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)


In [72]:
model = BostonNN(X_train.shape[1])

In [73]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BostonNN(
  (net): Sequential(
    (0): Linear(in_features=225, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [74]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [75]:
for epoch in tqdm(range(100)):
    loss: nn.MSELoss
    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        predictions = model(X_batch)
        loss = loss_fn(predictions, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")


  3%|▎         | 3/100 [00:00<00:11,  8.23it/s]

Epoch 1 | Loss: 0.1128
Epoch 2 | Loss: 0.0971
Epoch 3 | Loss: 0.1356
Epoch 4 | Loss: 0.4747


  7%|▋         | 7/100 [00:00<00:07, 13.26it/s]

Epoch 5 | Loss: 0.6745
Epoch 6 | Loss: 0.3703
Epoch 7 | Loss: 0.0602
Epoch 8 | Loss: 0.0697


 12%|█▏        | 12/100 [00:00<00:05, 16.97it/s]

Epoch 9 | Loss: 0.0456
Epoch 10 | Loss: 0.0900
Epoch 11 | Loss: 0.2848
Epoch 12 | Loss: 0.1463


 16%|█▌        | 16/100 [00:01<00:04, 17.61it/s]

Epoch 13 | Loss: 0.0433
Epoch 14 | Loss: 0.0603
Epoch 15 | Loss: 0.0808
Epoch 16 | Loss: 0.0971


 20%|██        | 20/100 [00:01<00:04, 17.91it/s]

Epoch 17 | Loss: 0.0950
Epoch 18 | Loss: 0.0657
Epoch 19 | Loss: 0.0967
Epoch 20 | Loss: 0.0362


 24%|██▍       | 24/100 [00:01<00:04, 18.09it/s]

Epoch 21 | Loss: 0.0645
Epoch 22 | Loss: 0.0591
Epoch 23 | Loss: 0.0735
Epoch 24 | Loss: 0.0272


 28%|██▊       | 28/100 [00:01<00:03, 18.19it/s]

Epoch 25 | Loss: 0.0521
Epoch 26 | Loss: 0.0244
Epoch 27 | Loss: 0.0519
Epoch 28 | Loss: 0.0602


 32%|███▏      | 32/100 [00:01<00:03, 17.93it/s]

Epoch 29 | Loss: 0.0494
Epoch 30 | Loss: 0.0392
Epoch 31 | Loss: 0.0409
Epoch 32 | Loss: 0.0385


 36%|███▌      | 36/100 [00:02<00:03, 18.09it/s]

Epoch 33 | Loss: 0.0445
Epoch 34 | Loss: 0.0660
Epoch 35 | Loss: 0.0432
Epoch 36 | Loss: 0.0576


 40%|████      | 40/100 [00:02<00:03, 17.94it/s]

Epoch 37 | Loss: 0.0476
Epoch 38 | Loss: 0.0313
Epoch 39 | Loss: 0.0190
Epoch 40 | Loss: 0.0202


 42%|████▏     | 42/100 [00:02<00:03, 18.03it/s]

Epoch 41 | Loss: 0.0389
Epoch 42 | Loss: 0.0392
Epoch 43 | Loss: 0.0160
Epoch 44 | Loss: 0.0284


 47%|████▋     | 47/100 [00:02<00:02, 18.81it/s]

Epoch 45 | Loss: 0.2345
Epoch 46 | Loss: 0.0323
Epoch 47 | Loss: 0.0289
Epoch 48 | Loss: 0.0301
Epoch 49 | Loss: 0.0219


 53%|█████▎    | 53/100 [00:03<00:02, 19.00it/s]

Epoch 50 | Loss: 0.0356
Epoch 51 | Loss: 0.0300
Epoch 52 | Loss: 0.0289
Epoch 53 | Loss: 0.0478


 55%|█████▌    | 55/100 [00:03<00:02, 18.16it/s]

Epoch 54 | Loss: 0.0633
Epoch 55 | Loss: 0.0267
Epoch 56 | Loss: 0.0412


 59%|█████▉    | 59/100 [00:03<00:02, 15.77it/s]

Epoch 57 | Loss: 0.0515
Epoch 58 | Loss: 0.0508
Epoch 59 | Loss: 0.0499


 63%|██████▎   | 63/100 [00:03<00:02, 16.45it/s]

Epoch 60 | Loss: 0.0349
Epoch 61 | Loss: 0.1756
Epoch 62 | Loss: 0.0293
Epoch 63 | Loss: 0.0340


 65%|██████▌   | 65/100 [00:03<00:02, 16.19it/s]

Epoch 64 | Loss: 0.0304
Epoch 65 | Loss: 0.0444
Epoch 66 | Loss: 0.1414
Epoch 67 | Loss: 0.0189


 71%|███████   | 71/100 [00:04<00:01, 16.58it/s]

Epoch 68 | Loss: 0.0130
Epoch 69 | Loss: 0.0240
Epoch 70 | Loss: 0.0247
Epoch 71 | Loss: 0.0244


 73%|███████▎  | 73/100 [00:04<00:01, 15.87it/s]

Epoch 72 | Loss: 0.0274
Epoch 73 | Loss: 0.0473
Epoch 74 | Loss: 0.0448
Epoch 75 | Loss: 0.0266


 77%|███████▋  | 77/100 [00:04<00:01, 16.01it/s]

Epoch 76 | Loss: 0.0228
Epoch 77 | Loss: 0.0343
Epoch 78 | Loss: 0.0264
Epoch 79 | Loss: 0.0259


 83%|████████▎ | 83/100 [00:05<00:01, 16.53it/s]

Epoch 80 | Loss: 0.0336
Epoch 81 | Loss: 0.0218
Epoch 82 | Loss: 0.0285
Epoch 83 | Loss: 0.0386


 85%|████████▌ | 85/100 [00:05<00:00, 16.35it/s]

Epoch 84 | Loss: 0.0273
Epoch 85 | Loss: 0.0287
Epoch 86 | Loss: 0.0414
Epoch 87 | Loss: 0.0361


 89%|████████▉ | 89/100 [00:05<00:00, 15.42it/s]

Epoch 88 | Loss: 0.0349
Epoch 89 | Loss: 0.0992
Epoch 90 | Loss: 0.0329


 93%|█████████▎| 93/100 [00:05<00:00, 15.60it/s]

Epoch 91 | Loss: 0.0270
Epoch 92 | Loss: 0.0193
Epoch 93 | Loss: 0.0200
Epoch 94 | Loss: 0.0170


 97%|█████████▋| 97/100 [00:05<00:00, 15.50it/s]

Epoch 95 | Loss: 0.0218
Epoch 96 | Loss: 0.0107
Epoch 97 | Loss: 0.0483
Epoch 98 | Loss: 0.0223


100%|██████████| 100/100 [00:06<00:00, 16.33it/s]

Epoch 99 | Loss: 0.0240
Epoch 100 | Loss: 0.0399





In [76]:

torch.save(model.state_dict(), "BostonModel.pth")

