In [407]:
import pandas as pd
import numpy as np

import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
import torch.nn as nn
import torch


In [408]:
df = pd.read_csv('train_data.csv')
df.head()


Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,SubwayStation,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592,2006,814,3,terraced,individual_heating,management_in_trust,111.0,184.0,5min~10min,10min~15min,3.0,0.0,Kyungbuk_uni_hospital,5,6.0,9.0
1,51327,1985,587,8,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
2,48672,1985,587,6,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
3,380530,2006,2056,8,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,5.0,11.0,Sin-nam,5,3.0,7.0
4,78318,1992,644,2,mixed,individual_heating,self_management,142.0,79.0,5min~10min,15min~20min,4.0,8.0,Myung-duk,3,9.0,14.0


In [409]:
print("Shape:", df.shape)
print(df.dtypes)
print()

for column in df.columns:
    # print(df[column].dtype)
    if df[column].dtype not in ("int64", "float64"):
        print(column)
        print(df[column].value_counts())
        print()


Shape: (4124, 17)
SalePrice                      int64
YearBuilt                      int64
Size(sqf)                      int64
Floor                          int64
HallwayType                   object
HeatingType                   object
AptManageType                 object
N_Parkinglot(Ground)         float64
N_Parkinglot(Basement)       float64
TimeToBusStop                 object
TimeToSubway                  object
N_manager                    float64
N_elevators                  float64
SubwayStation                 object
N_FacilitiesInApt              int64
N_FacilitiesNearBy(Total)    float64
N_SchoolNearBy(Total)        float64
dtype: object

HallwayType
terraced    2485
mixed       1194
corridor     445
Name: HallwayType, dtype: int64

HeatingType
individual_heating    3938
central_heating        186
Name: HeatingType, dtype: int64

AptManageType
management_in_trust    3869
self_management         255
Name: AptManageType, dtype: int64

TimeToBusStop
0~5min         3148
5min

In [410]:
def fix_df(df: pd.DataFrame) -> pd.DataFrame:

    # Break time columns to min and max values
    df["TimeToBusStop"] = df["TimeToBusStop"].str.replace("min", "")
    df["TimeToBusStop"] = df["TimeToBusStop"].str.replace("-", "~")
    df["TimeToBusStop"] = df["TimeToBusStop"].str.replace("no_bus_stop_nearby", "100~100")
    df[["TimeToBusStopMin", "TimeToBusStopMax"]] = df["TimeToBusStop"].str.split('~', 1, expand=True)
    df["TimeToBusStopMin"] = df["TimeToBusStopMin"].astype(int)
    df["TimeToBusStopMax"] = df["TimeToBusStopMax"].astype(int)

    df["TimeToSubway"] = df["TimeToSubway"].str.replace("min", "")
    df["TimeToSubway"] = df["TimeToSubway"].str.replace("-", "~")
    df["TimeToSubway"] = df["TimeToSubway"].str.replace("no_bus_stop_nearby", "100~100")
    df[["TimeToSubwayMin", "TimeToSubwayMax"]] = df["TimeToSubway"].str.split('~', 1, expand=True)
    df["TimeToSubwayMin"] = df["TimeToSubwayMin"].astype(int)
    df["TimeToSubwayMax"] = df["TimeToSubwayMax"].astype(int)

    df.drop(columns=["TimeToBusStop", "TimeToSubway"], inplace=True)

    # One hot encoding for a string columns with a small number of values
    columns_to_onehot = ["HallwayType", "HeatingType", "AptManageType", "SubwayStation"]
    for column in columns_to_onehot:
        y = pd.get_dummies(df[column], prefix=column)
        df[y.columns.to_list()] = y
    df.drop(columns=columns_to_onehot, inplace=True)

    # Convert SalePrice if exists
    if "SalePrice" in df.columns:
        df.loc[df["SalePrice"] <= 100_000, "SalePrice"] = 1
        df.loc[df["SalePrice"] > 100_000, "SalePrice"] = 2
        df.loc[df["SalePrice"] > 350_000, "SalePrice"] = 3

    return df


In [411]:
df = fix_df(df)
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,N_Parkinglot(Ground),N_Parkinglot(Basement),N_manager,N_elevators,N_FacilitiesInApt,N_FacilitiesNearBy(Total),...,AptManageType_management_in_trust,AptManageType_self_management,SubwayStation_Bangoge,SubwayStation_Banwoldang,SubwayStation_Chil-sung-market,SubwayStation_Daegu,SubwayStation_Kyungbuk_uni_hospital,SubwayStation_Myung-duk,SubwayStation_Sin-nam,SubwayStation_no_subway_nearby
0,2,2006,814,3,111.0,184.0,3.0,0.0,5,6.0,...,1,0,0,0,0,0,1,0,0,0
1,1,1985,587,8,80.0,76.0,2.0,2.0,3,12.0,...,0,1,0,0,0,1,0,0,0,0
2,1,1985,587,6,80.0,76.0,2.0,2.0,3,12.0,...,0,1,0,0,0,1,0,0,0,0
3,2,2006,2056,8,249.0,536.0,5.0,11.0,5,3.0,...,1,0,0,0,0,0,0,0,1,0
4,1,1992,644,2,142.0,79.0,4.0,8.0,3,9.0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,2,2007,1928,24,0.0,1270.0,14.0,16.0,10,9.0,...,1,0,0,0,0,0,1,0,0,0
4120,2,2015,644,22,102.0,400.0,5.0,10.0,7,7.0,...,1,0,0,0,0,1,0,0,0,0
4121,2,2007,868,20,0.0,1270.0,14.0,16.0,10,9.0,...,1,0,0,0,0,0,1,0,0,0
4122,2,1978,1327,1,87.0,0.0,1.0,4.0,3,7.0,...,0,1,0,0,0,0,1,0,0,0


In [412]:
df.dtypes

SalePrice                                int64
YearBuilt                                int64
Size(sqf)                                int64
Floor                                    int64
N_Parkinglot(Ground)                   float64
N_Parkinglot(Basement)                 float64
N_manager                              float64
N_elevators                            float64
N_FacilitiesInApt                        int64
N_FacilitiesNearBy(Total)              float64
N_SchoolNearBy(Total)                  float64
TimeToBusStopMin                         int32
TimeToBusStopMax                         int32
TimeToSubwayMin                          int32
TimeToSubwayMax                          int32
HallwayType_corridor                     uint8
HallwayType_mixed                        uint8
HallwayType_terraced                     uint8
HeatingType_central_heating              uint8
HeatingType_individual_heating           uint8
AptManageType_management_in_trust        uint8
AptManageType

In [413]:
class SimpleClassifier(nn.Module):

    def __init__(
        self,
        num_inputs: int,
        num_hidden: int,
        num_outputs: int
        ) -> None:

        super().__init__()
        self.linear1 = nn.Linear(num_inputs, 2*num_hidden)
        self.act_fn = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.linear2 = nn.Linear(2*num_hidden, 3*num_hidden)
        self.linear3 = nn.Linear(3*num_hidden, num_hidden)
        self.linear4 = nn.Linear(num_hidden, num_outputs)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear1(x)
        x = self.act_fn(x)
        x = self.dropout(x)
        x = self.linear2(x)
        x = self.act_fn(x)
        x = self.dropout(x)
        x = self.linear3(x)
        x = self.act_fn(x)
        x = self.dropout(x)
        x = self.linear4(x)
        return x


In [414]:
model = SimpleClassifier(len(df.columns)-1, 100, 3)
crit = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.002)


In [415]:
train = df.sample(frac=1, random_state=200)
test = df.drop(train.index)

train_dataset = data.TensorDataset(torch.from_numpy(train.values[:, 1:]), torch.from_numpy(train.values[:, 0]))
test_dataset = data.TensorDataset(torch.from_numpy(train.values[:, 1:]), torch.from_numpy(train.values[:, 0]))

train_data_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=True)


In [416]:
for epoch in range(150):

    sum_loss = 0.0
    for data in train_data_loader:
        inputs, labels = data[0], data[1]
        print("IN", inputs.shape)
        print("LS", labels.shape)

        optimizer.zero_grad()

        preds = model(inputs.float())
        preds = preds.squeeze(dim=1)

        loss = crit(preds, labels.float())
        loss.backward()
        optimizer.step()

        sum_loss += loss
    
    print('Epoch:' , epoch + 1, "-", loss)


IN torch.Size([128, 29])
LS torch.Size([128])


  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (3) must match the size of tensor b (128) at non-singleton dimension 1