In [19]:
import pandas as pd

import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
import torch.nn as nn
import torch

from sklearn.preprocessing import StandardScaler


In [20]:
torch.manual_seed = 42
# When running on the CuDNN backend, two further options must be set
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# Preparing data

In [21]:
df = pd.read_csv('train_data.csv')
df.head()


Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,SubwayStation,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592,2006,814,3,terraced,individual_heating,management_in_trust,111.0,184.0,5min~10min,10min~15min,3.0,0.0,Kyungbuk_uni_hospital,5,6.0,9.0
1,51327,1985,587,8,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
2,48672,1985,587,6,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
3,380530,2006,2056,8,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,5.0,11.0,Sin-nam,5,3.0,7.0
4,78318,1992,644,2,mixed,individual_heating,self_management,142.0,79.0,5min~10min,15min~20min,4.0,8.0,Myung-duk,3,9.0,14.0


In [22]:
print("Shape:", df.shape)
print(df.dtypes)
print()

for column in df.columns:
    if df[column].dtype not in ("int64", "float64"):
        print(column)
        print(df[column].value_counts())
        print()


Shape: (4124, 17)
SalePrice                      int64
YearBuilt                      int64
Size(sqf)                      int64
Floor                          int64
HallwayType                   object
HeatingType                   object
AptManageType                 object
N_Parkinglot(Ground)         float64
N_Parkinglot(Basement)       float64
TimeToBusStop                 object
TimeToSubway                  object
N_manager                    float64
N_elevators                  float64
SubwayStation                 object
N_FacilitiesInApt              int64
N_FacilitiesNearBy(Total)    float64
N_SchoolNearBy(Total)        float64
dtype: object

HallwayType
terraced    2485
mixed       1194
corridor     445
Name: HallwayType, dtype: int64

HeatingType
individual_heating    3938
central_heating        186
Name: HeatingType, dtype: int64

AptManageType
management_in_trust    3869
self_management         255
Name: AptManageType, dtype: int64

TimeToBusStop
0~5min         3148
5min

In [23]:
def fix_df(df: pd.DataFrame) -> pd.DataFrame:

    # Break time columns to min and max values
    df["TimeToBusStop"] = df["TimeToBusStop"].str.replace("min", "")
    df["TimeToBusStop"] = df["TimeToBusStop"].str.replace("-", "~")
    df["TimeToBusStop"] = df["TimeToBusStop"].str.replace("no_bus_stop_nearby", "100~100")
    df[["TimeToBusStopMin", "TimeToBusStopMax"]] = df["TimeToBusStop"].str.split('~', 1, expand=True)
    df["TimeToBusStopMin"] = df["TimeToBusStopMin"].astype(int)
    df["TimeToBusStopMax"] = df["TimeToBusStopMax"].astype(int)

    df["TimeToSubway"] = df["TimeToSubway"].str.replace("min", "")
    df["TimeToSubway"] = df["TimeToSubway"].str.replace("-", "~")
    df["TimeToSubway"] = df["TimeToSubway"].str.replace("no_bus_stop_nearby", "100~100")
    df[["TimeToSubwayMin", "TimeToSubwayMax"]] = df["TimeToSubway"].str.split('~', 1, expand=True)
    df["TimeToSubwayMin"] = df["TimeToSubwayMin"].astype(int)
    df["TimeToSubwayMax"] = df["TimeToSubwayMax"].astype(int)

    df.drop(columns=["TimeToBusStop", "TimeToSubway"], inplace=True)

    # One hot encoding for a string columns with a small number of values
    columns_to_onehot = ["HallwayType", "HeatingType", "AptManageType", "SubwayStation"]
    for column in columns_to_onehot:
        y = pd.get_dummies(df[column], prefix=column)
        df[y.columns.to_list()] = y
    df.drop(columns=columns_to_onehot, inplace=True)

    # Convert SalePrice if exists
    if "SalePrice" in df.columns:
        bins = (0, 100_000, 350_000, float("inf"))
        labels = (0, 1, 2)
        df["SalePrice"] = pd.cut(x=df["SalePrice"], bins=bins, labels=labels)

    return df


In [24]:
df = fix_df(df)
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,N_Parkinglot(Ground),N_Parkinglot(Basement),N_manager,N_elevators,N_FacilitiesInApt,N_FacilitiesNearBy(Total),...,AptManageType_management_in_trust,AptManageType_self_management,SubwayStation_Bangoge,SubwayStation_Banwoldang,SubwayStation_Chil-sung-market,SubwayStation_Daegu,SubwayStation_Kyungbuk_uni_hospital,SubwayStation_Myung-duk,SubwayStation_Sin-nam,SubwayStation_no_subway_nearby
0,1,2006,814,3,111.0,184.0,3.0,0.0,5,6.0,...,1,0,0,0,0,0,1,0,0,0
1,0,1985,587,8,80.0,76.0,2.0,2.0,3,12.0,...,0,1,0,0,0,1,0,0,0,0
2,0,1985,587,6,80.0,76.0,2.0,2.0,3,12.0,...,0,1,0,0,0,1,0,0,0,0
3,2,2006,2056,8,249.0,536.0,5.0,11.0,5,3.0,...,1,0,0,0,0,0,0,0,1,0
4,0,1992,644,2,142.0,79.0,4.0,8.0,3,9.0,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,2,2007,1928,24,0.0,1270.0,14.0,16.0,10,9.0,...,1,0,0,0,0,0,1,0,0,0
4120,1,2015,644,22,102.0,400.0,5.0,10.0,7,7.0,...,1,0,0,0,0,1,0,0,0,0
4121,2,2007,868,20,0.0,1270.0,14.0,16.0,10,9.0,...,1,0,0,0,0,0,1,0,0,0
4122,1,1978,1327,1,87.0,0.0,1.0,4.0,3,7.0,...,0,1,0,0,0,0,1,0,0,0


In [25]:
df.dtypes


SalePrice                              category
YearBuilt                                 int64
Size(sqf)                                 int64
Floor                                     int64
N_Parkinglot(Ground)                    float64
N_Parkinglot(Basement)                  float64
N_manager                               float64
N_elevators                             float64
N_FacilitiesInApt                         int64
N_FacilitiesNearBy(Total)               float64
N_SchoolNearBy(Total)                   float64
TimeToBusStopMin                          int32
TimeToBusStopMax                          int32
TimeToSubwayMin                           int32
TimeToSubwayMax                           int32
HallwayType_corridor                      uint8
HallwayType_mixed                         uint8
HallwayType_terraced                      uint8
HeatingType_central_heating               uint8
HeatingType_individual_heating            uint8
AptManageType_management_in_trust       

# Build Classifier

In [26]:
class SimpleClassifier(nn.Module):

    def __init__(
        self,
        num_inputs: int,
        num_hidden: int,
        num_outputs: int,
        dropout: int = 0.2
        ) -> None:

        super().__init__()
        self.linear1 = nn.Linear(num_inputs, 2*num_hidden)
        self.act_fn1 = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout)
        self.linear2 = nn.Linear(2*num_hidden, 3*num_hidden)
        self.linear3 = nn.Linear(3*num_hidden, num_hidden)
        self.linear4 = nn.Linear(num_hidden, num_outputs)
        self.bn1 = nn.BatchNorm1d(2*num_hidden)
        self.bn2 = nn.BatchNorm1d(3*num_hidden)
        self.bn3 = nn.BatchNorm1d(num_hidden)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear1(x)
        x = self.act_fn1(x)
        x = self.bn1(x)

        x = self.dropout(x)
        x = self.linear2(x)
        x = self.act_fn1(x)
        x = self.bn2(x)

        x = self.dropout(x)
        x = self.linear3(x)
        x = self.act_fn1(x)
        x = self.bn3(x)

        x = self.dropout(x)
        x = self.linear4(x)

        return x


In [27]:
model = SimpleClassifier(len(df.columns)-1, 700, 3, dropout=0.5)

crit = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)


In [28]:

scaler = StandardScaler()
df[df.columns[1:]] = scaler.fit_transform(df[df.columns[1:]])

train = df.sample(frac=0.7, random_state=200)
test = df.drop(train.index)

train_dataset = data.TensorDataset(torch.from_numpy(train.values[:, 1:]), torch.from_numpy(train.values[:, 0]))
test_dataset = data.TensorDataset(torch.from_numpy(test.values[:, 1:]), torch.from_numpy(test.values[:, 0]))

train_data_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=True)


# Train model

In [29]:
model.train()

for epoch in range(350):

    sum_loss = 0.0
    for inputs, labels in train_data_loader:
        labels = labels.type(torch.LongTensor)

        preds = model(inputs.float())
        preds = preds.squeeze()

        loss = crit(preds, labels)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        sum_loss += loss

    print('Epoch:' , epoch + 1, "-", loss)


Epoch: 1 - tensor(0.8892, grad_fn=<NllLossBackward0>)
Epoch: 2 - tensor(0.7230, grad_fn=<NllLossBackward0>)
Epoch: 3 - tensor(0.7600, grad_fn=<NllLossBackward0>)
Epoch: 4 - tensor(0.7804, grad_fn=<NllLossBackward0>)
Epoch: 5 - tensor(0.8182, grad_fn=<NllLossBackward0>)
Epoch: 6 - tensor(0.5612, grad_fn=<NllLossBackward0>)
Epoch: 7 - tensor(0.4745, grad_fn=<NllLossBackward0>)
Epoch: 8 - tensor(0.5821, grad_fn=<NllLossBackward0>)
Epoch: 9 - tensor(0.5424, grad_fn=<NllLossBackward0>)
Epoch: 10 - tensor(0.5811, grad_fn=<NllLossBackward0>)
Epoch: 11 - tensor(0.6157, grad_fn=<NllLossBackward0>)
Epoch: 12 - tensor(0.6732, grad_fn=<NllLossBackward0>)
Epoch: 13 - tensor(0.3847, grad_fn=<NllLossBackward0>)
Epoch: 14 - tensor(0.4784, grad_fn=<NllLossBackward0>)
Epoch: 15 - tensor(0.4045, grad_fn=<NllLossBackward0>)
Epoch: 16 - tensor(0.6198, grad_fn=<NllLossBackward0>)
Epoch: 17 - tensor(0.4704, grad_fn=<NllLossBackward0>)
Epoch: 18 - tensor(0.4778, grad_fn=<NllLossBackward0>)
Epoch: 19 - tensor(

In [30]:
def get_accuracy(model: nn.Module, the_data: data.TensorDataset) -> float:
    correct = 0
    total = 0

    model.eval()

    for inputs, labels in data.DataLoader(the_data):
        output = model(inputs.float())
        pred = output.max(1, keepdim=True)[1]  # get the index of the max logit
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += inputs.shape[0]
    return correct / total


In [31]:
get_accuracy(model, test_dataset)


0.8787388843977365

# Evaluation

In [32]:
test_df = pd.read_csv("test_data.csv")
test_df.head()


Unnamed: 0,YearBuilt,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,SubwayStation,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,1993,914,10,mixed,individual_heating,management_in_trust,523.0,536.0,0~5min,15min~20min,8.0,20.0,Myung-duk,4,14.0,17.0
1,2014,907,16,terraced,individual_heating,management_in_trust,90.0,1174.0,0~5min,0-5min,7.0,20.0,Myung-duk,9,14.0,17.0
2,2007,1629,7,terraced,individual_heating,management_in_trust,7.0,605.0,0~5min,0-5min,5.0,5.0,Banwoldang,5,9.0,5.0
3,2005,743,21,mixed,individual_heating,management_in_trust,67.0,798.0,0~5min,5min~10min,6.0,0.0,Bangoge,7,13.0,15.0
4,2006,903,7,terraced,individual_heating,management_in_trust,123.0,181.0,5min~10min,0-5min,3.0,11.0,Myung-duk,4,8.0,11.0


In [33]:
test_df = fix_df(test_df)
test_df.head()


Unnamed: 0,YearBuilt,Size(sqf),Floor,N_Parkinglot(Ground),N_Parkinglot(Basement),N_manager,N_elevators,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total),...,AptManageType_management_in_trust,AptManageType_self_management,SubwayStation_Bangoge,SubwayStation_Banwoldang,SubwayStation_Chil-sung-market,SubwayStation_Daegu,SubwayStation_Kyungbuk_uni_hospital,SubwayStation_Myung-duk,SubwayStation_Sin-nam,SubwayStation_no_subway_nearby
0,1993,914,10,523.0,536.0,8.0,20.0,4,14.0,17.0,...,1,0,0,0,0,0,0,1,0,0
1,2014,907,16,90.0,1174.0,7.0,20.0,9,14.0,17.0,...,1,0,0,0,0,0,0,1,0,0
2,2007,1629,7,7.0,605.0,5.0,5.0,5,9.0,5.0,...,1,0,0,1,0,0,0,0,0,0
3,2005,743,21,67.0,798.0,6.0,0.0,7,13.0,15.0,...,1,0,1,0,0,0,0,0,0,0
4,2006,903,7,123.0,181.0,3.0,11.0,4,8.0,11.0,...,1,0,0,0,0,0,0,1,0,0


In [34]:
test_df = scaler.transform(test_df)


In [35]:
preds = list()

model.eval()
for inputs in torch.utils.data.DataLoader(test_df):
    output = model(inputs.float())
    pred = output.max(1, keepdim=True)[1]  # get the index of the max logit
    preds.append(int(pred.data))


In [36]:
pd.DataFrame(preds, columns=['pred']).to_csv('preds.csv', index=False, header=False)
