In [495]:
import torch
import torch.nn as nn
import torch.utils.data as data

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [496]:
device = torch.device('cuda')

In [497]:
import pickle
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [498]:
df = pd.read_csv('/content/drive/MyDrive/price_dataset/train_data.csv', delimiter=",")

In [None]:
df.head()

In [500]:
df["over100k"] = df["SalePrice"] > 1e5
df["over350k"] = df["SalePrice"] > 3.5e5
df.drop(columns = "SalePrice", inplace=True)

In [None]:
df.head()

In [502]:
apartment_category_columns = [
    "HallwayType",
    "HeatingType",
    "AptManageType"
]

transport_category_columns = [
    "TimeToBusStop",
    "TimeToSubway",
    "SubwayStation"
]

apartment_category_values = pd.get_dummies(
    df[apartment_category_columns]
)

transport_category_values = pd.get_dummies(
    df[transport_category_columns]
)

In [None]:
apartment_category_values

In [None]:
transport_category_values

In [505]:
target_100k_values = df["over100k"]
target_350k_values = df["over350k"]
target_all_values = target_100k_values.astype(int) + target_350k_values.astype(int)
target_columns = [
    "over100k",
    "over350k"
]

In [506]:
df.drop(columns = apartment_category_columns + transport_category_columns + target_columns, inplace=True)

In [507]:
TEST_SIZE = 0.2
test_indicies = np.random.rand(len(df)) < TEST_SIZE

In [508]:
train_numerical = torch.from_numpy(df.values[~test_indicies]).float()
test_numerical = torch.from_numpy(df.values[test_indicies]).float()

train_apartment_category = torch.from_numpy(apartment_category_values.values[~test_indicies]).float()
test_apartment_category = torch.from_numpy(apartment_category_values.values[test_indicies]).float()

train_transport_category = torch.from_numpy(transport_category_values.values[~test_indicies]).float()
test_transport_category = torch.from_numpy(transport_category_values.values[test_indicies]).float()

train_target100k = torch.from_numpy(target_100k_values.values[~test_indicies]).float()
test_target100k = torch.from_numpy(target_100k_values.values[test_indicies]).float()

train_target350k = torch.from_numpy(target_350k_values.values[~test_indicies]).float()
test_target350k = torch.from_numpy(target_350k_values.values[test_indicies]).float()
test_target_all = torch.from_numpy(target_all_values.values[test_indicies]).float()


In [509]:
train_target100k_tensor = data.TensorDataset(train_numerical, train_apartment_category, train_transport_category, train_target100k)
test_target100k_tensor = data.TensorDataset(test_numerical, test_apartment_category, test_transport_category, test_target100k)

train_target350k_tensor = data.TensorDataset(train_numerical, train_apartment_category, train_transport_category, train_target350k)
test_target350k_tensor = data.TensorDataset(test_numerical, test_apartment_category, test_transport_category, test_target350k)

test_target_all_tensor = data.TensorDataset(test_numerical, test_apartment_category, test_transport_category, test_target_all)

In [510]:
numerical_size = df.shape[1]
apartment_category_size = apartment_category_values.shape[1]
transport_category_size = transport_category_values.shape[1]

In [None]:
print(f" Numerical size: {numerical_size}\n Apartment category size: {apartment_category_size}\n Transport category size: {transport_category_size}")

In [512]:
DROPOUT = 0.2
LR = 1e-3
EPOCHS = 200
BATCH_SIZE = 128
WEIGHT_DECAY = 1e-4

In [513]:
class NeuralNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        self.apartment_emb_layer = nn.Linear(apartment_category_size, apartment_category_size)
        self.transport_emb_layer = nn.Linear(transport_category_size, transport_category_size)
        self.act_emb = nn.Tanh()
        self.linear1 = nn.Linear(apartment_category_size + transport_category_size + numerical_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.act1 = nn.ReLU()
        self.drop1 = nn.Dropout(DROPOUT)
        self.linear2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.act2 = nn.ReLU()
        self.drop2 = nn.Dropout(DROPOUT)
        self.linear3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.act3 = nn.ReLU()
        self.drop3 = nn.Dropout(DROPOUT)
        self.linear4 = nn.Linear(32, 1)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, x, apartment_emb, transport_emb):
        apartment_emb = self.apartment_emb_layer(apartment_emb)
        apartment_emb = self.act_emb(apartment_emb)
        transport_emb = self.transport_emb_layer(transport_emb)
        transport_emb = self.act_emb(transport_emb)
        x = torch.cat([x, apartment_emb, transport_emb], dim=1)
        x = self.routine(self.linear1, self.bn1, self.act1, self.drop1, x)
        x = self.routine(self.linear2, self.bn2, self.act2, self.drop2, x)
        x = self.routine(self.linear3, self.bn3, self.act3, self.drop3, x)
        x = self.linear4(x)
        return x

    def routine(self, linear_layer, batch_norm, activation, dropout, x):
        x = linear_layer(x)
        x = batch_norm(x)
        x = activation(x)
        x = dropout(x)
        return x

In [514]:
def get_accuracy(model, data_loader):
  model.eval()
  correct = 0
  total = 0
  with torch.no_grad():
      for x, apartment, transport, label in data_loader:
          x = x.to(device)
          label = label.to(device)
          apartment = apartment.to(device)
          transport = transport.to(device)
          preds = model(x, apartment, transport).squeeze()
          preds = preds > 0
          correct += preds.eq(label).sum().item()
          total += x.shape[0]

  return correct / total

In [515]:
train_100k_loader = data.DataLoader(train_target100k_tensor, batch_size=BATCH_SIZE, shuffle=True)
train_350k_loader = data.DataLoader(train_target350k_tensor, batch_size=BATCH_SIZE, shuffle=True)

In [516]:
test_100k_loader = data.DataLoader(test_target100k_tensor, batch_size=BATCH_SIZE, shuffle=False)
test_350k_loader = data.DataLoader(test_target350k_tensor, batch_size=BATCH_SIZE, shuffle=False)

In [517]:
test_all_loader = data.DataLoader(test_target_all_tensor, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
model100k = NeuralNetwork()
model100k.to(device)

In [519]:
optimizer = torch.optim.Adam(params=model100k.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
loss_module = nn.BCEWithLogitsLoss(pos_weight=(len(train_target100k)-sum(train_target100k))/sum(train_target100k))

In [None]:
model100k.train()
loss_list = []
for epoch in range(EPOCHS):
  epoch_loss = []
  for x, apartment, transport, y in train_100k_loader:
      x = x.to(device)
      y = y.to(device)
      apartment = apartment.to(device)
      transport = transport.to(device)
      outputs = model100k(x, apartment, transport)
      loss = loss_module(outputs.squeeze(dim=1), y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      epoch_loss.append(loss.item())
  mean_loss = np.array(epoch_loss).mean()
  loss_list.append(mean_loss)
  if epoch % 10 == 0:
    print(f"Epoch: {epoch}, loss={mean_loss:.3}")

In [None]:
plt.plot(loss_list)
plt.title("Loss for 100k model")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.show()

In [None]:
print(f"Accuracy (100k): {get_accuracy(model100k, test_100k_loader)}")

In [None]:
model350k = NeuralNetwork()
model350k.to(device)

In [524]:
optimizer = torch.optim.Adam(params=model350k.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
loss_module = nn.BCEWithLogitsLoss(pos_weight=(len(train_target350k)-sum(train_target350k))/sum(train_target350k))

In [None]:
model350k.train()
loss_list = []
for epoch in range(EPOCHS):
  epoch_loss = []
  for x, apartment, transport, y in train_350k_loader:
      x = x.to(device)
      y = y.to(device)
      apartment = apartment.to(device)
      transport = transport.to(device)
      outputs = model350k(x, apartment, transport)
      loss = loss_module(outputs.squeeze(dim=1), y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      epoch_loss.append(loss.item())
  mean_loss = np.array(epoch_loss).mean()
  loss_list.append(mean_loss)
  if epoch % 10 == 0:
    print(f"Epoch: {epoch}, loss={mean_loss:.3}")

In [None]:
plt.plot(loss_list)
plt.title("Loss for 350k model")
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.show()

In [None]:
print(f"Accuracy (350k): {get_accuracy(model350k, test_350k_loader)}")

In [None]:
model100k.eval()
model350k.eval()
correct = 0
total = 0
with torch.no_grad():
    for x, apartment, transport, label in test_all_loader:
        x = x.to(device)
        apartment = apartment.to(device)
        transport = transport.to(device)
        label = pd.DataFrame(label.cpu().detach().numpy())

        preds_100k = model100k(x, apartment, transport).squeeze()
        preds_100k = pd.DataFrame(preds_100k.cpu().detach().numpy())

        preds_350k = model350k(x, apartment, transport).squeeze() 
        preds_350k = pd.DataFrame(preds_350k.cpu().detach().numpy())
        
        preds = (preds_100k > 0).astype(int) + (preds_350k > 0).astype(int)
        correct += (preds == label).sum().item()
        total += x.shape[0]

print(f"Accuracy: {correct / total}")

In [529]:
eval_df = pd.read_csv('/content/drive/MyDrive/price_dataset/test_data.csv', delimiter=",")

In [None]:
eval_df.head()

In [531]:
eval_transportation_categorical_values = pd.get_dummies(
      eval_df[transport_category_columns]
)

eval_apartment_categorical_values = pd.get_dummies(
      eval_df[apartment_category_columns]
)

In [532]:
eval_df.drop(columns=transport_category_columns + 
                apartment_category_columns, inplace=True)

In [533]:
eval_numerical_data = torch.from_numpy(
    eval_df.values
).float()

eval_transportation_categorical_data = torch.from_numpy(
    eval_transportation_categorical_values.values
).float()

eval_apartment_categorical_data = torch.from_numpy(
    eval_apartment_categorical_values.values
).float()

In [534]:
eval_dataset = data.TensorDataset(eval_numerical_data,eval_apartment_categorical_data,
                                  eval_transportation_categorical_data)

In [535]:
eval_data_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=1, shuffle=False)

In [None]:
import csv
count = {0:0,1:0, 2:0}
with open('wyniki.csv', 'w') as f:
    writer = csv.writer(f)
    model350k.eval()
    model100k.eval()
    with torch.no_grad():
        for x,apartment, transport in eval_data_loader:
            x, apartment, transport = (
                x.to(device), apartment.to(device), 
                transport.to(device)
            )
            output100k = model100k(x, apartment, transport).squeeze(dim=1)
            output100k = pd.DataFrame(output100k.cpu().detach().numpy())

            output350k = model350k(x, apartment, transport).squeeze(dim=1)
            output350k = pd.DataFrame(output350k.cpu().detach().numpy())
            pred = (output100k > 0).astype(int) + (output350k > 0).astype(int)
            pred = pred.values[0][0]
            count[pred] += 1
            writer.writerow([pred])
print(count)