In [1]:
import sqlalchemy as db
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

import torch
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as Data

In [2]:
engine = db.create_engine(f'postgresql://bogdanivanyuk:bogdanivanyuk@localhost:5431/flats_data')
connection = engine.connect()
metadata = db.MetaData()
flat_info = db.Table('flat_info', metadata, autoload=True, autoload_with=engine)
announcement_info = db.Table('announcement_info', metadata, autoload=True, autoload_with=engine)

In [3]:
#Equivalent to 'SELECT * FROM census'
query_flat_info = connection.execute(db.select([flat_info]))
df_flat_info = pd.DataFrame(query_flat_info)
df_flat_info.columns = query_flat_info.keys()

query_announcement_info = connection.execute(db.select([announcement_info]))
df_announcement_info = pd.DataFrame(query_announcement_info)
df_announcement_info.columns = query_announcement_info.keys()

data = pd.merge(df_announcement_info, df_flat_info, on='flat_id')
data.fillna(0)
data = data.drop(['page_url', 'image_urls', 'description', 'verified', 'title', 'street_name'], axis = 1)

data.head(4)

Unnamed: 0,flat_id,price_uah,price_usd,type_of_proposal,date_created,city_name,total_area,living_area,kitchen_area,floor,total_number_of_floors,number_of_rooms,year_of_construction,heating_type,walls_type,latitude,longitude
0,0,2035623,80000,от собственника,2019-05-23 23:14:10,Винница,95.1,51.6,21.2,7,12,3,Сдача в 2017,индивидуальное,кирпич,0.0,0.0
1,1,966921,38000,от посредника,2019-08-13 17:33:39,Винница,63.0,40.0,8.0,5,9,3,,централизованное,панель,0.0,0.0
2,2,954198,37500,от посредника,2019-07-15 00:55:03,Винница,38.0,18.0,9.0,1,5,1,2014,индивидуальное,кирпич,0.0,0.0
3,3,1781170,70000,от представителя хозяина (без комиссионных),2019-04-20 16:19:47,Харьков,95.4,50.0,0.0,13,14,3,,индивидуальное,газоблок,50.013457,36.276238


In [4]:
# outlier detection
data = data.drop(data[(data['price_usd'] > 1000000) | (data['total_area'] > 600) | (data['living_area'] > 200) | (data['kitchen_area'] > 100) | (data['floor'] > 40) | 
                          (data['number_of_rooms'] > 6)].index)
# preprocessing steps
data['year_of_construction'] = data['year_of_construction'].apply(lambda x: re.findall(r'\b\d+\b',str(x))[0] 
                                                                  if len(re.findall(r'\b\d+\b',str(x))) != 0 else -1)
data['type_of_proposal'] = data['type_of_proposal'].replace(r'^\s*$', 'NA_proposal', regex=True)
data['heating_type'] = data['heating_type'].replace(r'^\s*$', 'NA_heating', regex=True)
data['year_of_construction'] = data['year_of_construction'].astype(int)

ohe = OneHotEncoder(categories='auto')
feature_arr = ohe.fit_transform(data[['type_of_proposal','city_name', 'heating_type', 'walls_type']]).astype(int).toarray()
feature_labels = ohe.categories_
feature_labels = np.concatenate(feature_labels).ravel()
data = data.drop(['type_of_proposal', 'city_name', 'heating_type', 'walls_type'], axis=1)
data[feature_labels] = pd.DataFrame(feature_arr, columns=feature_labels)
data = data.fillna(0)

In [5]:
target = data['price_usd']
data = data.drop(['price_usd', 'price_uah', 'date_created', 'flat_id'], axis=1)

In [6]:
data.head()

Unnamed: 0,total_area,living_area,kitchen_area,floor,total_number_of_floors,number_of_rooms,year_of_construction,latitude,longitude,NA_proposal,...,монолитно-каркасный,монолитно-кирпичный,монолитный железобетон,облицовочный кирпич,панель,пеноблок,ракушечник (ракушняк),сборно-монолитная,сборный железобетон,силикатный кирпич
0,95.1,51.6,21.2,7,12,3,2017,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,63.0,40.0,8.0,5,9,3,-1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,38.0,18.0,9.0,1,5,1,2014,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,95.4,50.0,0.0,13,14,3,-1,50.013457,36.276238,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,77.0,40.0,14.0,3,5,2,2019,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
x_train, x_test,y_train, y_test = train_test_split(data, target, test_size=0.2, shuffle=True, random_state=42)

In [8]:
torch.manual_seed(1)    # reproducible

<torch._C.Generator at 0x1a2f7cf530>

In [9]:
x_train.head(1)

Unnamed: 0,total_area,living_area,kitchen_area,floor,total_number_of_floors,number_of_rooms,year_of_construction,latitude,longitude,NA_proposal,...,монолитно-каркасный,монолитно-кирпичный,монолитный железобетон,облицовочный кирпич,панель,пеноблок,ракушечник (ракушняк),сборно-монолитная,сборный железобетон,силикатный кирпич
25879,93.0,0.0,9.0,7,9,5,-1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
scaler_1 = StandardScaler()
scaler_2 = StandardScaler()
x_train = scaler_1.fit_transform(x_train)
x_test = scaler_1.transform(x_test)

In [11]:
x_train = torch.Tensor(x_train)
y_train = torch.Tensor(y_train.values).view(-1, 1)
x_test = torch.Tensor(x_test)
y_test = torch.Tensor(y_test.values).view(-1, 1)

#train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
#test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [27]:
y_train

tensor([[46500.],
        [  393.],
        [20034.],
        ...,
        [44500.],
        [27026.],
        [23112.]])

In [12]:
class NeuralNet(torch.nn.Module):
    def __init__(self, input_size, dimensions_hidden, number_output = 1):
        super(NeuralNet, self).__init__()
        self.input = torch.nn.Linear(input_size, dimensions_hidden)
        self.relu = torch.nn.ReLU()
        self.hidden = torch.nn.Linear(dimensions_hidden, dimensions_hidden)
        self.predict = torch.nn.Linear(dimensions_hidden, number_output)
        
    def forward(self, x):
        x = self.input(x)
        x = self.relu(x)
        x = self.hidden(x)
        x = self.relu(x)
        x = self.predict(x)
        return x

In [31]:
net = NeuralNet(input_size=x_train.shape[1], dimensions_hidden = 256)

optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
epochs = 10
loss_function = torch.nn.MSELoss()

In [32]:
plt.ion()
for i in range(epochs):
    preds = net(x_train)
    loss = loss_function(preds, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    preds_valid = net(x_test)
    loss_valid = loss_function(preds_valid, y_test)
    print(f'Epochs {i}: train - {torch.sqrt(loss)}; validation - {torch.sqrt(loss_valid)}')

Epochs 0: train - 73106.7890625; validation - 75315.8828125
Epochs 1: train - 73105.34375; validation - 75313.390625
Epochs 2: train - 73102.7578125; validation - 75309.0
Epochs 3: train - 73098.1796875; validation - 75302.0703125
Epochs 4: train - 73090.9609375; validation - 75291.8984375
Epochs 5: train - 73080.3671875; validation - 75277.7109375
Epochs 6: train - 73065.6015625; validation - 75258.65625
Epochs 7: train - 73045.7734375; validation - 75233.8046875
Epochs 8: train - 73019.9140625; validation - 75202.125
Epochs 9: train - 72986.9609375; validation - 75162.5078125


In [33]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

import numpy as np
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#X, y = load_boston(return_X_y=True)


# create train and test indices
train, test = train_test_split(list(range(data.shape[0])), test_size=.2, random_state=42)

input_size = data.shape[1]
hidden_layer_size = 256
learning_rate = 0.01
batch_size = 500
num_epochs = 10

class PrepareData(Dataset):

    def __init__(self, X, y, scale_X=True):
        if not torch.is_tensor(X):
            if scale_X:
                X = StandardScaler().fit_transform(X)
                self.X = torch.from_numpy(X)
        if not torch.is_tensor(y):
            self.y = torch.from_numpy(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

ds = PrepareData(data, y=np.array(target), scale_X=True)

train_set = DataLoader(ds, batch_size=batch_size,
                       sampler=SubsetRandomSampler(train))
test_set = DataLoader(ds, batch_size=batch_size,
                      sampler=SubsetRandomSampler(test))

class RegressionModel(nn.Module):

    def __init__(self, input_size, hidden_size):
        super(RegressionModel, self).__init__()
        self.dense_h1 = nn.Linear(input_size, hidden_size)
        self.relu_h1 = nn.ReLU()
        self.dense_h2 = nn.Linear(hidden_size, hidden_size)
        self.dense_out = nn.Linear(hidden_size, 1)

    def forward(self, X):
        out = self.dense_h1(X)
        out = self.relu_h1(out)
        out = self.dense_h2(out)
        out = self.dense_out(out)
        return out

model = RegressionModel(input_size=input_size, hidden_size=hidden_layer_size)

cost_func = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(model)

RegressionModel(
  (dense_h1): Linear(in_features=264, out_features=256, bias=True)
  (relu_h1): ReLU()
  (dense_h2): Linear(in_features=256, out_features=256, bias=True)
  (dense_out): Linear(in_features=256, out_features=1, bias=True)
)


In [34]:
all_losses = []
for e in range(num_epochs):
    batch_losses = []

    for ix, (Xb, yb) in enumerate(train_set):

        _X = Variable(Xb).float()
        _y = Variable(yb).float()
        #==========Forward pass===============
        preds = model(_X)
        loss = cost_func(preds, _y)

        #==========backward pass==============

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        batch_losses.append(loss.item())
        all_losses.append(loss.item())

    mbl = np.mean(np.sqrt(batch_losses)).round(3)

    if e % 2 == 0:
        print("Epoch [{}/{}], Batch loss: {}".format(e, num_epochs, mbl))

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [0/10], Batch loss: 66094.143
Epoch [2/10], Batch loss: 51553.103
Epoch [4/10], Batch loss: 51562.644
Epoch [6/10], Batch loss: 51712.897
Epoch [8/10], Batch loss: 51567.35


In [35]:

# prepares model for inference when trained with a dropout layer
print(model.training)
model.eval()
print(model.training)

test_batch_losses = []
for _X, _y in test_set:

    _X = Variable(_X).float()
    _y = Variable(_y).float()

    #apply model
    test_preds = model(_X)
    test_loss = cost_func(test_preds, _y)

    test_batch_losses.append(test_loss.item())
    print("Batch loss: {}".format(test_loss.item()))

True
False
Batch loss: 3930648832.0
Batch loss: 2650279936.0
Batch loss: 3212782336.0
Batch loss: 2978340096.0
Batch loss: 2938296832.0
Batch loss: 4163319808.0
Batch loss: 2622347264.0
Batch loss: 2147499264.0
Batch loss: 1571437056.0
Batch loss: 2230563072.0
Batch loss: 3238953216.0
Batch loss: 4103343104.0


  return F.mse_loss(input, target, reduction=self.reduction)
