In [1]:
import sqlalchemy as db
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pickle

import torch
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as Data

In [2]:
engine = db.create_engine(f'postgresql://bogdanivanyuk:bogdanivanyuk@localhost:5431/flats_data')
connection = engine.connect()
metadata = db.MetaData()
flat_info = db.Table('flat_info', metadata, autoload=True, autoload_with=engine)
announcement_info = db.Table('announcement_info', metadata, autoload=True, autoload_with=engine)

In [3]:
#Equivalent to 'SELECT * FROM census'
query_flat_info = connection.execute(db.select([flat_info]))
df_flat_info = pd.DataFrame(query_flat_info)
df_flat_info.columns = query_flat_info.keys()

query_announcement_info = connection.execute(db.select([announcement_info]))
df_announcement_info = pd.DataFrame(query_announcement_info)
df_announcement_info.columns = query_announcement_info.keys()

data = pd.merge(df_announcement_info, df_flat_info, on='flat_id')
data = data.drop(['page_url', 'image_urls','verified', 'title', 'street_name'], axis = 1)

data.head(4)

Unnamed: 0,flat_id,price_uah,price_usd,description,type_of_proposal,date_created,city_name,total_area,living_area,kitchen_area,floor,total_number_of_floors,number_of_rooms,year_of_construction,heating_type,walls_type,latitude,longitude
0,29239,1015228,40000,Продаётся однокомнатная квартира с евроремонто...,,2019-07-09 13:46:10,Одесса,42.0,30.0,8.0,12,13,1,,централизованное,кирпич,0.0,0.0
1,29240,812183,32000,"Предлагается 1 комнатная квартира, район Дома ...",,2019-07-09 13:46:16,Одесса,36.0,18.0,9.0,8,10,1,,централизованное,кирпич,0.0,0.0
2,29241,2461929,97000,Продам 2-х комнатную квартиру с ремонтом. в ЖК...,,2019-07-09 13:46:14,Одесса,68.0,27.0,24.0,7,9,2,,централизованное,кирпич,0.0,0.0
3,29242,989848,39000,Продам трехкомнатную квартиру общей площадью 6...,,2019-07-09 13:46:15,Одесса,62.0,35.0,6.0,1,5,3,,централизованное,кирпич,0.0,0.0


In [4]:
# outlier detection
data = data.drop(data[(data['price_usd'] > 1000000) | (data['total_area'] > 600) | (data['living_area'] > 200) | (data['kitchen_area'] > 100) | (data['floor'] > 40) | 
                          (data['number_of_rooms'] > 6)].index)
# preprocessing steps
data['year_of_construction'] = data['year_of_construction'].apply(lambda x: re.findall(r'\b\d+\b',str(x))[0] if len(re.findall(r'\b\d+\b',str(x))) != 0 else -1)
data['type_of_proposal'] = data['type_of_proposal'].replace(r'^\s*$', 'NA_proposal', regex=True)
data['heating_type'] = data['heating_type'].replace(r'^\s*$', 'NA_heating', regex=True)
data['year_of_construction'] = data['year_of_construction'].astype(int)


In [5]:
from stop_words import get_stop_words
stop_words_russian = get_stop_words('russian')
stop_words_ukr = get_stop_words('ukrainian')
from pymystem3 import Mystem
mystem = Mystem() 
data['description'] = data['description'].apply(lambda x: ' '.join([t for t in mystem.lemmatize(x.lower()) if 
                                                                    (t not in stop_words_russian and t not in stop_words_ukr and t.isalpha())]))

In [6]:
corpus_text = '\n'.join(data['description'])
sentences = corpus_text.split('\n')
sentences[:2]

['продаваться однокомнатный квартира евроремонт техника мебель ремонт выполнять качественный стройматериал ванный теплый пола кухня devi коридор кухня балкон пол испанский плитка комната немецкий ламинат большой балкон эркерный застекление панорамный вид номер обьект сайт ан премьер id продиктовать риелтор просьба посредник беспокоить',
 'предлагаться комнатный квартира район дом мебель состояние строитель устанавливать агва проводить электричество лоджия кв вид балкон площадь возле дом мебель отличный транспортный развязка номер обьект сайт ан премьер id продиктовать риелтор просьба посредник беспокоить']

In [7]:
from gensim.models import FastText
model_fastText = FastText(sentences, size=100, window=5, min_count=5, workers=4)

In [8]:
text = sentences[0]
np.mean(model_fastText[text])

  


4.9904775e-06

In [9]:
data['description'] = data['description'].apply(lambda x: np.mean(model_fastText[x], axis =0))
data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,flat_id,price_uah,price_usd,description,type_of_proposal,date_created,city_name,total_area,living_area,kitchen_area,floor,total_number_of_floors,number_of_rooms,year_of_construction,heating_type,walls_type,latitude,longitude
0,29239,1015228,40000,5e-06,NA_proposal,2019-07-09 13:46:10,Одесса,42.0,30.0,8.0,12,13,1,-1,централизованное,кирпич,0.0,0.0
1,29240,812183,32000,1.2e-05,NA_proposal,2019-07-09 13:46:16,Одесса,36.0,18.0,9.0,8,10,1,-1,централизованное,кирпич,0.0,0.0
2,29241,2461929,97000,-7e-06,NA_proposal,2019-07-09 13:46:14,Одесса,68.0,27.0,24.0,7,9,2,-1,централизованное,кирпич,0.0,0.0
3,29242,989848,39000,-1.3e-05,NA_proposal,2019-07-09 13:46:15,Одесса,62.0,35.0,6.0,1,5,3,-1,централизованное,кирпич,0.0,0.0
4,29243,1395939,55000,-8e-06,NA_proposal,2019-07-09 13:46:16,Одесса,72.0,0.0,18.0,3,5,3,-1,централизованное,кирпич,0.0,0.0


In [10]:
ohe = OneHotEncoder(categories='auto')
feature_arr = ohe.fit_transform(data[['type_of_proposal','city_name', 'heating_type', 'walls_type']]).astype(int).toarray()
feature_labels = ohe.categories_
feature_labels = np.concatenate(feature_labels).ravel()
data = data.drop(['type_of_proposal', 'city_name', 'heating_type', 'walls_type'], axis=1)
data[feature_labels] = pd.DataFrame(feature_arr, columns=feature_labels)

In [11]:
target = data['price_usd']
data = data.drop(['price_usd', 'price_uah', 'date_created', 'flat_id'], axis=1)

In [12]:
data = data.fillna(0)

In [13]:
x_train, x_test,y_train, y_test = train_test_split(data, target, test_size=0.2, shuffle=True, random_state=42)

In [14]:
torch.manual_seed(1)    # reproducible

<torch._C.Generator at 0x1a1b310590>

In [15]:
x_train.head(6)

Unnamed: 0,description,total_area,living_area,kitchen_area,floor,total_number_of_floors,number_of_rooms,year_of_construction,latitude,longitude,...,монолитно-каркасный,монолитно-кирпичный,монолитный железобетон,облицовочный кирпич,панель,пеноблок,ракушечник (ракушняк),сборно-монолитная,сборный железобетон,силикатный кирпич
25885,0.0,78.0,0.0,0.0,8,14,3,-1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7087,-1.8e-05,64.0,35.2,11.5,7,9,2,-1,49.451002,27.001467,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
29164,0.0,92.0,55.0,16.0,4,9,3,2016,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18289,0.0,22.0,0.0,5.0,3,5,1,-1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8252,1e-06,58.8,29.3,16.3,5,16,2,-1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8343,-4e-06,78.41,25.28,25.28,24,25,1,-1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
scaler = StandardScaler()
x_train = scaler.fit_transform(data)

In [17]:
x_train = torch.Tensor(data.values)
y_train = torch.Tensor(target)

#x_train = torch.Tensor(x_train.values)
#y_train = torch.Tensor(y_train)
#x_test = torch.Tensor(x_test.values)
#y_test = torch.Tensor(y_test.values)

In [91]:
class NeuralNet(torch.nn.Module):
    def __init__(self, number_features, dimensions_hidden, number_output = 1):
        super(NeuralNet, self).__init__()
        self.hidden_1 = torch.nn.Linear(number_features, dimensions_hidden)
        self.relu = torch.nn.ReLU()
        self.hidden_2 = torch.nn.Linear(dimensions_hidden, dimensions_hidden)
        self.relu_2 = torch.nn.ReLU()
        self.hidden_3 = torch.nn.Linear(dimensions_hidden, dimensions_hidden)
        self.relu_3 = torch.nn.ReLU()
        self.hidden_4 = torch.nn.Linear(dimensions_hidden, dimensions_hidden)
        self.relu_4 = torch.nn.ReLU()
        self.predict = torch.nn.Linear(dimensions_hidden, number_output)
        
    def forward(self, x):
        x = self.hidden_1(x)
        x = self.relu(x)
        x = self.hidden_2(x)
        x = self.relu_2(x)
        x = self.hidden_3(x)
        x = self.relu_3(x)
        x = self.hidden_4(x)
        x = self.relu_4(x)
        x = self.predict(x)
        return x

In [105]:
x_train.shape

torch.Size([29939, 265])

In [92]:
net = NeuralNet(number_features=x_train.shape[1], dimensions_hidden = 512)
print(net)
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
epochs = 30
loss_function = torch.nn.MSELoss()

NeuralNet(
  (hidden_1): Linear(in_features=265, out_features=512, bias=True)
  (relu): ReLU()
  (hidden_2): Linear(in_features=512, out_features=512, bias=True)
  (relu_2): ReLU()
  (hidden_3): Linear(in_features=512, out_features=512, bias=True)
  (relu_3): ReLU()
  (hidden_4): Linear(in_features=512, out_features=512, bias=True)
  (relu_4): ReLU()
  (predict): Linear(in_features=512, out_features=1, bias=True)
)


In [93]:
plt.ion()
for i in range(epochs):
    preds = net(x_train)
    loss = loss_function(preds, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    #preds_valid = net(x_test)
    #loss_valid = loss_function(preds_valid, y_test)
    print(f'Epochs RMSE {i}: train - {torch.sqrt(loss)}') #; validation - {torch.sqrt(loss_valid)}')
    #if i % 5 == 0:
    #    # plot and show learning process
    #    plt.cla()
    #    plt.scatter(x_train[:0].data.numpy(), y_train.data.numpy())
    #    plt.plot(x_train[:0].data.numpy(), preds.data.numpy(), 'r-', lw=5)
    #    plt.text(0.5, 0, 'Loss=%.4f' % loss.data.numpy(), fontdict={'size': 20, 'color':  'red'})
    #    plt.pause(0.1)

#plt.ioff()
#plt.show()

Epochs RMSE 0: train - 73554.0859375
Epochs RMSE 1: train - 73008.5078125
Epochs RMSE 2: train - 69845.53125
Epochs RMSE 3: train - 89235.6953125
Epochs RMSE 4: train - 66588.546875
Epochs RMSE 5: train - 67274.078125
Epochs RMSE 6: train - 68758.890625
Epochs RMSE 7: train - 69148.3046875
Epochs RMSE 8: train - 68669.4296875
Epochs RMSE 9: train - 67192.4296875
Epochs RMSE 10: train - 64257.42578125
Epochs RMSE 11: train - 59648.72265625
Epochs RMSE 12: train - 56962.14453125
Epochs RMSE 13: train - 63467.55859375
Epochs RMSE 14: train - 59899.0859375
Epochs RMSE 15: train - 56383.74609375
Epochs RMSE 16: train - 57873.578125
Epochs RMSE 17: train - 59561.83203125
Epochs RMSE 18: train - 59747.8671875
Epochs RMSE 19: train - 58432.921875
Epochs RMSE 20: train - 56439.78515625
Epochs RMSE 21: train - 55909.00390625
Epochs RMSE 22: train - 57864.0078125
Epochs RMSE 23: train - 57723.39453125
Epochs RMSE 24: train - 55908.94140625
Epochs RMSE 25: train - 55798.8515625
Epochs RMSE 26: tra

In [94]:
x_train.shape[1]

265

In [95]:
net = torch.save(net.state_dict(), 'neural_network_state_dict_with_nlp')