In [99]:
import sqlalchemy as db
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
%matplotlib inline

In [100]:
engine = db.create_engine(f'postgresql://bogdanivanyuk:bogdanivanyuk@localhost:5431/flats_data')
connection = engine.connect()
metadata = db.MetaData()
flat_info = db.Table('flat_info', metadata, autoload=True, autoload_with=engine)
announcement_info = db.Table('announcement_info', metadata, autoload=True, autoload_with=engine)

In [101]:
#Equivalent to 'SELECT * FROM census'
query_flat_info = connection.execute(db.select([flat_info]))
df_flat_info = pd.DataFrame(query_flat_info)
df_flat_info.columns = query_flat_info.keys()

query_announcement_info = connection.execute(db.select([announcement_info]))
df_announcement_info = pd.DataFrame(query_announcement_info)
df_announcement_info.columns = query_announcement_info.keys()

data = pd.merge(df_announcement_info, df_flat_info, on='flat_id')
data.fillna(0)
data = data.drop(['flat_id', 'page_url', 'image_urls', 'description', 'verified', 'title', 'street_name'], axis = 1)

data.head(4)

Unnamed: 0,price_uah,price_usd,type_of_proposal,date_created,city_name,total_area,living_area,kitchen_area,floor,total_number_of_floors,number_of_rooms,year_of_construction,heating_type,walls_type,latitude,longitude
0,2035623,80000,от собственника,2019-05-23 23:14:10,Винница,95.1,51.6,21.2,7,12,3,Сдача в 2017,индивидуальное,кирпич,0.0,0.0
1,966921,38000,от посредника,2019-08-13 17:33:39,Винница,63.0,40.0,8.0,5,9,3,,централизованное,панель,0.0,0.0
2,954198,37500,от посредника,2019-07-15 00:55:03,Винница,38.0,18.0,9.0,1,5,1,2014,индивидуальное,кирпич,0.0,0.0
3,1781170,70000,от представителя хозяина (без комиссионных),2019-04-20 16:19:47,Харьков,95.4,50.0,0.0,13,14,3,,индивидуальное,газоблок,50.013457,36.276238


# Outlier detection

In [102]:
data = data.drop(data[(data['price_usd'] > 1000000) | (data['total_area'] > 600) | (data['living_area'] > 200) | (data['kitchen_area'] > 100) | (data['floor'] > 40) | 
                          (data['number_of_rooms'] > 6)].index)

In [103]:
# preprocessing steps
data['year_of_construction'] = data['year_of_construction'].apply(lambda x: re.findall(r'\b\d+\b',str(x))[0] 
                                                                  if len(re.findall(r'\b\d+\b',str(x))) != 0 else -1)

In [89]:
le = LabelEncoder()
# filter categorical columns using mask and turn it into a list
categorical_cols = ['type_of_proposal', 'city_name', 'heating_type', 'walls_type'] 
data['type_of_proposal'] = le.fit_transform(data['type_of_proposal'])
le.transform(['от посредника'])

array([2])

In [90]:
le = LabelEncoder()
# filter categorical columns using mask and turn it into a list
categorical_cols = ['type_of_proposal', 'city_name', 'heating_type', 'walls_type'] 
data[categorical_cols] = data[categorical_cols].apply(lambda col: le.fit_transform(col))

In [104]:
target = data['price_usd']
data = data.drop(['price_usd', 'price_uah', 'date_created'], axis=1)

In [105]:
x_train, x_test,y_train, y_test = train_test_split(data, target, test_size=0.2, shuffle=True, random_state=42)

In [106]:
data.head()

Unnamed: 0,type_of_proposal,city_name,total_area,living_area,kitchen_area,floor,total_number_of_floors,number_of_rooms,year_of_construction,heating_type,walls_type,latitude,longitude
0,от собственника,Винница,95.1,51.6,21.2,7,12,3,2017,индивидуальное,кирпич,0.0,0.0
1,от посредника,Винница,63.0,40.0,8.0,5,9,3,-1,централизованное,панель,0.0,0.0
2,от посредника,Винница,38.0,18.0,9.0,1,5,1,2014,индивидуальное,кирпич,0.0,0.0
3,от представителя хозяина (без комиссионных),Харьков,95.4,50.0,0.0,13,14,3,-1,индивидуальное,газоблок,50.013457,36.276238
4,от представителя хозяина (без комиссионных),Винница,77.0,40.0,14.0,3,5,2,2019,без отопления,кирпич,0.0,0.0


# DECISIONTREE REGRESSOR

In [94]:
x_train.shape

(23951, 13)

In [95]:
param_grid = [{'max_depth':np.arange(1, 21),
              'min_samples_leaf':[1, 5, 10, 20, 50, 100]}]
dtr = DecisionTreeRegressor(random_state=42, max_depth=10, min_samples_leaf = 1)
#gs = GridSearchCV(estimator=dtr, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5,verbose=1)
dtr.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [96]:
#model = gs.best_estimator_
preds = dtr.predict(x_test)
preds_train = dtr.predict(x_train)
print('MSE test: {};  train: {}'.format(int(mean_squared_error(y_test, preds)), int(mean_squared_error(y_train, preds_train))))
print('MAE test: {}; train: {}'.format(int(mean_absolute_error(y_test, preds)), int(mean_absolute_error(y_train, preds_train))))

MSE test: 1005576577;  train: 360586640
MAE test: 13742; train: 10397


In [97]:
from joblib import dump, load
dump(dtr, 'decision_tree_model.joblib') 

['decision_tree_model.joblib']

In [15]:
clf = load('decision_tree_model.joblib') 

In [16]:
preds = clf.predict(x_test)
preds_train = clf.predict(x_train)
print(len(preds))
print(len(x_test))
print('MSE test: {};  train: {}'.format(int(mean_squared_error(y_test, preds)), int(mean_squared_error(y_train, preds_train))))
print('MAE test: {}; train: {}'.format(int(mean_absolute_error(y_test, preds)), int(mean_absolute_error(y_train, preds_train))))

5988
5988
MSE test: 1005576577;  train: 360586640
MAE test: 13742; train: 10397


# CatBoost

In [107]:
train_data = x_train
train_label = y_train
test_data = x_test
test_label = y_test
x_train.head()

Unnamed: 0,type_of_proposal,city_name,total_area,living_area,kitchen_area,floor,total_number_of_floors,number_of_rooms,year_of_construction,heating_type,walls_type,latitude,longitude
25879,от посредника,Днепропетровск,93.0,0.0,9.0,7,9,5,-1,централизованное,панель,0.0,0.0
7079,от посредника,Винница,78.0,0.0,0.0,7,9,2,-1,индивидуальное,кирпич,0.0,0.0
29157,от посредника,Ирпень,37.1,15.0,8.9,4,5,1,-1,централизованное,кирпич,0.0,0.0
18283,,Киев,75.0,46.0,15.0,19,25,2,-1,централизованное,кирпич,50.395887,30.61572
8244,от посредника,Буча,46.95,18.61,14.62,7,16,1,-1,централизованное,пеноблок,0.0,0.0


In [109]:
import numpy as np
from catboost import Pool, CatBoostRegressor
# initialize data
#train_data = np.random.randint(0, 
#                               100, 
#                               size=(100, 10))
#train_label = np.random.randint(0, 
#                                1000, 
#                                size=(100))
#test_data = np.random.randint(0, 
#                              100, 
#                              size=(50, 10))
# initialize Pool
train_pool = Pool(train_data, 
                  train_label, 
                  #cat_features=['type_of_proposal', 'city_name', 'heating_type', 'walls_type'])
                  cat_features=[0, 1, 9, 10])

train_pool_preds = Pool(train_data,  
                  #cat_features=['type_of_proposal','city_name', 'heating_type', 'walls_type'])
                        cat_features=[0, 1, 9, 10])

test_pool = Pool(test_data, 
                 #cat_features=['type_of_proposal', 'city_name', 'heating_type', 'walls_type']) 
                 cat_features=[0, 1, 9, 10])

# specify the training parameters 
model = CatBoostRegressor(iterations=10, 
                          depth=16, 
                          learning_rate=1, 
                          loss_function='RMSE')
#train the model
model.fit(train_pool)
# make the prediction using the resulting model
preds = model.predict(test_pool)
model.save_model('cat_boost_model')
preds_train = model.predict(train_pool_preds)
import math
print(preds)
print('MSE test: {};  train: {}'.format(int(mean_squared_error(y_test, preds)), int(mean_squared_error(y_train, preds_train))))
print('MAE test: {}; train: {}'.format(int(mean_absolute_error(y_test, preds)), int(mean_absolute_error(y_train, preds_train))))

0:	learn: 33672.6497549	total: 1.58s	remaining: 14.3s
1:	learn: 29107.7266087	total: 3.07s	remaining: 12.3s
2:	learn: 25997.1930694	total: 4.62s	remaining: 10.8s
3:	learn: 22614.0688174	total: 6.05s	remaining: 9.08s
4:	learn: 21079.5840769	total: 7.51s	remaining: 7.51s
5:	learn: 19386.7249543	total: 8.97s	remaining: 5.98s
6:	learn: 18099.6546366	total: 10.6s	remaining: 4.54s
7:	learn: 17236.5482264	total: 12.1s	remaining: 3.01s
8:	learn: 16156.5930248	total: 13.6s	remaining: 1.51s
9:	learn: 15727.3073929	total: 15.2s	remaining: 0us
[15102.61101507 30436.46292387 13364.45701088 ... 22138.14367772
 85425.53046766 51599.17293176]
MSE test: 996668961;  train: 262532578
MAE test: 13835; train: 9418


In [12]:
from_file = CatBoostRegressor()

from_file.load_model("cat_boost_model")

preds = from_file.predict(test_pool)

In [13]:
print(preds)
print('MSE test: {};  train: {}'.format(int(mean_squared_error(y_test, preds))), int(mean_squared_error(y_train, preds_train)))
print('MAE test: {}; train: {}'.format(int(mean_absolute_error(y_test, preds)), int(mean_absolute_error(y_train, preds_train))))

[26040.55170496 32487.2836305  14515.7664526  ... 25885.53009348
 85178.97731012 60771.79941853]


IndexError: tuple index out of range