In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
import re


In [18]:
data = pd.read_csv('nlp.csv')
data = data.drop(['price_uah', 'flat_id','date_created'], axis=1)
data = data.fillna('NA')


In [19]:
data.description.loc[1]

'срочно продаваться квартира хороший район окно пластиковый потолок стена выравнивать ванная санузел современный плитка квартира чистый ухоженный балкон застеклять стояк поменять пол ламинат ряд базар вокзал транспортный развязка'

In [20]:
from joblib import load, dump
tfidf = load('text_representation_tfidf.joblib')
svd = load('text_representation_svd.joblib')

In [21]:
from stop_words import get_stop_words
stop_words_russian = get_stop_words('russian')
stop_words_ukr = get_stop_words('ukrainian')
from pymystem3 import Mystem
mystem = Mystem() 
data['description'] = data['description'].apply(lambda x: ' '.join([t for t in mystem.lemmatize(x.lower()) if 
                                                                    (t not in stop_words_russian and t not in stop_words_ukr and t.isalpha() and len(t) > 2)]))
X = tfidf.fit_transform(data.description)
X = svd.fit_transform(X)

In [22]:
X = pd.DataFrame(X)
data = pd.concat([data, X], axis=1)

In [23]:
categorical_cols = ['type_of_proposal', 'city_name', 'heating_type', 'walls_type'] 
data = data.drop(data[(data['price_usd'] > 1000000) | (data['total_area'] > 600) | (data['living_area'] > 200) | (data['kitchen_area'] > 100) | (data['floor'] > 40) | 
                          (data['number_of_rooms'] > 6)].index)
target = data['price_usd']
data = data.drop('price_usd', axis = 1)
# preprocessing steps
data['year_of_construction'] = data['year_of_construction'].apply(lambda x: re.findall(r'\b\d+\b',str(x))[0] 
                                                                  if len(re.findall(r'\b\d+\b',str(x))) != 0 else -1)


In [24]:
data = data.drop('description', axis =1)
data.head()

Unnamed: 0,type_of_proposal,city_name,total_area,living_area,kitchen_area,floor,total_number_of_floors,number_of_rooms,year_of_construction,heating_type,...,90,91,92,93,94,95,96,97,98,99
0,от собственника,Винница,95.1,51.6,21.2,7,12,3,2017,индивидуальное,...,-0.013085,0.006626,-0.003804,-0.00989,-0.024501,-0.009227,-0.015793,-0.002021,0.014442,-0.004673
1,от посредника,Винница,63.0,40.0,8.0,5,9,3,-1,централизованное,...,0.038001,0.014788,-0.00659,0.032128,-0.00058,-0.010669,0.017705,0.010321,-0.000563,-0.009511
2,от посредника,Винница,38.0,18.0,9.0,1,5,1,2014,индивидуальное,...,0.00692,-0.015672,0.001417,-0.023127,0.001304,-0.015824,6.8e-05,-0.014232,-0.028762,0.01564
3,от представителя хозяина (без комиссионных),Харьков,95.4,50.0,0.0,13,14,3,-1,индивидуальное,...,-0.022783,-0.019228,0.029614,-0.025937,0.051994,0.033629,-0.016136,-0.017017,0.052564,0.034505
4,от представителя хозяина (без комиссионных),Винница,77.0,40.0,14.0,3,5,2,2019,без отопления,...,0.004494,-0.024551,0.036647,-0.014925,0.007343,-0.01909,0.021563,0.054567,-0.041486,0.052118


In [14]:
x_train, x_test,y_train, y_test = train_test_split(data, target, test_size=0.2, shuffle=True, random_state=42)

In [8]:
train_data = x_train
train_label = y_train
test_data = x_test
test_label = y_test
x_train.head()

Unnamed: 0,type_of_proposal,city_name,total_area,living_area,kitchen_area,floor,total_number_of_floors,number_of_rooms,year_of_construction,heating_type,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99
25879,от посредника,Днепропетровск,93.0,0.0,9.0,7,9,5,-1,централизованное,...,-0.005772,0.009538,0.004673,-0.003542,0.014862,-0.009815,-0.023816,0.01914,-0.010819,0.013913
7079,от посредника,Винница,78.0,0.0,0.0,7,9,2,-1,индивидуальное,...,-0.031146,-0.02524,0.01,-0.000772,-0.008911,-0.010001,-0.001201,0.002092,-0.0021,-0.02375
29157,от посредника,Ирпень,37.1,15.0,8.9,4,5,1,-1,централизованное,...,0.001506,-0.002538,0.002003,-0.00192,-0.000183,0.00177,-0.000552,0.000372,-0.001669,0.000929
18283,,Киев,75.0,46.0,15.0,19,25,2,-1,централизованное,...,0.003002,0.000308,-0.027143,-0.006431,0.025086,0.00196,0.004671,-0.002525,-0.009245,-0.008934
8244,от посредника,Буча,46.95,18.61,14.62,7,16,1,-1,централизованное,...,-0.000433,0.000163,0.002833,-0.001838,1e-06,-0.00145,0.001595,-0.000874,0.001581,-0.000457


In [27]:
import numpy as np
from catboost import Pool, CatBoostRegressor

# initialize Pool
train_pool = Pool(data, 
                  target, 
                  cat_features=['type_of_proposal', 'city_name', 'heating_type', 'walls_type'])

#train_pool_preds = Pool(train_data,  
#                  cat_features=['type_of_proposal','city_name', 'heating_type', 'walls_type'])

#test_pool = Pool(test_data, 
#                 cat_features=['type_of_proposal', 'city_name', 'heating_type', 'walls_type']) 

# specify the training parameters 
model = CatBoostRegressor(iterations=10, 
                          depth=16, 
                          learning_rate=1, 
                          loss_function='RMSE')
#train the model
model.fit(train_pool)
# make the prediction using the resulting model
model.save_model('cat_boost_model_with_nlp')

0:	learn: 39191.8900014	total: 21.5s	remaining: 3m 13s
1:	learn: 28838.5182833	total: 42.3s	remaining: 2m 49s
2:	learn: 24051.3706985	total: 1m 3s	remaining: 2m 28s
3:	learn: 20725.6338938	total: 1m 25s	remaining: 2m 7s
4:	learn: 18238.1658053	total: 1m 46s	remaining: 1m 46s
5:	learn: 16620.1569109	total: 2m 7s	remaining: 1m 25s
6:	learn: 14632.4400159	total: 2m 29s	remaining: 1m 4s
7:	learn: 12476.9659552	total: 2m 52s	remaining: 43s
8:	learn: 11941.2475799	total: 3m 16s	remaining: 21.8s
9:	learn: 11439.0844219	total: 3m 40s	remaining: 0us
