In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import os

my_seed = 59

In [15]:
csvs_folder = '/kaggle/input/cian-big-three/cian_data'
csvs_paths = [os.path.join(csvs_folder, csv) for csv in os.listdir(csvs_folder)]
ds_full = pd.read_csv(csvs_paths[0], delimiter=';')

for csv in csvs_paths[1:]:
    ds_part = pd.read_csv(csv, delimiter=';')
    ds_full = pd.concat([ds_full, ds_part], axis=0, ignore_index=True)

print(len(ds_full))

17107


In [16]:
ds_full.reset_index()
ds_full.head()

Unnamed: 0,author,author_type,url,location,deal_type,accommodation_type,floor,floors_count,rooms_count,total_meters,price,district,street,house_number,underground,residential_complex
0,Форум-групп,developer,https://ekb.cian.ru/sale/flat/306151208/,Екатеринбург,sale,flat,25,29,5,128.0,34024538.0,Центр,Центральный жилрайон,,Площадь 1905 года,Форма ЖК
1,РЕПУТАЦИЯ,real_estate_agent,https://ekb.cian.ru/sale/flat/304987190/,Екатеринбург,sale,flat,2,10,5,164.0,13300000.0,Орджоникидзевский,переулок Замятина,28.0,Проспект Космонавтов,
2,PR FLAT,real_estate_agent,https://ekb.cian.ru/sale/flat/315286042/,Екатеринбург,sale,flat,10,10,5,132.6,16100790.0,Центр,Шейнкмана,110.0,Геологическая,
3,TEN девелопмент,developer,https://ekb.cian.ru/sale/flat/313517303/,Екатеринбург,sale,flat,11,16,5,159.7,69135000.0,Центр,Февральской Революции,21.0,Динамо,19/05
4,Кулиговская и Партнёры,real_estate_agent,https://ekb.cian.ru/sale/flat/314597466/,Екатеринбург,sale,flat,21,23,5,162.4,64960000.0,Октябрьский,Тверитина,46.0,Геологическая,


In [17]:
ds_full = ds_full.dropna(axis=0, subset=['price'])
# "location" is important if data contains different cities
redundant_columns = ['author', 'url', 'deal_type', 'accommodation_type']
ds_full = ds_full.drop(redundant_columns, axis=1)
print(len(ds_full))
print(ds_full.describe())

16575
              floor  floors_count   rooms_count  total_meters         price
count  16575.000000  16575.000000  16575.000000  16575.000000  1.657500e+04
mean       7.955053     15.892549      2.624314     87.304992  4.916935e+07
std        7.915672     10.653911      1.310068     60.377137  1.178342e+08
min       -1.000000     -1.000000     -1.000000     10.760000  4.000000e+05
25%        3.000000      8.000000      2.000000     48.000000  9.833106e+06
50%        5.000000     13.000000      3.000000     68.000000  1.520000e+07
75%       11.000000     23.000000      4.000000    104.100000  3.616831e+07
max       82.000000     95.000000      5.000000    979.500000  2.862950e+09


In [18]:
# drop rooms_count == -1
ds_full = ds_full[ds_full['rooms_count'] > 0]
print(len(ds_full))

16517


In [19]:
# fix NaN in cat. features
object_cols = [col for col in ds_full.columns if ds_full[col].dtype == "object"]
for col in object_cols:
    ds_full[col] = ds_full[col].fillna('undef')
    ds_full[col] = ds_full[col].str.strip().str.lower()

In [20]:
X, y = ds_full.drop(['price'], axis=1), ds_full['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=my_seed)
print(len(X_train), len(X_test), len(y_train), len(y_test))

13213 3304 13213 3304


In [21]:
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# high cardinality in some columns
sorted(d.items(), key=lambda x: x[1])

[('location', 3),
 ('author_type', 8),
 ('district', 178),
 ('underground', 364),
 ('residential_complex', 1291),
 ('house_number', 1397),
 ('street', 1915)]

In [22]:
# frequency encoding and ohe

low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 20]
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

print(low_cardinality_cols)
print(high_cardinality_cols)

ohe_encoding = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
for col in low_cardinality_cols:
    ohe_cats_array = ohe_encoding.fit_transform(X_train[[col]])
    ohe_cats_df = pd.DataFrame(ohe_cats_array, 
                               columns=[f"{col}_{cat}" for cat in ohe_encoding.categories_[0]],
                               index=X_train.index)
    X_train = pd.concat([X_train, ohe_cats_df], axis=1).drop([col], axis=1)

    ohe_test_cats_array = ohe_encoding.transform(X_test[[col]])
    ohe_test_cats_df = pd.DataFrame(ohe_test_cats_array, 
                                    columns=[f"{col}_{cat}" for cat in ohe_encoding.categories_[0]],
                                    index=X_test.index)
    X_test = pd.concat([X_test, ohe_test_cats_df], axis=1).drop([col], axis=1)

X_train.head()

['author_type', 'location']
['district', 'underground', 'residential_complex', 'house_number', 'street']


Unnamed: 0,floor,floors_count,rooms_count,total_meters,district,street,house_number,underground,residential_complex,author_type_developer,author_type_homeowner,author_type_official_representative,author_type_real_estate_agent,author_type_realtor,author_type_representative_developer,author_type_undef,author_type_unknown,location_екатеринбург,location_москва,location_санкт-петербург
9254,12,51,3,76.0,раменки,мичуринский проспект,56,раменки,"небо — небоскребы на мичуринском, 56",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13081,8,25,1,37.5,октябрьский,undef,5к5,чкаловская,хрустальные ключи жк,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7576,2,13,5,186.5,хамовники,усачева,15а,спортивная,садовые кварталы,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10292,5,33,3,59.6,даниловский,undef,undef,крестьянская застава,эра жк,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13697,8,27,1,35.49,чкаловский,походная,12,ботаническая,небосклоны,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [23]:
# maybe normalize in .value_counts()?..
for col in high_cardinality_cols:
    freq_map = X_train[col].value_counts(normalize=True).to_dict()
    
    X_train[col+'_freqs'] = X_train[col].map(freq_map)
    X_train = X_train.drop(col, axis=1)

    X_test[col+'_freqs'] = X_test[col].map(freq_map)
    X_test = X_test.drop(col, axis=1)

In [24]:
X_train.to_pickle('x_train.pkl') 
y_train.to_pickle('y_train.pkl') 
X_test.to_pickle('x_test.pkl') 
y_test.to_pickle('y_test.pkl') 

In [25]:
print(X_train.columns)
X_train.head()

Index(['floor', 'floors_count', 'rooms_count', 'total_meters',
       'author_type_developer', 'author_type_homeowner',
       'author_type_official_representative', 'author_type_real_estate_agent',
       'author_type_realtor', 'author_type_representative_developer',
       'author_type_undef', 'author_type_unknown', 'location_екатеринбург',
       'location_москва', 'location_санкт-петербург', 'district_freqs',
       'underground_freqs', 'residential_complex_freqs', 'house_number_freqs',
       'street_freqs'],
      dtype='object')


Unnamed: 0,floor,floors_count,rooms_count,total_meters,author_type_developer,author_type_homeowner,author_type_official_representative,author_type_real_estate_agent,author_type_realtor,author_type_representative_developer,author_type_undef,author_type_unknown,location_екатеринбург,location_москва,location_санкт-петербург,district_freqs,underground_freqs,residential_complex_freqs,house_number_freqs,street_freqs
9254,12,51,3,76.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.022705,0.002498,0.000605,0.001741,0.002195
13081,8,25,1,37.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.015591,0.077348,0.000151,0.000227,0.195565
7576,2,13,5,186.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.022251,0.011504,0.003708,0.002043,0.003633
10292,5,33,3,59.6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.013169,0.00106,0.000908,0.243094,0.195565
13697,8,27,1,35.49,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.033225,0.043064,0.001211,0.013093,0.001287
