## Предобработка данных и обучение модели

In [119]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer,  KNNImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error, make_scorer, f1_score, accuracy_score
from transliterate import translit
import xgboost as xgb
import heapq
import geojson
import json
import pickle
from catboost import CatBoostRegressor
import catboost as cb
from similar_text import similar_text

In [27]:
train = pd.read_csv('data/train.csv', parse_dates=['timestamp'])
test = pd.read_csv('data/test.csv', parse_dates=['timestamp'])
macro = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])

In [28]:
id_test = test['id']

train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)
num_train = len(train)

In [29]:
#Очистка данных
print('Data Clean...')
bad_index = train[train.life_sq > train.full_sq].index
train.loc[bad_index, "life_sq"] = np.NaN
equal_index = [601,1896,2791]
test.loc[equal_index, "life_sq"] = test.loc[equal_index, "full_sq"]
bad_index = test[test.life_sq > test.full_sq].index
test.loc[bad_index, "life_sq"] = np.NaN
bad_index = train[train.life_sq < 5].index
train.loc[bad_index, "life_sq"] = np.NaN
bad_index = test[test.life_sq < 5].index
test.loc[bad_index, "life_sq"] = np.NaN
bad_index = train[train.full_sq < 5].index
train.loc[bad_index, "full_sq"] = np.NaN
bad_index = test[test.full_sq < 5].index
test.loc[bad_index, "full_sq"] = np.NaN
kitch_is_build_year = [13117]
train.loc[kitch_is_build_year, "build_year"] = train.loc[kitch_is_build_year, "kitch_sq"]
bad_index = train[train.kitch_sq >= train.life_sq].index
train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = test[test.kitch_sq >= test.life_sq].index
test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.kitch_sq == 0).values + (train.kitch_sq == 1).values].index
train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = test[(test.kitch_sq == 0).values + (test.kitch_sq == 1).values].index
test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.full_sq > 210) & (train.life_sq / train.full_sq < 0.3)].index
train.loc[bad_index, "full_sq"] = np.NaN
bad_index = test[(test.full_sq > 150) & (test.life_sq / test.full_sq < 0.3)].index
test.loc[bad_index, "full_sq"] = np.NaN
bad_index = train[(train.kitch_sq / train.full_sq > 0.6)].index
train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = test[(test.kitch_sq / test.full_sq > 0.6)].index
test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = train[train.life_sq > 300].index
train.loc[bad_index, ["life_sq", "full_sq"]] = np.NaN
bad_index = test[test.life_sq > 200].index
test.loc[bad_index, ["life_sq", "full_sq"]] = np.NaN
train.product_type.value_counts(normalize= True)
test.product_type.value_counts(normalize= True)
bad_index = train[train.build_year < 1500].index
train.loc[bad_index, "build_year"] = np.NaN
bad_index = test[test.build_year < 1500].index
test.loc[bad_index, "build_year"] = np.NaN
bad_index = train[train.build_year > 2019].index
train.loc[bad_index, "build_year"] = np.NaN
bad_index = train[train.num_room == 0].index
train.loc[bad_index, "num_room"] = np.NaN
bad_index = test[test.num_room == 0].index
test.loc[bad_index, "num_room"] = np.NaN
bad_index = [10076, 11621, 17764, 19390, 24007, 26713, 29172]
train.loc[bad_index, "num_room"] = np.NaN
bad_index = [3174, 7313]
test.loc[bad_index, "num_room"] = np.NaN
bad_index = train[(train.floor == 0).values * (train.max_floor == 0).values].index
train.loc[bad_index, ["max_floor", "floor"]] = np.NaN
bad_index = train[train.floor == 0].index
train.loc[bad_index, "floor"] = np.NaN
bad_index = train[train.max_floor == 0].index
train.loc[bad_index, "max_floor"] = np.NaN
bad_index = test[test.max_floor == 0].index
test.loc[bad_index, "max_floor"] = np.NaN
bad_index = train[train.floor > train.max_floor].index
train.loc[bad_index, "max_floor"] = np.NaN
bad_index = test[test.floor > test.max_floor].index
test.loc[bad_index, "max_floor"] = np.NaN
train.floor.describe(percentiles= [0.9999])
bad_index = [23584]
train.loc[bad_index, "floor"] = np.NaN
train.material.value_counts()
test.material.value_counts()
train.state.value_counts()
bad_index = train[train.state == 33].index
train.loc[bad_index, "state"] = np.NaN
test.state.value_counts()

# Очитка выбросов. Отношение стоимости к общей площади недвижимости.
train.loc[train.full_sq == 0, 'full_sq'] = 50
train = train[train.price_doc/train.full_sq <= 600000]
train = train[train.price_doc/train.full_sq >= 40000]


Data Clean...


In [30]:
y = train['price_doc'].values
# train.drop(['price_doc'], axis=1, inplace=True)
train['month'] = train.timestamp.dt.month
train['dow'] = train.timestamp.dt.dayofweek
train['year'] = train.timestamp.dt.year

train['rel_floor'] = train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = train['kitch_sq'] / train['full_sq'].astype(float)

In [31]:
test['month'] = test.timestamp.dt.month
test['dow'] = test.timestamp.dt.dayofweek
test['year'] = test.timestamp.dt.year

test['rel_floor'] = test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = test['kitch_sq'] / test['full_sq'].astype(float)

In [32]:
train['kindergarten_closely'] = train.kindergarten_km.apply(lambda x : 1 if x < 0.5 else 0)
test['kindergarten_closely'] = test.kindergarten_km.apply(lambda x : 1 if x < 0.5 else 0)

In [33]:
train['railroad_closely'] = train.railroad_km.apply(lambda x : 1 if x < 1.5 else 0)
test['railroad_closely'] = test.railroad_km.apply(lambda x : 1 if x < 1.5 else 0)

In [34]:
train['public_transport_station_closely'] = train.public_transport_station_km.apply(lambda x : 1 if x < 0.5 else 0)
test['public_transport_station_closely'] = test.public_transport_station_km.apply(lambda x : 1 if x < 0.5 else 0)

In [35]:
train['metro_closely'] = train.metro_min_walk.apply(lambda x : 1 if x < 17 else 0)
test['metro_closely'] = test.metro_min_walk.apply(lambda x : 1 if x < 17 else 0)

In [36]:
train['school_closely'] = train.school_km.apply(lambda x : 1 if x < 0.5 else 0)
test['school_closely'] = test.school_km.apply(lambda x : 1 if x < 0.5 else 0)

In [37]:
train['park_closely'] = train.park_km.apply(lambda x : 1 if x < 0.5 else 0)
test['park_closely'] = test.park_km.apply(lambda x : 1 if x < 0.5 else 0)

In [38]:
train['water_closely'] = train.water_km.apply(lambda x : 1 if x < 0.5 else 0)
test['water_closely'] = test.water_km.apply(lambda x : 1 if x < 0.5 else 0)

In [39]:
feature_forms = ['build_year', 'sub_area', 'full_sq', 'kitch_sq', 'life_sq', 'num_room', 'floor', 'max_floor',
                'month', 'dow', 'year', 'rel_floor', 'rel_kitch_sq', 'state', 'kindergarten_closely', 'railroad_closely',
                'public_transport_station_closely', 'metro_closely', 'school_closely', 'park_closely', 'water_closely']

In [40]:
feature_forms_classifier = ['build_year', 'full_sq', 'kitch_sq', 'life_sq', 'num_room', 'floor', 'max_floor',
                'rel_floor', 'rel_kitch_sq', 'state', 'kindergarten_closely', 'railroad_closely',
                'public_transport_station_closely', 'metro_closely', 'school_closely', 'park_closely', 'water_closely', 'price_doc']

In [41]:
num_train = len(train)
num_train

28827

In [42]:
df_all = pd.concat([train, test])
df_all = pd.merge_ordered(df_all, macro, on='timestamp', how='left')
df_all.drop(['timestamp'], axis=1, inplace=True)

##### Датасеты для регрессии и классификации

In [97]:
for c in df_all.columns:
    if df_all[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(df_all[c].values))
        df_all[c] = lbl.transform(list(df_all[c].values))

# Датасет для предсказания районов
df_all_classifier = df_all.copy() 

# Датасет со всеми признаками
df_train = df_all[:num_train]
df_test = df_all[num_train:]

# Датасет с отборными признаками
df_selected_features = df_all.copy()
df_selected_features = df_selected_features[feature_forms]
df_train_selected_features = df_selected_features[:num_train]
df_test_selected_features = df_selected_features[num_train:]


## Регрессия. Обучение на всех данных.

In [65]:
xgb_params = {
    'n_estimators' : 300,
    'eta': 0.05,
    'max_depth': 5,
    'objective': 'reg:linear',
    'eval_metric' : 'rmse',
    'silent': 1
}

In [66]:
num_boost_rounds = 422
# model = xgb.XGBRegressor(**xgb_params, )

In [67]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(columns=['price_doc']), y, test_size=0.2, shuffle=False)

In [68]:
train_matrix_all_features = xgb.DMatrix(X_train, np.log1p(y_train), feature_names=X_train.columns.values)
test_matrix_all_features = xgb.DMatrix(X_test, feature_names=X_test.columns.values)

In [70]:
model_reg_all_features = xgb.train(xgb_params, train_matrix_all_features, num_boost_round=num_boost_rounds)
pred_reg_all_features = np.exp(model_reg_all_features.predict(test_matrix_all_features))

Parameters: { "n_estimators", "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [86]:
rmsle = np.sqrt(mean_squared_log_error(y_test, pred_reg_all_features))

print('RMSLE: {:.3f}'.format(rmsle))

RMSLE: 0.255


#### Обучим на всех данных и сделаем саббмит на Kaggle

In [90]:
columns_for_subbmit = df_train.drop(columns=['price_doc']).columns.values
all_df_matrix_all_features = xgb.DMatrix(df_train.drop(columns=['price_doc']), np.log1p(y), feature_names=columns_for_subbmit)
test_df_matrix_all_features = xgb.DMatrix(df_test.drop(columns=['price_doc']), feature_names=columns_for_subbmit)

In [92]:
model_for_subbmit_all_features = xgb.train(xgb_params, all_df_matrix_all_features, num_boost_round=num_boost_rounds)
pred_for_subbmit_all_features = np.exp(model_reg_all_features.predict(test_df_matrix_all_features))

Parameters: { "n_estimators", "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [94]:
submission = pd.DataFrame({'id': id_test, 'price_doc': pred_for_subbmit_all_features})
submission.to_csv('XGB_all_features.csv', index=False)

##### Получили score на Kaggle

In [113]:
print('Public score: 0.32513, Private Score: 0.32491, Top 59%')

Public score: 0.32513, Private Score: 0.32491, Top 59%


### Попробуем Catboost и GridSearch

In [201]:
params_cat = {'iterations' : [800, 1000],
          'learning_rate' : np.arange(0.04, 0.07, 0.003),
          'depth' : [6]}

In [202]:
train_dataset_all_features = cb.Pool(X_train, np.log1p(y_train))
test_dataset_all_features = cb.Pool(X_test, np.log1p(y_test))

In [203]:
model_cat_all_features = CatBoostRegressor(loss_function='RMSE', silent=True)

In [None]:
model_cat_all_features.grid_search(params_cat, train_dataset_all_features, plot=True)

In [154]:
best_params_cat = model_cat_all_features.get_all_params

In [153]:
pred_reg_cat_all_features = np.exp(model_cat_all_features.predict(test_dataset_all_features))
rmsle_cat = np.sqrt(mean_squared_log_error(y_test, pred_reg_cat_all_features))

print('RMSLE: {:.3f}'.format(rmsle))

RMSLE: 0.254


#### Обучим на всех данных и сделаем subbmit

In [155]:
train_dataset_all_features_cat = cb.Pool(df_train.drop(columns=['price_doc']), np.log1p(y))
test_dataset_all_features_cat = cb.Pool(df_test.drop(columns=['price_doc']))

In [190]:
best_cat_params = {
    'iterations': 1000,
    'learning_rate': 0.03,
    'max_depth': 6,
    'loss_function': 'RMSE',
    'silent': True
}

In [191]:
best_cat_model = CatBoostRegressor(**best_cat_params)

In [192]:
best_cat_model.fit(train_dataset_all_features_cat)

<catboost.core.CatBoostRegressor at 0x226b487aeb0>

In [193]:
pred_reg_cat_all_features_for_subbmit = np.exp(best_cat_model.predict(test_dataset_all_features_cat))

In [194]:
submission = pd.DataFrame({'id': id_test, 'price_doc': pred_reg_cat_all_features_for_subbmit})
submission.to_csv('CatB_all_features.csv', index=False)

##### Получили score на Kaggle

In [206]:
print('Public score: 0.32349, Private Score: 0.32633, Top 59%')

Public score: 0.32349, Private Score: 0.32633, Top 59%


### Регрессия. Обучение на отборных признаках.

In [99]:
X_train_sf, X_test_sf, y_train_sf, y_test_sf = train_test_split(df_train_selected_features, y, 
                                                    test_size=0.2, shuffle=False)

In [100]:
train_matrix_selected_features = xgb.DMatrix(X_train_sf, np.log1p(y_train_sf), feature_names=feature_forms)
test_matrix_selected_features = xgb.DMatrix(X_test_sf, feature_names=feature_forms)

In [106]:
model_reg_selected_features = xgb.train(xgb_params, train_matrix_selected_features, num_boost_round=num_boost_rounds)
pred_reg_selected_features = np.exp(model_reg_selected_features.predict(test_matrix_selected_features))

Parameters: { "n_estimators", "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [107]:
rmsle_sf = np.sqrt(mean_squared_log_error(y_test_sf, pred_reg_selected_features))
print('RMSLE с отобранными признаками: {:.3f}'.format(rmsle_sf))

RMSLE с отобранными признаками: 0.267


##### Опять обучим на всех данных и сделаем саббмит на Kaggle

In [108]:
df_matrix_selected_features = xgb.DMatrix(df_train_selected_features, np.log1p(y), feature_names=feature_forms)
test_df_matrix_selected_features = xgb.DMatrix(df_test_selected_features, feature_names=feature_forms)

In [109]:
model_for_subbmit_selected_features = xgb.train(xgb_params, df_matrix_selected_features, num_boost_round=num_boost_rounds)
pred_for_subbmit_selected_features = np.exp(model_for_subbmit_selected_features.predict(test_df_matrix_selected_features))

Parameters: { "n_estimators", "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [110]:
submission_sf = pd.DataFrame({'id': id_test, 'price_doc': pred_for_subbmit_selected_features})
submission_sf.to_csv('XGB_selected_features.csv', index=False)

##### Получили score на Kaggle

In [112]:
print('Public score: 0.34180, Private Score: 0.34673, Top 78%')

Public score: 0.34180, Private Score: 0.34673, Top 78%


In [112]:
# Выгрузка модели
# pickle.dump(model_reg, open('model.pkl', "wb"))

## Классификация районов

##### Датасет для классификации

In [26]:
y_classifier = df_all_classifier['sub_area']
X_classifier = df_all_classifier[feature_forms_classifier]

In [27]:
X_classifier.loc[num_train:, 'price_doc'] = pred_reg

In [29]:
X_classifier.shape, y_classifier.shape

((36489, 18), (36489,))

In [30]:
xgb_params_cl = {
    'n_estimators' : 300,
    'learning_rate': 0.05,
    'max_depth': 5,
    'n_jobs' : -1,
    'silent': 1
}

In [33]:
classifier_model = xgb.XGBClassifier(**xgb_params_cl)

In [38]:
X_train_cl, X_test_cl, y_train_cl, y_test_cl = train_test_split(X_classifier, y_classifier, test_size=0.2)

In [143]:
classifier_model.fit(X_train_cl, y_train_cl.values)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=300,
              n_jobs=-1, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

In [45]:
# Датасет для классификации районов
# X_classifier.to_csv('data/classification_data.csv', index=False)

In [34]:
# Обучение на всех данных
# classifier_model.fit(X_classifier, y_classifier.values)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=300,
              n_jobs=-1, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

In [35]:
# Выгрузка модели для классификаци
# pickle.dump(classifier_model, open('model_cl.pkl', "wb"))

##### Вероятности классов для каждого объекта

In [40]:
pred_c_probs = classifier_model.predict_proba(X_test_cl)

In [39]:
pred_c = classifier_model.predict(X_test_cl)

In [122]:
f_el = pred_c_probs[147]
dict_probs = {idx : el for idx, el in enumerate(f_el)}

##### Топ 5 подходящих районов по указанным параметрам

In [123]:
top5_sub_areas = heapq.nlargest(5, dict_probs, key=dict_probs.get)
top5_sub_areas

[134, 123, 142, 50, 36]

In [124]:
lbl.inverse_transform(top5_sub_areas)

array(['Veshnjaki', 'Solncevo', 'Zapadnoe Degunino', 'Kuncevo',
       'Jaroslavskoe'], dtype='<U30')

##### F1 score

In [148]:
f1_score(y_test_cl, pred_c, average='weighted')

0.5108774134462816

In [149]:
res_c = pd.DataFrame([y_test_cl.values,pred_c]).T
res_c.rename({0 : 'test', 1 : 'pred'}, axis=1, inplace=True)

In [150]:
res_c.head(15)

Unnamed: 0,test,pred
0,110,20
1,84,101
2,9,136
3,133,133
4,109,15
5,17,17
6,100,100
7,38,38
8,108,127
9,27,27


### Сопоставление районов

In [25]:
with open('map/mo.geojson', encoding='utf-8') as f:
    gj = geojson.load(f)

In [26]:
names = []
for name in gj['features']:
    names.append(name['properties']['NAME'])
names.remove('"Мосрентген"')
names.append('Мосрентген')

In [27]:
names_areas = [lbl.inverse_transform([x])[0] for x in np.arange(0,145)]
names_areas = [translit(x, 'ru') for x in names_areas]
names_areas = pd.Series(names_areas)
names_areas = names_areas.apply(lambda x : x[10:] if x.startswith('Поселение') else x)
names_areas = list(names_areas)

In [28]:
len(names), len(names_areas)

(146, 145)

In [29]:
similar_text('Аэропорт', 'Айеропорт')

82

In [34]:
dict_similar_areas = {}
for area in names_areas:
    dict_similar_areas[area] = [similar_text(area, i) for i in names]

In [35]:
for k, v in dict_similar_areas.items():
    dict_similar_areas[k] = np.argmax(v)

In [36]:
for k, v in dict_similar_areas.items():
    dict_similar_areas[k] = names[v]

In [39]:
pd.set_option('display.max_rows', 150)

In [42]:
areas_df = pd.DataFrame(dict_similar_areas.items(), columns=['df_areas', 'json_areas'])
areas_df['label'] = np.arange(0,145)

In [43]:
set(areas_df.json_areas) ^ set(names)

{'Бабушкинский',
 'Войковский',
 'Клёновское',
 'Марьино',
 'Мещанский',
 'Хорошёвский'}

In [46]:
areas_df.loc[5 , 'json_areas'] = 'Бабушкинский'
areas_df.loc[31 , 'json_areas'] = 'Хорошёвский'
areas_df.loc[60 , 'json_areas'] = 'Марьино'
areas_df.loc[63 , 'json_areas'] = 'Мещанский'
areas_df.loc[136 , 'json_areas'] = 'Войковский'

In [69]:
lbl.inverse_transform([136])

array(['Vojkovskoe'], dtype='<U30')

In [71]:
areas_df.to_csv('data/areas_df.csv', index=False)

In [74]:
areas_df[areas_df.json_areas == 'Хорошёвский']['label'].values[0]

31

In [76]:
areas_df.loc[95 , 'json_areas']

'Новофёдоровское'