Используя исходные или очищенные данные, сформируйте предсказание класса объявления из множества exposition_test.tsv.gz

Обязательно нужно использовать одну или несколько моделей кластеризации. Дополнительно можно использовать решающие деревья, CatBoost, LightGBM и XGBoost.

Подсказка: для использования day_mean в классификации/кластеризации потребуется его сформировать для тестовых данных. Это можно сделать либо при помощи других моделей (два этапа классификации), либо построив линейную модель прогноза day_mean от count_day.

Данные:
* https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_train.tsv.gz
* https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_test.tsv.gz
* https://video.ittensive.com/machine-learning/hacktherealty/data/metro.utf8.json
* https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_sample_submisson.tsv

Итоговый файл с кодом (.py или .ipynb) выложите в github с портфолио.

# ------------------------------

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from transliterate import translit
from tqdm import notebook
from pyod.models.iforest import IForest
from pyod.models.copod import COPOD
from sklearn.neighbors import LocalOutlierFactor

from catboost import Pool, CatBoostClassifier
from xgboost import XGBClassifier


In [2]:
def data_preproccesing (data):
# add total items per day
    data_day_count = data.groupby("day").count()["build_year"]
    data["day_count"] = data["day"].apply(lambda x:data_day_count.loc[x])
# approximate values (clean-up)
    data.loc[data.build_year == 0, 'build_year'] = np.NaN
    data['build_year'] = data['build_year'].fillna((data.groupby(['building_series_id'])['build_year'].transform('median')))
    data.loc[data['build_year'].isna(), 'build_year'] = data['build_year'].mean()
    data['build_year'] = data['build_year'].astype(np.uint16)
    if 'has_elevator' in data.columns:
# elevator for 6+ floors
        data.loc[(data.has_elevator==0) & (data.floor>5), 'has_elevator'] = 1
# fix living area
    data.loc[data.living_area == 0, 'living_area'] = np.NaN
    data['living_area'] = data['living_area'].fillna((data.groupby(['rooms'])['living_area'].transform('median')))
# fix price
    data.loc[data.price<100, 'price'] *= 1000
    data.loc[data.price<1000, 'price'] *= 60
    if 'floors_total' in data.columns:
# fix celing height
        data.loc[(data.ceiling_height<2) | (data.ceiling_height>5), 'ceiling_height'] = np.NaN
        data['ceiling_height'] = data['ceiling_height'].fillna(data.groupby(['building_series_id'])['ceiling_height'].transform('median'))
        data.loc[data['ceiling_height'].isna(), 'ceiling_height'] = data['ceiling_height'].mean()
# enrich data, % floor
        data['floor'] = data['floor'] / data["floors_total"]
# locality, village/region/moscow/metro
    if 'locality_name' in data.columns:
        data['loctype_village'] = (data['locality_name'].str.match(pat = 'городок|деревня|ДНП|поселок|посёлок|село|СНТ|товарищество|хутор')).astype(np.uint8)
        data['loctype_moscow'] = (data.locality_name == 'Москва').astype(np.uint8)
        data['loctype_region'] = ((data.loctype_village == 0) & (data.loctype_moscow == 0)).astype(np.uint8)
    if "site_id" in data.columns:
        data = data.drop(['site_id', 'main_image', 'area', 'building_id', 'unified_address'], axis=1)
    if 'target_string' in data.columns:
        data = data.drop(['target_string'], axis=1)
# processing date
    if 'day' in data.columns:
        data['day'] = pd.to_datetime(data['day'])
        data['year'] = data['day'].dt.year
        data['month'] = data['day'].dt.month
        data['week'] = data['day'].dt.week
        data['dow'] = data['day'].dt.dayofweek
        data['dom'] = data['day'].dt.day
        data['doy'] = data['day'].dt.dayofyear
        data = data.drop(["day"], axis=1)
# adding holydays, 1-7 Jan, 8 Mar, 1 May, 9 May, 12 Jun, 4 Nov
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2017/
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2018/
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2019/
# http://www.consultant.ru/law/ref/calendar/proizvodstvennye/2020/
        data['is_holyday'] = ((data['year'] == 2017 &
                                (((data['dom'] > 0) & (data['dom'] < 8) & data['month'] == 1) | 
                                (((data['dom'] == 23) | data['dom'] == 24)) & (data['month'] == 2)) |
                                ((data['dom'] == 8) & (data['month'] == 3)) |
                                (((data['dom'] == 1) | (data['dom'] == 8) | (data['dom'] == 9)) & data['month'] == 5) |
                                ((data['dom'] == 12) & (data['month'] == 6)) |
                                ((data['dom'] == 6) & (data['month'] == 11))) |
                              ((data['year'] == 2018) &
                                (((data['dom'] > 0) & (data['dom'] < 9) & data['month'] == 1) | 
                                ((data['dom'] == 23) & (data['month'] == 2)) |
                                (((data['dom'] == 8) | (data['dom'] == 9)) & (data['month'] == 3)) |
                                ((data['dom'] == 30) & (data['month'] == 4)) |
                                (((data['dom'] == 1) | (data['dom'] == 2) | (data['dom'] == 9)) & data['month'] == 5) |
                                (((data['dom'] == 11) | (data['dom'] == 12)) & (data['month'] == 6)) |
                                ((data['dom'] == 5) & (data['month'] == 11)) |
                                ((data['dom'] == 31) & (data['month'] == 12)))) |
                              ((data['year'] == 2019) &
                                (((data['dom'] > 0) & (data['dom'] < 9) & data['month'] == 1) | 
                                ((data['dom'] == 8) & (data['month'] == 3)) |
                                (((data['dom'] == 1) | (data['dom'] == 2) | (data['dom'] == 3) | (data['dom'] == 9) | (data['dom'] == 10)) & data['month'] == 5) |
                                ((data['dom'] == 12) & (data['month'] == 6)) |
                                ((data['dom'] == 4) & (data['month'] == 11)))) |
                              ((data['year'] == 2020) &
                                (((data['dom'] > 0) & (data['dom'] < 9) & data['month'] == 1) | 
                                ((data['dom'] == 24) & (data['month'] == 2)) |
                                ((data['dom'] == 9) & (data['month'] == 3)) |
                                (((data['dom'] == 1) | (data['dom'] == 4) | (data['dom'] == 5) | (data['dom'] == 11)) & data['month'] == 5) |
                                ((data['dom'] == 12) & (data['month'] == 6)) |
                                ((data['dom'] == 4) & (data['month'] == 11))))).astype(np.uint8)
# one-hot vectors
    if 'year' in data.columns:
        for label in ['year', 'month', 'week', 'dow', 'doy', 'dom', 'renovation',
                      'balcony', 'building_type', 'parking', 'floors_total', 'locality_name']:
            for l in data[label].unique():
                data[label + "_" + translit(str(l), "ru", reversed=True)] = (data[label] == l).astype(np.uint8)
# boolean -> int
    if 'studio' in data.columns:
        for label in ['studio', 'has_elevator', 'expect_demolition', 'is_apartment']:
            data[label] = data[label].astype(np.uint8)
# index (remove id from columns)
    if 'id' in data.columns:
        data = data.set_index(['id'])
    return data

In [3]:
def calc_price (data, group="", label=""):
    if data[group] in price_groups[group][label]:
        return data["price"] / price_groups[group][label][data[group]]
    else:
        return 1

In [4]:
train_data = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz')
train_data = train_data.drop(labels=["doy_108"], axis=1)
train_data.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,price_locality_name_median,target
0,105.0,3.0,3,50.0,95000,2.456912,2.261905,1
1,40.0,3.0,1,19.200001,25000,3.028689,1.0,2
2,37.599998,2.64,0,19.0,26000,3.091993,0.619048,2
3,80.0,3.0,3,49.0,35000,3.10101,1.25,2
4,100.0,3.0,3,49.0,80000,2.495468,1.904762,3


### Нормализация данных
Приведение всех значений к отрезку [0;1], удалим из данных target

In [5]:
target = train_data["target"]

In [6]:
scaler = StandardScaler()

In [7]:
train_data_mm = pd.DataFrame(scaler.fit_transform(train_data[train_data.columns[:-1]]))

In [8]:
train_data_mm.head()

Unnamed: 0,0,1,2,3,4,5,6
0,1.709793,1.115052,1.469889,1.025395,0.378181,-1.60709,0.278301
1,-0.483202,1.115052,-0.861151,-0.679471,-0.217544,0.337784,-0.132179
2,-0.564174,-0.645106,-2.026671,-0.690542,-0.209033,0.553112,-0.256098
3,0.866333,1.115052,1.469889,0.970042,-0.13244,0.583783,-0.050858
4,1.541101,1.115052,1.469889,0.970042,0.250526,-1.475943,0.162128


# K-means

In [9]:
kmeans = KMeans(n_clusters=100, random_state=0, max_iter=200, n_init=10).fit(train_data_mm)

In [10]:
train_data_mm["label"] = kmeans.labels_
train_data_mm["target"] = train_data["target"]

# Модели классификации

### XGBoost = градиентный бустинг деревьев решений
(Получим кластеры по всем данным)
Получим модель XGB

In [11]:
y = train_data_mm["target"]
x = train_data_mm.drop(labels=["label", "target"], axis=1)    

In [12]:
xgb_model = XGBClassifier(max_depth=17, max_features=27,
                          n_estimators=76, min_samples_leaf=20)

In [13]:
xgb_model.fit(x, y)



Parameters: { "max_features", "min_samples_leaf" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=17, max_features=27,
              min_child_weight=1, min_samples_leaf=20, missing=nan,
              monotone_constraints='()', n_estimators=76, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1, ...)

# CatBoost

In [14]:
train_dataset = Pool(data=x, label=y)
catboost_model = CatBoostClassifier(iterations=10, learning_rate=0.57,
                 random_seed=17, depth=6, loss_function="MultiClass",
                 bootstrap_type="MVS", custom_metric="WKappa")

In [15]:
cb_params = {
    "depth": range(5,8),
    'learning_rate': np.arange(0.56,0.59,0.01),
    'l2_leaf_reg': range(1,5),
}

cb_grid = catboost_model.grid_search(cb_params, cv=5, X=x, y=y, verbose=True)

print (cb_grid["params"])

catboost_model= CatBoostClassifier(iterations=100,
                learning_rate=cb_grid["params"]["learning_rate"],
                depth=cb_grid["params"]["depth"],
                l2_leaf_reg=cb_grid["params"]["l2_leaf_reg"],
                random_seed=17, loss_function="MultiClass",
                bootstrap_type="MVS", custom_metric="WKappa")

0:	learn: 1.5479336	test: 1.5493856	best: 1.5493856 (0)	total: 269ms	remaining: 2.42s
1:	learn: 1.5303152	test: 1.5323604	best: 1.5323604 (1)	total: 353ms	remaining: 1.41s
2:	learn: 1.5200658	test: 1.5228151	best: 1.5228151 (2)	total: 444ms	remaining: 1.04s
3:	learn: 1.5117907	test: 1.5143011	best: 1.5143011 (3)	total: 548ms	remaining: 822ms
4:	learn: 1.5082575	test: 1.5109742	best: 1.5109742 (4)	total: 637ms	remaining: 637ms
5:	learn: 1.5060137	test: 1.5090561	best: 1.5090561 (5)	total: 739ms	remaining: 493ms
6:	learn: 1.5036498	test: 1.5064971	best: 1.5064971 (6)	total: 831ms	remaining: 356ms
7:	learn: 1.5022063	test: 1.5052227	best: 1.5052227 (7)	total: 938ms	remaining: 234ms
8:	learn: 1.5010828	test: 1.5042242	best: 1.5042242 (8)	total: 1.03s	remaining: 114ms
9:	learn: 1.5001524	test: 1.5034311	best: 1.5034311 (9)	total: 1.13s	remaining: 0us

bestTest = 1.503431094
bestIteration = 9

0:	loss: 1.5034311	best: 1.5034311 (0)	total: 1.44s	remaining: 50.2s
0:	learn: 1.5473617	test: 1.54

In [16]:
catboost_model.fit(train_dataset)

0:	learn: 1.5409125	total: 160ms	remaining: 15.9s
1:	learn: 1.5224255	total: 302ms	remaining: 14.8s
2:	learn: 1.5103595	total: 443ms	remaining: 14.3s
3:	learn: 1.5056707	total: 579ms	remaining: 13.9s
4:	learn: 1.5033266	total: 765ms	remaining: 14.5s
5:	learn: 1.5006863	total: 922ms	remaining: 14.4s
6:	learn: 1.4987258	total: 1.09s	remaining: 14.5s
7:	learn: 1.4965059	total: 1.27s	remaining: 14.7s
8:	learn: 1.4956740	total: 1.46s	remaining: 14.8s
9:	learn: 1.4948075	total: 1.65s	remaining: 14.9s
10:	learn: 1.4934014	total: 1.84s	remaining: 14.9s
11:	learn: 1.4922293	total: 2.02s	remaining: 14.8s
12:	learn: 1.4916478	total: 2.17s	remaining: 14.5s
13:	learn: 1.4903732	total: 2.32s	remaining: 14.3s
14:	learn: 1.4896515	total: 2.49s	remaining: 14.1s
15:	learn: 1.4889821	total: 2.65s	remaining: 13.9s
16:	learn: 1.4883128	total: 2.8s	remaining: 13.7s
17:	learn: 1.4874646	total: 2.99s	remaining: 13.6s
18:	learn: 1.4870891	total: 3.16s	remaining: 13.5s
19:	learn: 1.4865218	total: 3.34s	remainin

<catboost.core.CatBoostClassifier at 0x1793b121430>

### Обогащение тестовых данных

In [17]:
test_data = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_test.tsv.gz', sep='\t')
test_data.head()

Unnamed: 0,building_series_id,site_id,parking,build_year,expect_demolition,main_image,latitude,total_area,ceiling_height,rooms,...,kitchen_area,day,public,longitude,price,flats_count,building_type,balcony,locality_name,renovation
0,663294,0,UNKNOWN,1971,False,//avatars.mds.yandex.net/get-realty/1900763/ad...,55.795704,36.0,2.64,1,...,0.0,2020-01-25,True,37.602478,40000,80,PANEL,UNKNOWN,Москва,UNKNOWN
1,712125,0,UNKNOWN,1986,False,//avatars.mds.yandex.net/get-realty/1583116/ad...,55.605583,40.0,2.48,1,...,10.0,2019-11-19,True,37.743679,25000,222,PANEL,LOGGIA,Москва,COSMETIC_DONE
2,0,0,UNKNOWN,2014,False,//avatars.mds.yandex.net/get-realty/2124710/ad...,55.92556,25.0,0.0,0,...,0.0,2020-01-11,True,37.862965,19000,179,MONOLIT,LOGGIA,Королёв,COSMETIC_DONE
3,0,0,UNKNOWN,2001,False,//avatars.mds.yandex.net/get-realty/2958378/ad...,55.432522,42.0,0.0,1,...,10.0,2020-01-27,True,37.544224,20000,0,PANEL,LOGGIA,Подольск,COSMETIC_DONE
4,1564812,0,UNKNOWN,2019,False,//avatars.mds.yandex.net/get-realty/2732616/ad...,55.91753,73.300003,2.8,3,...,10.2,2020-03-04,False,37.411098,68000,0,MONOLIT,TWO_LOGGIA,Химки,EURO


Зависимость срока экспозиции от количества объявлений в день. Возможно, имеет смысл просто взять логарифм от числа объявлений в день - и использовать его для кластеризации/классификации

In [18]:
train = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_train.tsv.gz', sep='\t')
train_day_count = train.groupby("day").count()["target"]
train_day_mean = train.groupby("day").mean()["target"]
train["day_count"] = train["day"].apply(lambda x:train_day_count.loc[x])
train["day_mean"] = train["day"].apply(lambda x:train_day_mean.loc[x])

In [19]:
x = np.array(train[train["day_count"]>max(train_day_count)-1]["day_count"]).reshape(-1, 1)
y = train[train["day_count"]>max(train_day_count)-1]["day_mean"]
day_model = LinearRegression().fit(x, y)

In [20]:
test_day_count = test_data.groupby("day").count()["total_area"]
test_data["day_count"] = test_data["day"].apply(lambda x:test_day_count.loc[x])
test_data["day_mean"] = day_model.predict(np.array(test_data["day_count"]).reshape(-1, 1))
test_data.loc[test_data["day_count"]<max(test_data["day_count"]),"day_mean"] = train_day_mean.mean()

In [21]:
test_data.head(30)

Unnamed: 0,building_series_id,site_id,parking,build_year,expect_demolition,main_image,latitude,total_area,ceiling_height,rooms,...,public,longitude,price,flats_count,building_type,balcony,locality_name,renovation,day_count,day_mean
0,663294,0,UNKNOWN,1971,False,//avatars.mds.yandex.net/get-realty/1900763/ad...,55.795704,36.0,2.64,1,...,True,37.602478,40000,80,PANEL,UNKNOWN,Москва,UNKNOWN,352,2.931129
1,712125,0,UNKNOWN,1986,False,//avatars.mds.yandex.net/get-realty/1583116/ad...,55.605583,40.0,2.48,1,...,True,37.743679,25000,222,PANEL,LOGGIA,Москва,COSMETIC_DONE,553,2.931129
2,0,0,UNKNOWN,2014,False,//avatars.mds.yandex.net/get-realty/2124710/ad...,55.92556,25.0,0.0,0,...,True,37.862965,19000,179,MONOLIT,LOGGIA,Королёв,COSMETIC_DONE,381,2.931129
3,0,0,UNKNOWN,2001,False,//avatars.mds.yandex.net/get-realty/2958378/ad...,55.432522,42.0,0.0,1,...,True,37.544224,20000,0,PANEL,LOGGIA,Подольск,COSMETIC_DONE,501,2.931129
4,1564812,0,UNKNOWN,2019,False,//avatars.mds.yandex.net/get-realty/2732616/ad...,55.91753,73.300003,2.8,3,...,False,37.411098,68000,0,MONOLIT,TWO_LOGGIA,Химки,EURO,464,2.931129
5,1564812,0,UNKNOWN,1961,False,//avatars.mds.yandex.net/get-realty/1651606/ad...,55.677845,32.0,0.0,1,...,True,37.564484,40000,112,BRICK,BALCONY,Москва,COSMETIC_DONE,553,2.931129
6,1564812,0,UNKNOWN,2016,False,//avatars.mds.yandex.net/get-realty/2355710/ad...,55.842464,30.0,3.0,0,...,False,37.373302,45000,0,MONOLIT,UNKNOWN,Москва,DESIGNER_RENOVATION,5259,2.931129
7,1564812,0,UNKNOWN,1952,False,//avatars.mds.yandex.net/get-realty/2090636/ad...,55.779575,40.0,3.2,1,...,False,37.706863,40000,115,BRICK,LOGGIA,Москва,EURO,5259,2.931129
8,1564812,0,UNKNOWN,1966,False,//avatars.mds.yandex.net/get-realty/2353363/ad...,55.765087,48.0,2.7,2,...,True,37.657494,110000,68,PANEL,BALCONY,Москва,EURO,485,2.931129
9,663320,0,UNKNOWN,2005,False,//avatars.mds.yandex.net/get-realty/2771165/ad...,55.886742,52.0,2.74,2,...,True,37.647129,40000,235,PANEL,BALCONY,Москва,COSMETIC_DONE,6779,1.392246


In [22]:
test_data = data_preproccesing(test_data)
test_data.head()

  data['week'] = data['day'].dt.week


Unnamed: 0_level_0,building_series_id,parking,build_year,expect_demolition,latitude,total_area,ceiling_height,rooms,floors_total,living_area,...,locality_name_derevnja Kabanovo,locality_name_derevnja Ivojlovo,locality_name_selo Vozdvizhenskoe,locality_name_derevnja Dolgoe Ledovo,locality_name_derevnja Martem'janovo,locality_name_poselok Veshki,locality_name_poselok Radiotsentr,locality_name_derevnja Zhilino,locality_name_poselok Shuvoe,locality_name_derevnja Vorschikovo
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13762887891614807236,663294,UNKNOWN,1971,0,55.795704,36.0,2.64,1,12,19.0,...,0,0,0,0,0,0,0,0,0,0
14654451946329972059,712125,UNKNOWN,1986,0,55.605583,40.0,2.48,1,16,20.0,...,0,0,0,0,0,0,0,0,0,0
17449292585625593873,0,UNKNOWN,2014,0,55.92556,25.0,2.7,0,16,12.0,...,0,0,0,0,0,0,0,0,0,0
15597282206699587329,0,UNKNOWN,2001,0,55.432522,42.0,2.7,1,10,20.0,...,0,0,0,0,0,0,0,0,0,0
3718201047023531068,1564812,UNKNOWN,2019,0,55.91753,73.300003,2.8,3,16,45.799999,...,0,0,0,0,0,0,0,0,0,0


Добавление средней цены

In [23]:
price_data = pd.DataFrame(test_data[["locality_name", "price"]])
price_groups = {"locality_name": {
    "median": price_data.groupby(["locality_name"]).median()["price"]
}}

In [24]:
for group in price_groups:
    print ("Processing:", group, end=" ")
    for label in price_groups[group]:
        print (label, end=" ")
        test_data["price_" + group + "_" + label] = test_data.apply(calc_price, axis=1,
                                                                      group=group, label=label)
    print ("")

Processing: locality_name median 


In [25]:
test_data = pd.DataFrame(test_data[['total_area', 'ceiling_height', 'rooms', 'living_area',
                                   'price', 'day_mean', 'price_locality_name_median']])

In [26]:
test_data.to_csv("exposition_test.basic.csv", index=False)

# Формирование предсказаний

In [27]:
test_data = pd.read_csv('exposition_test.basic.csv')

In [28]:
test_data_mm = pd.DataFrame(scaler.transform(test_data))
test_data_mm.head()

Unnamed: 0,0,1,2,3,4,5,6
0,-0.618155,-0.645106,-0.861151,-0.690542,-0.089888,0.005938,-0.168322
1,-0.483202,-1.427399,-0.861151,-0.635189,-0.217544,0.005938,-0.276751
2,-0.989277,-0.351746,-2.026671,-1.078012,-0.268606,0.005938,-0.199947
3,-0.415725,-0.351746,-0.861151,-0.635189,-0.260095,0.005938,-0.174608
4,0.640286,0.137186,1.469889,0.792913,0.148401,0.005938,0.277401


# -----------------------

In [29]:
clusters = train_data_mm.groupby("label").mean()["target"]
y = pd.DataFrame(kmeans.predict(test_data_mm))
y["pred"] = y[0].apply(lambda x:clusters.iloc[x])

In [30]:
y_xgboost = pd.DataFrame(xgb_model.predict(test_data_mm))
y_xgboost["pred"] = y_xgboost[0].apply(lambda x:clusters.iloc[x])

In [31]:
y_catboost = pd.DataFrame(catboost_model.predict(test_data_mm))
y_catboost["pred"] = y_catboost[0].apply(lambda x:clusters.iloc[x])

In [32]:
y = (y + y_xgboost + y_catboost) / 3

In [33]:
y[0] = y[0].apply(lambda n: int(n))

In [34]:
y.head(30)

Unnamed: 0,0,pred
0,31,2.935945
1,29,3.281559
2,18,3.346789
3,31,3.279982
4,30,2.954644
5,25,2.960657
6,29,3.791062
7,26,2.95473
8,28,3.680687
9,26,2.557729


### Загрузка решения

In [None]:
submission = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/E/exposition_sample_submission.tsv', sep='\t')
submission['target'] = np.around(y["pred"])
#submission['target'] = 3
#submission['target'] = np.around(test_data["day_mean"])
submission["target"] = submission["target"].apply(lambda x:max(1, min(x,5))).astype(np.uint8)
submission.to_csv('submission_last.tsv', sep='\t', index=False)