In [1]:
!pip install osmread -q

In [41]:
import numpy as np
import pandas as pd 
import os
from sklearn.model_selection import StratifiedKFold, KFold
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import typing
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
import random
from sklearn.cluster import KMeans
from tqdm import tqdm
from itertools import chain
from collections import Counter
import pickle
import math

THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1
seed=47

In [3]:
def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9
    
def deviation_metric_one_sample_abs(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9
    
def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

def deviation_metric_arr(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))])

In [4]:
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)
sub = pd.read_csv('data/test_submission.csv', low_memory=False)
print(train.shape)
train.head()

In [5]:
with open('data/concat_nodes.pickle', 'rb') as fin:
    tagged_nodes = pickle.load(fin)

In [6]:
prisons = pd.read_csv('data/Rus_prisons_coords.csv')
schools = pd.read_csv('data/Rus_schools_final.csv')

prisons_knn = NearestNeighbors().fit(prisons[['lat', 'lon']])
schools_knn = NearestNeighbors().fit(schools[['lat', 'lon']])

# Препро

In [7]:
cities_values = pd.DataFrame(train['city'].value_counts()).reset_index()
cities_values_other = cities_values[cities_values['city']<=8]['index'].tolist()

train.loc[train['city'].isin(cities_values_other).index, 'city'] = train.loc[train['city'].isin(cities_values_other).index, 'osm_city_nearest_name'].tolist()
test.loc[test['city'].isin(cities_values_other).index, 'city'] = test.loc[test['city'].isin(cities_values_other).index, 'osm_city_nearest_name'].tolist()

In [8]:
def floor_recognition(s):
    if s[0] == "-":
        return -1 * floor_recognition(s[1:])
    if s.isdigit():
        return int(s)
    elif "подв" in s:
        return -1
    elif "цок" in s:
        return 0
    else:
        return -100
    
def floor_parser(row):
    #"<=0",  "2-5", "6-10", "11-20", "20<"
    result = [0, 0, 0, 0, 0]
    r = str(row).lower()
    for stop_word in [".0", "этажа", "этаж", "эт", " ", "-й"]:
        r = r.replace(stop_word, "")
    r = r.replace(".", ",").split(",")[0]
    floor = floor_recognition(r)
    if floor < -99:
        return pd.Series(result)
    elif floor < 1:
        result[0] = 1
    elif floor <=5:
        result[1] = 1
    elif floor <= 10:
        result[2] = 1
    elif floor <= 20:
        result[3] = 1
    else:
        result[4] = 1
    return pd.Series(result)

In [9]:
train['floor'] = train['floor'].fillna('1')
test['floor'] = test['floor'].fillna('1')

train['street'] = train['street'].fillna('no street')
test['street'] = test['street'].fillna('no street')

In [10]:
train['total_square'] = np.clip(train['total_square'], 0, 15_000)
test['total_square'] = np.clip(test['total_square'], 0, 15_000)

In [11]:
train[["floor<0", "floor<=5", "floor<=10", "floor<=20", "floor>20"]] = train["floor"].apply(floor_parser)
test[["floor<0", "floor<=5", "floor<=10", "floor<=20", "floor>20"]] = test["floor"].apply(floor_parser)

In [12]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

month_year = (train.date.dt.month + train.date.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)

month_year = (test.date.dt.month + test.date.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)

week_year = (train.date.dt.weekofyear + train.date.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)

week_year = (test.date.dt.weekofyear + test.date.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)

train['month'] = train.date.dt.month
train['dow'] = train.date.dt.dayofweek

test['month'] = test.date.dt.month
test['dow'] = test.date.dt.dayofweek

In [13]:
for col in train.columns:
    if 'year' in col:
        train[col] = train[col].replace({1: np.nan})
        test[col] = test[col].replace({1: np.nan})
    elif 'mean_floor' in col:
        train[col] = np.clip(train[col], 0, 70)
        test[col] = np.clip(test[col], 0, 70)

In [14]:
sq_by_street = train.groupby('street')['total_square'].apply(np.mean).to_dict()
train['square_by_street'] = train['street'].apply(lambda x: sq_by_street.get(x, np.nan))
test['square_by_street'] = test['street'].apply(lambda x: sq_by_street.get(x, np.nan))

mean_floor_500_by_street = train.groupby('street')['reform_mean_floor_count_500'].apply(np.mean).to_dict()
train['mean_floor_500_by_street'] = train['street'].apply(lambda x: mean_floor_500_by_street.get(x, np.nan))
test['mean_floor_500_by_street'] = test['street'].apply(lambda x: mean_floor_500_by_street.get(x, np.nan))

mean_year_500_by_street = train.groupby('street')['reform_mean_year_building_500'].apply(np.mean).to_dict()
train['mean_year_500_by_street'] = train['street'].apply(lambda x: mean_year_500_by_street.get(x, np.nan))
test['mean_year_500_by_street'] = test['street'].apply(lambda x: mean_year_500_by_street.get(x, np.nan))

In [15]:
count_order_by_street_month = train.groupby(['street', 'month'])['city'].count().to_dict()
train['count_order_by_street_month'] = train.apply(lambda x: count_order_by_street_month.get((x['street'], x['month']), np.nan), axis=1)
test['count_order_by_street_month'] = test.apply(lambda x: count_order_by_street_month.get((x['street'], x['month']), np.nan), axis=1)

In [16]:
kremlin_lat, kremlin_lon = 55.753722, 37.620657

def dist_calc(lat1, lon1, lat2, lon2):
    R = 6373.0

    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * \
          math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c


train['distance_to_kremlin'] = train.apply(
    lambda row: dist_calc(row.lat, row.lng, kremlin_lat, kremlin_lon), axis=1)
test['distance_to_kremlin'] = test.apply(
    lambda row: dist_calc(row.lat, row.lng, kremlin_lat, kremlin_lon), axis=1)

In [17]:
train['prisons_dist'] = prisons_knn.kneighbors(train[['lat','lng']])[0].mean(axis=1)
train['schools_dist'] = schools_knn.kneighbors(train[['lat','lng']])[0].mean(axis=1)

test['prisons_dist'] = prisons_knn.kneighbors(test[['lat','lng']])[0].mean(axis=1)
test['schools_dist'] = schools_knn.kneighbors(test[['lat','lng']])[0].mean(axis=1)

In [18]:
def get_geo_df(tagged_nodes: list,main_tag: str, subcat: str):
    df = pd.DataFrame([(nd.lat,nd.lon,main_tag,nd.tags[main_tag])
                            for nd in tagged_nodes
                            if main_tag in nd.tags.keys()],
                           columns=['lat','lng','main_tag','local_tag'])
    if subcat:
        df = df[df['local_tag']=='cemetery']
    return df
def get_fitted_knn(df):
    knn = NearestNeighbors(n_neighbors=1, radius=5)
    knn.fit(df)
    return knn

external_cols = []

In [19]:
cities_1m_people = ['Москва', 'Санкт-Петербург', 'Новосибирск', 'Екатеринбург', 'Казань', 'Нижний Новгород', 'Челябинск', 'Самара', 'Омск',
                   'Ростов-на-Дону', 'Уфа', 'Красноярск', 'Воронеж', 'Пермь', 'Волгоград']

is_million = []
is_not_million = []

train['is_million_city'] = train['city'].apply(lambda x: 1 if x in cities_1m_people else 0)
test['is_million_city'] = test['city'].apply(lambda x: 1 if x in cities_1m_people else 0)

In [20]:
def check_offices(x):
    
    if x['osm_offices_points_in_0.001'] > 2 or x['osm_offices_points_in_0.005'] > 10 or x['osm_offices_points_in_0.0075'] > 15 or x['osm_offices_points_in_0.01'] > 20:
        return 1
    return 0

train['many_offices'] = train.apply(check_offices, axis=1)
test['many_offices'] = test.apply(check_offices, axis=1)

In [21]:
def check_financial_organizations(x):
    
    if x['osm_finance_points_in_0.001'] > 2 or x['osm_finance_points_in_0.005'] > 10 or x['osm_finance_points_in_0.0075'] > 15 or x['osm_finance_points_in_0.01'] > 20:
        return 1
    return 0

train['many_financial_organizations'] = train.apply(check_financial_organizations, axis=1)
test['many_financial_organizations'] = test.apply(check_financial_organizations, axis=1)

In [22]:
train['reform_house_population_500'] = train['reform_house_population_500'].fillna(train['reform_house_population_500'].median())
train['reform_house_population_1000'] = train['reform_house_population_1000'].fillna(train['reform_house_population_1000'].median())
train['reform_mean_floor_count_1000'] = train['reform_mean_floor_count_1000'].apply(lambda x: np.log(x+1))
train['reform_mean_floor_count_1000'] = train['reform_mean_floor_count_1000'].fillna(train['reform_mean_floor_count_1000'].mean())
train['reform_mean_floor_count_500'] = train['reform_mean_floor_count_500'].fillna(train['reform_mean_floor_count_500'].median())
train['reform_mean_year_building_1000'] = train['reform_mean_year_building_1000'].fillna(train['reform_mean_year_building_1000'].median())
train['reform_mean_year_building_500'] = train['reform_mean_year_building_500'].fillna(train['reform_mean_year_building_500'].median())

test['reform_house_population_500'] = test['reform_house_population_500'].fillna(train['reform_house_population_500'].median())
test['reform_house_population_1000'] = test['reform_house_population_1000'].fillna(train['reform_house_population_1000'].median())
test['reform_mean_floor_count_1000'] = test['reform_mean_floor_count_1000'].apply(lambda x: np.log(x+1))
test['reform_mean_floor_count_1000'] = test['reform_mean_floor_count_1000'].fillna(train['reform_mean_floor_count_1000'].mean())
test['reform_mean_floor_count_500'] = test['reform_mean_floor_count_500'].fillna(train['reform_mean_floor_count_500'].median())
test['reform_mean_year_building_1000'] = test['reform_mean_year_building_1000'].fillna(train['reform_mean_year_building_1000'].median())
test['reform_mean_year_building_500'] = test['reform_mean_year_building_500'].fillna(train['reform_mean_year_building_500'].median())

In [23]:
used_cols = list({'distance_to_kremlin', 'is_million_city', 'many_offices', 'many_financial_organizations',
             "floor<0", "floor<=5", "floor<=10", "floor<=20", "floor>20",
             'square_by_street', 'mean_floor_500_by_street', 'mean_year_500_by_street', 
             'month_year_cnt', 'week_year_cnt', 'month', 'dow',
             
             
    'total_square', 'lng', 'lat', 'osm_city_closest_dist',
    'osm_subway_closest_dist', 'osm_city_nearest_population',
    'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
    'reform_mean_year_building_500', 'osm_transport_stop_closest_dist',
    'osm_crossing_closest_dist', 'osm_crossing_points_in_0.01',
    'reform_house_population_500', 'osm_train_stop_closest_dist',
    'reform_mean_year_building_1000', 'reform_house_population_1000',
    'reform_count_of_houses_1000', 'osm_catering_points_in_0.01',
    'reform_count_of_houses_500', 'osm_transport_stop_points_in_0.01',
    'osm_shops_points_in_0.005', 'osm_culture_points_in_0.01',
    'osm_catering_points_in_0.005', 'osm_transport_stop_points_in_0.0075',
    'osm_amenity_points_in_0.001', 'osm_finance_points_in_0.005',
    'osm_shops_points_in_0.01', 'osm_leisure_points_in_0.01',
    'osm_crossing_points_in_0.0075', 'osm_catering_points_in_0.001',
    'osm_finance_points_in_0.0075', 'osm_shops_points_in_0.0075',
    'osm_shops_points_in_0.001', 'osm_crossing_points_in_0.005',
    'osm_amenity_points_in_0.01', 'osm_finance_points_in_0.01',
    'osm_catering_points_in_0.0075', 'osm_offices_points_in_0.01',
    'osm_leisure_points_in_0.0075', 'osm_offices_points_in_0.0075',
    'osm_healthcare_points_in_0.01', 'osm_culture_points_in_0.0075',
    'osm_culture_points_in_0.005', 'osm_historic_points_in_0.01',
    'osm_building_points_in_0.0075', 'osm_historic_points_in_0.0075',
    'osm_amenity_points_in_0.005', 'osm_leisure_points_in_0.005',
    'osm_hotels_points_in_0.01', 'osm_healthcare_points_in_0.0075',
    'osm_offices_points_in_0.005', 'osm_crossing_points_in_0.001',
    'osm_healthcare_points_in_0.005', 'osm_transport_stop_points_in_0.005',
    'osm_building_points_in_0.01', 'osm_hotels_points_in_0.0075',
    'osm_amenity_points_in_0.0075', 'osm_historic_points_in_0.005',
    'osm_hotels_points_in_0.005', 'osm_finance_points_in_0.001',
    'osm_building_points_in_0.005', 'osm_offices_points_in_0.001',
    'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.005',
    'osm_train_stop_points_in_0.01', 'osm_culture_points_in_0.001',
    'osm_building_points_in_0.001', }) + ['prisons_dist', 'schools_dist']

In [24]:
all_cat_cols = ['city', 'floor', 'osm_city_nearest_name', 'region', 'street', 'realty_type']
cat_cols = ['region', 'city', 'realty_type', 'osm_city_nearest_name', 'street']
cat_ohe = ['region', 'realty_type', 'osm_city_nearest_name', 'floor', 'osm_city_nearest_name']

In [25]:
train = train.drop(columns=train.columns[train.columns.str.startswith('OHE')])
test = test.drop(columns=train.columns[train.columns.str.startswith('OHE')])
for col in all_cat_cols:
    print(col)
    if col in cat_cols:
        oe_temp = OrdinalEncoder()
        oe_temp.fit(pd.concat([train[[col]], test[[col]]]))
        train[col + 'ord_enc'] = oe_temp.transform(train[[col]])
        test[col + 'ord_enc'] = oe_temp.transform(test[[col]])
    
    if col in cat_ohe:
        OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        OH_encoder.fit(pd.concat([train[[col]], test[[col]]]))
        train = pd.concat([train, pd.DataFrame(OH_encoder.transform(train[[col]]), index=train.index).add_prefix('OHE_' + col + '_')], axis=1)
        test = pd.concat([test, pd.DataFrame(OH_encoder.transform(test[[col]]), index=test.index).add_prefix('OHE_' + col + '_')], axis=1)
    
    if col in cat_cols:
        train[col + '_cat'] = train[col].astype('category')
        test[col + '_cat'] = test[col].astype('category')


used_cols = used_cols + train.columns[train.columns.str.startswith('OHE')].tolist()

# Моделинг

In [26]:
def calc(X, y, test, model, cv, used_cols, oof=False):
    res=[]
    local_probs=pd.DataFrame()
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = used_cols
    for i, (tdx, vdx) in enumerate(cv.split(X, y)):
        X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]
        model.fit(X_train[used_cols], y_train)
        feature_importances[f'fold_{i}'] = model.feature_importances_
        preds = model.predict(X_valid[used_cols])
        
        metric = deviation_metric(y_valid.tolist(), preds.tolist())
        res.append(metric)
        print(f'fold_{i+1}: {metric:.4f}')
        
        if oof:
            local_probs['fold_%i'%i] = model.predict(test[used_cols])

    print('mean:', round(np.mean(res), 4))
    if oof:
        local_probs['res'] = local_probs.mean(axis=1)
    feature_importances['average'] = feature_importances[feature_importances.columns[1:]].mean(axis=1)
    feature_importances = feature_importances.sort_values(by='average', ascending=False)
    return local_probs, feature_importances

In [27]:
def calc_corr(X, y, test, model, cv, used_cols, sample_weight, oof=False):
    res=[]
    local_probs=pd.DataFrame()
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = used_cols
    for i, (tdx, vdx) in enumerate(cv.split(X, X['price_type'])):
        X_train, X_valid, y_train, y_valid = X.iloc[tdx].copy(), X.iloc[vdx].copy(), y[tdx].copy(), y[vdx].copy()
        #train_weights = sample_weight[tdx]
        
        mask_val = X_valid['price_type']==1
        mask_train = X_train['price_type']==1
        
        #train_weights_man, train_weights_aut = train_weights[mask_train], train_weights[~mask_train]
        X_train_man,X_train_aut = X_train[mask_train], X_train[~mask_train]
        X_valid_man,X_valid_aut = X_valid[mask_val], X_valid[~mask_val]
        y_train_man,y_train_aut = y_train[mask_train], y_train[~mask_train]
        y_valid_man,y_valid_aut = y_valid[mask_val], y_valid[~mask_val]
        
        model.fit(X_train_aut[used_cols], np.log(y_train_aut))
        feature_importances[f'fold_{i}'] = model.feature_importances_
        correct_manual = model.predict(X_train_man[used_cols])
        corr_coef = ((y_train_man - np.exp(correct_manual))/np.exp(correct_manual)).median()
        
        preds_manual = np.exp(model.predict(X_valid_man[used_cols])) * (1 + corr_coef)
        preds_aut = np.exp(model.predict(X_valid_aut[used_cols]))
        
        full_valids = y_valid_man.tolist() + y_valid_aut.tolist()
        full_preds = preds_manual.tolist() + preds_aut.tolist()
        metric_man = deviation_metric(y_valid_man.tolist(), preds_manual.tolist())
        metric = deviation_metric(full_valids, full_preds)
        res.append(metric)
        print(f'corr_coef fold_{i+1}: {metric:.4f}, manual: {metric_man:.4f}')
        
        if oof:
            local_probs['fold_%i'%i] = np.exp(model.predict(test[used_cols])) * (1 + corr_coef)

    print('mean:', round(np.mean(res), 4))
    if oof:
        local_probs['res'] = local_probs.mean(axis=1)
        
    feature_importances['average'] = feature_importances[feature_importances.columns[1:]].mean(axis=1)
    feature_importances = feature_importances.sort_values(by='average', ascending=False)
    return local_probs, feature_importances

In [28]:
def calc_clusters(X, y, test, model, cv, used_cols, oof=False):
    res=[]
    local_probs=pd.DataFrame(index=test.index)
    two = ['lat','lng']
    for i, (tdx, vdx) in enumerate(cv.split(X, X['price_type'])):
        X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]
        
        kmeans = KMeans(n_clusters=10, random_state=seed)
        X_train['kmeans'] = kmeans.fit_predict(X_train[two])
        X_valid['kmeans'] = kmeans.predict(X_valid[two])
        
        if oof:
            test['kmeans'] = kmeans.predict(test[two])
        
        for km in X_train['kmeans'].unique():
            train_mask = X_train['kmeans']==km
            valid_mask = X_valid['kmeans']==km
            
            model.fit(X_train.loc[train_mask, used_cols], np.log(y_train[train_mask]))
            X_valid.loc[valid_mask, 'preds_km'] = np.exp(model.predict(X_valid.loc[valid_mask, used_cols]))
            
            if oof:
                test_mask = test['kmeans']==km
                local_probs.loc[test_mask, 'fold_%i'%i] = np.exp(model.predict(test.loc[test_mask, used_cols]))
            
        metric = deviation_metric(y_valid.tolist(), X_valid['preds_km'].tolist())
        res.append(metric)
        print(f'fold_{i+1}: {metric:.4f}')            

    print('mean:', round(np.mean(res), 4))
    if oof:
        local_probs['res'] = local_probs.mean(axis=1)
        return local_probs
    else:
        return np.mean(res)

In [29]:
used_cols = used_cols + external_cols

In [30]:
params_lgbm=  {'n_estimators': 898, 'num_leaves': 452,
 'max_depth': 36, 'learning_rate': 0.013166369544834123,
 'min_split_gain': 0.00416492412866083, 'colsample_bytree': 0.8496546536706886,
 'subsample_freq': 8, 'objective': 'l2',
 'random_seed':seed
}

In [31]:
catb = CatBoostRegressor(
                iterations=700,
                random_state=seed,
                silent=True,
                task_type= 'GPU',
)
 
lgbm = LGBMRegressor(random_seed = seed)
catb2 = CatBoostRegressor(
                iterations=1000,
                random_state=seed,
                silent=True,
                task_type= 'GPU',
)
stkfold = StratifiedKFold(n_splits=5, shuffle = True, random_state = seed)
kfold = KFold(n_splits=5, shuffle = True, random_state = seed) #StratifiedKFold KFold

In [32]:
manual = train[train['price_type']==1]
auto = train[train['price_type']==0]

In [33]:
catb.fit(manual[used_cols], np.log(manual['per_square_meter_price']))
preds_aut = np.exp(catb.predict(auto[used_cols]))
auto['metric'] = deviation_metric_arr(auto['per_square_meter_price'].tolist(), preds_aut.tolist())

In [34]:
manual_full = pd.concat([manual, auto[auto['metric']==0]]).reset_index(drop=True)

In [35]:
X = manual_full.drop('per_square_meter_price', axis=1)
y = manual_full['per_square_meter_price']

In [36]:
pd.set_option('chained_assignment',None)
used_cols_1 = set(used_cols)-set(['osm_train_stop_closest_dist'])
probs = calc_clusters(X, y, test, catb2, stkfold, used_cols_1, oof=True)

In [37]:
def fast_drop(X,y,model,cv,alpha,n_loops):
    drop_rate = 0
    for j in range(n_loops):
        cols_list=list(X.columns)
        X = X[random.sample(cols_list,len(cols_list))]
        X = X.reindex(columns=random.shuffle(X.columns.to_list()))
        drop_list=[]
        print(f"Number of columns:{len(X.columns)}, alpha:{alpha}")
        
        baseline = calc_clusters(X, y, test, model, cv,
                              used_cols, oof=False)
        i=0
        for col in list(check_cols):
            i+=1
            dropped_res = calc_clusters(X.drop(col,axis=1), y, test, model, cv,
                              list(set(used_cols)&set(X.drop(col,axis=1).columns)), oof=False)
            print(f"{i}. drop:{col}, res:{dropped_res}, baseline:{baseline}, delta:{round(baseline-dropped_res,8)}")
            if baseline-dropped_res>alpha:
                X = X.drop(col,axis=1)
                baseline = dropped_res
                drop_list.append(col)
                print('   !DROPPED!')
        print(f"\nDropped columns:{drop_list}\n")
        alpha = alpha/2

In [38]:
sns.distplot(probs['res'], label='preds')
sns.distplot(y, label='true')
plt.legend()
plt.show()

In [39]:
sub['per_square_meter_price'] = probs['res']

In [40]:
sub.to_csv('catb_ot_no_mult_stkfold_10cl.csv', index=False)