In [52]:
import numpy as np
import pandas as pd 
import os
from sklearn.model_selection import StratifiedKFold, KFold
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import typing
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
pd.options.mode.chained_assignment = None

THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1
seed=47

In [53]:
def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9
    
def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

def deviation_metric_arr(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))])

# Входные данные

In [54]:
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)
sub = pd.read_csv('data/test_submission.csv', low_memory=False)

# Доп. данные

In [55]:
prisons = pd.read_csv('data/Rus_prisons_coords.csv')
schools = pd.read_csv('data/Rus_schools_final.csv')
city_population = pd.read_csv('data/city_population.csv')
zarplaty = pd.read_csv('data/zarplaty.csv')

In [56]:
prisons_knn = NearestNeighbors().fit(prisons[['lat', 'lon']])
schools_knn = NearestNeighbors().fit(schools[['lat', 'lon']])

# Препроцессинг

## Работа с городами: категории, непопулярные

In [57]:
def city_type(row):
    if row >=1000000:
        return "1Million"
    elif  (row<1000000)&(row >200000):
        return "Medium"
    elif  (row <=200000):
        return "Small"

In [58]:
train['age'] = round(2021 - train['reform_mean_year_building_500'])
train['city'] = train['city'].apply(lambda x: x.lower())
city_population_clean = city_population.groupby('settlement').agg({'population':'sum'}).reset_index()
city_population_clean.columns = ['city', 'city_population']
city_population_clean.city = city_population_clean.city.apply(lambda x: x.lower())
train = train.merge(city_population_clean, on = 'city', how='left')
train['city_type'] = train['city_population'].apply(lambda x: city_type(x))
train['city_type'] = train['city_type'].fillna('other')
train.loc[train.city  == 'москва', 'city_type'] = "Capital"
train.loc[train.city  == 'санкт-Петербург', 'city_type'] = "Capital"
train = train.merge(zarplaty, on = 'region', how='left')
train['zarplata'] = pd.to_numeric(train['zarplata'], downcast = 'unsigned')

test['age'] = round(2021 - test['reform_mean_year_building_500'])
test['city'] = test['city'].apply(lambda x: x.lower())
city_population_clean = city_population.groupby('settlement').agg({'population':'sum'}).reset_index()
city_population_clean.columns = ['city', 'city_population']
city_population_clean['city_population']
city_population_clean.city = city_population_clean.city.apply(lambda x: x.lower())
test = test.merge(city_population_clean, on = 'city', how='left')
test['city_type'] = test['city_population'].apply(lambda x: city_type(x))
test['city_type'] = test['city_type'].fillna('other')
test.loc[test.city  == 'москва', 'city_type'] = "Capital"
test.loc[test.city  == 'санкт-Петербург', 'city_type'] = "Capital"
test = test.merge(zarplaty, on = 'region', how='left')
test['zarplata'] = pd.to_numeric(test['zarplata'], downcast = 'unsigned')

In [59]:
cities_values = pd.DataFrame(train['city'].value_counts()).reset_index()
cities_values_other = cities_values[cities_values['city']<=8]['index'].tolist()

train.loc[train['city'].isin(cities_values_other).index, 'city'] = train.loc[train['city'].isin(cities_values_other).index, 'osm_city_nearest_name'].tolist()
test.loc[test['city'].isin(cities_values_other).index, 'city'] = test.loc[test['city'].isin(cities_values_other).index, 'osm_city_nearest_name'].tolist()

## Чистка этажей

In [60]:
def floor_recognition(s):
    if s[0] == "-":
        return -1 * floor_recognition(s[1:])
    if s.isdigit():
        return int(s)
    elif "подв" in s:
        return -1
    elif "цок" in s:
        return 0
    else:
        return -100
    
def floor_parser(row):
    result = [0, 0, 0, 0, 0]
    r = str(row).lower()
    for stop_word in [".0", "этажа", "этаж", "эт", " ", "-й"]:
        r = r.replace(stop_word, "")
    r = r.replace(".", ",").split(",")[0]
    floor = floor_recognition(r)
    if floor < -99:
        return pd.Series(result)
    elif floor < 1:
        result[0] = 1
    elif floor <=5:
        result[1] = 1
    elif floor <= 10:
        result[2] = 1
    elif floor <= 20:
        result[3] = 1
    else:
        result[4] = 1
    return pd.Series(result)

In [61]:
train['floor'] = train['floor'].fillna('1')
test['floor'] = test['floor'].fillna('1')

train['street'] = train['street'].fillna('no street')
test['street'] = test['street'].fillna('no street')

In [62]:
train[["floor<0", "floor<=5", "floor<=10", "floor<=20", "floor>20"]] = train["floor"].apply(floor_parser)
test[["floor<0", "floor<=5", "floor<=10", "floor<=20", "floor>20"]] = test["floor"].apply(floor_parser)

## Дейттайм фичи

In [63]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

month_year = (train.date.dt.month + train.date.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)

month_year = (test.date.dt.month + test.date.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)

week_year = (train.date.dt.weekofyear + train.date.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)

week_year = (test.date.dt.weekofyear + test.date.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)

train['month'] = train.date.dt.month
train['dow'] = train.date.dt.dayofweek

test['month'] = test.date.dt.month
test['dow'] = test.date.dt.dayofweek

## Небольшая обработка екстремальных и подозрительных значений

In [64]:
train['total_square'] = np.clip(train['total_square'], 0, 15_000)
test['total_square'] = np.clip(test['total_square'], 0, 15_000)

In [65]:
for col in train.columns:
    if 'year' in col:
        train[col] = train[col].replace({1: np.nan})
        test[col] = test[col].replace({1: np.nan})
    elif 'mean_floor' in col:
        train[col] = np.clip(train[col], 0, 70)
        test[col] = np.clip(test[col], 0, 70)

## Статистики относительно закодированых улиц

In [66]:
sq_by_street = train.groupby('street')['total_square'].apply(np.mean).to_dict()
train['square_by_street'] = train['street'].apply(lambda x: sq_by_street.get(x, np.nan))
test['square_by_street'] = test['street'].apply(lambda x: sq_by_street.get(x, np.nan))

mean_floor_500_by_street = train.groupby('street')['reform_mean_floor_count_500'].apply(np.mean).to_dict()
train['mean_floor_500_by_street'] = train['street'].apply(lambda x: mean_floor_500_by_street.get(x, np.nan))
test['mean_floor_500_by_street'] = test['street'].apply(lambda x: mean_floor_500_by_street.get(x, np.nan))

mean_year_500_by_street = train.groupby('street')['reform_mean_year_building_500'].apply(np.mean).to_dict()
train['mean_year_500_by_street'] = train['street'].apply(lambda x: mean_year_500_by_street.get(x, np.nan))
test['mean_year_500_by_street'] = test['street'].apply(lambda x: mean_year_500_by_street.get(x, np.nan))

In [67]:
count_order_by_street_month = train.groupby(['street', 'month'])['city'].count().to_dict()
train['count_order_by_street_month'] = train.apply(lambda x: count_order_by_street_month.get((x['street'], x['month']), np.nan), axis=1)
test['count_order_by_street_month'] = test.apply(lambda x: count_order_by_street_month.get((x['street'], x['month']), np.nan), axis=1)

## Геофичи

In [68]:
import math

kremlin_lat, kremlin_lon = 55.753722, 37.620657

def dist_calc(lat1, lon1, lat2, lon2):
    R = 6373.0

    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * \
          math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c


train['distance_to_kremlin'] = train.apply(
    lambda row: dist_calc(row.lat, row.lng, kremlin_lat, kremlin_lon), axis=1)
test['distance_to_kremlin'] = test.apply(
    lambda row: dist_calc(row.lat, row.lng, kremlin_lat, kremlin_lon), axis=1)

In [69]:
train['prisons_dist'] = prisons_knn.kneighbors(train[['lat','lng']])[0].mean(axis=1)
train['schools_dist'] = schools_knn.kneighbors(train[['lat','lng']])[0].mean(axis=1)

test['prisons_dist'] = prisons_knn.kneighbors(test[['lat','lng']])[0].mean(axis=1)
test['schools_dist'] = schools_knn.kneighbors(test[['lat','lng']])[0].mean(axis=1)

## Доп. фичи на основании входных

In [70]:
def check_offices(x):
    
    if x['osm_offices_points_in_0.001'] > 2 or x['osm_offices_points_in_0.005'] > 10 or x['osm_offices_points_in_0.0075'] > 15 or x['osm_offices_points_in_0.01'] > 20:
        return 1
    return 0

train['many_offices'] = train.apply(check_offices, axis=1)
test['many_offices'] = test.apply(check_offices, axis=1)

In [71]:
def check_financial_organizations(x):
    
    if x['osm_finance_points_in_0.001'] > 2 or x['osm_finance_points_in_0.005'] > 10 or x['osm_finance_points_in_0.0075'] > 15 or x['osm_finance_points_in_0.01'] > 20:
        return 1
    return 0

train['many_financial_organizations'] = train.apply(check_financial_organizations, axis=1)
test['many_financial_organizations'] = test.apply(check_financial_organizations, axis=1)

## Работа с пропущенными значениями

In [72]:
train['reform_house_population_500'] = train['reform_house_population_500'].fillna(train['reform_house_population_500'].median())
train['reform_house_population_1000'] = train['reform_house_population_1000'].fillna(train['reform_house_population_1000'].median())
train['reform_mean_floor_count_1000'] = train['reform_mean_floor_count_1000'].apply(lambda x: np.log(x+1))
train['reform_mean_floor_count_1000'] = train['reform_mean_floor_count_1000'].fillna(train['reform_mean_floor_count_1000'].mean())
train['reform_mean_floor_count_500'] = train['reform_mean_floor_count_500'].fillna(train['reform_mean_floor_count_500'].median())
train['reform_mean_year_building_1000'] = train['reform_mean_year_building_1000'].fillna(train['reform_mean_year_building_1000'].median())
train['reform_mean_year_building_500'] = train['reform_mean_year_building_500'].fillna(train['reform_mean_year_building_500'].median())

test['reform_house_population_500'] = test['reform_house_population_500'].fillna(train['reform_house_population_500'].median())
test['reform_house_population_1000'] = test['reform_house_population_1000'].fillna(train['reform_house_population_1000'].median())
test['reform_mean_floor_count_1000'] = test['reform_mean_floor_count_1000'].apply(lambda x: np.log(x+1))
test['reform_mean_floor_count_1000'] = test['reform_mean_floor_count_1000'].fillna(train['reform_mean_floor_count_1000'].mean())
test['reform_mean_floor_count_500'] = test['reform_mean_floor_count_500'].fillna(train['reform_mean_floor_count_500'].median())
test['reform_mean_year_building_1000'] = test['reform_mean_year_building_1000'].fillna(train['reform_mean_year_building_1000'].median())
test['reform_mean_year_building_500'] = test['reform_mean_year_building_500'].fillna(train['reform_mean_year_building_500'].median())

## Используемые фичи

In [73]:
used_cols = list({'distance_to_kremlin', 'many_offices', 'many_financial_organizations',#'is_million_city', 
             "floor<0", "floor<=5", "floor<=10", "floor<=20", "floor>20",
             'square_by_street', 'mean_floor_500_by_street', 'mean_year_500_by_street', 
             'month_year_cnt', 'week_year_cnt', 'month', 'dow',
             
             
    'total_square', 'lng', 'lat', 'osm_city_closest_dist',
    'osm_subway_closest_dist', 'osm_city_nearest_population',
    'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
    'reform_mean_year_building_500', 'osm_transport_stop_closest_dist',
    'osm_crossing_closest_dist', 'osm_crossing_points_in_0.01',
    'reform_house_population_500', 'osm_train_stop_closest_dist',
    'reform_mean_year_building_1000', 'reform_house_population_1000',
    'reform_count_of_houses_1000', 'osm_catering_points_in_0.01',
    'reform_count_of_houses_500', 'osm_transport_stop_points_in_0.01',
    'osm_shops_points_in_0.005', 'osm_culture_points_in_0.01',
    'osm_catering_points_in_0.005', 'osm_transport_stop_points_in_0.0075',
    'osm_amenity_points_in_0.001', 'osm_finance_points_in_0.005',
    'osm_shops_points_in_0.01', 'osm_leisure_points_in_0.01',
    'osm_crossing_points_in_0.0075', 'osm_catering_points_in_0.001',
    'osm_finance_points_in_0.0075', 'osm_shops_points_in_0.0075',
    'osm_shops_points_in_0.001', 'osm_crossing_points_in_0.005',
    'osm_amenity_points_in_0.01', 'osm_finance_points_in_0.01',
    'osm_catering_points_in_0.0075', 'osm_offices_points_in_0.01',
    'osm_leisure_points_in_0.0075', 'osm_offices_points_in_0.0075',
    'osm_healthcare_points_in_0.01', 'osm_culture_points_in_0.0075',
    'osm_culture_points_in_0.005', 'osm_historic_points_in_0.01',
    'osm_building_points_in_0.0075', 'osm_historic_points_in_0.0075',
    'osm_amenity_points_in_0.005', 'osm_leisure_points_in_0.005',
    'osm_hotels_points_in_0.01', 'osm_healthcare_points_in_0.0075',
    'osm_offices_points_in_0.005', 'osm_crossing_points_in_0.001',
    'osm_healthcare_points_in_0.005', 'osm_transport_stop_points_in_0.005',
    'osm_building_points_in_0.01', 'osm_hotels_points_in_0.0075',
    'osm_amenity_points_in_0.0075', 'osm_historic_points_in_0.005',
    'osm_hotels_points_in_0.005', 'osm_finance_points_in_0.001',
    'osm_building_points_in_0.005', 'osm_offices_points_in_0.001',
    'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.005',
    'osm_train_stop_points_in_0.01', 'osm_culture_points_in_0.001',
    'osm_building_points_in_0.001', }) + ['prisons_dist', 'schools_dist'] + ['age', 'zarplata']

## Енкодинг категориальных переменных

In [74]:
all_cat_cols = ['city', 'floor', 'osm_city_nearest_name', 'region', 'street', 'realty_type','city_type']
cat_cols = ['region', 'city', 'realty_type', 'osm_city_nearest_name', 'street','city_type']
cat_ohe = ['region', 'realty_type', 'osm_city_nearest_name', 'floor', 'osm_city_nearest_name']

In [75]:
train = train.drop(columns=train.columns[train.columns.str.startswith('OHE')])
test = test.drop(columns=train.columns[train.columns.str.startswith('OHE')])
for col in all_cat_cols:
    print(col)
    if col in cat_cols:
        oe_temp = OrdinalEncoder()
        oe_temp.fit(pd.concat([train[[col]], test[[col]]]))
        train[col + 'ord_enc'] = oe_temp.transform(train[[col]])
        test[col + 'ord_enc'] = oe_temp.transform(test[[col]])
    
    if col in cat_ohe:
        OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        OH_encoder.fit(pd.concat([train[[col]], test[[col]]]))
        train = pd.concat([train, pd.DataFrame(OH_encoder.transform(train[[col]]), index=train.index).add_prefix('OHE_' + col + '_')], axis=1)
        test = pd.concat([test, pd.DataFrame(OH_encoder.transform(test[[col]]), index=test.index).add_prefix('OHE_' + col + '_')], axis=1)
    
    if col in cat_cols:
        train[col + '_cat'] = train[col].astype('category')
        test[col + '_cat'] = test[col].astype('category')


used_cols = used_cols + train.columns[train.columns.str.startswith('OHE')].tolist()

# Моделинг

In [76]:
from sklearn.cluster import KMeans
from tqdm import tqdm
def calc_clusters(X, y, test, model, cv, used_cols, oof=False):
    res=[]
    local_probs=pd.DataFrame(index=test.index)
    two = ['lat','lng']
    for i, (tdx, vdx) in enumerate(cv.split(X, X['price_type'])):
        X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]
        
        kmeans = KMeans(n_clusters=8, random_state=seed)
        X_train['kmeans'] = kmeans.fit_predict(X_train[two])
        X_valid['kmeans'] = kmeans.predict(X_valid[two])
        
        if oof:
            test['kmeans'] = kmeans.predict(test[two])
        
        for km in X_train['kmeans'].unique():
            train_mask = X_train['kmeans']==km
            valid_mask = X_valid['kmeans']==km
            
            model.fit(X_train.loc[train_mask, used_cols], np.log(y_train[train_mask]))
            X_valid.loc[valid_mask, 'preds_km'] = np.exp(model.predict(X_valid.loc[valid_mask, used_cols]))
            
            if oof:
                test_mask = test['kmeans']==km
                local_probs.loc[test_mask, 'fold_%i'%i] = np.exp(model.predict(test.loc[test_mask, used_cols]))
            
        metric = deviation_metric(y_valid.tolist(), X_valid['preds_km'].tolist())
        res.append(metric)
        print(f'fold_{i+1}: {metric:.4f}')            

    print('mean:', round(np.mean(res), 4))
    if oof:
        local_probs['res'] = local_probs.mean(axis=1)
    return local_probs

In [77]:
catb_1lvl = CatBoostRegressor(
                iterations=1000,
                random_state=seed,
                silent=True,
                task_type= 'GPU')
lgbm_2lvl = LGBMRegressor(
    n_estimators=1000,
    random_seed = seed)

kfold = KFold(n_splits=5, shuffle = True, random_state = seed)

## Наращиваем размер датасета с price_type==1. Ищем семплы с найменьшей ошибкой по предикту

In [78]:
manual = train[train['price_type']==1]
auto = train[train['price_type']==0]

In [79]:
catb_1lvl.fit(manual[used_cols], np.log(manual['per_square_meter_price']))
preds_aut = np.exp(catb_1lvl.predict(auto[used_cols]))
auto['metric'] = deviation_metric_arr(auto['per_square_meter_price'].tolist(), preds_aut.tolist())

In [80]:
manual_full = pd.concat([manual, auto[auto['metric']==0]]).reset_index(drop=True)

In [81]:
X = manual_full.drop('per_square_meter_price', axis=1)
y = manual_full['per_square_meter_price']

## OOF предикты отдельно по каждому кластеру

In [82]:
probs = calc_clusters(X, y, test, lgbm_2lvl, kfold, used_cols, oof=True)

In [83]:
sub['per_square_meter_price'] = probs['res']

In [84]:
plt.figure(figsize=(16, 8))
train['per_square_meter_price'].hist(bins=40, density=True, alpha=0.5)
sub['per_square_meter_price'].hist(bins=40, density=True, alpha=0.5)
(sub['per_square_meter_price'] * 0.9).hist(bins=40, density=True, alpha=0.5)

In [85]:
sub.to_csv('submit_full_featurs_with_external_data_lgbm.csv', index=False)