In [None]:
# !pip install catboost
# !pip install shap

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn import metrics

from catboost import CatBoostRegressor, CatBoostClassifier

import shap

pd.set_option('display.max_columns', 60)

In [None]:
def get_dif(largest_iso, least_iso):
  for col in ['bus_stop', 'house', 'kiosk',
        'retail', 'station', 'subway_entrance', 'tram_stop', 'bar', 'cafe',
        'fast_food', 'food_court', 'pub', 'restaurant', 'college',
        'driving_school', 'language_school', 'school', 'kindergarten',
        'university', 'car_wash', 'fuel', 'atm', 'bank', 'clinic', 'dentist',
        'doctors', 'hospital', 'pharmacy', 'veterinary', 'theatre', 'cinema',
        'hostel', 'hotel', 'office', 'shop']:
    largest_iso[col] = largest_iso[col]-least_iso[col]


def add_weights(columns, weight):
  for col_name in scaled_data.columns:
    if columns in col_name:
      scaled_data[col_name] = scaled_data[col_name]*weight


def make_classes_for_num_checks(q):
  df_target['interval'] = pd.qcut(df_target["Чеки шт/мес"], q=q)
  df_target['makr_class'] = pd.qcut(df_target["Чеки шт/мес"], q=q, labels=False)

  for i in range(q):
    df_target.loc[df_target['makr_class'] == i, 'mean'] = df_target[df_target['makr_class'] == i]["Чеки шт/мес"].mean()

In [None]:
df_target = pd.read_csv('../../data/vkusvill_target.csv')
data = pd.read_csv('../../data/vkusvill_data_without_target.csv')
iso_30 = pd.read_csv('../../data/vkusvill_isochrone_30.csv')
iso_25 = pd.read_csv('../../data/vkusvill_isochrone_25.csv')
iso_20 = pd.read_csv('../../data/vkusvill_isochrone_20.csv')
iso_15 = pd.read_csv('../../data/vkusvill_isochrone_15.csv')
iso_10 = pd.read_csv('../../data/vkusvill_isochrone_10.csv')
dist_df = pd.read_csv('../../data/vkusvill_dist.csv')

get_dif(iso_30, iso_25)
get_dif(iso_25, iso_20)
get_dif(iso_20, iso_15)
get_dif(iso_15, iso_10)

# revenue = df_target['Выручка р/мес'].values
# avg_check = df_target["Выручка р/мес"] / df_target["Чеки шт/мес"]
# num_checks = df_target["Чеки шт/мес"].values

In [None]:
work_schedule = [
    ['с 9-00 до 22-00', 9, 22, 9, 22, 0], 
    ['с 8-00 до 22-00', 8, 22, 8, 22, 0],
    ['пн-чт с 7-00 - 23-45, пт-сб с 8-00 - 23-00, вс с 7-00 - 23-45', 7, 23, 7, 23, 1],
    ['Откл. с 9:00 до 22:00', 9, 22, 9, 22, 0], 
    ['с 7-00 до 22-00', 7, 22, 7, 22, 0],
    ['будни с 8-00 до 22-00; вых. с 9-00 до 22-00', 8, 22, 9, 22, 1], 
    ['с 10-00 до 22-00', 10, 22, 10, 22, 0],
    ['будни с 9-00 до 23-00, выходные с 9-00 до 22-00', 9, 23, 9, 22, 1],
    ['с 9-00 до 21-00', 9, 21, 9, 21, 0], 
    ['Будни с 8 до 23, вых с 8 до 22', 8, 23, 8, 22, 1],
    ['с 7:00 до 23:45', 7, 23, 7, 23, 0], 
    ['Откл. с 11:00 до 21:00', 11, 21, 11, 21, 0],
    ['Откл. с 8:00 до 22:00', 8, 22, 8, 22, 0], 
    ['с 8-00 до 23-45', 8, 23, 8, 23, 0], 
    ['с 8-00 до 23-00', 8, 23, 8, 23, 0],
    ['Будни с8:00 до 23:00 вых с 8:00 до 22:00', 8, 23, 8, 22, 1],
    ['будни с 8-00 до 23-00, вых. 9-00 до 23-00', 8, 23, 9, 23, 1],
    ['пн-пт с 8-00 до 23-00, сб-вс с 9-00 до 22-00', 8, 23, 9, 22, 1],
    ['вс-чт. с 10:00 до 23:00, пт-сб. с 10:00 по 24:00', 10, 23, 10, 24, 1],
    ['пн-чт с 7-00 - 23-45, пт-сб с 7-00 - 23-00, вс с 7-00 - 23-45', 7, 23, 7, 23, 1],
    ['с 8-30 до 23-00', 8, 23, 8, 23, 0], 
    ['с 7-00 до 23-45', 7, 23, 7, 23, 0],
    ['пн-чт с 7-00 - 23-45, пт-сб с 7-00 - 22-00, вс с 8-00 - 23-45', 7, 23, 8, 23, 1],
    ['с пн по чт с 7 до 23:45, пт с 7 до 23:00, сб с 8 до 22:00, вс 8 до 23:45', 7, 23, 8, 23, 1],
    ['бд 7-00 до23-00 вх 8-00 до 22-00', 7, 23, 8, 22, 1], 
    ['с 9:00 до 23:00', 9, 23, 9, 23, 0],
    ['с 7-00 до 23-00', 7, 23, 7, 23, 0]
]

work_schedule = pd.DataFrame(
    work_schedule, 
    columns=["График", "Будни начало", "Будни конец", "Выходные начало", "Выходные конец", "Разные графики"]
    )

data = pd.merge(data, work_schedule, on="График", how="left")
data['Рабочие часы в будни'] = data['Будни конец'] - data['Будни начало']
data['Рабочие часы в выходные'] = data['Выходные конец'] - data['Выходные начало']
data['Ночной магазин'].replace({'Нет': 0, 'Да': 1}, inplace=True)

In [None]:
def add_iso(new_features, iso):
  new_features_aggregated = pd.DataFrame(columns=['medicine'+iso, 
                                                   'stations'+iso, 'housing'+iso,
                                                   'shops'+iso, 'atms+banks'+iso,
                                                   'office'+iso,
                                                  'food'+iso,'for_motorists'+iso,
                                                  ])

  new_features_aggregated['medicine'+iso] = new_features['clinic']+new_features['dentist']+\
                                        new_features['doctors']+new_features['hospital']+\
                                        new_features['pharmacy']

  new_features_aggregated['food'+iso] = new_features['food_court']+new_features['pub']+\
                                    new_features['restaurant']+new_features['cafe']+\
                                    new_features['bar']+new_features['fast_food']

  new_features_aggregated['stations'+iso] = new_features['bus_stop']+new_features['station']+\
                                        new_features['subway_entrance']+new_features['tram_stop']     

  new_features_aggregated['housing'+iso] = new_features['hotel']+new_features['hostel']

  new_features_aggregated['for_motorists'+iso] = new_features['car_wash']+new_features['fuel']

  new_features_aggregated['shops'+iso] = new_features['kiosk']+new_features['retail']+new_features['shop']

  new_features_aggregated['atms+banks'+iso] = new_features['atm']+new_features['bank']

  new_features_aggregated['office'+iso] = new_features['office']

  return new_features_aggregated

In [None]:
data = pd.concat([data, add_iso(iso_10, '10')], axis=1) 
data = pd.concat([data, add_iso(iso_15, '15')], axis=1) 
data = pd.concat([data, add_iso(iso_20, '20')], axis=1) 
data = pd.concat([data, add_iso(iso_25, '25')], axis=1)
data = pd.concat([data, add_iso(iso_30, '30')], axis=1)  

data = pd.concat([data, dist_df], axis=1) 

In [None]:
data_cat = data[['Регион', 'Город']]

In [None]:
data = data.drop(columns = ['Дата открытия', 'Наименование', 'Номерточки', 'Регион', 'Город', 'Адрес', 'Широта', 'Долгота', 'График'], axis = 1)

In [None]:
scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(data)

scaled_data = pd.DataFrame(d, columns=data.columns)

In [None]:
add_weights('15', 0.9)
add_weights('20', 0.8)
add_weights('25', 0.7)
add_weights('30', 0.6)

In [None]:
num_in_train = 240
solo = []
mul = []
with_classes = []

In [None]:
for i in range(20): 
  all_data = pd.concat([scaled_data, data_cat], axis=1) 
  all_data_with_targets = pd.concat([all_data, df_target[["Выручка р/мес","Чеки шт/мес"]]], axis=1) 
  all_data_with_targets = all_data_with_targets.sample(frac=1).reset_index(drop=True)

  revenue = all_data_with_targets['Выручка р/мес'].values
  avg_check = all_data_with_targets["Выручка р/мес"] / all_data_with_targets["Чеки шт/мес"]
  num_checks = all_data_with_targets["Чеки шт/мес"].values
  all_data = all_data_with_targets.drop(columns = ["Выручка р/мес","Чеки шт/мес"], axis = 1)


  X_train, y_train = all_data[:num_in_train], revenue[:num_in_train]
  X_test, y_test = all_data[num_in_train:], revenue[num_in_train:]
  revenue_test = revenue[num_in_train:]

  model = CatBoostRegressor(cat_features=['Регион', 'Город'],logging_level= 'Silent')
  model.fit(X_train, y_train)
  y_pred_test = model.predict(X_test)
  print('CatBoostRegressor revenue MAPE: %.3f' % metrics.mean_absolute_percentage_error(y_test, y_pred_test))
  revenue_pred = y_pred_test.copy()


  X_train, y_train = all_data[:num_in_train], avg_check[:num_in_train]
  X_test, y_test = all_data[num_in_train:], avg_check[num_in_train:]
  avg_check_test = avg_check[num_in_train:]

  model = CatBoostRegressor(cat_features=['Регион', 'Город'],logging_level= 'Silent')
  model.fit(X_train, y_train)
  y_pred_test = model.predict(X_test)
  print('CatBoostRegressor avg_check MAPE: %.3f' % metrics.mean_absolute_percentage_error(y_test, y_pred_test))
  avg_check_pred = y_pred_test.copy()


  X_train, y_train = all_data[:num_in_train], num_checks[:num_in_train]
  X_test, y_test = all_data[num_in_train:], num_checks[num_in_train:]
  num_checks_test = num_checks[num_in_train:]

  model = CatBoostRegressor(cat_features=['Регион', 'Город'],logging_level= 'Silent')
  model.fit(X_train, y_train)
  y_pred_test = model.predict(X_test)
  print('CatBoostRegressor num_checks MAPE: %.3f' % metrics.mean_absolute_percentage_error(y_test, y_pred_test))
  num_checks_pred = y_pred_test.copy()


  make_classes_for_num_checks(17)

  num_checks_class = df_target["makr_class"].values

  classes_dict = {}
  for mark_class, mean in zip(df_target["makr_class"], df_target["mean"]):
    classes_dict[mark_class] = mean


  X_train, y_train = all_data[:num_in_train], num_checks_class[:num_in_train]
  X_test, y_test = all_data[num_in_train:], num_checks_class[num_in_train:]
  num_checks_class_test = num_checks_class[num_in_train:]

  classifier = CatBoostClassifier(cat_features=['Регион', 'Город'],logging_level= 'Silent')
  classifier.fit(X_train, y_train)
  y_pred_test = classifier.predict(X_test)
  num_checks_class_pred = y_pred_test

  result_class_classification = []
  for i in y_pred_test:
    result_class_classification.append(classes_dict[i[0]])

  solo.append(metrics.mean_absolute_percentage_error(revenue_test, revenue_pred))
  mul.append(metrics.mean_absolute_percentage_error(revenue_test, 
                    (avg_check_pred*num_checks_pred+revenue_pred)/2))
  with_classes.append(metrics.mean_absolute_percentage_error(revenue_test, 
                    (avg_check_pred*((result_class_classification+num_checks_pred)/2)+revenue_pred)/2))
  
  print(metrics.mean_absolute_percentage_error(revenue_test, revenue_pred),
        metrics.mean_absolute_percentage_error(revenue_test, (avg_check_pred*num_checks_pred+revenue_pred)/2),
        metrics.mean_absolute_percentage_error(revenue_test, 
                    (avg_check_pred*((result_class_classification+num_checks_pred)/2)+revenue_pred)/2))
  print('-----------')

CatBoostRegressor revenue MAPE: 0.164
CatBoostRegressor avg_check MAPE: 0.130
CatBoostRegressor num_checks MAPE: 0.180
0.16437155980703927 0.1657879588871661 0.18736351280374228
-----------
CatBoostRegressor revenue MAPE: 0.172
CatBoostRegressor avg_check MAPE: 0.118
CatBoostRegressor num_checks MAPE: 0.159
0.1715848585037211 0.17450235494762048 0.21165709355590875
-----------
CatBoostRegressor revenue MAPE: 0.144
CatBoostRegressor avg_check MAPE: 0.096
CatBoostRegressor num_checks MAPE: 0.150
0.14379495560351008 0.14831726064724943 0.16294379179405868
-----------
CatBoostRegressor revenue MAPE: 0.192
CatBoostRegressor avg_check MAPE: 0.100
CatBoostRegressor num_checks MAPE: 0.200
0.19201758701160546 0.19137501627059106 0.18187891390896738
-----------
CatBoostRegressor revenue MAPE: 0.173
CatBoostRegressor avg_check MAPE: 0.110
CatBoostRegressor num_checks MAPE: 0.201
0.1728528137341998 0.17608183351430381 0.21925817378876056
-----------
CatBoostRegressor revenue MAPE: 0.200
CatBoostRe

In [None]:
from statistics import mean
print(mean(solo))
print(mean(mul))
print(mean(with_classes))

0.1791836437094589
0.1801313355013972
0.2031982502473391
