In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

In [2]:
df_data_without_target = pd.read_csv('data/vkusvill_data_without_target.csv')
df_target = pd.read_csv('data/vkusvill_target.csv')
df = df_data_without_target.merge(df_target, on='Номерточки', how='inner')
df["Средний чек"] = df["Выручка р/мес"] / df["Чеки шт/мес"]

In [3]:
work_schedule = [
    ['с 9-00 до 22-00', 9, 22, 9, 22, 0], 
    ['с 8-00 до 22-00', 8, 22, 8, 22, 0],
    ['пн-чт с 7-00 - 23-45, пт-сб с 8-00 - 23-00, вс с 7-00 - 23-45', 7, 23, 7, 23, 1],
    ['Откл. с 9:00 до 22:00', 9, 22, 9, 22, 0], 
    ['с 7-00 до 22-00', 7, 22, 7, 22, 0],
    ['будни с 8-00 до 22-00; вых. с 9-00 до 22-00', 8, 22, 9, 22, 1], 
    ['с 10-00 до 22-00', 10, 22, 10, 22, 0],
    ['будни с 9-00 до 23-00, выходные с 9-00 до 22-00', 9, 23, 9, 22, 1],
    ['с 9-00 до 21-00', 9, 21, 9, 21, 0], 
    ['Будни с 8 до 23, вых с 8 до 22', 8, 23, 8, 22, 1],
    ['с 7:00 до 23:45', 7, 23, 7, 23, 0], 
    ['Откл. с 11:00 до 21:00', 11, 21, 11, 21, 0],
    ['Откл. с 8:00 до 22:00', 8, 22, 8, 22, 0], 
    ['с 8-00 до 23-45', 8, 23, 8, 23, 0], 
    ['с 8-00 до 23-00', 8, 23, 8, 23, 0],
    ['Будни с8:00 до 23:00 вых с 8:00 до 22:00', 8, 23, 8, 22, 1],
    ['будни с 8-00 до 23-00, вых. 9-00 до 23-00', 8, 23, 9, 23, 1],
    ['пн-пт с 8-00 до 23-00, сб-вс с 9-00 до 22-00', 8, 23, 9, 22, 1],
    ['вс-чт. с 10:00 до 23:00, пт-сб. с 10:00 по 24:00', 10, 23, 10, 24, 1],
    ['пн-чт с 7-00 - 23-45, пт-сб с 7-00 - 23-00, вс с 7-00 - 23-45', 7, 23, 7, 23, 1],
    ['с 8-30 до 23-00', 8, 23, 8, 23, 0], 
    ['с 7-00 до 23-45', 7, 23, 7, 23, 0],
    ['пн-чт с 7-00 - 23-45, пт-сб с 7-00 - 22-00, вс с 8-00 - 23-45', 7, 23, 8, 23, 1],
    ['с пн по чт с 7 до 23:45, пт с 7 до 23:00, сб с 8 до 22:00, вс 8 до 23:45', 7, 23, 8, 23, 1],
    ['бд 7-00 до23-00 вх 8-00 до 22-00', 7, 23, 8, 22, 1], 
    ['с 9:00 до 23:00', 9, 23, 9, 23, 0],
    ['с 7-00 до 23-00', 7, 23, 7, 23, 0]
]

work_schedule = pd.DataFrame(
    work_schedule, 
    columns=["График", "Будни начало", "Будни конец", "Выходные начало", "Выходные конец", "Разные графики"]
    )

df = pd.merge(df, work_schedule, on="График", how="left")
df['Рабочие часы в будни'] = df['Будни конец'] - df['Будни начало']
df['Рабочие часы в выходные'] = df['Выходные конец'] - df['Выходные начало']
df['Ночной магазин'].replace({'Нет': 0, 'Да': 1}, inplace=True)

df = pd.concat([df, pd.get_dummies(df['Город'])], axis=1)

# df.set_index('Номерточки',inplace=True)

df = df[['Дата открытия', 'Торговая площадь, м2',
       'Ночной магазин', 'Выручка р/мес', 'Чеки шт/мес', 'Средний чек',
       'Будни начало', 'Будни конец', 'Выходные начало', 'Выходные конец',
       'Разные графики', 'Рабочие часы в будни', 'Рабочие часы в выходные',
       'Волоколамск', 'Дедовск', 'Дмитров', 'Долгопрудный', 'Дубна',
       'Зеленоград', 'Ивантеевка', 'Истра', 'Клин', 'Королев', 'Красногорск',
       'Лобня', 'Москва', 'Мытищи', 'Нахабино', 'Некрасовский', 'Новинки',
       'Павловская Слобода', 'Путилково', 'Пушкино', 'Сабурово',
       'Сергиев Посад', 'Солнечногорск', 'Солнечногорский р-он, д. Подолино',
       'Химки', 'Хотьково', 'Юбилейный', 'Яхрома']]

train = df[:260]
test = df[260:]

In [4]:
train.head(3)

Unnamed: 0,Дата открытия,"Торговая площадь, м2",Ночной магазин,Выручка р/мес,Чеки шт/мес,Средний чек,Будни начало,Будни конец,Выходные начало,Выходные конец,...,Путилково,Пушкино,Сабурово,Сергиев Посад,Солнечногорск,"Солнечногорский р-он, д. Подолино",Химки,Хотьково,Юбилейный,Яхрома
0,44491,52.6,0,39098.035556,204.773333,190.933238,9,22,9,22,...,False,False,False,False,False,False,False,False,False,False
1,43799,78.0,0,30782.08,117.173333,262.705508,9,22,9,22,...,False,False,False,False,False,False,False,False,False,False
2,42600,73.0,0,28921.148889,201.533333,143.505535,8,22,8,22,...,False,False,False,False,False,False,False,False,False,False


In [5]:
all_data_test_revenue = df['Выручка р/мес']
all_data_test_avg_check = df['Средний чек']
all_data_train = df.drop(columns = ['Выручка р/мес', 'Чеки шт/мес', 'Средний чек'],axis = 1)

test_revenue = test['Выручка р/мес']
test_avg_check = test['Средний чек']
test = test.drop(columns = ['Выручка р/мес', 'Чеки шт/мес', 'Средний чек'],axis = 1)

train_revenue = train['Выручка р/мес']
train_avg_check = train['Средний чек']
train = train.drop(columns = ['Выручка р/мес', 'Чеки шт/мес', 'Средний чек'],axis = 1)

In [6]:
results_revenue = []
results_avg_check = []

In [7]:
X_train, y_train = train, train_revenue.to_numpy()
X_test, y_test = test, test_revenue.to_numpy()


forest = RandomForestRegressor()
forest.fit(X_train, y_train)
y_pred_test = forest.predict(X_test)
print('RandomForestRegressor revenue MAPE: %.3f' % metrics.mean_absolute_percentage_error(y_test, y_pred_test))

RandomForestRegressor revenue MAPE: 0.223


In [8]:
# RandomForestRegressor

X_train, y_train = all_data_train, all_data_test_revenue.to_numpy()
forest = RandomForestRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(forest, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('RandomForestRegressor revenue mean MAPE: %.3f' % scores.mean())
results_revenue.append(('RandomForestRegressor', scores.mean()))

X_train, y_train = all_data_train, all_data_test_avg_check.to_numpy()
forest = RandomForestRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(forest, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('RandomForestRegressor avg_check mean MAPE: %.3f' % scores.mean())
results_avg_check.append(('RandomForestRegressor', scores.mean()))

RandomForestRegressor revenue mean MAPE: 0.191
RandomForestRegressor avg_check mean MAPE: 0.128


In [9]:
# AdaBoostRegressor

X_train, y_train = all_data_train, all_data_test_revenue.to_numpy()
AdaBoostReg = AdaBoostRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(AdaBoostReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('AdaBoostRegressor revenue mean MAPE: %.3f' % scores.mean())
results_revenue.append(('AdaBoostRegressor', scores.mean()))

X_train, y_train = all_data_train, all_data_test_avg_check.to_numpy()
AdaBoostReg = AdaBoostRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(AdaBoostReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('AdaBoostRegressor avg_check mean MAPE: %.3f' % scores.mean())
results_avg_check.append(('AdaBoostRegressor', scores.mean()))

AdaBoostRegressor revenue mean MAPE: 0.220
AdaBoostRegressor avg_check mean MAPE: 0.134


In [10]:
# GradientBoostingRegressor

X_train, y_train = all_data_train, all_data_test_revenue.to_numpy()
GBReg = GradientBoostingRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(GBReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('GradientBoostingRegressor revenue mean MAPE: %.3f' % scores.mean())
results_revenue.append(('GradientBoostingRegressor', scores.mean()))

X_train, y_train = all_data_train, all_data_test_avg_check.to_numpy()
GBReg = GradientBoostingRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(GBReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('GradientBoostingRegressor avg_check mean MAPE: %.3f' % scores.mean())
results_avg_check.append(('GradientBoostingRegressor', scores.mean()))

GradientBoostingRegressor revenue mean MAPE: 0.190
GradientBoostingRegressor avg_check mean MAPE: 0.130


In [11]:
# XGBRegressor

X_train, y_train = all_data_train, all_data_test_revenue.to_numpy()
XGBReg = XGBRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(XGBReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('XGBRegressor revenue mean MAPE: %.3f' % scores.mean())
results_revenue.append(('XGBRegressor', scores.mean()))

X_train, y_train = all_data_train, all_data_test_avg_check.to_numpy()
XGBReg = XGBRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(XGBReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('XGBRegressor avg_check mean MAPE: %.3f' % scores.mean())
results_avg_check.append(('XGBRegressor', scores.mean()))

XGBRegressor revenue mean MAPE: 0.214
XGBRegressor avg_check mean MAPE: 0.143


In [12]:
results_revenue = sorted(results_revenue,key = lambda x: x[1])
results_avg_check = sorted(results_avg_check,key = lambda x: x[1])

df_results_revenue = pd.DataFrame(results_revenue, columns=['model', 'result'])
df_results_avg_check = pd.DataFrame(results_avg_check, columns=['model', 'result'])

In [13]:
df_results_revenue

Unnamed: 0,model,result
0,GradientBoostingRegressor,0.189692
1,RandomForestRegressor,0.190544
2,XGBRegressor,0.213857
3,AdaBoostRegressor,0.219906


In [14]:
df_results_avg_check

Unnamed: 0,model,result
0,RandomForestRegressor,0.127949
1,GradientBoostingRegressor,0.129848
2,AdaBoostRegressor,0.134268
3,XGBRegressor,0.142824


## Добавляем дополнительные признаки из OSM

In [15]:
new_features = pd.read_csv('data/vkusvill_new_features.csv')

In [16]:
all_data_train_new_features = pd.concat([all_data_train, new_features[['fuel', 'bus_stop', 'house', 'kiosk', 'retail',
       'station', 'subway_entrance', 'tram_stop', 'bar', 'cafe', 'fast_food',
       'food_court', 'pub', 'restaurant', 'college', 'driving_school',
       'language_school', 'school', 'kindergarten', 'university', 'car_wash',
       'atm', 'bank', 'clinic', 'dentist', 'doctors', 'hospital', 'pharmacy',
       'veterinary', 'theatre', 'cinema', 'hostel', 'hotel', 'office', 'shop']]], axis=1)

In [17]:
all_data_train_new_features.head()

Unnamed: 0,Дата открытия,"Торговая площадь, м2",Ночной магазин,Будни начало,Будни конец,Выходные начало,Выходные конец,Разные графики,Рабочие часы в будни,Рабочие часы в выходные,...,doctors,hospital,pharmacy,veterinary,theatre,cinema,hostel,hotel,office,shop
0,44491,52.6,0,9,22,9,22,0,13,13,...,1,0,5,0,0,0,0,0,0,91
1,43799,78.0,0,9,22,9,22,0,13,13,...,0,0,0,0,0,0,0,0,1,9
2,42600,73.0,0,8,22,8,22,0,14,14,...,6,2,25,3,2,0,5,7,46,390
3,43661,85.5,0,8,22,8,22,0,14,14,...,5,2,14,0,0,1,1,1,17,184
4,43201,100.0,0,8,22,8,22,0,14,14,...,6,0,30,2,0,0,0,1,22,229


In [18]:
results_revenue_with_new_features = []
results_avg_check_with_new_features = []

In [19]:
# RandomForestRegressor

X_train, y_train = all_data_train_new_features, all_data_test_revenue.to_numpy()
forest = RandomForestRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(forest, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('RandomForestRegressor revenue mean MAPE: %.3f' % scores.mean())
results_revenue_with_new_features.append(('RandomForestRegressor', scores.mean()))

X_train, y_train = all_data_train_new_features, all_data_test_avg_check.to_numpy()
forest = RandomForestRegressor()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(forest, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('RandomForestRegressor avg_check mean MAPE: %.3f' % scores.mean())
results_avg_check_with_new_features.append(('RandomForestRegressor', scores.mean()))


# AdaBoostRegressor

X_train, y_train = all_data_train_new_features, all_data_test_revenue.to_numpy()
AdaBoostReg = AdaBoostRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(AdaBoostReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('AdaBoostRegressor revenue mean MAPE: %.3f' % scores.mean())
results_revenue_with_new_features.append(('AdaBoostRegressor', scores.mean()))

X_train, y_train = all_data_train_new_features, all_data_test_avg_check.to_numpy()
AdaBoostReg = AdaBoostRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(AdaBoostReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('AdaBoostRegressor avg_check mean MAPE: %.3f' % scores.mean())
results_avg_check_with_new_features.append(('AdaBoostRegressor', scores.mean()))


# GradientBoostingRegressor

X_train, y_train = all_data_train_new_features, all_data_test_revenue.to_numpy()
GBReg = GradientBoostingRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(GBReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('GradientBoostingRegressor revenue mean MAPE: %.3f' % scores.mean())
results_revenue_with_new_features.append(('GradientBoostingRegressor', scores.mean()))

X_train, y_train = all_data_train_new_features, all_data_test_avg_check.to_numpy()
GBReg = GradientBoostingRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(GBReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('GradientBoostingRegressor avg_check mean MAPE: %.3f' % scores.mean())
results_avg_check_with_new_features.append(('GradientBoostingRegressor', scores.mean()))


# XGBRegressor

X_train, y_train = all_data_train_new_features, all_data_test_revenue.to_numpy()
XGBReg = XGBRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(XGBReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('XGBRegressor revenue mean MAPE: %.3f' % scores.mean())
results_revenue_with_new_features.append(('XGBRegressor', scores.mean()))

X_train, y_train = all_data_train_new_features, all_data_test_avg_check.to_numpy()
XGBReg = XGBRegressor(n_estimators=100,random_state=1)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(XGBReg, X_train, y_train, scoring='neg_mean_absolute_percentage_error', cv=cv, n_jobs=-1)
scores = np.absolute(scores)
print('XGBRegressor avg_check mean MAPE: %.3f' % scores.mean())
results_avg_check_with_new_features.append(('XGBRegressor', scores.mean()))

RandomForestRegressor revenue mean MAPE: 0.193
RandomForestRegressor avg_check mean MAPE: 0.118
AdaBoostRegressor revenue mean MAPE: 0.206
AdaBoostRegressor avg_check mean MAPE: 0.123
GradientBoostingRegressor revenue mean MAPE: 0.198
GradientBoostingRegressor avg_check mean MAPE: 0.122
XGBRegressor revenue mean MAPE: 0.199
XGBRegressor avg_check mean MAPE: 0.126


In [20]:
results_revenue_with_new_features = sorted(results_revenue_with_new_features,key = lambda x: x[1])
results_avg_check_with_new_features = sorted(results_avg_check_with_new_features,key = lambda x: x[1])

df_results_revenue_with_new_features = pd.DataFrame(results_revenue_with_new_features, columns=['model', 'result'])
df_results_avg_check_with_new_features = pd.DataFrame(results_avg_check_with_new_features, columns=['model', 'result'])

In [21]:
df_results_revenue

Unnamed: 0,model,result
0,GradientBoostingRegressor,0.189692
1,RandomForestRegressor,0.190544
2,XGBRegressor,0.213857
3,AdaBoostRegressor,0.219906


In [22]:
df_results_revenue_with_new_features

Unnamed: 0,model,result
0,RandomForestRegressor,0.192676
1,GradientBoostingRegressor,0.197922
2,XGBRegressor,0.198622
3,AdaBoostRegressor,0.206188


In [23]:
df_results_avg_check

Unnamed: 0,model,result
0,RandomForestRegressor,0.127949
1,GradientBoostingRegressor,0.129848
2,AdaBoostRegressor,0.134268
3,XGBRegressor,0.142824


In [24]:
df_results_avg_check_with_new_features

Unnamed: 0,model,result
0,RandomForestRegressor,0.117795
1,GradientBoostingRegressor,0.121754
2,AdaBoostRegressor,0.123372
3,XGBRegressor,0.126001
