# Предсказание оценки для ресторанов

_Выполнил_: Подцепко Игорь Сергеевич (M3335)

## Загрузка данных и предобработка

In [3]:
import pandas as pd

df = pd.read_csv('data/raw/train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73995 entries, 0 to 73994
Data columns (total 72 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   user_id                         73995 non-null  uint64 
 1   org_id                          73995 non-null  uint64 
 2   rating                          73995 non-null  float64
 3   ts                              73995 non-null  int64  
 4   user_city                       73995 non-null  object 
 5   org_city                        73995 non-null  object 
 6   average_bill                    45433 non-null  float64
 7   rating_org                      73995 non-null  float64
 8   rubrics                         73995 non-null  object 
 9   food_delivery                   73995 non-null  int64  
 10  breakfast                       73995 non-null  int64  
 11  takeaway                        73995 non-null  int64  
 12  summer_terrace                  

Проверим, в каких колонках есть `null`'ы.

In [4]:
for column in df.columns[df.isnull().any()]:
    print('-', column)

- average_bill


In [5]:
average_bills_by_city = df.groupby('org_id')[['org_city', 'average_bill']].first().groupby('org_city').average_bill.mean().rename('average_bill_by_city')
average_bills_by_city

org_city
msk    1104.885787
spb     797.595920
Name: average_bill_by_city, dtype: float64

Средний чек для ресторана можно приблизить средним чеком среди всех ресторанов с разумными значениями (например, до 10'000).

In [6]:
df = df.join(average_bills_by_city, on='org_city')
df.average_bill = df.average_bill.fillna(df.average_bill_by_city)

Таким образом, были устранены все "пробелы" в исходном наборе данных. Теперь добавим новые признаки, например:

1. Средняя оценка от пользователя `mean_rating_by_user`;

In [7]:
rubrics = df.rubrics.unique()
rubric_codes = {}
for i in range(len(rubrics)):
    rubric_codes[rubrics[i]] = i
df.rubrics = df.rubrics.map(rubric_codes)

In [8]:
count_of_orgs_by_city = df.groupby('org_city').org_id.count().rename('count_of_orgs_by_city')
df = df.join(count_of_orgs_by_city, on='org_city')
mean_rating_by_city = df.groupby('org_city').rating.mean().rename('mean_rating_by_city')
df = df.join(mean_rating_by_city, on='org_city')
df

Unnamed: 0,user_id,org_id,rating,ts,user_city,org_city,average_bill,rating_org,rubrics,food_delivery,...,vinotheque,cakes_for_different_occasions,accepted_credit_cards,kalyan,teahouse,bread_from_tandoor,handmade_goods,average_bill_by_city,count_of_orgs_by_city,mean_rating_by_city
0,12182789761588196310,11946401375069641937,4.0,677,msk,msk,1500.000000,3.883212,0,1,...,0,0,0,0,0,0,0,1104.885787,51218,4.319634
1,2796544982129273287,5545859981678495568,5.0,577,msk,msk,1000.000000,4.495379,0,1,...,0,0,0,0,0,0,0,1104.885787,51218,4.319634
2,10546824832088319728,3451485117888180206,4.0,716,msk,msk,500.000000,4.576063,1,1,...,0,0,0,0,0,0,0,1104.885787,51218,4.319634
3,9967742981458207920,9490041040735762889,3.0,639,spb,spb,1000.000000,3.775701,1,1,...,0,0,0,0,0,0,0,797.595920,22777,4.329850
4,11046819495825038093,5234254278902729372,5.0,669,msk,msk,500.000000,4.187500,2,1,...,0,0,0,0,0,0,0,1104.885787,51218,4.319634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73990,12526944272743210441,2948132663120061820,2.0,517,msk,msk,500.000000,4.280632,7,1,...,0,0,0,0,0,0,0,1104.885787,51218,4.319634
73991,17792383534781948527,13225228120465186798,5.0,455,msk,msk,1104.885787,4.446033,0,0,...,0,0,0,0,0,0,0,1104.885787,51218,4.319634
73992,7452326377200652192,1135274990644701922,5.0,1032,msk,msk,1000.000000,4.420231,4,1,...,0,0,0,0,0,0,0,1104.885787,51218,4.319634
73993,368322320223775667,2498968754330575170,5.0,27,msk,msk,1104.885787,4.694307,1,0,...,0,0,0,0,0,0,0,1104.885787,51218,4.319634


In [9]:
df = pd.get_dummies(df, prefix=['rubrics'], columns = ['rubrics'])
df = pd.get_dummies(df, prefix=['user_city'], columns=['user_city'], drop_first=True)
df = pd.get_dummies(df, prefix=['org_city'], columns=['org_city'], drop_first=True)
df

Unnamed: 0,user_id,org_id,rating,ts,average_bill,rating_org,food_delivery,breakfast,takeaway,summer_terrace,...,rubrics_6,rubrics_7,rubrics_8,rubrics_9,rubrics_10,rubrics_11,rubrics_12,rubrics_13,user_city_spb,org_city_spb
0,12182789761588196310,11946401375069641937,4.0,677,1500.000000,3.883212,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2796544982129273287,5545859981678495568,5.0,577,1000.000000,4.495379,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,10546824832088319728,3451485117888180206,4.0,716,500.000000,4.576063,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,9967742981458207920,9490041040735762889,3.0,639,1000.000000,3.775701,1,1,1,1,...,0,0,0,0,0,0,0,0,1,1
4,11046819495825038093,5234254278902729372,5.0,669,500.000000,4.187500,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73990,12526944272743210441,2948132663120061820,2.0,517,500.000000,4.280632,1,1,0,1,...,0,1,0,0,0,0,0,0,0,0
73991,17792383534781948527,13225228120465186798,5.0,455,1104.885787,4.446033,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73992,7452326377200652192,1135274990644701922,5.0,1032,1000.000000,4.420231,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
73993,368322320223775667,2498968754330575170,5.0,27,1104.885787,4.694307,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [197]:
# from sklearn.preprocessing import StandardScaler
#
# scaler = StandardScaler()
# features = df.drop(['rating_org'], axis=1)
# scaled = pd.DataFrame(scaler.fit_transform(features.to_numpy()), columns=features.columns)
# scaled['rating_org'] = df.rating_org
# df = scaled
# df

Unnamed: 0,user_id,org_id,rating,ts,average_bill,food_delivery,breakfast,takeaway,summer_terrace,wi_fi,...,rubrics_7,rubrics_8,rubrics_9,rubrics_10,rubrics_11,rubrics_12,rubrics_13,user_city_spb,org_city_spb,rating_org
0,0.562067,0.551223,-0.285553,-0.127911,0.208790,0.566389,0.856904,0.735080,-1.231287,0.402606,...,-0.165599,-0.174476,-0.137805,-0.133203,-0.066111,-0.106314,-0.041136,-0.621298,-0.666864,3.883212
1,-1.204647,-0.661967,0.599119,-0.468638,-0.024434,0.566389,-1.166992,0.735080,0.812159,0.402606,...,-0.165599,-0.174476,-0.137805,-0.133203,-0.066111,-0.106314,-0.041136,-0.621298,-0.666864,4.495379
2,0.254139,-1.058944,-0.285553,0.004972,-0.257657,0.566389,0.856904,0.735080,0.812159,0.402606,...,-0.165599,-0.174476,-0.137805,-0.133203,-0.066111,-0.106314,-0.041136,-0.621298,-0.666864,4.576063
3,0.145142,0.085632,-1.170226,-0.257388,-0.024434,0.566389,0.856904,0.735080,0.812159,0.402606,...,-0.165599,-0.174476,-0.137805,-0.133203,-0.066111,-0.106314,-0.041136,1.609533,1.499557,3.775701
4,0.348250,-0.721030,0.599119,-0.155170,-0.257657,0.566389,0.856904,0.735080,0.812159,0.402606,...,-0.165599,-0.174476,-0.137805,-0.133203,-0.066111,-0.106314,-0.041136,-0.621298,-0.666864,4.187500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73990,0.626845,-1.154352,-2.054898,-0.673074,-0.257657,0.566389,0.856904,-1.360396,0.812159,0.402606,...,6.038694,-0.174476,-0.137805,-0.133203,-0.066111,-0.106314,-0.041136,-0.621298,-0.666864,4.280632
73991,1.617926,0.793618,0.599119,-0.884325,0.024490,-1.765570,-1.166992,-1.360396,-1.231287,0.402606,...,-0.165599,-0.174476,-0.137805,-0.133203,-0.066111,-0.106314,-0.041136,-0.621298,-0.666864,4.446033
73992,-0.328319,-1.497970,0.599119,1.081669,-0.024434,0.566389,0.856904,0.735080,0.812159,0.402606,...,-0.165599,-0.174476,-0.137805,-0.133203,-0.066111,-0.106314,-0.041136,-0.621298,-0.666864,4.420231
73993,-1.661697,-1.239489,0.599119,-2.342636,0.024490,-1.765570,-1.166992,-1.360396,-1.231287,-2.483820,...,-0.165599,-0.174476,-0.137805,-0.133203,-0.066111,-0.106314,-0.041136,-0.621298,-0.666864,4.694307


Для определения подходящих для обучения модели признаков рассмотрим их корреляцию и дисперсию.

In [10]:
rows_list = []
for column in df.columns:
    try:
        corr = df.rating_org.corr(df[column])
        var = df[column].var()
        rows_list.append({
            'feature': column,
            'corr': corr,
            'var': var
        })
    except TypeError:
        continue
pd.DataFrame(rows_list).sort_values('corr')

Unnamed: 0,feature,corr,var
75,rubrics_3,-0.183910,0.098854
6,food_delivery,-0.105047,0.183892
81,rubrics_9,-0.084240,0.018289
84,rubrics_12,-0.078229,0.011052
32,free_delivery,-0.072332,0.050509
...,...,...,...
27,parking_disabled,0.167170,0.107226
28,toilet_for_disabled,0.173958,0.110697
30,cafe,0.181062,0.075462
2,rating,0.269386,1.277735


 ## Предсказание

Для предсказания будем использовать `sklearn.neighbors.KNeighborsRegression` и `KFold` (cross validation) для оценки качества модели. Загрузим тестовый набор данных и заполним пропуски "среднего чека" средним чеком по всем заведениям аналогично тому, как это было сделано для тренировочного набора.

In [11]:
test_x = pd.read_csv('data/raw/test_x.csv')
test_x = test_x.join(average_bills_by_city, on='org_city')
test_x.average_bill.fillna(test_x['average_bill_by_city'], inplace=True)
test_x.rubrics = test_x.rubrics.map(rubric_codes)
test_x = test_x.join(count_of_orgs_by_city, on='org_city')
test_x = test_x.join(mean_rating_by_city, on='org_city')
test_x = pd.get_dummies(test_x, prefix=['rubrics'], columns = ['rubrics'])
test_x = pd.get_dummies(test_x, prefix=['user_city'], columns=['user_city'], drop_first=True)
test_x = pd.get_dummies(test_x, prefix=['org_city'], columns=['org_city'], drop_first=True)
# test_x = pd.DataFrame(scaler.transform(test_x.to_numpy()), columns=test_x.columns)

Определим пару функций, которые инкапсулируют в себе алгоритм создания, обучения модели и предсказания `rating_org` для тестового набора данных.

In [12]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold


def try_fit_with(features_list, n_neighbors: int = 250):
    features = df[features_list]
    rating_org = df.rating_org

    k_fold = KFold(n_splits=3, shuffle=True)

    total_errors = {'MSE': 0, 'MAPE': 0}
    for train, test in k_fold.split(features, rating_org):
        regressor = KNeighborsRegressor(n_neighbors=n_neighbors, weights='distance', n_jobs=-1)
        regressor.fit(features.loc[train], rating_org.loc[train])
        predicted = regressor.predict(features.loc[test])
        total_errors['MSE'] += mean_squared_error(rating_org.loc[test], predicted)
        total_errors['MAPE'] += mean_absolute_percentage_error(rating_org.loc[test], predicted)
    return total_errors


def try_fit_several_times(features_list, n_neighbors: int = 250, n: int = 5):
    total_errors = {'MSE': 0, 'MAPE': 0}
    for i in range(n):
        current_result = try_fit_with(features_list)
        total_errors['MSE'] += current_result['MSE'] / n
        total_errors['MAPE'] += current_result['MAPE'] / n
    regressor = KNeighborsRegressor(n_neighbors=n_neighbors, weights='distance', n_jobs=-1)
    regressor = regressor.fit(df[features_list], df.rating_org)
    result = regressor.predict(test_x[features_list])
    result_df = pd.DataFrame(result, columns=['rating_org'])
    result_df.index.name = 'id'
    return total_errors, result_df


Попробуем обучить модель на разных наборах признаков.

In [None]:
errors, prediction = try_fit_several_times(['org_id', 'user_id', 'ts', 'average_bill', 'wheelchair_access'],
                                           n_neighbors=15000,
                                           n=0)
print(errors)
prediction.to_csv('data/submission.v1.csv')

In [None]:
errors, prediction = try_fit_several_times(['org_id', 'user_id', 'ts', 'rating', 'average_bill', 'wheelchair_access'])
print(errors)
prediction.to_csv('data/submission.v2.csv')

In [None]:
errors, prediction = try_fit_several_times(
    ['org_id', 'user_id', 'ts', 'rating', 'cafe', 'average_bill', 'wheelchair_access'])
print(errors)
prediction.to_csv('data/submission.v3.csv')

In [None]:
errors, prediction = try_fit_several_times(
    ['rating', 'cafe', 'toilet_for_disabled', 'parking_disabled', 'payment_by_credit_card'])
print(errors)
prediction.to_csv('data/submission.v4.csv')

In [None]:
errors, prediction = try_fit_several_times(
    ['rating', 'cafe', 'toilet_for_disabled', 'parking_disabled', 'payment_by_credit_card', 'automatic_door'])
print(errors)
prediction.to_csv('data/submission.v5.csv')

In [None]:
errors, prediction = try_fit_several_times(
    ['org_id', 'rating', 'average_bill', 'food_delivery', 'free_delivery', 'delivery'])
print(errors)
prediction.to_csv('data/submission.v6.csv')

In [None]:
errors, prediction = try_fit_several_times(
    ['org_id', 'average_bill'])
print(errors)
prediction.to_csv('data/submission.v7.csv')

In [None]:
numeric_features = df.drop(['user_city', 'org_city', 'rubrics', 'rating_org', 'mean_rating_by_user'],
                           axis=1).columns.to_list()
errors, prediction = try_fit_several_times(numeric_features)
print(errors)
prediction.to_csv('data/submission.v8.csv')

In [163]:
errors, prediction = try_fit_several_times(test_x.columns)
print(errors)
prediction.to_csv('data/submission.v9.csv')

{'MSE': 0.2016477243349685, 'MAPE': 0.12506835960248625}


In [202]:
rubrics = ["rubrics_" + str(i) for i in range(1, 14)]
errors, prediction = try_fit_several_times(['org_id', 'user_id', 'ts', 'average_bill', 'wheelchair_access'] + rubrics)
print(errors)
prediction.to_csv('data/submission.v10.csv')

{'MSE': 0.25609977747093426, 'MAPE': 0.15167818030657673}
