In [39]:
import pandas as pd
import json
import numpy as np

In [244]:
data = pd.read_parquet('../data/data_made_restaurants.parquet')

In [245]:
data.head()

Unnamed: 0,customer_id,order_id,user_latitude,user_longitude,date,city_id,chain_id,vendor_id,target,total_value,...,vendor_latitude,vendor_longitude,online_payment,accepting_cash,min_delivery_value,takeaway_support,citymobil_support,default_product_group_id,product_group_ids,cuisine_ids
0,15955880,207845807,55.7815,37.5307,2020-08-01,1,140718,343852,1,575,...,55.778137,37.6024,1,0,1000,1,0,30.0,[30],
1,62512097,207855295,55.6472,37.4682,2020-08-01,1,140718,343852,1,1360,...,55.778137,37.6024,1,0,1000,1,0,30.0,[30],
2,64977556,207871966,55.8649,37.5014,2020-08-01,1,140718,343852,1,560,...,55.778137,37.6024,1,0,1000,1,0,30.0,[30],
3,81281415,207960541,55.8711,37.5105,2020-08-01,1,140718,343852,1,1130,...,55.778137,37.6024,1,0,1000,1,0,30.0,[30],
4,72045218,208028305,55.8166,37.5899,2020-08-01,1,140718,343852,1,745,...,55.778137,37.6024,1,0,1000,1,0,30.0,[30],


In [248]:
data = data[['chain_id', 'cuisine_ids', 'product_group_ids']].drop_duplicates()

In [249]:
data['products'] = data['product_group_ids'].apply(lambda x: json.loads(x) if x and pd.notna(x) else [])

In [250]:
data['cuisines'] = data['cuisine_ids'].apply(lambda x: json.loads(x) if x and pd.notna(x) else [])

In [251]:
data['products_len'] = data['products'].apply(lambda x: len(x))
data['cuisines_len'] = data['cuisines'].apply(lambda x: len(x))

# Для модели

In [252]:
data_for_model = data[['cuisines', 'products', 'products_len', 'cuisines_len']]

## Модель на продуктах

In [253]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [254]:
#отбираем только те наблюдения, где есть продукты и кухни
products_model_data = data_for_model.loc[(data_for_model['cuisines_len']>0) & (data_for_model['products_len']>0), ['cuisines', 'products']]

In [316]:
# список кухонь и продуктов - пригодится
unique_products = np.unique(np.concatenate(data['products'].values)).astype(int)
unique_cuisines = np.unique(np.concatenate(data['cuisines'].values)).astype(int)

In [317]:
max(unique_products)

52

In [318]:
# разворачиваем кухни в строки
cuisine_to_products = products_model_data.explode('cuisines')
cuisine_to_products.head()

Unnamed: 0,cuisines,products
1013,1,"[1, 2, 18, 26]"
1013,2,"[1, 2, 18, 26]"
1013,3,"[1, 2, 18, 26]"
1013,5,"[1, 2, 18, 26]"
3939,1,"[1, 2, 18]"


In [319]:
def preprocess_data(df):
    def array_to_sparse(arr):
        result = np.zeros(max(unique_products)+1, dtype=int)
        result[arr]=1
        return result
    
    return np.stack( df['products'].apply(array_to_sparse))

In [320]:
X = preprocess_data(cuisine_to_products)

In [321]:
y = cuisine_to_products['cuisines'].astype(int).values

In [322]:
## Базовый RF

In [323]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [324]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [325]:
predicted_probas = rf.predict_proba(X_test)
predicted = rf.predict(X_test)

In [326]:
def predictions_to_cuicins(pred_proba, th=0.2):
    t = pred_proba>th
    return np.where(t == True)[0]+1

In [327]:
cuisins_predicted = list()
for p in predicted_probas:
    cuisins_predicted.append(predictions_to_cuicins(p, 0.1))

In [328]:
print('Настоящая кухня есть в списке предсказанных, доля:')
np.mean([c in cp for cp, c in zip(cuisins_predicted, y_test)])

Настоящая кухня есть в списке предсказанных, доля:


0.7482758620689656

# Предсказываю на полном датасете

In [333]:
X_full = preprocess_data(data)

In [334]:
full_predict_probas = rf.predict_proba(X_full)

In [335]:
full_cuisins_predicted = list()
for p in full_predict_probas:
    full_cuisins_predicted.append(predictions_to_cuicins(p, 0.1))

In [337]:
data['cuisines_predicted'] = full_cuisins_predicted

In [339]:
data[data['cuisines_len']>0]

Unnamed: 0,chain_id,cuisine_ids,product_group_ids,products,cuisines,products_len,cuisines_len,cuisines_predicted
1013,52874,"[1, 2, 3, 5]","[1, 2, 18, 26]","[1, 2, 18, 26]","[1, 2, 3, 5]",4,4,"[1, 2, 3, 5]"
3939,109886,"[1, 2]","[1, 2, 18]","[1, 2, 18]","[1, 2]",3,2,"[1, 2]"
4185,106646,"[1, 2]","[1, 2, 23]","[1, 2, 23]","[1, 2]",3,2,"[1, 2, 5]"
4187,29982,[1],"[1, 23]","[1, 23]",[1],2,1,[1]
4440,17802,"[1, 2]","[2, 23, 26]","[2, 23, 26]","[1, 2]",3,2,"[1, 2]"
...,...,...,...,...,...,...,...,...
2778726,109014,[3],[43],[43],[3],1,1,"[3, 4]"
2778727,100226,[3],"[20, 24, 25, 43, 51]","[20, 24, 25, 43, 51]",[3],5,1,[3]
2778968,109198,[5],"[6, 26]","[6, 26]",[5],2,1,[5]
2779307,67862,"[1, 3, 4]","[1, 3, 51]","[1, 3, 51]","[1, 3, 4]",3,3,"[1, 3, 4]"


In [340]:
data.to_csv('chain_cuisines_restored.csv')