hist_data.csv - исторические данные о покупках
- buyer_id - id покупателя
- pav_order_id - id заказа
- created - время добавления в заказ
- item_id - id item'а
- count - количество выбранного товара в заказе
- price_sold - цена за 1 item
- flag_weight_goods - бинарный флаг того, является ли товар весовым
- weight - вес заказа

test.csv - текущее состояние корзины (последний заказ пользователя)
- buyer_id - id покупателя
- pav_order_id - id заказа
- created - время добавления в заказ
- item_id - id item'а (товара)
- count - количество выбранного товара в заказе
- price_sold - цена за 1 item
- flag_weight_goods - бинарный флаг того, является ли товар весовым

In [51]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import Counter
from scipy import sparse as sp
from collections import defaultdict
import implicit

hist_data = pd.read_csv('data/hist_data.csv')
test = pd.read_csv('data/test.csv')

In [52]:
def dcg(y_relevance: np.ndarray) -> float:    
    return np.sum([(2**i - 1) / np.log2(k + 1) for (k, i) in enumerate(y_relevance, start=1)])

def ndcg(y_relevance: np.ndarray, k: int) -> float:
    if y_relevance.sum() == 0:
        return 0.0
    DCG = dcg(y_relevance[:k])
    IDCG = dcg(-np.sort(-y_relevance)[:k])
    return DCG / IDCG

def apply_relevance(x):
    return [int(item in x['basket']) for item in x['preds']]

def create_relevance(pred):
    d = pred.copy()
    d['basket'] = d['basket'].apply(set)
    d = d.apply(apply_relevance, axis=1)
    return d

def ndcg_full_dataset(d):
    dd = pd.DataFrame(d.to_list()).fillna(0).to_numpy()
    k = dd.shape[1]
    scores = [ndcg(dd[i], k) for i in range(len(dd))]
    return np.mean(scores)

def compute_ndcg_score(pred):
    relevance = create_relevance(pred)
    return ndcg_full_dataset(relevance)

In [53]:
hist_data.head(3)

Unnamed: 0,buyer_id,pav_order_id,created,item_id,count,price_sold,flag_weight_goods,weight
0,95203091,98506637863,2021-07-01 00:03:44,202808329,1.0,79.99,False,11.14
1,95203091,98506637863,2021-07-01 00:03:44,202953905,1.072,44.945,True,11.14
2,95203091,98506637863,2021-07-01 00:03:44,203566452,1.0,69.99,False,11.14


In [54]:
test.head(3)

Unnamed: 0,buyer_id,pav_order_id,created,item_id,count,price_sold,flag_weight_goods
0,94640077,98519243164,2021-08-30 17:56:31,203053459,1.0,67.62,False
1,95865222,98512083628,2021-07-26 16:17:21,202967705,1.14,406.8,True
2,95147155,98519972197,2021-09-02 21:54:18,203551512,1.0,52.77,False


In [55]:
prod_his = set(hist_data.item_id)
prod_test = set(test.item_id)
len(prod_test - prod_his)

2124

In [56]:
full_df = pd.concat([hist_data.iloc[:, :-1], test])
full_df = full_df.assign(sum_price = full_df['count'] * full_df.price_sold)
full_df.head(3)


Unnamed: 0,buyer_id,pav_order_id,created,item_id,count,price_sold,flag_weight_goods,sum_price
0,95203091,98506637863,2021-07-01 00:03:44,202808329,1.0,79.99,False,79.99
1,95203091,98506637863,2021-07-01 00:03:44,202953905,1.072,44.945,True,48.18104
2,95203091,98506637863,2021-07-01 00:03:44,203566452,1.0,69.99,False,69.99


In [57]:
users_data = full_df.copy()
users_data = users_data.groupby('buyer_id').agg({
    'pav_order_id': 'unique',
    'item_id': np.size,
    'price_sold': np.sum
    })


users_data.pav_order_id = users_data.pav_order_id.apply(len)
users_data = users_data.assign(
    avg_items_order = round(users_data.item_id / users_data.pav_order_id, 2),
    avg_price_order = round(users_data.price_sold / users_data.pav_order_id, 2)
    )
users_data.rename(columns = {'pav_order_id' : 'count_order', 'item_id' : 'count_items', 'price_sold' : 'sum_expense'}, inplace = True)

In [58]:
users_data.sort_values('avg_items_order', ascending=False).head(3)

Unnamed: 0_level_0,count_order,count_items,sum_expense,avg_items_order,avg_price_order
buyer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
95681486,2,186,13254.892,93.0,6627.45
95314110,2,160,25081.463,80.0,12540.73
94715958,7,537,40147.51,76.71,5735.36


In [59]:
users_data.describe()

Unnamed: 0,count_order,count_items,sum_expense,avg_items_order,avg_price_order
count,80244.0,80244.0,80244.0,80244.0,80244.0
mean,3.882945,69.928082,7323.801233,16.830866,1770.703268
std,3.083818,66.709305,9181.896459,6.632906,1549.847322
min,1.0,7.0,0.0,7.0,0.0
25%,2.0,24.0,2462.435,12.33,1205.53
50%,3.0,50.0,5070.565,15.57,1614.745
75%,5.0,92.0,9514.4315,20.0,2135.1125
max,20.0,723.0,771062.551,93.0,301756.77


In [60]:
product_data = full_df.copy()
product_data['count_mean'] = product_data['count'].values
product_data = product_data.groupby('item_id').agg({
    'price_sold': np.median,
    'sum_price': np.median,
    'count_mean': np.mean,
    'count': np.size,
    'flag_weight_goods': lambda x: list(x)[0]  
    })


In [61]:
product_data.sort_values('count_mean', ascending=False).head(3)



Unnamed: 0_level_0,price_sold,sum_price,count_mean,count,flag_weight_goods
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
204217182,9.98,998.0,92.410256,39,False
204217183,9.98,998.0,85.407407,27,False
204217144,9.98,998.0,78.785714,14,False


In [62]:
product_data.describe()

Unnamed: 0,price_sold,sum_price,count_mean,count
count,54596.0,54596.0,54596.0,54596.0
mean,187.660403,204.536365,1.329832,102.778757
std,310.67895,470.801163,1.027269,655.999594
min,0.0,0.0,0.001,1.0
25%,54.99,64.6925,1.0,3.0
50%,106.99,114.99,1.090909,10.0
75%,210.49,214.247799,1.363636,47.0
max,11989.99,34297.9524,92.410256,79760.0


In [63]:
# Общии приготовления
product_lite_df = pd.DataFrame({'item_id': product_data.index, 'count': product_data['count'].values})
top20prod = frozenset(product_data.nlargest(20, columns="count").index)

order_user = test.groupby('pav_order_id').agg({'buyer_id': lambda x: list(x)[0]}).to_dict()
order_user = order_user['buyer_id']

## 1 решение через старые покупки и популярные товары 

In [64]:
# Считаем сколько сколько клиент купил товаров 
users = full_df.buyer_id.unique()
user_count_dict = {}
for user in users:
    list_ = full_df[full_df.buyer_id == user]['item_id'].to_list()
    res = sorted(Counter(list_).items(), key=lambda x: x[1], reverse=True)
    user_count_dict.update({user : res})

In [1]:
def top_20_items(items: list) -> list:
    t_df = pd.DataFrame(items, columns=['item_id', 'count_user'])
    new_df = pd.merge(t_df, product_lite_df, on="item_id")
    res = new_df.sort_values(by=["count_user", 'count'], ascending=False).head(20)['item_id'].to_list()
    l = len(res)
    if l != 20:
        res += list(top20prod - set(res))[:20 - l]
    return res

In [None]:
result_list = []

for order_id, user_id in order_user.items():
    i_list = user_count_dict[user_id]
    prod_list = top_20_items(i_list)
    result_list.append([order_id, prod_list])
pred = pd.DataFrame(result_list, columns=['pav_order_id', 'preds'])

In [None]:
pred.to_csv('pred.csv', index=False)

## 1.1 

In [None]:
test.head(1)

Unnamed: 0,buyer_id,pav_order_id,created,item_id,count,price_sold,flag_weight_goods
0,94640077,98519243164,2021-08-30 17:56:31,203053459,1.0,67.62,False


In [None]:
pred_ome_dict = {order:pr for order, pr in zip(pred.pav_order_id.values, pred.preds.values)}
pav_order_id = []
basket = []
preds = []
for order_id, user_id in order_user.items():
    order_df = test[test.pav_order_id == order_id][['buyer_id', 'item_id']]
    on_df = pd.merge(order_df, product_lite_df, on="item_id").nlargest(20, columns="count")
    res = on_df.item_id.to_list()
    basket.append(res)

    top20 = frozenset(pred_ome_dict[order_id])
    l = len(res)
    if l != 20:
        res += list(top20prod - set(res))[:20 - l]
        
    pav_order_id.append(order_id)
    preds.append(res)

data = {'pav_order_id': pav_order_id, 'basket': basket, 'preds': preds}
pred_1_1 = pd.DataFrame(data=data).set_index('pav_order_id')

In [None]:
pred_1_1['preds'].to_csv('pred.csv')
compute_ndcg_score(pred_1_1)


1.0

# Матрица

In [65]:
import x5.src.utils as utils
product_encoder = utils.ProductEncoder(product_lite_df.item_id.to_list())
user_encoder = utils.ProductEncoder(order_user.values())

In [66]:
def make_coo_row(items, product_encoder):
    idx = []
    values = []

    n_items = len(items)

    for pid, count in items:
        idx.append(product_encoder.toIdx(pid))
        values.append(count)

    return sp.coo_matrix(
        (np.array(values).astype(np.float32), ([0] * len(idx), idx)), shape=(1, product_encoder.num_products)
    )

In [68]:
rows = []
for user_id in order_user.values():
    rows.append(make_coo_row(
        user_count_dict[user_id],
        product_encoder
        ))
X_sparse = sp.vstack(rows).tocsr()

In [45]:
rows = []
for user_id in order_user.values():
    rows.append(utils.make_coo_row(
        full_df[full_df.buyer_id == user_id].item_id.to_list(),
        product_encoder
        ))
X_sparse = sp.vstack(rows).tocsr()

## 2 решение item2item
0.16

In [75]:
model = implicit.nearest_neighbours.CosineRecommender(K=10)
model.fit(X_sparse)

100%|██████████| 54596/54596 [00:01<00:00, 37010.55it/s]


In [76]:
result_list = []

for order_id, user_id in order_user.items():
    row_sparse = utils.make_coo_row(full_df[full_df.buyer_id == user_id].item_id.to_list(), product_encoder).tocsr()
    raw_recs = model.recommend(user_encoder.toIdx(user_id), row_sparse,  N=20, filter_already_liked_items=False, recalculate_user=True)
    prod_list = product_encoder.toPid(raw_recs[0])
    result_list.append([order_id, prod_list])

pred2 = pd.DataFrame(result_list, columns=['pav_order_id', 'preds'])
pred2.to_csv('pred.csv', index=False)

## 2.1 LogisticMatrixFactorization
0.04

In [93]:
model2 = implicit.lmf.LogisticMatrixFactorization()
model2.fit(X_sparse)

100%|██████████| 30/30 [00:22<00:00,  1.33it/s]


In [103]:
result = model2.recommend_all(X_sparse, N=20, filter_already_liked_items=False)
result_lmf = [product_encoder.toPid(line) for line in result]
result_lmf_list = []
for order_id, user_id in order_user.items():
    ind = user_encoder.toIdx(user_id)
    result_lmf_list.append([order_id, result_lmf[ind]])

pred3 = pd.DataFrame(result_lmf_list, columns=['pav_order_id', 'preds'])
# pred3.to_csv('pred.csv', index=False)

In [None]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(X_sparse)
distances, indices = knn.kneighbors(X_sparse, n_neighbors=5)

In [None]:
def recommend_movie(title):

  index_user_likes = matrix_df.index.tolist().index(title) # get an index for a movie
  sim_movies = indices[index_user_likes].tolist() # make list for similar movies
  movie_distances = distances[index_user_likes].tolist() # the list for distances of similar movies

  id_movie = sim_movies.index(index_user_likes) # get the position of the movie itself in indices and distances

  print('Similar Movies to '+str(matrix_df.index[index_user_likes])+': \n')

  sim_movies.remove(index_user_likes) # remove the movie itself in indices
  movie_distances.pop(id_movie) # remove the movie itself in distances

  j = 1
    
  for i in sim_movies:
    print(str(j)+': '+str(matrix_df.index[i])+', the distance with '+str(title)+': '+str(movie_distances[j-1]))
    j = j + 1

In [None]:
recommend_movie(202782406)

ValueError: 202782406 is not in list

## 2 решение item2item