In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import normalize

In [14]:
df = pd.read_csv('123.csv')
df2 = pd.read_csv('pred.csv')
data = {'pav_order_id': df2.pav_order_id.values, 'preds': df.preds.values}
df3 = pd.DataFrame(data)
df3.to_csv('pred.csv', index=False)

In [35]:
class ProductEncoder:
    def __init__(self, id_list):
        self.product_idx = {}
        self.product_pid = {}
        for idx, pid in enumerate(id_list):
            self.product_idx[pid] = idx
            self.product_pid[idx] = pid

    def toIdx(self, x):
        if type(x) == int:
            pid = x
            return self.product_idx[pid]
        return [self.product_idx[pid] for pid in x]

    def toPid(self, x):
        if type(x) == int:
            idx = x
            return self.product_pid[idx]
        return [self.product_pid[idx] for idx in x]

    @property
    def num_products(self):
        return len(self.product_idx)

In [2]:
hist_data = pd.read_csv('data/hist_data.csv')
test = pd.read_csv('data/test.csv')
full_df = pd.concat([hist_data.iloc[:, :-1], test])
full_df = full_df.assign(sum_price = full_df['count'] * full_df.price_sold)

In [42]:
items_list = list(full_df.item_id.unique())
buyer_list = list(full_df.buyer_id.unique())
product_encoder = ProductEncoder(items_list)
user_encoder = ProductEncoder(buyer_list)

In [44]:
df = full_df.groupby(['buyer_id', 'item_id']).agg({'count': np.size})
data = [(user_encoder.toIdx(ui[0]), product_encoder.toIdx(ui[1]), c) for ui, c in zip(df.index, df['count'].to_list())]
new_df = pd.DataFrame(data, columns=['buyer_id', 'item_id', 'count']).sort_values('buyer_id')

In [121]:
rows, r_pos = np.unique(new_df.values[:,0], return_inverse=True)
cols, c_pos = np.unique(new_df.values[:,1], return_inverse=True)
matrix = sparse.csr_matrix((new_df.values[:,2], (r_pos, c_pos)))

In [122]:
Pui = normalize(matrix, norm='l2', axis=1)
sim = Pui.T * Pui

In [None]:
transpose_matrix = matrix.transpose(copy=True)
Piu = normalize(transpose_matrix, norm='l2', axis=1)
fit = Pui * Piu * Pui

In [132]:
import sys
 
sys.getsizeof(Piu)

48

In [103]:
user_order = test.groupby('buyer_id').agg({'pav_order_id': lambda x: list(x)[0], 'item_id': lambda x: list(x)})
user_list = user_order.index
order_list = user_order.pav_order_id.values
basket_list = user_order.item_id.values

In [None]:
preds_list = []
for user, order, basket in zip(user_list, order_list, basket_list):
    pred = product_encoder.toPid(sim[user_encoder.toIdx(user)].toarray().argsort()[0][:-20])
    preds_list.append((order, basket, pred))

In [93]:
s = list(sim[10].toarray().argsort()[0][:-20])
product_encoder.toPid(s)

[204115783,
 204047571,
 203280811,
 204098294,
 207177449,
 204056791,
 206000181,
 204146262,
 204083080,
 205757712,
 206000267,
 204050619,
 203455460,
 203532312,
 205803587,
 204039799,
 203381290,
 203514220,
 203409292,
 203088413,
 205951725,
 202794614,
 204101443,
 202975630,
 205755451,
 203434942,
 203527921,
 203438040,
 203438042,
 205737854,
 202811985,
 203379513,
 203235369,
 205909781,
 203480463,
 204090525,
 204090501,
 205090660,
 203401712,
 203532819,
 204367785,
 204007632,
 203391252,
 204058101,
 204088868,
 204074952,
 204074954,
 205755514,
 203334268,
 207545151,
 204350500,
 204146348,
 203432354,
 202810851,
 210909609,
 204071870,
 203513898,
 203513895,
 203513901,
 203513907,
 204086064,
 204380145,
 203404752,
 202985135,
 203501787,
 202942492,
 203235379,
 203513514,
 204044833,
 202824248,
 205097816,
 205916827,
 202808189,
 203455756,
 203564456,
 203378352,
 204415285,
 202816280,
 214038285,
 211000643,
 203442227,
 214014105,
 202957563,
 204

In [92]:
s = list(sim[10].toarray().argsort()[0][:-20])
for i in s:
    print(i)
    print(product_encoder.toPid(int(i)))

27297
204115783
36104
204047571
36105
203280811
36106
204098294
36107
207177449
36108
204056791
36109
206000181
36110
204146262
36111
204083080
36112
205757712
36113
206000267
36114
204050619
36115
203455460
36117
203532312
36118
205803587
36121
204039799
36122
203381290
36123
203514220
36124
203409292
36125
203088413
36126
205951725
36127
202794614
36103
204101443
36128
202975630
36102
205755451
36100
203434942
36079
203527921
36080
203438040
36081
203438042
36082
205737854
36083
202811985
36084
203379513
36085
203235369
36086
205909781
36087
203480463
36088
204090525
36089
204090501
36090
205090660
36091
203401712
36092
203532819
36093
204367785
36094
204007632
36095
203391252
36096
204058101
36097
204088868
36098
204074952
36099
204074954
36101
205755514
36078
203334268
36129
207545151
36131
204350500
36159
204146348
36160
203432354
36161
202810851
36162
210909609
36163
204071870
36164
203513898
36165
203513895
36166
203513901
36167
203513907
36168
204086064
36169
204380145
36170
20