In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('data_with_n_basket_Preprocessed_02_181002.csv', index_col=0, engine='c')
print('unique users:', len(list(set(data.user_id.values))))
print('unique items:', len(list(set(data.product_id.values))))

unique users: 7716
unique items: 9073


In [2]:
data.head()

Unnamed: 0,user_id,order_id,order_number,product_id,reordered
10356,76,3045366,1,246,0
10357,76,3045366,1,36107,0
10358,76,1522904,2,36107,1
10362,76,1934930,3,11984,0
10363,76,1934930,3,34058,0


In [3]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [4]:
unique_item = sorted(data.product_id.unique())
item_id = array(unique_item)
print('item_id', item_id)

label_encoder = LabelEncoder()
item_idx = label_encoder.fit_transform(item_id)
print('item_idx',item_idx)

item_id [    1    10    23 ... 49655 49667 49680]
item_idx [   0    1    2 ... 9070 9071 9072]


In [5]:
print('making item2idx, idx2item and item_one_hot')
item2idx = {n: i for i, n in enumerate(item_id)}
idx2item = {i: n for i, n in enumerate(item_id)}

# binary encode
onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
integer_encoded = item_idx.reshape(len(item_idx), 1)
item_onehot = onehot_encoder.fit_transform(integer_encoded)

list_product = data.groupby(['user_id', 'order_id', 'order_number']).agg({'product_id':lambda x: (set(x))})
list_product = pd.DataFrame(list_product.reset_index())
list_product.columns = ['user_id', 'order_id', 'order_number', 'list_item_basket']
list_product = list_product.sort_values(['user_id', 'order_number'])
list_product = list_product.reset_index(drop=True)
list_product.head()

making item2idx, idx2item and item_one_hot


Unnamed: 0,user_id,order_id,order_number,list_item_basket
0,76,3045366,1,"{36107, 246}"
1,76,1522904,2,{36107}
2,76,1934930,3,"{11984, 34058, 46413}"
3,76,1491175,4,"{25706, 6631}"
4,76,2102068,5,{36107}


In [7]:
import pickle
# save
with open('item2idx.pickle', 'wb') as f:
    pickle.dump(item2idx, f, pickle.HIGHEST_PROTOCOL)

# load
with open('item2idx.pickle', 'rb') as f:
    item2idx = pickle.load(f)

In [6]:
DF = list_product[list_product.order_number <= 10]
print('User_id, Order_id, Order_number(1~10), bakset`s item list', DF.shape)

User_id, Order_id, Order_number(1~10), bakset`s item list (77160, 4)


In [7]:
basket_multihot_list = []

for i in range(DF.shape[0]):
    session2hot = np.zeros(len(item_onehot))
    for j in list(DF.list_item_basket[i]):
        session2hot = np.add(session2hot, item_onehot[item2idx[j]])
    basket_multihot_list.append(session2hot)
print('basket_multihot_list 개수', len(basket_multihot_list))
DF['basket2vec'] = basket_multihot_list
print(DF.columns)

basket_multihot_list 개수 77160
Index(['user_id', 'order_id', 'order_number', 'list_item_basket',
       'basket2vec'],
      dtype='object')


In [8]:
for i in range(5):
    DF['input_{}'.format(i)] = DF.groupby(['user_id'])['basket2vec'].shift(i)

df = DF[['user_id', 'order_id','order_number',  'input_4','input_3','input_2','input_1','input_0']].dropna()
df = df.reset_index(drop=True)
df.columns = ['user_id', 'order_id', 'order_number', 'input_t-4','input_t-3','input_t-2', 'input_t-1', 'output']


df['order_cnt'] = df.order_number-4
df = df[['user_id', 'order_id', 'input_t-4','input_t-3','input_t-2', 'input_t-1', 'output', 'order_cnt']]
df[df.user_id==76]

Unnamed: 0,user_id,order_id,input_t-4,input_t-3,input_t-2,input_t-1,output,order_cnt
0,76,2102068,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
1,76,2763506,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
2,76,2792895,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3
3,76,1474394,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4
4,76,380972,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5
5,76,3294399,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6


In [32]:
import pickle
import gzip
with gzip.open('df_01.pickle', 'wb') as f:
    pickle.dump(tmp, f)

In [36]:
import pickle
import gzip
with gzip.open('df_02.pickle', 'wb') as f:
    pickle.dump(tmp, f)

In [37]:
with gzip.open('df_01.pickle','rb') as f:
    df01 = pickle.load(f)

In [38]:
with gzip.open('df_02.pickle','rb') as f:
    df02 = pickle.load(f)

In [45]:
df = pd.concat([df01, df02])