In [5]:
import pickle

import pandas as pd
import numpy as np
import scipy.sparse as sp

from tqdm import tqdm_notebook as tqdm

from numba import jit, njit

기존의 pickle data 불러오기

In [8]:
with open('online_retail.bin','rb') as f_in :
    df = pickle.load(f_in)

In [14]:
df.columns = df.columns.str.lower()
df = df[~df.invoiceno.astype('str').str.startswith('C')].reset_index(drop=True)
df.customerid = df.customerid.fillna(-1).astype('int32')

sequece 처리를 위한 전처리

In [26]:
class LabelEncoder:
    def fit(self, seq):
        self.vocab = sorted(set(seq))
        self.idx = {c: i+1 for i, c in enumerate(self.vocab)}
    
    def vocab_size(self):
        return len(self.vocab) + 1
    
    def transfrom(self,seq):
        n = len(seq)
        result = np.zeros(n, dtype='int32')
        
        for i in range(n):
            result[i] = self.idx.get(seq[i],0)
            
        return result
    
    def fit_transform(self, seq):
        self.fit(seq)
        return self.transfrom(seq)

In [20]:
item_enc = LabelEncoder()
df.stockcode = item_enc.fit_transform(df.stockcode.astype('str'))
df.stockcode = df.stockcode.astype('int32')

훈련, 테스트 set 분리

In [40]:
df_train = df[df.invoicedate < '2011-10-09'].reset_index(drop=True)
df_val = df[(df.invoicedate >= '2011-10-09') & (df.invoicedate 
                                               <= '2011-11-09')].reset_index(drop=True)
df_test = df[df.invoicedate >= '2011-11-09'].reset_index(drop=True)
#reset_index : 기존의 0~ 정수형 인덱스를 다시 넣어줌.
#drop=True는 리셋하기 전의 인덱스를 버리고 새로이 하는 것

In [41]:
df_train.shape, df_val.shape, df_test.shape

((378470, 8), (64460, 8), (89691, 8))

In [30]:
user_enc = LabelEncoder()
user_enc.fit(df_train[df_train.customerid != -1].customerid)

df_train.customerid = user_enc.transfrom(df_train.customerid)
df_val.customerid = user_enc.transfrom(df_val.customerid)

In [46]:
uid_train = df_train.drop_duplicates(subset='invoiceno').customerid.values
uid_val = df_val.drop_duplicates(subset='invoiceno').customerid.values
#drop_duplicates(subset=None, keep=first) : 중복값 제거
#subse= 중복 제거할 columns 특정 (원래는 모두 다)
#keep=first : 중복값 있을 시 첫 번째 것만 사용

In [48]:
def group_indptr(df):
    indptr, = np.where(df.invoiceno != df.invoiceno.shift())
    #shift(n) : n칸 미는
    indptr = np.append(indptr, len(df)).astype('int32')
    return indptr

indptr_train = group_indptr(df_train)
indptr_val = group_indptr(df_val)

In [59]:
from collections import Counter
top_train = Counter(df_train.stockcode) #히스토그램처럼 세주는

simple baseline

In [73]:
def baseline(uid, indptr, items, top, k=5):
    n_groups = len(uid)
    n_items = len(items)
    
    pred_all = np.zeros((n_items,k), dtype='int32')
    
    for g in range(n_groups):
        t = top.copy()
        
        start = indptr[g]
        end = indptr[g+1]
        
        for i in range(start, end):
            pred = [k for (k, c) in t.most_common(5)]
            pred_all[i] = pred
            
            actual = items[i]
            if actual in t:
                del t[actual]
            #추천된 항목 중 하나를 사용자가 구매했다면 제거하는 코드    
                
                
    return pred_all

In [74]:
iid_val = df_val.stockcode.values
pred_baseline = baseline(uid_val, indptr_val, iid_val, top_train, k=5)

In [76]:
@njit

def accuracy_k(y_true, y_pred):
    n, k = y_pred.shape
    
    acc = 0
    
    for i in range(n):
        for j in range(k):
            if y_pred[i,j] == y_true[i]:
                acc = acc + 1
                break
                
    return acc/n

In [77]:
accuracy_k(iid_val, pred_baseline)

0.012705553831833695

개선해보자! RNN naive model

데이터 준비

In [78]:
def pack_items(users, items_indptr, items_vals):
    n = len(items_indptr) - 1
    
    result = []
    for i in range(n):
        start = items_indptr[i]
        end = items_indptr[i+1]
        result.append(items_vals[start:end])
    return result

In [89]:
train_items = pack_items(indptr_train, indptr_train, df_train.stockcode.values)

df_train_wrap = pd.DataFrame()
df_train_wrap['customerid'] = uid_train
df_train_wrap['items'] = train_items

In [90]:
df_train_wrap.head()

Unnamed: 0,customerid,items
0,17850,"[3528, 2792, 3041, 2982, 2981, 1662, 800]"
1,17850,"[1547, 1546]"
2,13047,"[3301, 1655, 1658, 1659, 1247, 3368, 1537, 153..."
3,13047,"[1862, 1816, 1815, 1817]"
4,13047,[818]


시퀀스 길이가 다름, 패딩을 통해 고정 길이로 만들어주는 동시에
RNN에서 쓰일 X,Y 쌍도 만들어 줄 것임

In [92]:
def pad_seq(data, num_steps):
    data = np.pad(data, pad_width =(1,0), mode='constant')
    
    n = len(data)
    
    if n <= num_steps:
        pad_right = num_steps - n + 1
        data = np.pad(data, pad_width=(0,pad_right),mode='constant')
        
    return data

def prepare_train_data(data, num_steps):
    data = pad_seq(data, num_steps)
    
    X = []
    Y = []
    
    for i in range(num_steps, len(data)):
        start = i - num_steps
        X.append(data[start:i])
        Y.append(data[start+1:i+1])
        
    return X, Y

In [93]:
import tensorflow as tf

In [94]:
class Config:
    num_steps = 5
    
    num_items = item_enc.vocab_size()
    num_users = user_enc.vocab_size()
    
    learning_rate = 1.0
    hidden_size = 200
    embedding_Size = 200
    batch_size = 20
    
config = Config()

In [99]:
config.num_items

4060

In [95]:
train_items = df_train_wrap['items']

X_train = []
Y_train = []

for i in range(len(train_items)):
    X, Y = prepare_train_data(train_items[i], config.num_steps)
    
    X_train.extend(X)
    Y_train.extend(Y)
    
X_train = np.array(X_train, dtype='int32')
Y_train = np.array(Y_train, dtype='int32')

In [98]:
model = tf.keras.Sequential()

In [100]:
model.add(tf.keras.layers.Embedding(5000, 200, input_length=5))
model.add(tf.keras.layers.LSTM(200))
model.add(tf.keras.layers.LSTM(200))
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(config.num_items, 
                                                               activation='softmax')))
model.summary()

ValueError: Input 0 of layer lstm_1 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 200)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=tf.kers.optimizers2)