### 패키지 다운 

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from tqdm.notebook import tqdm as tqdm

import pickle

### 데이터셋 다운 및 전처리 후 나누기 

In [2]:
# df = pd.read_excel('online_retail.xlsx')

In [3]:
# with open('online_retail.bin', 'wb') as f_out:
#     pickle.dump(df, f_out)

pickled version 재사용 가능 

In [4]:
with open('online_retail.bin', 'rb')as f_in:
    df = pickle.load(f_in)

In [5]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


#### df 설명
- InvoiceNo: 각 transaction을 고유하게 식별하는 송장 번호. Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.
- StockCode: 구매 항목의 코드. Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
- Description: 상품 이름. Product (item) name. Nominal.
- Quantity: transaction에서 해당 항목이 구매된 횟수. The quantities of each product (item) per transaction. Numeric.
- InvoiceDate: Invice Date and time. Numeric, the day and time when each transaction was generated.
- UnitPrice: 단가. Unit price. Numeric, Product price per unit in sterling.
- CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
- Country: Country name. Nominal, the name of the country where each customer resides.

#### 고쳐야 할 문제
열 이름의 대문자를 소문자로.
일부 거래는 실제 거래가 아닌 수익이므로, 필터링하여 제외해야함.
알 수 없는 사용자가 NaN으로 처리되어 있어, Customer ID가 float형태임.
알 수 없는 사용자에게 공통 ID를 할당하고, 본 열을 정수형 (int)로 바꿔야 함

In [6]:
df.columns = df.columns.str.lower()
df = df[~df.invoiceno.astype('str').str.startswith('C')].reset_index(drop=True)
#0~대신 invoiceno로 인덱싱함
#drop=True를 사용해서 인덱싱 역할을 하는 동시에, 기존 열의 역할도 함
df.customerid = df.customerid.fillna(-1).astype('int32')

In [7]:
df.head()

Unnamed: 0,invoiceno,stockcode,description,quantity,invoicedate,unitprice,customerid,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [8]:
#stockcode (뒤에 알파벳도 붙어 있는 str형)을 내림차순으로 정수 번호 부여
stockcode_values = df.stockcode.astype('str')

stockcodes = sorted(set(stockcode_values))
stockcodes = {c:i for (i,c) in enumerate(stockcodes)}

df.stockcode = stockcode_values.map(stockcodes).astype('int32')

In [9]:
df_train = df[df.invoicedate < '2011-10-09']
df_train = df_train.reset_index(drop=True)
df_val = df[(df.invoicedate >= '2011-10-09') &
           (df.invoicedate <= '2011-11-09')]
df_val = df_val.reset_index(drop=True)
df_test = df[df.invoicedate >= '2011-11-09']
df_test = df_test.reset_index(drop=True)

In [10]:
print(len(df_train))
print(len(df_val))
print(len(df_test))

378470
64460
89691


### 추천 방법1 - precision 활용 

In [11]:
top = df_train.stockcode.value_counts().head(5).index.values
top

array([3527, 3506, 1347, 2730,  180])

In [12]:
def group_indptr(df):
    indptr, = np.where(df.invoiceno != df.invoiceno.shift())
    indptr = np.append(indptr, len(df)).astype('int32')
    return indptr

In [13]:
val_indptr = group_indptr(df_val)
num_groups = len(val_indptr) - 1
baseline = np.tile(top, num_groups).reshape(-1,5)

In [14]:
from numba import njit

@njit
def precision(group_indptr, true_items, predicted_items):
    tp = 0
    
    n, m = predicted_items.shape
    
    for i in range(n):
        group_start = group_indptr[i]
        group_end = group_indptr[i+1]
        group_true_items = true_items[group_start:group_end]
        
        for item in group_true_items:
            for j in range(m):
                if item == predicted_items[i, j]:
                    tp = tp + 1
                    continue
                    
    return tp / (n * m)

In [15]:
val_items = df_val.stockcode.values
precision(val_indptr, val_items, baseline)

0.0642299794661191

### 추천 방법2 - 행렬 분해 

In [16]:
df_train_user = df_train[df_train.customerid != -1].reset_index(drop=True)

customers = sorted(set(df_train_user.customerid))
customers = {c: i for (i, c) in enumerate(customers)}

df_train_user.customerid = df_train_user.customerid.map(customers)

In [17]:
uid = df_train_user.customerid.values.astype('int32')
iid = df_train_user.stockcode.values.astype('int32')
ones = np.ones_like(uid, dtype='uint8')

X_train = sp.csr_matrix((ones, (uid, iid)))

In [18]:
df_val.customerid = df_val.customerid.apply(lambda c: customers.get(c,-1))

uid_val = df_val.drop_duplicates(subset='invoiceno').customerid.values
known_mask = uid_val != -1
uid_val = uid_val[known_mask]

In [19]:
imp_baseline = baseline.copy()

from implicit.als import AlternatingLeastSquares

item_user = X_train.T.tocsr()
als = AlternatingLeastSquares(factors=128, regularization=0.000001,)
als.fit(item_user)

als_U = als.user_factors
als_I = als.item_factors

pred_all = als_U[uid_val].dot(als_I.T)
top_val = (-pred_all).argsort(axis=1)[:,:5]
imp_baseline[known_mask] =  top_val

precision(val_indptr, val_items, imp_baseline)

  0%|          | 0/15 [00:00<?, ?it/s]

0.14127310061601642

In [20]:
type(als_I)

numpy.ndarray

### 방법3-tensorflow를 활용한 행렬분해 

In [21]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [22]:
def prepare_batches(seq, step):
    n = len(seq)
    res = []
    for i in range(0, n, step):
        res.append(seq[i:i+step])
    return res

In [23]:
num_users = uid.max() + 1
num_items = iid.max() + 1

num_factors = 128
lambda_user = 0.0000001
lambda_item = 0.0000001
K = 5
lr = 0.005

In [24]:
def embed(inputs, size, dim, name=None):
    std = np.sqrt(2 / dim)
    emb = tf.Variable(tf.random_uniform([size, dim], -std, std), name=name)
    lookup = tf.nn.embedding_lookup(emb, inputs)
    return lookup

graph = tf.Graph()
graph.seed = 1

with graph.as_default():
    place_user = tf.placeholder(tf.int32, shape=(None, 1))
    place_item = tf.placeholder(tf.int32, shape=(None, 1))
    place_y = tf.placeholder(tf.float32, shape=(None, 1))

    user_factors = embed(place_user, num_users, num_factors, "user_factors")
    user_bias = embed(place_user, num_users, 1, "user_bias")
    user_bias = tf.reshape(user_bias, [-1, 1])

    item_factors = embed(place_item, num_items, num_factors, "item_factors")
    item_bias = embed(place_item, num_items, 1, "item_bias")
    item_bias = tf.reshape(item_bias, [-1, 1])

    global_bias = tf.Variable(0.0, name='global_bias')

    pred = tf.reduce_sum(user_factors * item_factors, axis=2)
    pred = tf.sigmoid(global_bias + user_bias + item_bias + pred)

    reg = lambda_user * tf.reduce_sum(user_factors * user_factors) + \
          lambda_item * tf.reduce_sum(item_factors * item_factors)

    loss = tf.losses.log_loss(place_y, pred)
    loss_total = loss + reg

    opt = tf.train.AdamOptimizer(learning_rate=lr)
    step = opt.minimize(loss_total)

    init = tf.global_variables_initializer()

In [25]:
def get_variable(graph, session, name):
    v = graph.get_operation_by_name(name)
    v = v.values()[0]
    v = v.eval(session=session)
    return v

def calculate_validation_precision(graph, session, uid):
    U = get_variable(graph, session, 'user_factors')
    I = get_variable(graph, session, 'item_factors')
    bi = get_variable(graph, session, 'item_bias').reshape(-1)

    pred_all = U[uid_val].dot(I.T) + bi
    top_val = (-pred_all).argsort(axis=1)[:, :5]

    imp_baseline = baseline.copy()
    imp_baseline[known_mask] = top_val

    return precision(val_indptr, val_items, imp_baseline)

In [26]:
session = tf.Session(config=None, graph=graph)
session.run(init)

np.random.seed(0)

for i in range(10):
    train_idx_shuffle = np.arange(uid.shape[0])
    np.random.shuffle(train_idx_shuffle)
    batches = prepare_batches(train_idx_shuffle, 5000)

    progress = tqdm(total=len(batches))
    for idx in batches:
        pos_samples = len(idx)
        neg_samples = pos_samples * K 

        label = np.concatenate([
                    np.ones(pos_samples, dtype='float32'), 
                    np.zeros(neg_samples, dtype='float32')
                ]).reshape(-1, 1)

        neg_users = np.random.randint(low=0, high=num_users, 
                                      size=neg_samples, dtype='int32')
        neg_items = np.random.randint(low=0, high=num_items,
                                      size=neg_samples, dtype='int32')

        batch_uid = np.concatenate([uid[idx], neg_users]).reshape(-1, 1)
        batch_iid = np.concatenate([iid[idx], neg_items]).reshape(-1, 1)

        feed_dict = {
            place_user: batch_uid,
            place_item: batch_iid,
            place_y: label,
        }
        _, l = session.run([step, loss], feed_dict)
        
        progress.update(1)
        progress.set_description('%.3f' % l)
    progress.close()

    val_precision = calculate_validation_precision(graph, session, uid_val)
    print('epoch %02d: precision: %.3f' % (i+1, val_precision))

2021-11-09 18:49:42.700227: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:196] None of the MLIR optimization passes are enabled (registered 0 passes)
2021-11-09 18:49:42.702552: W tensorflow/core/platform/profile_utils/cpu_utils.cc:126] Failed to get CPU frequency: 0 Hz


  0%|          | 0/56 [00:00<?, ?it/s]

epoch 01: precision: 0.057


  0%|          | 0/56 [00:00<?, ?it/s]

epoch 02: precision: 0.082


  0%|          | 0/56 [00:00<?, ?it/s]

epoch 03: precision: 0.108


  0%|          | 0/56 [00:00<?, ?it/s]

epoch 04: precision: 0.126


  0%|          | 0/56 [00:00<?, ?it/s]

epoch 05: precision: 0.138


  0%|          | 0/56 [00:00<?, ?it/s]

epoch 06: precision: 0.146


  0%|          | 0/56 [00:00<?, ?it/s]

epoch 07: precision: 0.152


  0%|          | 0/56 [00:00<?, ?it/s]

epoch 08: precision: 0.152


  0%|          | 0/56 [00:00<?, ?it/s]

epoch 09: precision: 0.152


  0%|          | 0/56 [00:00<?, ?it/s]

epoch 10: precision: 0.151


In [27]:

    U = get_variable(graph, session, 'user_factors')
    I = get_variable(graph, session, 'item_factors')
    bi = get_variable(graph, session, 'item_bias').reshape(-1)

    pred_all = U[uid_val].dot(I.T) + bi
    top_val = (-pred_all).argsort(axis=1)[:, :5]

    imp_baseline = baseline.copy()
    imp_baseline[known_mask] = top_val

    result = precision(val_indptr, val_items, imp_baseline)

In [30]:
# from tensorflow.python.summary.writer import writer

# with tf.Graph().as_default():
#     writer = writer.FileWriter('./mygraph',session.graph)

###  방법4-베이시안 랭크를 이용

In [31]:
num_factors = 128
lambda_user = 0.0000001
lambda_item = 0.0000001
lambda_bias = 0.0000001
lr = 0.0005

In [32]:
def init_variable(size, dim, name=None):
    std = np.sqrt(2 / dim)
    return tf.Variable(tf.random_uniform([size, dim], -std, std), name=name)

def embed(inputs, size, dim, name=None):
    emb = init_variable(size, dim, name)
    return tf.nn.embedding_lookup(emb, inputs)

graph = tf.Graph()
graph.seed = 1

with graph.as_default():
    place_user = tf.placeholder(tf.int32, shape=(None, 1))
    place_item_pos = tf.placeholder(tf.int32, shape=(None, 1))
    place_item_neg = tf.placeholder(tf.int32, shape=(None, 1))

    user_factors = embed(place_user, num_users, num_factors, "user_factors")
    item_factors = init_variable(num_items, num_factors, "item_factors")
    item_factors_pos = tf.nn.embedding_lookup(item_factors, place_item_pos)
    item_factors_neg = tf.nn.embedding_lookup(item_factors, place_item_neg)

    item_bias = init_variable(num_items, 1, "item_bias")
    item_bias_pos = tf.nn.embedding_lookup(item_bias, place_item_pos)
    item_bias_pos = tf.reshape(item_bias_pos, [-1, 1])
    item_bias_neg = tf.nn.embedding_lookup(item_bias, place_item_neg)
    item_bias_neg = tf.reshape(item_bias_neg, [-1, 1])

    pred_pos = item_bias_pos + tf.reduce_sum(user_factors * item_factors_pos, axis=2)
    pred_neg = item_bias_neg + tf.reduce_sum(user_factors * item_factors_neg, axis=2)

    pred_diff = pred_pos - pred_neg

    loss_bpr = - tf.reduce_mean(tf.log(tf.sigmoid(pred_diff)))
    loss_reg = lambda_user * tf.reduce_sum(user_factors * user_factors) + \
          lambda_item * tf.reduce_sum(item_factors_pos * item_factors_pos) + \
          lambda_item * tf.reduce_sum(item_factors_neg * item_factors_neg) + \
          lambda_bias * tf.reduce_sum(item_bias_pos) + \
          lambda_bias * tf.reduce_sum(item_bias_neg)
        
    loss_total = loss_bpr + loss_reg

    opt = tf.train.AdamOptimizer(learning_rate=lr)
    step = opt.minimize(loss_total)

    init = tf.global_variables_initializer()

In [33]:
session = tf.Session(config=None, graph=graph)
session.run(init)

size_total = uid.shape[0]
size_sample = 15000

np.random.seed(0)

for i in range(100):
    for k in range(30):
        idx = np.random.randint(low=0, high=size_total, size=size_sample)

        batch_uid = uid[idx].reshape(-1, 1)
        batch_iid_pos = iid[idx].reshape(-1, 1)
        batch_iid_neg = np.random.randint(
            low=0, high=num_items, size=(size_sample, 1), dtype='int32')

        feed_dict = {
            place_user: batch_uid,
            place_item_pos: batch_iid_pos,
            place_item_neg: batch_iid_neg,
        }
        _, l = session.run([step, loss_bpr], feed_dict)

    val_precision = calculate_validation_precision(graph, session, uid_val)
    print('epoch %02d: recall: %.3f' % (i+1, val_precision))

2021-11-09 18:50:03.560437: I tensorflow/compiler/tf2mlcompute/kernels/mlc_subgraph_op.cc:326] Compute: Failed in processing TensorFlow graph gradients/MLCSubgraphOp_1_4 with frame_id = 0 and iter_id = 0 with error: Internal: PerformGradientPassNodeRoutine: Failed to find forward-pass output for node: add_5 (error will be reported 5 times unless TF_MLC_LOGGING=1).
2021-11-09 18:50:03.589633: I tensorflow/compiler/tf2mlcompute/kernels/mlc_subgraph_op.cc:326] Compute: Failed in processing TensorFlow graph gradients/MLCSubgraphOp_1_4 with frame_id = 0 and iter_id = 0 with error: Internal: PerformGradientPassNodeRoutine: Failed to find forward-pass output for node: add_5 (error will be reported 5 times unless TF_MLC_LOGGING=1).
2021-11-09 18:50:03.613109: I tensorflow/compiler/tf2mlcompute/kernels/mlc_subgraph_op.cc:326] Compute: Failed in processing TensorFlow graph gradients/MLCSubgraphOp_1_4 with frame_id = 0 and iter_id = 0 with error: Internal: PerformGradientPassNodeRoutine: Failed t

epoch 01: recall: 0.025
epoch 02: recall: 0.029
epoch 03: recall: 0.032
epoch 04: recall: 0.040
epoch 05: recall: 0.046
epoch 06: recall: 0.051
epoch 07: recall: 0.054
epoch 08: recall: 0.058
epoch 09: recall: 0.059
epoch 10: recall: 0.061
epoch 11: recall: 0.061
epoch 12: recall: 0.063
epoch 13: recall: 0.065
epoch 14: recall: 0.066
epoch 15: recall: 0.067
epoch 16: recall: 0.070
epoch 17: recall: 0.072
epoch 18: recall: 0.074
epoch 19: recall: 0.075
epoch 20: recall: 0.078
epoch 21: recall: 0.079
epoch 22: recall: 0.080
epoch 23: recall: 0.082
epoch 24: recall: 0.083
epoch 25: recall: 0.084
epoch 26: recall: 0.087
epoch 27: recall: 0.088
epoch 28: recall: 0.089
epoch 29: recall: 0.091
epoch 30: recall: 0.093
epoch 31: recall: 0.095
epoch 32: recall: 0.097
epoch 33: recall: 0.099
epoch 34: recall: 0.102
epoch 35: recall: 0.104
epoch 36: recall: 0.105
epoch 37: recall: 0.107
epoch 38: recall: 0.108
epoch 39: recall: 0.109
epoch 40: recall: 0.112
epoch 41: recall: 0.113
epoch 42: recall