In [1]:
import numpy as np
import pandas as pd
import logging
from scipy import sparse

import random
import re

In [2]:
trans = pd.read_csv('02거래정보.csv')
trans = trans.query('pd_c != "unknown"').sort_values(by = 'pd_c').reset_index()
trans = trans.drop(['index'], axis = 1)
trans = trans[trans.pd_c != "0667"]
trans = trans[trans.pd_c != "0196"]
trans = trans[trans.pd_c != "0524"]
item = pd.read_csv('04상품분류정보.csv')
item = item.dropna()
df = pd.read_csv('final.csv')
df = df.query('trans != "[]"')

### Preprocessing

- Co-occuerence matrix를 만들기 위해 clnt_id와 pd_c의 범위를 0~고유값의 개수 로 변환함
- count, jaccard, lift 방법을 사용할 수 있지만 평점 정보가 없기 때문에 count만 사용

In [3]:
col_user_id = 'clnt_id'
col_item_id = 'pd_c'
n_users = trans.clnt_id.nunique()
n_items = trans.pd_c.nunique()
col_rating = 'rating'

#### encoding

In [4]:
vocab_users = {}
num_users = 0
for i in np.hstack([trans[col_user_id]]):
    if vocab_users.get(i) != None:
        continue
    vocab_users[i] = num_users
    num_users += 1
    
vocab_items = {}
num_items = 0
for i in np.hstack([trans[col_item_id]]):
    if vocab_items.get(i) != None:
        continue
    vocab_items[i] = num_items
    num_items += 1
    
encoded_users = [vocab_users[i] for i in trans[col_user_id]]
encoded_items = [vocab_items[i] for i in trans[col_item_id]]

In [6]:
df2 = pd.DataFrame({'clnt_id':encoded_users, col_item_id:encoded_items})
df2[col_rating] = 1

#### Make co-occurence matrix

In [7]:
user_item_hits = sparse.coo_matrix((np.repeat(1, df2.shape[0]),
                                    (df2[col_user_id], df2[col_item_id])),shape=(n_users, n_items),).tocsr()

item_cooccurrence = user_item_hits.transpose().dot(user_item_hits)
#item_cooccurrence = item_cooccurrence.multiply(item_cooccurrence >= 1)

In [8]:
item_cooccurrence_count = item_cooccurrence.toarray()

In [9]:
item_cooccurrence_count

array([[  52,    2,    2, ...,    0,    3,    3],
       [   2,    5,    1, ...,    0,    0,    0],
       [   2,    1,    7, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,    6,    2,    0],
       [   3,    0,    0, ...,    2, 3103,  148],
       [   3,    0,    0, ...,    0,  148,  564]], dtype=int32)

In [13]:
item_cooccurrence_count.shape

(1664, 1664)

In [14]:
i_n = 195

a = item_cooccurrence_count[i_n].copy()

b = a.argsort()[-10:]

display(b)

display(item.iloc[i_n,:])

c = item.iloc[b,]
display(c)

array([ 562,  352, 1391,  113,  219, 1613,  345,  960, 1209,  195],
      dtype=int64)

pd_c                         197
clac_nm1           Chilled Foods
clac_nm2    Packaged Side Dishes
clac_nm3          Packged Kimchi
Name: 196, dtype: object

Unnamed: 0,pd_c,clac_nm1,clac_nm2,clac_nm3
564,565,Fruits,Imported Fruits,Bananas
353,354,Dairy Products,Yogurt,Spoon Type Yogurts
1393,1394,Substitute Foods,Instant Noodles,Bibim Ramens
113,114,Beverages,Water,Water
220,221,Cleaning / Laundry / Bathroom Accessories,Cleaning Accessories,Trash Bags
1615,1616,Vegetables,Tofu / Bean Sprouts,Soybean Sprouts
346,347,Dairy Products,Milk,Fresh Milk
962,963,Meats,Domestic Porks,Domestic Porks - Variety Meats
1211,1212,Snack Foods,Snacks,Fruit Snacks
196,197,Chilled Foods,Packaged Side Dishes,Packged Kimchi


### Make Top-K list

In [15]:
pattern = '([0-9]+)'

In [16]:
pc = []
for i in range(df.shape[0]):
    a = df.iloc[i,1]
    pd_c = re.findall(pattern, a[1:-1])
    for j in range(len(pd_c)):
        pd_c[j] = int(pd_c[j])
    pc.append(pd_c)

In [17]:
for i, pcc in enumerate(pc):
    for j in range(len(pcc)):
        if pcc[j] < 195:
            pc[i][j] = pc[i][j]-1
        elif 195 < pcc[j] < 523:
            pc[i][j] = pc[i][j]-2
        elif pcc[j] > 666:
            pc[i][j] = pc[i][j]-3

In [18]:
f = []
for i, p in enumerate(pc):
    c = []
    g = []
    for j in range(len(p)):
        a = item_cooccurrence_count[p[j]].copy()
        b = a.argsort()[-10:]
        c.append(list(b))

        answer = sum(c, [])
        d = []
        e = list(set(answer))

        for k in range(10):
            randomIndex = random.randrange(0,len(e))
            d.append(e[randomIndex])
            del e[randomIndex]
            

    f.append(d)

In [19]:
reco_df = pd.DataFrame([],columns=['clnt_id', 'pd_c', 'clac_nm1', 'clac_nm2', 'clac_nm3'])
for i, clnt in enumerate(df.clnt_id):
    c = []
    for h in range(10):
        c.append(clnt)
    data = pd.concat([pd.DataFrame(c, columns = ['clnt_id']),
                      item.iloc[f[i],:].reset_index().drop(['index'], axis = 1)], axis = 1, sort=False)
    reco_df = reco_df.append(data)

In [20]:
reco_df.to_csv('top-10.csv', index = False)

In [21]:
reco_df[reco_df.clnt_id == list(set(reco_df.clnt_id))[115]]

Unnamed: 0,clnt_id,pd_c,clac_nm1,clac_nm2,clac_nm3
0,65991,333,Cosmetics / Beauty Care,Skin Care,Beauty Soaps
1,65991,114,Beverages,Water,Water
2,65991,727,Health Foods,Red Ginsengs / Dried Ginseng Processed Foods,Dried Ginseng Processed Foods
3,65991,1616,Vegetables,Tofu / Bean Sprouts,Soybean Sprouts
4,65991,892,Liquors / Alcoholic Beverages,Alcoholic Beverage Sets,Wine Sets
5,65991,1661,Women's Clothing,Women's Upper Bodywear / Tops,Women's Blouses
6,65991,963,Meats,Domestic Porks,Domestic Porks - Variety Meats
7,65991,903,Liquors / Alcoholic Beverages,Traditional Liquors,Chinese Liquor
8,65991,841,Kids' Clothing,Preschoolers' Lower Bodywear / Bottoms,Infant / Toddlers' Leggings
9,65991,1315,Sport Fashion,Women's Golf Clothing,Women's Golf T-shirts / Tops
