In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.1-cp37-cp37m-manylinux2014_x86_64.whl (18.6 MB)
[K     |████████████████████████████████| 18.6 MB 453 kB/s 
Installing collected packages: implicit
Successfully installed implicit-0.6.1


In [2]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
import random

In [3]:
df = pd.read_parquet('/content/drive/MyDrive/ecommerce/2019-Oct-EDA.parquet')

In [5]:
# df = pd.read_parquet('/content/drive/MyDrive/ecommerce/2019-Oct-EDA-plus.parquet')

In [4]:
df = df[['user_id', 'product_id', 'rating']]

In [7]:
df2 = df.copy()

In [5]:
data = pd.read_parquet('/content/drive/MyDrive/ecommerce/2019-Oct-product.parquet')

In [9]:
df['user_id'].nunique()

2962411

In [10]:
df['product_id'].nunique()

166084

# Sparse Matrix

* 데이터를 새로 인덱싱 (램의 과부화를 줄이기 위함)

* Train set / Test set 으로 분리 (matrix 에 마스크를 씌우기 위함)

* Matrix 생성

In [6]:
user_unique = df['user_id'].unique()
product_unique = df['product_id'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
product_to_idx = {v:k for k,v in enumerate(product_unique)}

In [7]:
temp_user_data = df['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(df): 
    print('user_id column indexing OK!!')
    df['user_id'] = temp_user_data
else:
    print('user_id column indexing Fail!!')

temp_product_data = df['product_id'].map(product_to_idx.get).dropna()
if len(temp_product_data) == len(df):
    print('product_id column indexing OK!!')
    df['product_id'] = temp_product_data
else:
    print('product_id column indexing Fail!!')

df

user_id column indexing OK!!
product_id column indexing OK!!


Unnamed: 0,user_id,product_id,rating
0,0,0,0.031052
1,0,0,0.191958
2,0,0,0.282290
3,1,0,0.031052
4,1,0,0.031052
...,...,...,...
41202558,855452,166081,0.019030
41202559,855452,166081,0.019030
41202560,855452,166081,0.019030
41202561,736844,166082,0.019030


In [8]:
from scipy.sparse import csr_matrix

In [9]:
shape = (len(user_to_idx), len(product_to_idx))

In [9]:
num_user = df['user_id'].nunique()
num_product = df['product_id'].nunique()

csr_data = csr_matrix((df.rating, (df.user_id, df.product_id)), shape = (num_user, num_product))
csr_data

<2962411x166084 sparse matrix of type '<class 'numpy.float64'>'
	with 22625546 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
df_train , df_test = train_test_split(df, test_size = 0.3)

In [12]:
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [13]:
csr_train = csr_matrix((df_train.rating, (df_train.user_id, df_train.product_id)))
csr_train

<2962411x166084 sparse matrix of type '<class 'numpy.float64'>'
	with 17750103 stored elements in Compressed Sparse Row format>

In [14]:
csr_test = csr_matrix((df_test.rating, (df_test.user_id, df_test.product_id)))
csr_test

<2962410x166083 sparse matrix of type '<class 'numpy.float64'>'
	with 9312756 stored elements in Compressed Sparse Row format>

In [15]:
csr_test1 = csr_matrix((df_test.rating, (df_test.user_id, df_test.product_id)), shape = shape)
csr_test1

<2962411x166084 sparse matrix of type '<class 'numpy.float64'>'
	with 9312756 stored elements in Compressed Sparse Row format>

# ALS

* Implicit ALS 모델 제작

* p@k를 사용한 성능평가

* 모델을 사용한 추천 시스템 알고리즘 제작

In [22]:
from implicit.als import AlternatingLeastSquares
from implicit import evaluation
import os

In [23]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [24]:
als_model = AlternatingLeastSquares(factors = 20, regularization = 0.05, use_gpu = False, iterations = 3, dtype = np.float32)

In [25]:
# csr_data_transpose = csr_data.T
# csr_data_transpose

In [26]:
als_model.fit(csr_train)

  0%|          | 0/3 [00:00<?, ?it/s]

성능평가

In [27]:
evaluation.mean_average_precision_at_k(als_model, csr_train, csr_test, K = 3, show_progress = False, num_threads = 0)

0.010880683174413633

추천 시스템 알고리즘

In [28]:
user_n = 555774146
user_id = user_to_idx[user_n]

In [29]:
%%time
recommendations = als_model.recommend(user_id, csr_data[user_id])
recom = recommendations[0]

recom

CPU times: user 4.5 ms, sys: 6.99 ms, total: 11.5 ms
Wall time: 5 ms


array([  28,   48,  281,  157, 1158, 1760,  369,  691, 1691, 1579],
      dtype=int32)

In [30]:
idx_to_product = {v:k for k,v in product_to_idx.items()}
rec_sys = [idx_to_product[i] for i in recom]

In [31]:
rec_sys

[1004750,
 1004258,
 1801690,
 1002633,
 1004209,
 1004858,
 1003306,
 1003317,
 1004565,
 1004246]

In [32]:
def rec_mod(user_n):
  user_id = user_to_idx[user_n]
  recommendations = als_model.recommend(user_id, csr_data[user_id])
  recom = recommendations[0]
  idx_to_product = {v : k for k, v in product_to_idx.items()}
  reco = [idx_to_product[i] for i in recom]
  
  return reco

In [33]:
%%time
rec_mod(545248845)

CPU times: user 27.3 ms, sys: 57.2 ms, total: 84.6 ms
Wall time: 21.2 ms


[1004258,
 1801690,
 1004750,
 1002633,
 1004873,
 4804295,
 1003317,
 1003306,
 1005100,
 1004858]

In [34]:
rec1 = data[data['product_id'] == rec_sys[0]][['category_code', 'brand', 'price']][:1]
rec2 = data[data['product_id'] == rec_sys[1]][['category_code', 'brand', 'price']][:1]
rec3 = data[data['product_id'] == rec_sys[2]][['category_code', 'brand', 'price']][:1]
rec4 = data[data['product_id'] == rec_sys[3]][['category_code', 'brand', 'price']][:1]
rec5 = data[data['product_id'] == rec_sys[4]][['category_code', 'brand', 'price']][:1]

rec_product = pd.concat([rec1, rec2, rec3, rec4, rec5]).reset_index(drop = True)

In [35]:
rec_product

Unnamed: 0,category_code,brand,price
0,electronics.smartphone,samsung,197.43
1,electronics.smartphone,apple,735.05
2,electronics.video.tv,samsung,368.04
3,electronics.smartphone,apple,360.08
4,electronics.smartphone,samsung,92.64


In [36]:
%%time
favorite_product = 1200947
product_id = product_to_idx[favorite_product]
similar_product = als_model.similar_items(product_id, N = 5)[0]
similar_product

CPU times: user 12 ms, sys: 2.32 ms, total: 14.3 ms
Wall time: 9.9 ms


array([    0, 19612,  5290,  1633, 44824], dtype=int32)

In [37]:
idx_to_product = {v:k for k,v in product_to_idx.items()}
rec_sim_pro = [idx_to_product[i] for i in similar_product]

In [38]:
rec_sim_pro

[1200947, 1801906, 1800930, 1801805, 22500121]

In [39]:
rec_pro1 = data[data['product_id'] == rec_sim_pro[0]][['category_code', 'brand', 'price']][:1]
rec_pro2 = data[data['product_id'] == rec_sim_pro[1]][['category_code', 'brand', 'price']][:1]
rec_pro3 = data[data['product_id'] == rec_sim_pro[2]][['category_code', 'brand', 'price']][:1]
rec_pro4 = data[data['product_id'] == rec_sim_pro[3]][['category_code', 'brand', 'price']][:1]
rec_pro5 = data[data['product_id'] == rec_sim_pro[4]][['category_code', 'brand', 'price']][:1]

rec_similar = pd.concat([rec_pro1, rec_pro2, rec_pro3, rec_pro4, rec_pro5]).reset_index(drop = True)

In [40]:
rec_similar

Unnamed: 0,category_code,brand,price
0,electronics.tablet,samsung,115.41
1,electronics.video.tv,tcl,246.78
2,electronics.video.tv,sony,329.22
3,electronics.video.tv,artel,256.89
4,,elenberg,23.14


# LightFM

* lightfm 모델을 사용한 모델링

* p@k를 이용한 성능 평가

* 모델을 이용한 추천 시스템 알고리즘

In [16]:
!pip install lightfm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 9.8 MB/s 
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.16-cp37-cp37m-linux_x86_64.whl size=705378 sha256=e6c066103a6e922b4538efb007debf96fc500e02bf7e41c1a8675bb51ee53f79
  Stored in directory: /root/.cache/pip/wheels/f8/56/28/5772a3bd3413d65f03aa452190b00898b680b10028a1021914
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.16


In [17]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

In [18]:
light_model = LightFM(loss = 'warp', no_components = 20)

In [19]:
light_model.fit(csr_train, epochs = 3, num_threads = 2)

<lightfm.lightfm.LightFM at 0x7f5a45d32050>

성능평가

In [45]:
# k = 3
# print('Train precision at k={}:\t{:.4f}'.format(k, precision_at_k(light_model, csr_train, k=k).mean()))

In [46]:
k = 3
print('Test precision at k={}:\t{:.4f}'.format(k, precision_at_k(light_model, csr_test1, k=k).mean()))

Test precision at k=3:	0.0498


추천 시스템 알고리즘

In [20]:
%%time
scores = pd.Series(light_model.predict(0, np.arange(166084)))

CPU times: user 16.5 ms, sys: 2.75 ms, total: 19.3 ms
Wall time: 22.5 ms


In [21]:
scores = list(pd.Series(scores.sort_values(ascending=False).index))

In [22]:
scores1 = scores[0:5]

In [23]:
idx_to_product = {v:k for k,v in product_to_idx.items()}
lfm_rec = [idx_to_product[i] for i in scores1]

In [24]:
lfm_rec

[1005115, 1004856, 1004767, 1005105, 1002544]

In [26]:
lfm_reco1 = data[data['product_id'] == lfm_rec[0]][['category_code', 'brand', 'price']][:1]
lfm_reco2 = data[data['product_id'] == lfm_rec[1]][['category_code', 'brand', 'price']][:1]
lfm_reco3 = data[data['product_id'] == lfm_rec[2]][['category_code', 'brand', 'price']][:1]
lfm_reco4 = data[data['product_id'] == lfm_rec[3]][['category_code', 'brand', 'price']][:1]
lfm_reco5 = data[data['product_id'] == lfm_rec[4]][['category_code', 'brand', 'price']][:1]

lfm_rec_product = pd.concat([lfm_reco1, lfm_reco2, lfm_reco3, lfm_reco4, lfm_reco5]).reset_index(drop = True)

In [27]:
lfm_rec_product

Unnamed: 0,category_code,brand,price
0,electronics.smartphone,apple,975.57
1,electronics.smartphone,samsung,130.76
2,electronics.smartphone,samsung,254.82
3,electronics.smartphone,apple,1415.48
4,electronics.smartphone,apple,464.13
