In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics import ndcg_score, average_precision_score
from sklearn.decomposition import NMF, TruncatedSVD

In [2]:
# ALS class 정의
# http://ethen8181.github.io/machine-learning/recsys/1_ALSWR.html

class ALS:
    # 하이퍼 파라미터 지정
    def __init__(self, factors=10, iterations=20, reg=0.01):
        self.factors = factors
        self.iterations = iterations
        self.reg = reg
    # 모델 적합 -> 평점 행렬 입력
    def fit(self, ratings):
        # 랜덤으로 user 수 * latent factor 형태의 행렬 생성
        self.user_factors = np.random.random((ratings.shape[0], self.factors))
        # 랜덤으로 item 수 * latent factor 형태의 행렬 생성
        self.item_factors = np.random.random((ratings.shape[1], self.factors))
        
        # 사전에 지정한 iteration 수에 걸쳐서, 교차로 als_step 진행
        for _ in range(self.iterations):
            # user_factors 먼저 업데이트 
            self.user_factors = self.als_step(ratings, self.user_factors, self.item_factors)
            # 이어서 item_factors 업데이트
            self.item_factors = self.als_step(ratings.T, self.item_factors, self.user_factors)
    
    # 교차로 업데이트하는 스텝 메서드
    def als_step(self, ratings, solve_vecs, fixed_vecs):
        # normal equation - 업데이트 되지 않을 user/item feature의 공분산 matrix
        # feature가 주어진(고정된) 상태에서 최적의 해를 찾아 그 행렬을 새로운 factors로 사용
        # 가령, user_factors가 고정되어 있을 때는 최적의 item_factors를 구하고, 반대도 마찬가지
        A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.factors) * self.reg
        b = ratings.dot(fixed_vecs)
        A_inv = np.linalg.inv(A)
        solve_vecs = b.dot(A_inv)
        return solve_vecs

    def predict(self):
        pred = self.user_factors.dot(self.item_factors.T)
        return pred

In [5]:
# 1. 데이터 로드
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00502/online_retail_II.xlsx'
df = pd.read_excel(url)

In [6]:
df['Customer ID'] = df['Customer ID'].astype('category')
df['StockCode'] = df['StockCode'].astype('category')
df = df.rename({"Customer ID":"CustomerID"}, axis=1)

In [7]:
# 2. 피벗 테이블 만들기 -> 너무 적은 인터렉션을 갖는 유저/아이템은 배제
interaction_counts = df.groupby('CustomerID').StockCode.count()
df = df[df.CustomerID.isin(interaction_counts[interaction_counts > 10].index)]

item_counts = df.StockCode.value_counts()
df = df[df.StockCode.isin(item_counts[item_counts > 10].index)]

In [8]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,CustomerID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [9]:
pivot = df.pivot_table(index='CustomerID', columns='StockCode', fill_value=0, aggfunc='size')

In [10]:
# 3. implicit data로 변경 (binary화)
pivot = (pivot > 0).astype(int)

In [11]:
pivot.head()

StockCode,10002,10080,10109,10120,10125,10133,10134,10135,10138,11001,...,gift_0001_10,gift_0001_20,gift_0001_30,gift_0001_40,gift_0001_50,gift_0001_60,gift_0001_70,gift_0001_80,gift_0001_90,m
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12347.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12349.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12351.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# 4. train/test split -> MF에서는 다른 방식!
# masking 을 해서 해당 값을 test set 의 역할을 하도록 정의함
test_ratio = 0.2
train = pivot.copy()
test = np.zeros(pivot.shape)

for user in range(pivot.shape[0]):
    test_interactions = np.random.choice(pivot.values[user, :].nonzero()[0], 
                                         size=int(test_ratio*np.sum(pivot.values[user, :])),
                                         replace=False)
    train.values[user, test_interactions] = 0.
    test[user, test_interactions] = pivot.values[user, test_interactions]

In [13]:
# Convert train and test matrix into sparse matrix
train_csr = coo_matrix(train.values)
test_csr = coo_matrix(test)

In [14]:
n_latent_factors = 20

In [15]:
# SVD

# using sklearn Truncated SVD
svd = TruncatedSVD(n_components=n_latent_factors, random_state=42)
train_svd = svd.fit_transform(train_csr)
svd_pred = svd.inverse_transform(svd.transform(test_csr))

In [16]:
# using svds from scipy
u, sigma, vt = svds(train_csr.astype(float), n_latent_factors)
svd_pred = np.dot(u, np.dot(np.diag(sigma), vt))

In [17]:
print(f'shapes of the matrices: {u.shape, sigma.shape, vt.shape}')

shapes of the matrices: ((4383, 20), (20,), (20, 4632))


In [18]:
# NMF
model = NMF(n_components=n_latent_factors, init='random', random_state=0)

W = model.fit_transform(train_csr)
H = model.components_
nmf_pred = np.dot(W, H)



In [19]:
# train ALS model
als = ALS(factors = n_latent_factors, iterations=100, reg=0.01)
als.fit(train_csr)

# predict
als_pred = als.predict()

In [20]:
at_k = 10

# Make sure the predicted scores are in the range [0, 1]
predicted_svd = (svd_pred - svd_pred.min()) / (svd_pred.max() - svd_pred.min())
predicted_nmf = (nmf_pred - nmf_pred.min()) / (nmf_pred.max() - nmf_pred.min())
predicted_als = (als_pred - als_pred.min()) / (als_pred.max() - als_pred.min())

In [21]:
from sklearn.metrics import mean_squared_error

# Calculate RMSE for SVD
svd_rmse = np.sqrt(mean_squared_error(test_csr.toarray(), predicted_svd))
print('SVD RMSE: ', svd_rmse)

# Calculate RMSE for NMF
nmf_rmse = np.sqrt(mean_squared_error(test_csr.toarray(), predicted_nmf))
print('NMF RMSE: ', nmf_rmse)

# Calculate RMSE for ALS
als_rmse = np.sqrt(mean_squared_error(test_csr.toarray(), predicted_als))
print('ALS RMSE: ', als_rmse)


SVD RMSE:  0.45326041191122307
NMF RMSE:  0.050503834456889246
ALS RMSE:  0.44147397078103323
