In [1]:
import numpy as np
y_pred = [1,2,3,4,5]
y_true = [2,7,3]
k = 3
r = np.in1d(y_pred[:k], y_true)

In [2]:
r

array([False,  True,  True])

In [3]:
np.cumsum(r)

array([0, 1, 2])

In [4]:
(np.arange(len(r))+1)*r

array([0, 2, 3])

In [5]:
np.sum(np.cumsum(r) / (np.arange(len(r))+1)*r)

1.1666666666666665

In [6]:
np.sum(r)

2

In [7]:
def precision_at_k(y_true, y_pred, k):
    return float(np.in1d(y_pred[:k], y_true).mean())

def recall_at_k(y_true, y_pred, k):
    return float(np.in1d(y_true, y_pred[:k]).mean())

def ndcg_at_k(y_true, y_pred, k):
    r = np.inid(y_pred[:k], y_true)
    discount = 1 / (np.log2(np.arange(r.shape[0]) + 2))
    dcg = np.sum(r * discount)
    idcg = np.sum(np.ones_list(r) * discount)
    return float(dcg / idcg)

def average_precision_at_k(y_true, y_pred, k):
    r = np.in1d(y_pred[:k], y_true)
    if 0 < np.sum(r):
        return float(np.sum(np.cumsum(r) / (np.arange(len(r))+1)*r)/np.sum(r))
    else:
        return 0

def rr_at_k(y_true, y_pred, k):
    r = np.in1d(y_pred[:k], y_true)
    return max(r / len(r))

# 임의의 추천 생성

In [8]:
import numpy as np

np.random.seed(42)

num_users = 5
num_items = 10
interactions_per_user = 5
recommended_items_per_user = 5

# 샘플 데이터 생성
interaction_data = [np.random.choice(num_items, size=interactions_per_user, replace=False) for _ in range(num_users)]

# 샘플 예측 데이터 생성
prediction_data = [np.random.choice(num_items, size=recommended_items_per_user, replace=False) for _ in range(num_users)]

# 예시 출력
print("Interacted items for the first user:", interaction_data[0])
print("Recommended items for the first user:", prediction_data[0])

Interacted items for the first user: [8 1 5 0 7]
Recommended items for the first user: [8 2 5 7 3]


# 추천 메트릭 정의

In [9]:
def precision_at_k(y_true, y_pred, k):
    # 예측 된 아이템 중 상위 k개만
    y_pred = y_pred[:k]
    # top-k item 중 유관 상품의 비율
    return len(set(y_true) & set(y_pred)) / k

def recall_at_k(y_true, y_pred, k):
    # 예측 된 아이템 중 상위 k개만
    y_pred = y_pred[:k]
    # 실제 유관 상품 중 상위 k개에 포함된 비율
    return len(set(y_true) & set(y_pred)) / len(y_true)

def mrr_at_k(y_true, y_pred, k):
    # 예측 된 아이템 중 상위 k개만
    y_pred = y_pred[:k]
    for i, p in enumerate(y_pred):
        if p in y_true:
            return 1 / (i+1)
    return 0

def average_precision_at_k(y_true, y_pred, k):
    # 예측 된 아이템 중 상위 k개만
    y_pred = y_pred[:k]
    # average precision at k
    score = 0.0
    num_hits = 0
    for i ,p in enumerate(y_pred):
        if p in y_true:
            num_hits += 1
            score += num_hits / (i+1)
    return score / min(len(y_true), k)

def ndcg_at_k(y_true, y_pred, k):
    # 예측 된 아이템 중 상위 k개만
    y_pred = y_pred[:k]
    # DCG at k
    dcg = sum([int(p in y_true) / np.log2(i+2) for i, p in enumerate(y_pred)])
    # IDCG at k
    idcg = sum([1 / np.log2(i+2) for i in range(min(len(y_true), k))])
    # NDCG at k
    return dcg / idcg

In [10]:
k = 3  # We will compute the metrics at 3

# For each user, compute and print the metrics
for i in range(num_users):
    y_true = interaction_data[i]
    y_pred = prediction_data[i]

    precision = precision_at_k(y_true, y_pred, k)
    recall = recall_at_k(y_true, y_pred, k)
    map_ = average_precision_at_k(y_true, y_pred, k)
    ndcg = ndcg_at_k(y_true, y_pred, k)
    mrr = mrr_at_k(y_true, y_pred, k)

    print(f"Metrics for user {i+1}:")
    print(f'true items of user {i+1}: {y_true}')
    print(f'reco items of user {i+1}: {y_pred}')
    print(f"Precision@{k} = {precision}")
    print(f"Recall@{k} = {recall}")
    print(f"MAP@{k} = {map_}")
    print(f"NDCG@{k} = {ndcg}")
    print(f"MRR@{k} = {mrr}")
    print()


Metrics for user 1:
true items of user 1: [8 1 5 0 7]
reco items of user 1: [8 2 5 7 3]
Precision@3 = 0.6666666666666666
Recall@3 = 0.4
MAP@3 = 0.5555555555555555
NDCG@3 = 0.7039180890341347
MRR@3 = 1.0

Metrics for user 2:
true items of user 2: [0 1 8 5 3]
reco items of user 2: [0 5 2 6 3]
Precision@3 = 0.6666666666666666
Recall@3 = 0.4
MAP@3 = 0.6666666666666666
NDCG@3 = 0.7653606369886217
MRR@3 = 1.0

Metrics for user 3:
true items of user 3: [9 2 0 6 8]
reco items of user 3: [4 8 1 3 0]
Precision@3 = 0.3333333333333333
Recall@3 = 0.2
MAP@3 = 0.16666666666666666
NDCG@3 = 0.2960819109658652
MRR@3 = 0.5

Metrics for user 4:
true items of user 4: [1 7 6 2 8]
reco items of user 4: [2 0 4 9 8]
Precision@3 = 0.3333333333333333
Recall@3 = 0.2
MAP@3 = 0.3333333333333333
NDCG@3 = 0.46927872602275644
MRR@3 = 1.0

Metrics for user 5:
true items of user 5: [1 5 4 8 0]
reco items of user 5: [4 2 7 0 6]
Precision@3 = 0.3333333333333333
Recall@3 = 0.2
MAP@3 = 0.3333333333333333
NDCG@3 = 0.46927872

In [15]:
# sample 추천 점수

y_true = interaction_data[0]
y_pred = prediction_data[0]

precision = precision_at_k(y_true, y_pred, k)
recall = recall_at_k(y_true, y_pred, k)
map_ = average_precision_at_k(y_true, y_pred, k)
ndcg = ndcg_at_k(y_true, y_pred, k)
mrr = mrr_at_k(y_true, y_pred, k)

print(f"Metrics for user {i+1}:")
print(f'true items of user {i+1}: {y_true}')
print(f'reco items of user {i+1}: {y_pred}')
print(f"Precision@{k} = {precision}")
print(f"Recall@{k} = {recall}")
print(f"MAP@{k} = {map_}")
print(f"NDCG@{k} = {ndcg}")
print(f"MRR@{k} = {mrr}")
print()

Metrics for user 5:
true items of user 5: [8 1 5 0 7]
reco items of user 5: [8 2 5 7 3]
Precision@3 = 0.6666666666666666
Recall@3 = 0.4
MAP@3 = 0.5555555555555555
NDCG@3 = 0.7039180890341347
MRR@3 = 1.0



In [11]:
# 이상적인 추천 점수

y_true = [1,2,3]
y_pred = [1,2,3,4,5]

precision = precision_at_k(y_true, y_pred, k)
recall = recall_at_k(y_true, y_pred, k)
map_ = average_precision_at_k(y_true, y_pred, k)
ndcg = ndcg_at_k(y_true, y_pred, k)
mrr = mrr_at_k(y_true, y_pred, k)

print(f"Metrics for user {i+1}:")
print(f'true items of user {i+1}: {y_true}')
print(f'reco items of user {i+1}: {y_pred}')
print(f"Precision@{k} = {precision}")
print(f"Recall@{k} = {recall}")
print(f"MAP@{k} = {map_}")
print(f"NDCG@{k} = {ndcg}")
print(f"MRR@{k} = {mrr}")
print()

Metrics for user 5:
true items of user 5: [1, 2, 3]
reco items of user 5: [1, 2, 3, 4, 5]
Precision@3 = 1.0
Recall@3 = 1.0
MAP@3 = 1.0
NDCG@3 = 1.0
MRR@3 = 1.0



In [12]:
# 약간 아쉬운 추천

y_true = [1,2,3]
y_pred = [1,2,4,3,5]

precision = precision_at_k(y_true, y_pred, k)
recall = recall_at_k(y_true, y_pred, k)
map_ = average_precision_at_k(y_true, y_pred, k)
ndcg = ndcg_at_k(y_true, y_pred, k)
mrr = mrr_at_k(y_true, y_pred, k)

print(f"Metrics for user {i+1}:")
print(f'true items of user {i+1}: {y_true}')
print(f'reco items of user {i+1}: {y_pred}')
print(f"Precision@{k} = {precision}")
print(f"Recall@{k} = {recall}")
print(f"MAP@{k} = {map_}")
print(f"NDCG@{k} = {ndcg}")
print(f"MRR@{k} = {mrr}")
print()

Metrics for user 5:
true items of user 5: [1, 2, 3]
reco items of user 5: [1, 2, 4, 3, 5]
Precision@3 = 0.6666666666666666
Recall@3 = 0.6666666666666666
MAP@3 = 0.6666666666666666
NDCG@3 = 0.7653606369886217
MRR@3 = 1.0



# MF 모델 예제 다시 가져오기

In [16]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds
from sklearn.decomposition import NMF
from sklearn.metrics import ndcg_score, average_precision_score
from sklearn.decomposition import NMF, TruncatedSVD

In [17]:
# ALS class 정의
# http://ethen8181.github.io/machine-learning/recsys/1_ALSWR.html

class ALS:
    # 하이퍼 파라미터 지정
    def __init__(self, factors=10, iterations=20, reg=0.01):
        self.factors = factors
        self.iterations = iterations
        self.reg = reg
    # 모델 적합 -> 평점 행렬 입력
    def fit(self, ratings):
        # 랜덤으로 user 수 * latent factor 형태의 행렬 생성
        self.user_factors = np.random.random((ratings.shape[0], self.factors))
        # 랜덤으로 item 수 * latent factor 형태의 행렬 생성
        self.item_factors = np.random.random((ratings.shape[1], self.factors))

        # 사전에 지정한 iteration 수에 걸쳐서, 교차로 als_step 진행
        for _ in range(self.iterations):
            # user_factors 먼저 업데이트
            self.user_factors = self.als_step(ratings, self.user_factors, self.item_factors)
            # 이어서 item_factors 업데이트
            self.item_factors = self.als_step(ratings.T, self.item_factors, self.user_factors)

    # 교차로 업데이트하는 스텝 메서드
    def als_step(self, ratings, solve_vecs, fixed_vecs):
        # normal equation - 업데이트 되지 않을 user/item feature의 공분산 matrix
        # feature가 주어진(고정된) 상태에서 최적의 해를 찾아 그 행렬을 새로운 factors로 사용
        # 가령, user_factors가 고정되어 있을 때는 최적의 item_factors를 구하고, 반대도 마찬가지
        A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.factors) * self.reg
        b = ratings.dot(fixed_vecs)
        A_inv = np.linalg.inv(A)
        solve_vecs = b.dot(A_inv)
        return solve_vecs

    def predict(self):
        pred = self.user_factors.dot(self.item_factors.T)
        return pred

In [18]:
import pandas as pd

In [19]:
# 1. Load data
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00502/online_retail_II.xlsx'
df = pd.read_excel(url)
df['Customer ID'] = df['Customer ID'].astype('category')
df['StockCode'] = df['StockCode'].astype('category')
df = df.rename({"Customer ID":"CustomerID"}, axis=1)

In [20]:
# 2. Make pivot table and remove users and items with too few interactions
interaction_counts = df.groupby('CustomerID').StockCode.count()
df = df[df.CustomerID.isin(interaction_counts[interaction_counts > 10].index)]
item_counts = df.StockCode.value_counts()
df = df[df.StockCode.isin(item_counts[item_counts > 10].index)]

In [21]:
pivot = df.pivot_table(index='CustomerID', columns='StockCode', fill_value=0, aggfunc='size')

In [22]:
# 3. Make y label as a implicit feedback, with binary value
pivot = (pivot > 0).astype(int)

In [23]:
# 4. Create train/test split
test_ratio = 0.2
train = pivot.copy()
test = np.zeros(pivot.shape)

for user in range(pivot.shape[0]):
    test_interactions = np.random.choice(pivot.values[user, :].nonzero()[0],
                                         size=int(test_ratio*np.sum(pivot.values[user, :])),
                                         replace=False)
    train.values[user, test_interactions] = 0.
    test[user, test_interactions] = pivot.values[user, test_interactions]

In [24]:
# Convert train and test matrix into sparse matrix
train_csr = coo_matrix(train.values)
test_csr = coo_matrix(test)

In [25]:
n_latent_factors = 20

In [26]:
# SVD

# using sklearn Truncated SVD
svd = TruncatedSVD(n_components=n_latent_factors, random_state=42)
train_svd = svd.fit_transform(train_csr)
svd_pred = svd.inverse_transform(svd.transform(test_csr))

# using svds from scipy
u, sigma, vt = svds(train_csr.astype(float), n_latent_factors)
svd_pred = np.dot(u, np.dot(np.diag(sigma), vt))

In [27]:
# NMF
model = NMF(n_components=n_latent_factors, init='random', random_state=0)

W = model.fit_transform(train_csr)
H = model.components_
nmf_pred = np.dot(W, H)



In [28]:
# train ALS model
als = ALS(factors=n_latent_factors, iterations=100, reg=0.01)
als.fit(train_csr)

# predict
als_pred = als.predict()

In [29]:
at_k = 10

# Make sure the predicted scores are in the range [0, 1]
predicted_svd = (svd_pred - svd_pred.min()) / (svd_pred.max() - svd_pred.min())
predicted_nmf = (nmf_pred - nmf_pred.min()) / (nmf_pred.max() - nmf_pred.min())
predicted_als = (als_pred - als_pred.min()) / (als_pred.max() - als_pred.min())

In [30]:
from sklearn.metrics import mean_squared_error

# Calculate RMSE for SVD
svd_rmse = np.sqrt(mean_squared_error(test_csr.toarray(), predicted_svd))
print('SVD RMSE: ', svd_rmse)

# Calculate RMSE for NMF
nmf_rmse = np.sqrt(mean_squared_error(test_csr.toarray(), predicted_nmf))
print('NMF RMSE: ', nmf_rmse)

# Calculate RMSE for ALS
als_rmse = np.sqrt(mean_squared_error(test_csr.toarray(), predicted_als))
print('ALS RMSE: ', als_rmse)


SVD RMSE:  0.3586206450433661
NMF RMSE:  0.050224067115796654
ALS RMSE:  0.371092768706744


In [31]:
predicted_svd = [np.argsort(row)[-at_k:] for row in predicted_svd]
predicted_nmf = [np.argsort(row)[-at_k:] for row in predicted_nmf]
predicted_als = [np.argsort(row)[-at_k:] for row in predicted_als]

In [32]:
predicted_svd[:10]

[array([ 827, 2325, 2333, 2334, 2336,  828, 2335,  112, 2338, 2328]),
 array([1731, 1870, 1895, 1732, 1804, 1523, 1728, 1805, 1521, 1608]),
 array([ 544,  431, 1211, 1602,  547, 1210, 2722, 1212, 2721,  546]),
 array([2336, 2333,  838,  836,  830, 2328,  828, 2335, 2338,  112]),
 array([1109,  300, 1608,  809,  842, 1108,  834,  836,  838,  830]),
 array([2803, 4149,  511, 2474, 1166, 1598,  504, 2805,  512,  516]),
 array([3364, 3337, 1823, 1272, 1819, 1820, 1263, 1821,  563,  564]),
 array([1269, 1819, 1820, 1272, 1823, 3337, 1263, 1821,  563,  564]),
 array([ 563, 1608,  564,  547, 1210, 2722, 1602,  546, 1212, 2721]),
 array([3614, 2030, 2328,  112, 1320, 2031,  793,  796, 1322, 1319])]

In [33]:
rows = test_csr.tocsr()
true_interactions = [rows.getrow(i).indices for i in range(rows.shape[0])]

In [34]:
for i in range(rows.shape[0]):
    rows.getrow(i).indices
    break

In [35]:
rows.getrow(10)

<1x4632 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [36]:
true_interactions[:10]

[array([ 109,  827,  828, 1205, 1860], dtype=int32),
 array([ 420, 1019, 1253, 1401, 1416, 1677, 1898, 2407, 2721, 3377, 3387,
        3791, 4199, 4267], dtype=int32),
 array([2113, 2114, 2718, 2722], dtype=int32),
 array([ 350,  564,  683,  828,  834,  836, 1059, 1267, 1373, 1386, 1545,
        1627, 1641, 1730, 1732, 1876, 2013, 4051], dtype=int32),
 array([1259, 1499, 1882, 2130], dtype=int32),
 array([1031, 1402, 1403], dtype=int32),
 array([  80, 1272, 1315, 1317], dtype=int32),
 array([1060, 1061, 1604, 3770], dtype=int32),
 array([1207, 1254, 1267, 1279, 1728, 1788, 1804, 1905, 2070, 2258, 2718,
        3425, 3750], dtype=int32),
 array([ 327,  331,  424,  564,  793,  799,  980, 1245, 1322, 1506, 1507,
        1509, 1547, 1567, 1569, 1587, 1591, 1592, 1599, 1806, 1833, 1896,
        1897, 2006, 2009, 2030, 2135, 2243, 2333, 2338, 2383, 3626, 3936],
       dtype=int32)]

In [37]:
metrics = {'Precision@K': precision_at_k,
           'Recall@K': recall_at_k,
           'MAP@K': average_precision_at_k,
           'NDCG@K': ndcg_at_k,
           'MRR@K': mrr_at_k}

predictions = {'SVD': predicted_svd,
               'NMF': predicted_nmf,
               'ALS': predicted_als}

In [38]:
K = 10

average_metrics = {model: {metric: 0 for metric in metrics.keys()} for model in predictions.keys()}

for model, predicted in predictions.items():
    for y_true, y_pred in zip(true_interactions, predicted):
        if len(y_true) > 0:
            for metric_name, metric_fn in metrics.items():
                average_metrics[model][metric_name] += metric_fn(y_true, y_pred, K)

    for metric_name in metrics.keys():
        average_metrics[model][metric_name] /= len(true_interactions)

for model, model_metrics in average_metrics.items():
    print(f"{model}:")
    for metric, value in model_metrics.items():
        print(f"    {metric}: {value}")


SVD:
    Precision@K: 0.05660506502395478
    Recall@K: 0.06642109159862024
    MAP@K: 0.02766157806274923
    NDCG@K: 0.06943103476560814
    MRR@K: 0.14086532019454662
NMF:
    Precision@K: 0.051403148528404066
    Recall@K: 0.05501968106002313
    MAP@K: 0.0234305272015432
    NDCG@K: 0.06047878918998058
    MRR@K: 0.12413998529672698
ALS:
    Precision@K: 0.05651380333105039
    Recall@K: 0.06621301322885632
    MAP@K: 0.027092655415484126
    NDCG@K: 0.06872879878209151
    MRR@K: 0.13851840987364578
