# SVD로 recommendation model 만들기

In [2]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../src")

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy

## Data Preparation

In [152]:
X_train = pd.read_csv("../data/user_item_matrix_X_0.5.csv", index_col=0)
Y_train = pd.read_csv("../data/user_item_matrix_Y_0.5.csv", index_col=0)

X_train.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,186587,187541,187593,187595,188301,189111,193565,193567,193571,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
Y_train.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## SVD를 이용한 예측

In [154]:

def rmse_loss(X, Y):
    return ((X - Y) ** 2 ** .5).mean().mean()

def precision(user_item_matrix, user_id, preds):
    y = user_item_matrix.loc[user_id, :]
    y = y[y > 0.5]
    return y.index.isin(preds.index).mean()

def get_svd_prediction(user_item_matrix, k):
    u, s, vh = scipy.sparse.linalg.svds(user_item_matrix.to_numpy(), k=k)
    preds = np.dot(np.dot(u, np.diag(s)), vh)

    preds = pd.DataFrame(preds, columns=user_item_matrix.columns, index=user_item_matrix.index)
    preds = (preds - preds.min()) / (preds.max() - preds.min())
    return preds
    
class SVDRecommender:
    def __init__(self, user_item_matrix, k = 256):
        self.user_item_matrix = user_item_matrix
        self.preds = get_svd_prediction(user_item_matrix, k)
        
    def predict(self, user_id, k, filter_seen=True):
        p = self.preds.loc[user_id, :]
        
        if filter_seen:
            seen = self.user_item_matrix.loc[user_id, :]
            p[seen > 0.5] = 0
            
        p_top_k = p.sort_values(ascending=False)
            
        return p_top_k.head(k)
        
    def compute_rmse_loss(self, Y):
        return rmse_loss(self.preds, Y)
    
    def compute_metrics(self, Y, k=100):
        metrics = []
        X = self.user_item_matrix
        for uid in self.preds.index:
            x = X.loc[uid, :]
            x = x[x > 0.5]
            y = Y.loc[uid, :]
            y = y[y > 0.5]
            
            dy = set(y.index) - set(x.index)
            p = self.predict(uid, k)
            
            top_k_acc = p.index.isin(y.index).mean()
            prec = len(y[y.index.isin(p.index)])
            dy_prec = len(p[p.index.isin(dy)])
            
            metrics.append((prec, len(y), dy_prec, len(dy), top_k_acc, dy_prec / len(dy) if len(dy) > 0 else 0))
        
        columns=["precision", "Y count", "dy_correct", "DY count", "top_{0}_accuracy".format(k), "dy_precision"]
        return pd.DataFrame(metrics, index=self.preds.index, columns=columns)

In [174]:

for k in [6, 12, 32, 64, 128, 256, 512]:
    rec = SVDRecommender(X_train, k=k)
    print(k, rec.compute_rmse_loss(Y_train))
    metrics = rec.compute_metrics(Y_train, k=100)
    print(k, metrics.mean())

6 0.18067690139257436
6 precision             0.777049
Y count             165.304918
dy_correct            0.777049
DY count             81.398361
top_100_accuracy      0.007770
dy_precision          0.005430
dtype: float64
12 0.1542091530542808
12 precision             0.891803
Y count             165.304918
dy_correct            0.891803
DY count             81.398361
top_100_accuracy      0.008918
dy_precision          0.007835
dtype: float64
32 0.1141425168100875
32 precision             0.742623
Y count             165.304918
dy_correct            0.742623
DY count             81.398361
top_100_accuracy      0.007426
dy_precision          0.008987
dtype: float64
64 0.0770662856885011
64 precision             0.806557
Y count             165.304918
dy_correct            0.806557
DY count             81.398361
top_100_accuracy      0.008066
dy_precision          0.012675
dtype: float64
128 0.046714562392820076
128 precision             0.893443
Y count             165.304918
dy_cor

In [172]:
# rec.predict(1, 10)

precision             0.891803
Y count             165.304918
dy_correct            0.891803
DY count             81.398361
top_100_accuracy      0.008918
dy_precision          0.007835
dtype: float64

Unnamed: 0_level_0,precision,Y count,dy_correct,DY count,top_100_accuracy,dy_precision
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,1,39,1,19,0.01,0.052632
4,1,216,1,107,0.01,0.009346
6,9,314,9,154,0.09,0.058442
7,1,152,1,75,0.01,0.013333
10,1,140,1,69,0.01,0.014493
...,...,...,...,...,...,...
604,2,100,2,48,0.02,0.041667
606,11,1115,11,557,0.11,0.019749
608,5,831,5,415,0.05,0.012048
609,1,37,1,15,0.01,0.066667
