In [91]:
# pip install plotnine

In [1]:
import pickle
import pandas as pd
import numpy as np
import os, sys, gc 
from plotnine import *
import plotnine

from tqdm.notebook import tqdm as tqdm_notebook
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl
from matplotlib import rc
import re
from matplotlib.ticker import PercentFormatter
import datetime
from math import log # IDF 계산을 위해

## Data

In [2]:
movie = pd.read_csv("/Users/hyunji/2022-2 Recommendation-Tutorial/Data/ratings.csv")

In [8]:
movie

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


## 추천모델이 원하는 데이터셋 형태로 바꿔주기

In [85]:
user2idx = {}
for i, l in enumerate(movie['userId'].unique()):
    user2idx[l] = i

movie2idx = {}
for i, l in enumerate(movie['movieId'].unique()):
    movie2idx[l] = i

idx2user = {i: user for user, i in user2idx.items()}
idx2movie = {i: item for item, i in movie2idx.items()}

In [14]:
useridx = movie['useridx'] = movie['userId'].apply(lambda x: user2idx[x]).values
movieidx = movie['movieidx'] = movie['movieId'].apply(lambda x: movie2idx[x]).values
rating = movie['rating'].values

movie

Unnamed: 0,userId,movieId,rating,timestamp,useridx,movieidx
0,1,31,2.5,1260759144,0,0
1,1,1029,3.0,1260759179,0,1
2,1,1061,3.0,1260759182,0,2
3,1,1129,2.0,1260759185,0,3
4,1,1172,4.0,1260759205,0,4
...,...,...,...,...,...,...
99999,671,6268,2.5,1065579370,670,7005
100000,671,6269,4.0,1065149201,670,4771
100001,671,6365,4.0,1070940363,670,1329
100002,671,6385,2.5,1070979663,670,1331


In [15]:
n_users = movie['userId'].nunique()
n_items = movie['movieId'].nunique()

In [21]:
useridx

array([  0,   0,   0, ..., 670, 670, 670])

In [16]:
import scipy
#compressed sparse row
# csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
ratings = scipy.sparse.csr_matrix((rating, (useridx, movieidx)), shape=(len(set(useridx)), len(set(movieidx))))

## Model

In [23]:
import torch
import torch.nn.functional as F
from torch import nn
import torch.nn.init as weight_init

class MatrixFactorization(nn.Module):
    def __init__(self,R, n_users, n_items, n_factors=20): #random initialize함
        super().__init__() # 부모 클래스(torch.nn.Module)의 init을 불러옴 
        #임베딩 저장공간확보
        self.user_factors = nn.Embedding(n_users, n_factors)#임베딩할 단어의개수, 임베딩할 벡터의 차원
        self.item_factors = nn.Embedding(n_items, n_factors)
        
        # weight 초기화 
        weight_init.xavier_uniform_(self.user_factors.weight)
        weight_init.xavier_uniform_(self.item_factors.weight)
       
        # original Matrix 
        self.R = R
        
    def forward(self, user, item):
        #user와 item값에 대한 embedding 구하기 , 
        pred = (self.user_factors(user) * self.item_factors(item)).sum(1)
        return pred
    
    def complete_matrix(self):
        return torch.matmul(self.user_factors.weight, self.item_factors.weight.T)

In [26]:
model = MatrixFactorization(ratings, n_users, n_items, n_factors=20)

### 참고 : nn.Embedding

In [42]:
embedding = nn.Embedding(10,3)
embedding.weight

Parameter containing:
tensor([[ 1.3258,  1.9836, -1.5465],
        [ 0.9144, -0.4143,  0.1636],
        [-0.4903,  0.5278, -0.3591],
        [-0.0447, -2.7401, -0.6674],
        [ 0.2361, -0.2082, -1.2536],
        [ 0.2651,  0.6224, -0.8440],
        [-0.2546,  0.3500, -0.3714],
        [-0.5804, -0.7231,  0.3429],
        [ 0.5060,  0.2968, -0.6070],
        [-0.3959,  2.4972,  0.4987]], requires_grad=True)

In [43]:
from torch import LongTensor
#[1,2,4,5], [4,3,2,9]에 대한 임베딩 값 구하기.

#방법1
sentences = LongTensor([[1,2,4,5],[4,3,2,9]])
embedding(sentences)

#방법2
nn.Embedding(10,3)(LongTensor([[1,2,4,5],[4,3,2,9]]))

tensor([[[ 0.9144, -0.4143,  0.1636],
         [-0.4903,  0.5278, -0.3591],
         [ 0.2361, -0.2082, -1.2536],
         [ 0.2651,  0.6224, -0.8440]],

        [[ 0.2361, -0.2082, -1.2536],
         [-0.0447, -2.7401, -0.6674],
         [-0.4903,  0.5278, -0.3591],
         [-0.3959,  2.4972,  0.4987]]], grad_fn=<EmbeddingBackward0>)

Batch를 사용하지 않은 Matrix Factorization

In [77]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
optimizer = torch.optim.SGD(model.parameters(), lr=5e-3)  # learning rate
loss_func = torch.nn.MSELoss()

### 참고 :  zip함수

In [79]:
numbers = [1, 2, 3]
letters = ["A", "B", "C"]
for pair in zip(*(numbers, letters)):
    print(pair)

(1, 'A')
(2, 'B')
(3, 'C')


In [80]:
rows, cols = ratings.nonzero() #실제 rating이 있는 부분과 학습진행ㅁ

nb_epochs = 10
for epoch in tqdm_notebook(range(nb_epochs)):
    train_loss = 0
    for row, col in zip(*(rows, cols)):
        # gradient 값을 0으로 설정 
        optimizer.zero_grad()

        # 데이터를 Tensor형태로 변환 
        rating = torch.FloatTensor([ratings[row, col]]) #ratings행렬의 행과열 지정
        row = torch.LongTensor([row])
        col = torch.LongTensor([col])

        # 예측값을 만들고 Loss를 계산 
        prediction = model(row, col)
        loss = loss_func(prediction, rating)
        train_loss += loss.item()
        
        # 역전파 
        loss.backward()

        # 파라미터를 갱신
        optimizer.step()
    print('Epoch {:4d}/{} Loss: {:.6f}'.format(epoch+1, nb_epochs, train_loss/len(rows)))

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch    1/10 Loss: 1.558807
Epoch    2/10 Loss: 1.129767
Epoch    3/10 Loss: 0.944863
Epoch    4/10 Loss: 0.841397
Epoch    5/10 Loss: 0.772120
Epoch    6/10 Loss: 0.718886
Epoch    7/10 Loss: 0.674500
Epoch    8/10 Loss: 0.635672
Epoch    9/10 Loss: 0.600477
Epoch   10/10 Loss: 0.567820


## Recommend 
1. User tensor와 item tensor의 행렬곱을 통해 원 매트릭스 복원
2. 평점이 높은 상위 200개의 값을 뽑고 그 중에서 이미 본 내역은 제외

In [None]:
#1번 사용자가 모든 영화에 대해 남긴 평점의 예측값
torch.matmul(model.user_factors.weight[user2idx[1]], model.item_factors.weight.T)

In [87]:
idx2rec = {}
for u in user2idx.keys():
    #argsort는 가장 작은 값을 갖는 인덱스를 뽑는데 - 를 곱해줘서 가장 큰 값을 갖는 인덱스 뽑음
    item_rec = np.argsort(-torch.matmul(model.user_factors.weight[user2idx[u]], model.item_factors.weight.T).detach().numpy())[0:200]
    # 추천에서 제외해야할 항목
    item_rec = [idx2movie[x] for x in item_rec if x not in movie[movie['useridx']==u]['movieidx'].unique()][0:100]
    idx2rec[u] = item_rec   
    break

In [90]:
#첫번째 사용자에게 추천된 항목 확인
idx2rec[1]

[2318,
 5114,
 73290,
 858,
 1276,
 1221,
 745,
 1192,
 899,
 390,
 80,
 116,
 608,
 309,
 905,
 7063,
 318,
 8132,
 2804,
 162,
 111,
 2563,
 260,
 1259,
 4914,
 1148,
 2542,
 5995,
 1172,
 6269,
 7502,
 911,
 912,
 913,
 1223,
 2064,
 93040,
 904,
 4427,
 97,
 1250,
 908,
 73344,
 1394,
 1233,
 1212,
 6954,
 1219,
 1211,
 1228,
 307,
 1136,
 6669,
 1252,
 926,
 246,
 1243,
 9010,
 308,
 1674,
 2959,
 2938,
 903,
 759,
 969,
 1198,
 1203,
 3462,
 2920,
 1227,
 7075,
 2599,
 1247,
 2019,
 922,
 2329,
 1288,
 26587,
 2186,
 28,
 907,
 951,
 534,
 1060,
 1934,
 898,
 131724,
 58559,
 116897,
 3435,
 4226,
 2924,
 104913,
 1945,
 27803,
 4103,
 6918,
 6993,
 1230,
 6016]