# Latent Factor Collaborative Filtering Using Matrix Factorizations

## SGD 기반 행렬 분해 기법 사용
행렬 분해 잠재 요인 협업 필터링은 SVD나 NMF 등을 적용할 수 있는데, 일반적으로 행렬 분해에는 SVD가 자주 사용된다.
하지만 User-Item 평점 행렬의 경우 NaN 값의 데이터가 많은 Sparse Matrix이기 때문에 주로 SGD나 ALS 방식 기반의 행렬 분해를 이용한다.
여기서는 SGD 기반의 행렬 분해를 구현하고 이를 기반으로 사용자에게 영화를 추천해보도록 하자.

----

In [1]:
import cpuinfo
cpuinfo.get_cpu_info()

{'python_version': '3.9.7.final.0 (64 bit)',
 'cpuinfo_version': [8, 0, 0],
 'cpuinfo_version_string': '8.0.0',
 'arch': 'ARM_8',
 'bits': 64,
 'count': 8,
 'arch_string_raw': 'arm64',
 'brand_raw': 'Apple M1'}

---

## Data Loading and Processing 

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
movies = pd.read_csv('./data/MovieLens_Dataset/ml-latest-small/movies.csv')
ratings = pd.read_csv('./data/MovieLens_Dataset/ml-latest-small/ratings.csv')

print(movies.shape)
print(ratings.shape)

(9742, 3)
(100836, 4)


In [4]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [5]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [6]:
ratings = ratings[['userId', 'movieId','rating']]
ratings_matrix = ratings.pivot_table('rating',index='userId', columns='movieId')

In [7]:
rating_movies = pd.merge(ratings, movies, on='movieId')
ratings_matrix = rating_movies.pivot_table('rating',index='userId', columns='title')
ratings_matrix = ratings_matrix.fillna(0)

---

## RMSE

In [8]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두 개의 분해된 행렬 P와 Q.T의 내적으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 Null값이 아닌 값의 위치 인덱스를 추출해서 실제 R 행렬과 예측 R 행렬 사이의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

---

## Matrix Factorization Logic

In [15]:
def factorize_matrix(R, K, steps=200, lr=0.01, r_lambda=0.01):
    '''
    R: 원본 R 행렬
    K: 잠재 요인 차원 수
    steps: SGD 반복 횟수
    lr: 학습률(Learning Rate)
    r_lambda: L2 Regulrization 계수
    '''
    num_users, num_items = R.shape
    # P와 Q 행렬의 크기를 지정하고 정규 분포를 가진 랜덤한 값으로 초기 설정
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))
    
    prev_rmse = 10000
    break_count = 0
    
    # R > 0 인 행 위치, 영 위치, 값 을 non_zeros 리스트 객체에 저장
    non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0]
    
    for step in range(steps):
        for i,j,r in non_zeros:
            # 실제값과 예측값의 차이인 오류 도출
            eij = r - np.dot(P[i, :], Q[j,:].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + lr*(eij*Q[j,:] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + lr*(eij*P[i,:] - r_lambda*Q[j,:])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        # 10번마다 출력
        if (step%10) == 0:
            print("### iteration_stpe: ", step, "rmse: ", rmse)
            
    return P,Q

---

In [17]:
P, Q = factorize_matrix(ratings_matrix.values, K=50)

### iteration_stpe:  0 rmse:  2.907469480812062
### iteration_stpe:  10 rmse:  0.7340433499766933
### iteration_stpe:  20 rmse:  0.5128595402647591
### iteration_stpe:  30 rmse:  0.3727746943336636
### iteration_stpe:  40 rmse:  0.29546455244435
### iteration_stpe:  50 rmse:  0.2513536429962661
### iteration_stpe:  60 rmse:  0.2243935563592549
### iteration_stpe:  70 rmse:  0.2065542418228327
### iteration_stpe:  80 rmse:  0.19393901833847627
### iteration_stpe:  90 rmse:  0.18454630758986915
### iteration_stpe:  100 rmse:  0.17727068337830257
### iteration_stpe:  110 rmse:  0.1714606074856677
### iteration_stpe:  120 rmse:  0.16670944463323076
### iteration_stpe:  130 rmse:  0.1627503914205024
### iteration_stpe:  140 rmse:  0.1594007413770971
### iteration_stpe:  150 rmse:  0.1565307988374228
### iteration_stpe:  160 rmse:  0.15404569022305012
### iteration_stpe:  170 rmse:  0.1518742391439788
### iteration_stpe:  180 rmse:  0.1499618879524455
### iteration_stpe:  190 rmse:  0.148266

In [18]:
pred_matrix = np.dot(P, Q.T)

In [19]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index=ratings_matrix.index, columns=ratings_matrix.columns)
ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.198094,4.27052,3.701009,4.599037,4.448094,1.168205,4.066741,2.23118,3.728871,3.976023,...,1.255367,4.31772,4.014838,2.762811,2.682578,4.05358,2.857436,2.148007,3.998101,0.839473
2,2.98791,3.714054,3.135606,4.220052,4.103663,1.183167,3.48847,2.185392,2.472071,3.523472,...,1.082495,3.991854,3.091928,2.581265,2.434808,3.831928,2.469249,1.703216,4.339736,0.796198
3,2.17909,1.845131,1.586443,2.01082,1.799193,0.566939,0.830027,1.358729,0.48038,1.988084,...,0.719984,3.027712,1.870793,2.151041,1.456367,2.476212,0.465158,0.899399,2.705391,0.460979


이제 이렇게 만들어진 User-Item 평점 행렬 정보를 이용해서 개인화된 추천을 진행하보도록 하자.
관람하지 않은 영화를 추출하는 get_unseen_movies()와 추천 함수 recomm_movie_by_userid()는 2. Item-Based Nearest Neighbor Collaborative Filtering Practice - MovieLens Dataset에서 사용한 함수를 그대로 사용하도록 하겠다.

In [20]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화 정보를 추출하여 Series 객체로 반환
    # 반환된 user_rating은 영화명을 인덱스로 가지는 Series 객체
    user_rating = ratings_matrix.loc[userId, :]
    
    # user_rating이 0보다 크면 기존에 관람한 영화
    already_seen = user_rating[user_rating > 0].index.tolist()
    
    movies_list = ratings_matrix.columns.tolist()
    
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n = 10):
    # 예측 평점 df에서 사용자 아이디 인덱스와 unseen_list로 들어온 영화명 칼럼을 추출해 높은 순으로 정렬
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [23]:
# 사용자가 관람하지 않은 영화명 추출
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 잠재 요인 협업 필터링 이용 영화 추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=10)

# 평점 데이터를 DataFrame으로 생성
recomm_movies_df = pd.DataFrame(data=recomm_movies, index=recomm_movies.index, columns=['pred_score'])

In [24]:
recomm_movies

title
Lord of the Rings: The Return of the King, The (2003)    5.360419
Monty Python's Life of Brian (1979)                      5.269466
Graduate, The (1967)                                     5.266218
Shawshank Redemption, The (1994)                         5.249894
Annie Hall (1977)                                        5.177673
Monty Python and the Holy Grail (1975)                   5.164635
Player, The (1992)                                       5.145355
Chinatown (1974)                                         5.124131
Spider-Man 2 (2004)                                      5.123359
Shaun of the Dead (2004)                                 5.001922
Name: 9, dtype: float64

## End