# Matrix Factorization
- MovieLens 1M 데이터
- SVD 구현
- 파이썬 라이브러리 사용

In [2]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# 데이터 로드

In [4]:
path = '../data/ml-1m/'
header_list = ['userId', 'movieId', 'rating', 'timestamp']
ratings_df = pd.read_csv(path + 'ratings.csv', sep='\t',encoding='utf-8', names=header_list)

print(ratings_df.shape)
print(ratings_df.head())

(1000209, 4)
   userId  movieId  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [5]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

print(train_df.shape)
print(test_df.shape)

(800167, 4)
(200042, 4)


# Sparse Matrix

In [6]:
sparse_matrix = train_df.groupby('movieId').apply(lambda x: pd.Series(x['rating'].values, index=x['userId'])).unstack()
sparse_matrix.index.name = 'movieId'

sparse_matrix

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,6001,6002,6003,6004,6005,6006,6007,6008,6009,6010,6011,6012,6013,6014,6015,6016,6017,6018,6019,6020,6021,6022,6023,6024,6025,6026,6027,6028,6029,6030,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,5.0,,,,,4.0,,,5.0,5.0,,,,,,,,4.0,,,,,4.0,,,3.0,,,,,,,,5.0,,5.0,,5.0,,,...,,,,,,2.0,,,,5.0,,,5.0,,5.0,4.0,,,,,3.0,5.0,,,5.0,,,,,,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,,,,,,,,,2.0,,,,,2.0,,,,1.0,,,,,,,,,,,,,5.0,...,3.0,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,2.0,,,,,,,,,3.0,,,,,,,,,,1.0,,,,,
4,,,,,,,,3.0,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,
5,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,,,,,,,,,3.0,4.0,,,,,3.0,,,,4.0,,,,,,,,,,,,,,,,,,,4.0,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3949,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3950,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3951,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [7]:
# sparse matrix 결측치 영화 평균 평점으로 채우기
sparse_matrix_withmovie = sparse_matrix.apply(lambda x: x.fillna(x.mean()), axis=1)

# sparse matrix 결측치 유저 평균 평점으로 채우기
sparse_matrix_withuser = sparse_matrix.apply(lambda x: x.fillna(x.mean()), axis=0)

In [8]:
sparse_matrix_withmovie.head()

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,6001,6002,6003,6004,6005,6006,6007,6008,6009,6010,6011,6012,6013,6014,6015,6016,6017,6018,6019,6020,6021,6022,6023,6024,6025,6026,6027,6028,6029,6030,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,5.0,4.139606,4.139606,4.139606,4.139606,4.0,4.139606,4.139606,5.0,5.0,4.139606,4.139606,4.139606,4.139606,4.139606,4.139606,4.139606,4.0,4.139606,4.139606,4.139606,4.139606,4.0,4.139606,4.139606,3.0,4.139606,4.139606,4.139606,4.139606,4.139606,4.139606,4.139606,5.0,4.139606,5.0,4.139606,5.0,4.139606,4.139606,...,4.139606,4.139606,4.139606,4.139606,4.139606,2.0,4.139606,4.139606,4.139606,5.0,4.139606,4.139606,5.0,4.139606,5.0,4.0,4.139606,4.139606,4.139606,4.139606,3.0,5.0,4.139606,4.139606,5.0,4.139606,4.139606,4.139606,4.139606,4.139606,4.139606,4.0,4.139606,4.139606,4.0,4.139606,4.139606,4.139606,4.139606,3.0
2,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,2.0,3.190217,3.190217,3.190217,3.190217,2.0,3.190217,3.190217,3.190217,1.0,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,5.0,...,3.0,3.190217,3.190217,3.190217,3.190217,2.0,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217,3.190217
3,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,2.0,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,...,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,2.0,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.0,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,3.035354,1.0,3.035354,3.035354,3.035354,3.035354,3.035354
4,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,3.0,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,3.0,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,...,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.726562,2.0,2.726562,2.726562,2.726562,2.726562,2.726562
5,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,5.0,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,...,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388,3.050388


In [9]:
sparse_matrix_withuser.head()

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,6001,6002,6003,6004,6005,6006,6007,6008,6009,6010,6011,6012,6013,6014,6015,6016,6017,6018,6019,6020,6021,6022,6023,6024,6025,6026,6027,6028,6029,6030,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,5.0,3.733945,3.853659,4.210526,3.104575,4.0,4.37037,3.886957,5.0,5.0,3.294643,3.823529,3.420455,3.352941,3.2875,3.04,4.042683,4.0,3.567164,4.111111,2.75,3.123932,4.0,3.925926,3.764706,3.0,4.240741,3.827586,3.611765,3.542857,3.656566,3.628571,3.538206,5.0,3.54717,5.0,3.76087,5.0,3.518519,3.486486,...,3.607735,4.047431,4.13198,3.363636,3.706667,2.0,3.120968,3.482759,3.3,5.0,3.90625,3.0,5.0,3.886364,5.0,4.0,3.52,3.627907,3.405797,4.486486,3.0,5.0,3.737288,4.125,5.0,3.676923,4.269231,3.394737,3.892857,4.01087,3.704545,4.0,3.87037,3.941176,4.0,3.322812,3.675,3.9375,3.90566,3.0
2,4.146341,3.733945,3.853659,4.210526,3.104575,3.982143,4.37037,3.886957,3.717647,4.100313,3.294643,3.823529,3.420455,3.352941,3.2875,3.04,4.042683,2.0,3.567164,4.111111,2.75,3.123932,2.0,3.925926,3.764706,2.94081,1.0,3.827586,3.611765,3.542857,3.656566,3.628571,3.538206,3.912698,3.54717,4.175439,3.76087,3.576923,3.518519,5.0,...,3.0,4.047431,4.13198,3.363636,3.706667,2.0,3.120968,3.482759,3.3,3.939516,3.90625,3.0,4.0,3.886364,3.790698,3.226721,3.52,3.627907,3.405797,4.486486,3.54,3.871795,3.737288,4.125,3.281106,3.676923,4.269231,3.394737,3.892857,4.01087,3.704545,4.092308,3.87037,3.941176,2.646552,3.322812,3.675,3.9375,3.90566,3.582418
3,4.146341,3.733945,3.853659,4.210526,3.104575,3.982143,4.37037,3.886957,3.717647,4.100313,3.294643,3.823529,3.420455,3.352941,3.2875,3.04,4.042683,3.691406,3.567164,4.111111,2.75,3.123932,3.281124,3.925926,3.764706,2.0,4.240741,3.827586,3.611765,3.542857,3.656566,3.628571,3.538206,3.912698,3.54717,4.175439,3.76087,3.576923,3.518519,3.486486,...,3.607735,4.047431,4.13198,3.363636,3.706667,3.156863,3.120968,3.482759,3.3,3.939516,3.90625,3.0,4.0,3.886364,3.790698,2.0,3.52,3.627907,3.405797,4.486486,3.54,3.871795,3.737288,4.125,3.0,3.676923,4.269231,3.394737,3.892857,4.01087,3.704545,4.092308,3.87037,3.941176,1.0,3.322812,3.675,3.9375,3.90566,3.582418
4,4.146341,3.733945,3.853659,4.210526,3.104575,3.982143,4.37037,3.0,3.717647,4.100313,3.294643,3.823529,3.420455,3.352941,3.2875,3.04,4.042683,3.691406,3.567164,4.111111,2.75,3.123932,3.281124,3.925926,3.764706,3.0,4.240741,3.827586,3.611765,3.542857,3.656566,3.628571,3.538206,3.912698,3.54717,4.175439,3.76087,3.576923,3.518519,3.486486,...,3.607735,4.047431,4.13198,3.363636,3.706667,3.156863,3.120968,3.482759,3.3,3.939516,3.90625,3.0,4.0,3.886364,3.790698,3.226721,3.52,3.627907,3.405797,4.486486,3.54,3.871795,3.737288,4.125,3.281106,3.676923,4.269231,3.394737,3.892857,4.01087,3.704545,4.092308,3.87037,3.941176,2.0,3.322812,3.675,3.9375,3.90566,3.582418
5,4.146341,3.733945,3.853659,4.210526,3.104575,3.982143,4.37037,3.886957,3.717647,4.100313,3.294643,3.823529,3.420455,3.352941,3.2875,3.04,4.042683,3.691406,3.567164,4.111111,2.75,3.123932,3.281124,3.925926,3.764706,5.0,4.240741,3.827586,3.611765,3.542857,3.656566,3.628571,3.538206,3.912698,3.54717,4.175439,3.76087,3.576923,3.518519,3.486486,...,3.607735,4.047431,4.13198,3.363636,3.706667,3.156863,3.120968,3.482759,3.3,3.939516,3.90625,3.0,4.0,3.886364,3.790698,3.226721,3.52,3.627907,3.405797,4.486486,3.54,3.871795,3.737288,4.125,3.281106,3.676923,4.269231,3.394737,3.892857,4.01087,3.704545,4.092308,3.87037,3.941176,2.646552,3.322812,3.675,3.9375,3.90566,3.582418


# MF with SVD

In [10]:
def get_svd(s_matrix, k=300):
    u, s, vh = np.linalg.svd(s_matrix.transpose())
    S = s[:k] * np.identity(k, np.float)
    T = u[:, :k]
    Dt = vh[:k, :]

    item_factors = np.transpose(np.matmul(S, Dt))
    user_factors = np.transpose(T)

    return item_factors, user_factors

## 1. 영화 평균 평점

In [11]:
item_factors, user_factors = get_svd(sparse_matrix_withmovie)
prediction_result_df = pd.DataFrame(
    np.matmul(item_factors, user_factors),
    columns=sparse_matrix_withmovie.columns.values, index=sparse_matrix_withmovie.index.values
    )
movie_prediction_result_df = prediction_result_df.transpose()

In [None]:
print(item_factors.shape)
print(user_factors.shape)

(3683, 300)
(300, 6040)


In [None]:
movie_prediction_result_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
1,4.919958,3.091968,3.055648,2.705615,3.004578,3.920799,3.402009,3.01811,2.660275,3.504534,...,3.077097,2.116579,1.306975,2.130903,3.4998,3.608687,4.176089,3.691454,3.867573,3.789352
2,4.312839,3.152566,3.158968,2.767492,3.094605,3.84484,3.27962,3.000049,2.634677,3.534925,...,3.075172,2.127702,1.300832,2.191039,3.442631,3.542241,4.123214,3.73015,3.870283,3.751452
3,4.100369,3.287029,2.980007,2.714903,3.06614,4.053251,3.370253,3.020698,2.630165,3.51985,...,3.088508,2.11209,1.293136,2.136459,3.454934,3.55048,4.072457,3.717087,3.859387,3.783811
4,4.165418,3.251084,3.028892,2.742238,3.065472,3.903026,3.375582,3.038665,2.656261,3.577462,...,3.119255,2.113122,1.29418,2.182635,3.507168,3.615169,4.186814,3.668465,3.865516,3.765034
5,4.425782,3.176031,3.003269,2.653419,2.965521,3.54311,3.442262,3.066303,2.662975,3.169431,...,3.21954,2.097952,1.319847,2.151091,3.461029,3.572832,4.169884,3.660572,3.845115,3.689088


## 2. 유저 평균 평점

In [None]:
item_factors, user_factors = get_svd(sparse_matrix_withuser)
prediction_result_df = pd.DataFrame(
    np.matmul(item_factors, user_factors),
    columns=sparse_matrix_withuser.columns.values, index=sparse_matrix_withuser.index.values
    )
user_prediction_result_df = prediction_result_df.transpose()

In [None]:
user_prediction_result_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
1,4.808445,4.065008,4.142195,4.108244,4.097384,4.160094,4.131998,4.134432,4.154092,4.119212,...,4.11297,4.152238,4.130765,4.134107,4.152561,4.222269,4.18424,4.152912,4.158937,4.175014
2,3.712083,3.679458,3.856136,3.690607,3.72141,3.77149,3.602022,3.713896,3.755855,3.628131,...,3.762807,3.754855,3.74997,3.762314,3.699256,3.546223,3.757094,3.795708,3.703697,3.699706
3,3.752472,3.927985,3.921959,3.851888,3.92284,3.955127,3.892428,3.853202,3.831974,3.873135,...,3.843571,3.836454,3.829269,3.851797,3.856822,3.903836,3.895329,3.868619,3.840778,3.777915
4,4.207334,4.300731,4.239999,4.161639,4.247599,4.2415,4.164158,4.232434,4.199839,4.311588,...,4.233388,4.226526,4.222325,4.227623,4.226878,4.30038,4.247092,4.178408,4.204902,4.191337
5,3.160029,3.190759,3.300137,3.013819,3.049395,3.001498,2.943668,3.119666,3.184597,2.746506,...,3.161041,3.080309,3.165159,3.129787,3.061931,3.151018,3.165481,3.099113,3.086703,3.007867


In [None]:
print(item_factors.shape)
print(user_factors.shape)

(3683, 300)
(300, 6040)


# 두 모델 비교

In [None]:
def evaluate(test_df, prediction_result_df):
    groups_with_movie_ids = test_df.groupby('movieId')
    groups_with_user_ids = test_df.groupby('userId')
    intersection_movie_ids = sorted(list(set(list(prediction_result_df.columns)).intersection(set(list(groups_with_movie_ids.indices.keys())))))
    intersection_user_ids = sorted(list(set(list(prediction_result_df.index)).intersection(set(list(groups_with_user_ids.indices.keys())))))

    print(len(intersection_movie_ids))
    print(len(intersection_user_ids))

    compressed_prediction_df = prediction_result_df.loc[intersection_user_ids][intersection_movie_ids]

    # test_df에 대해 RMSE 계산
    grouped = test_df.groupby('userId')
    rmse_df = pd.DataFrame(columns=['rmse'])
    for userId, group in tqdm(grouped):
        pred_ratings = compressed_prediction_df.loc[userId][compressed_prediction_df.loc[userId].index.intersection(list(group['movieId'].values))]
        pred_ratings = pred_ratings.to_frame(name='rating').reset_index().rename(columns={'index':'movieId', 'rating':'pred_rating'})
        actual_ratings = group[['rating', 'movieId']].rename(columns={'rating':'actual_rating'})

        final_df = pd.merge(actual_ratings, pred_ratings, how='inner', on=['movieId'])
        final_df = final_df.round(4)

        if not final_df.empty:
            rmse = sqrt(mean_squared_error(final_df['actual_rating'], final_df['pred_rating']))
            rmse_df.loc[userId] = rmse

    return final_df, rmse_df

In [None]:
result_df, _ = evaluate(test_df, user_prediction_result_df)
print(result_df)
print('For User Matrix')
print(f"RMSE: {sqrt(mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values))}")

3421
6038


100%|██████████| 6038/6038 [00:19<00:00, 317.42it/s]

    actual_rating  movieId  pred_rating
0               1     2378       3.1821
1               5     1111       3.6047
2               4     1295       3.5849
3               5      916       3.6764
4               5      562       3.5405
..            ...      ...          ...
63              2     1210       3.7554
64              4     1617       3.4871
65              4     1273       3.8256
66              5     1900       3.6335
67              2      495       3.6234

[68 rows x 3 columns]
For User Matrix
RMSE: 1.0822095890842411





In [None]:
result_df, _ = evaluate(test_df, movie_prediction_result_df)
print(result_df)
print('For Movie Matrix')
print(f"RMSE: {sqrt(mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values))}")

3421
6038


100%|██████████| 6038/6038 [00:18<00:00, 331.02it/s]

    actual_rating  movieId  pred_rating
0               1     2378       2.8502
1               5     1111       4.0666
2               4     1295       3.3020
3               5      916       4.4100
4               5      562       3.7226
..            ...      ...          ...
63              2     1210       4.0122
64              4     1617       4.1459
65              4     1273       4.1259
66              5     1900       4.1544
67              2      495       3.3377

[68 rows x 3 columns]
For Movie Matrix
RMSE: 0.9638399916022186





# k 값 튜닝

In [20]:
def find_best_k(sparse_matrix, maximum_k=100):
    k_candidates = np.arange(50, maximum_k, 10)
    final_df = pd.DataFrame(columns=['rmse'], index=k_candidates)
    for k in tqdm(k_candidates):
        item_factors, user_factors = get_svd(sparse_matrix, k=k)
        each_result_df = pd.DataFrame(
            np.matmul(item_factors, user_factors),
            columns=sparse_matrix.columns.values, 
            index=sparse_matrix.index.values
        )
        each_result_df = each_result_df.transpose()
        result_df, _ = evaluate(test_df, each_result_df)
        each_rmse = sqrt(mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values))
        final_df.loc[k]['rmse'] = each_rmse

    return final_df

In [21]:
res = find_best_k(sparse_matrix_withmovie, 200)

  0%|          | 0/15 [00:00<?, ?it/s]

3421
6038


100%|██████████| 6038/6038 [00:18<00:00, 331.98it/s]
  7%|▋         | 1/15 [00:34<08:01, 34.41s/it]

3421
6038


100%|██████████| 6038/6038 [00:18<00:00, 320.07it/s]
 13%|█▎        | 2/15 [01:09<07:34, 34.99s/it]

3421
6038


100%|██████████| 6038/6038 [00:18<00:00, 323.05it/s]
 20%|██        | 3/15 [01:45<07:01, 35.14s/it]

3421
6038


100%|██████████| 6038/6038 [00:18<00:00, 323.68it/s]
 27%|██▋       | 4/15 [02:20<06:25, 35.07s/it]

# 파이썬 라이브러리를 사용해 MF 구현

In [13]:
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split

In [6]:
path = '../data/ml-1m/'
headers = ['userId', 'movieId', 'rating', 'timestamp']
ratings_df = pd.read_csv(path + 'ratings.csv', encoding='utf-8', sep='\t', names=headers)
print(ratings_df.shape)
ratings_df.head()

(1000209, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

print(train_df.shape)
print(test_df.shape)

(800167, 4)
(200042, 4)


In [8]:
train_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [9]:
new_train_df = train_df.rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
new_train_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
416292,2507,3035,2,974076680
683230,4087,2840,4,965431652
2434,19,457,3,978146863
688533,4118,2804,4,965804599
472584,2907,805,4,971838472


In [14]:
(
    X_train_initial,
    y_train_initial,
    X_train_update,
    y_train_update,
    X_test_update,
    y_test_update
) = train_update_test_split(new_train_df, frac_new_users=0.2)

In [19]:
# Initial training
matrix_fact = KernelMF(n_epochs=20, n_factors=100, verbose=1, lr=0.001, reg=0.005)
matrix_fact.fit(X_train_initial, y_train_initial)

Epoch  1 / 20  -  train_rmse: 1.0224704620689298
Epoch  2 / 20  -  train_rmse: 0.982575316264969
Epoch  3 / 20  -  train_rmse: 0.9605071783076413
Epoch  4 / 20  -  train_rmse: 0.9461536460818
Epoch  5 / 20  -  train_rmse: 0.9358009772707679
Epoch  6 / 20  -  train_rmse: 0.9278116958565178
Epoch  7 / 20  -  train_rmse: 0.9213282751379415
Epoch  8 / 20  -  train_rmse: 0.915876071122117
Epoch  9 / 20  -  train_rmse: 0.9111575720901837
Epoch  10 / 20  -  train_rmse: 0.9069746250057811
Epoch  11 / 20  -  train_rmse: 0.9032008579939916
Epoch  12 / 20  -  train_rmse: 0.8997395116265631
Epoch  13 / 20  -  train_rmse: 0.896516470704088
Epoch  14 / 20  -  train_rmse: 0.8934795606481318
Epoch  15 / 20  -  train_rmse: 0.890585995085644
Epoch  16 / 20  -  train_rmse: 0.8878063283663612
Epoch  17 / 20  -  train_rmse: 0.8851059035658996
Epoch  18 / 20  -  train_rmse: 0.8824649897807187
Epoch  19 / 20  -  train_rmse: 0.8798634836877841
Epoch  20 / 20  -  train_rmse: 0.877280482377595


KernelMF(gamma=0.01, lr=0.001, n_epochs=20, reg=0.005)

In [20]:
# 새로운 유저로 모델 업데이트
matrix_fact.update_users(
    X_train_update, y_train_update, lr=0.001, n_epochs=20, verbose=1
)

Epoch  1 / 20  -  train_rmse: 0.9615652462757442
Epoch  2 / 20  -  train_rmse: 0.9453891232058429
Epoch  3 / 20  -  train_rmse: 0.9340971039889769
Epoch  4 / 20  -  train_rmse: 0.9257887455599654
Epoch  5 / 20  -  train_rmse: 0.9193467863355828
Epoch  6 / 20  -  train_rmse: 0.9141611065932782
Epoch  7 / 20  -  train_rmse: 0.9098528888875821
Epoch  8 / 20  -  train_rmse: 0.9061672952140996
Epoch  9 / 20  -  train_rmse: 0.9029466832408304
Epoch  10 / 20  -  train_rmse: 0.9000857208526521
Epoch  11 / 20  -  train_rmse: 0.8975035753506425
Epoch  12 / 20  -  train_rmse: 0.8951477440867699
Epoch  13 / 20  -  train_rmse: 0.8929752841606713
Epoch  14 / 20  -  train_rmse: 0.8909541578866962
Epoch  15 / 20  -  train_rmse: 0.8890625595955197
Epoch  16 / 20  -  train_rmse: 0.8872802519257019
Epoch  17 / 20  -  train_rmse: 0.885588906973192
Epoch  18 / 20  -  train_rmse: 0.8839785858447629
Epoch  19 / 20  -  train_rmse: 0.8824392979610114
Epoch  20 / 20  -  train_rmse: 0.8809618219686906


In [21]:
pred = matrix_fact.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared=False)
print(f"\nTest RMSE: {rmse:.4f}")


Test RMSE: 0.9077


In [22]:
# 추천
user = 200
items_known = X_train_initial.query('user_id == @user')['item_id']
matrix_fact.recommend(user=user, items_known=items_known)

Unnamed: 0,user_id,item_id,rating_pred
33,200,527,4.722429
666,200,922,4.703825
1876,200,3429,4.685443
499,200,318,4.675897
162,200,1198,4.639924
1341,200,904,4.639419
1631,200,1212,4.614023
1572,200,1148,4.612565
182,200,2019,4.609939
81,200,858,4.595394


## SGD

In [23]:
baseline_model = BaselineModel(method='sgd', n_epochs=20, reg=0.005, lr=0.01, verbose=1)
baseline_model.fit(X_train_initial, y_train_initial)

pred = baseline_model.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared=False)

print(f"\nTest RMSE: {rmse:.4f}")

Epoch  1 / 20  -  train_rmse: 0.9211941466640128
Epoch  2 / 20  -  train_rmse: 0.90923526998352
Epoch  3 / 20  -  train_rmse: 0.9054006985279348
Epoch  4 / 20  -  train_rmse: 0.9036146883481199
Epoch  5 / 20  -  train_rmse: 0.9025300698318737
Epoch  6 / 20  -  train_rmse: 0.9019552213134174
Epoch  7 / 20  -  train_rmse: 0.9015450831897645
Epoch  8 / 20  -  train_rmse: 0.9014382068989485
Epoch  9 / 20  -  train_rmse: 0.9012109667106188
Epoch  10 / 20  -  train_rmse: 0.9011557733343935
Epoch  11 / 20  -  train_rmse: 0.9009996507333966
Epoch  12 / 20  -  train_rmse: 0.9009398369836692
Epoch  13 / 20  -  train_rmse: 0.9008414208105531
Epoch  14 / 20  -  train_rmse: 0.9007513889903448
Epoch  15 / 20  -  train_rmse: 0.9007929501247615
Epoch  16 / 20  -  train_rmse: 0.900645232813894
Epoch  17 / 20  -  train_rmse: 0.900650652590341
Epoch  18 / 20  -  train_rmse: 0.9007840614400578
Epoch  19 / 20  -  train_rmse: 0.9006294383570314
Epoch  20 / 20  -  train_rmse: 0.9006296000566372

Test RMSE: 0

In [25]:
baseline_model.update_users(X_train_update, y_train_update, lr=0.001, verbose=1)
pred = baseline_model.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared=False)

print(f"\nTest RMSE: {rmse:.4f}")

Epoch  1 / 20  -  train_rmse: 0.9546454573145846
Epoch  2 / 20  -  train_rmse: 0.939795760514356
Epoch  3 / 20  -  train_rmse: 0.9296977566180108
Epoch  4 / 20  -  train_rmse: 0.9224568638874655
Epoch  5 / 20  -  train_rmse: 0.9170412075548258
Epoch  6 / 20  -  train_rmse: 0.9128421326437819
Epoch  7 / 20  -  train_rmse: 0.9094991123412739
Epoch  8 / 20  -  train_rmse: 0.9067697359235587
Epoch  9 / 20  -  train_rmse: 0.9044949802464968
Epoch  10 / 20  -  train_rmse: 0.9025732107224059
Epoch  11 / 20  -  train_rmse: 0.9009240458462553
Epoch  12 / 20  -  train_rmse: 0.899495564495978
Epoch  13 / 20  -  train_rmse: 0.8982478203339495
Epoch  14 / 20  -  train_rmse: 0.8971485232629989
Epoch  15 / 20  -  train_rmse: 0.8961739521815912
Epoch  16 / 20  -  train_rmse: 0.8953048463782304
Epoch  17 / 20  -  train_rmse: 0.8945265525816108
Epoch  18 / 20  -  train_rmse: 0.8938244242652567
Epoch  19 / 20  -  train_rmse: 0.8931888850242713
Epoch  20 / 20  -  train_rmse: 0.8926121996243459

Test RMSE:

## ALS

In [26]:
baseline_model = BaselineModel(method='als', n_epochs=20, reg=0.5, verbose=1)
baseline_model.fit(X_train_initial, y_train_initial)

pred = baseline_model.predict(X_test_update)
rmse = mean_squared_error(y_test_update, pred, squared=False)

print(f"\nTest RMSE: {rmse:.4f}")

Epoch  1 / 20  -  train_rmse: 0.9200000512818065
Epoch  2 / 20  -  train_rmse: 0.8995417371060176
Epoch  3 / 20  -  train_rmse: 0.8985969793064607
Epoch  4 / 20  -  train_rmse: 0.898540641237458
Epoch  5 / 20  -  train_rmse: 0.8985359711679622
Epoch  6 / 20  -  train_rmse: 0.8985353188485163
Epoch  7 / 20  -  train_rmse: 0.8985351614394271
Epoch  8 / 20  -  train_rmse: 0.8985350952242304
Epoch  9 / 20  -  train_rmse: 0.898535049340028
Epoch  10 / 20  -  train_rmse: 0.8985350084537914
Epoch  11 / 20  -  train_rmse: 0.8985349690134986
Epoch  12 / 20  -  train_rmse: 0.8985349301779487
Epoch  13 / 20  -  train_rmse: 0.8985348917448258
Epoch  14 / 20  -  train_rmse: 0.8985348536634203
Epoch  15 / 20  -  train_rmse: 0.8985348159190761
Epoch  16 / 20  -  train_rmse: 0.8985347785058574
Epoch  17 / 20  -  train_rmse: 0.8985347414198628
Epoch  18 / 20  -  train_rmse: 0.8985347046577326
Epoch  19 / 20  -  train_rmse: 0.8985346682162506
Epoch  20 / 20  -  train_rmse: 0.8985346320922333

Test RMSE: