In [2]:
import numpy as np
 
# 원본 행렬 R 생성, 분해 행렬 P와 Q 초기화, 잠재요인 차원 K는 3 설정.
R = np.array([[4, np.NaN, np.NaN, 2, np.NaN ],
              [np.NaN, 5, np.NaN, 3, 1 ],
              [np.NaN, np.NaN, 3, 4, 4 ],
              [5, 2, 1, 2, np.NaN ]])
num_users, num_items = R.shape
K=3
 
# P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 random한 값으로 입력합니다.
np.random.seed(1)
P = np.random.normal(scale=1./K, size=(num_users, K))
Q = np.random.normal(scale=1./K, size=(num_items, K))
print('실제 행렬:\n', R)

실제 행렬:
 [[ 4. nan nan  2. nan]
 [nan  5. nan  3.  1.]
 [nan nan  3.  4.  4.]
 [ 5.  2.  1.  2. nan]]


In [3]:
from sklearn.metrics import mean_squared_error
 
def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
     
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
       
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
     
    return rmse
 
 
# R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트에 저장.
non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
 
steps=1000
learning_rate=0.01
r_lambda=0.01
 
# SGD 기법으로 P와 Q 매트릭스를 계속 업데이트.
for step in range(steps):
    for i, j, r in non_zeros:
        # 실제 값과 예측 값의 차이인 오류 값 구함
        eij = r - np.dot(P[i, :], Q[j, :].T)
        # Regularization을 반영한 SGD 업데이트 공식 적용
        P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
        Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
 
    rmse = get_rmse(R, P, Q, non_zeros)
    if (step % 50) == 0 :
        print("### iteration step : ", step," rmse : ", rmse)

### iteration step :  0  rmse :  3.2388050277987723
### iteration step :  50  rmse :  0.4876723101369647
### iteration step :  100  rmse :  0.15643403848192475
### iteration step :  150  rmse :  0.07455141311978038
### iteration step :  200  rmse :  0.04325226798579314
### iteration step :  250  rmse :  0.029248328780878977
### iteration step :  300  rmse :  0.022621116143829396
### iteration step :  350  rmse :  0.01949363619652524
### iteration step :  400  rmse :  0.018022719092132586
### iteration step :  450  rmse :  0.017319685953442663
### iteration step :  500  rmse :  0.016973657887570895
### iteration step :  550  rmse :  0.016796804595895595
### iteration step :  600  rmse :  0.016701322901884613
### iteration step :  650  rmse :  0.01664473691247672
### iteration step :  700  rmse :  0.016605910068210078
### iteration step :  750  rmse :  0.016574200475704973
### iteration step :  800  rmse :  0.01654431582921599
### iteration step :  850  rmse :  0.016513751774735196
### i

In [4]:
pred_matrix = np.dot(P, Q.T) # P @ Q.T 도 가능
print('실제 행렬:\n', R)
print('\n예측 행렬:\n', np.round(pred_matrix, 3))

실제 행렬:
 [[ 4. nan nan  2. nan]
 [nan  5. nan  3.  1.]
 [nan nan  3.  4.  4.]
 [ 5.  2.  1.  2. nan]]

예측 행렬:
 [[3.991 0.897 1.306 2.002 1.663]
 [6.696 4.978 0.979 2.981 1.003]
 [6.677 0.391 2.987 3.977 3.986]
 [4.968 2.005 1.006 2.017 1.14 ]]


In [14]:
#call data
import pandas as pd
import numpy as np

webtoons = pd.read_csv('../../csv/after_pre/webtoon_preprocessing_221219.csv',index_col=0)
ratings = pd.read_csv("../../csv/after_pre/total_survey_221129_2.csv")

print("webtoon_shape : {}".format(webtoons.shape))
print("rating_shape : {}".format(ratings.shape))

webtoons = webtoons.rename(columns={'no':'webtoon_num'})

display(webtoons.head(2))
display(ratings.head(2))

webtoon_shape : (2044, 15)
rating_shape : (1175, 4)


Unnamed: 0,webtoon_num,title,link,status,author,fake_intro,real_intro,likes,episodes,first_register_date,last_register_date,age,rate,genre1_pre,genre2_pre
0,1,가난을 등에 업은 소녀,https://comic.naver.com/webtoon/list?titleId=7...,1,B급달궁 / 오은지,흔하디 흔한 재벌후계자와 캔디도 울고 갈 박복한 가난소녀의 파란만장 동거기!인기작 ...,흔하디 흔한 재벌후계자와 캔디도 울고 갈 박복한 가난소녀의 파란만장 동거기!인기작 ...,344,28.0,2020.10.19,2020.10.19,0,9.6,0,0
1,2,가담항설,https://comic.naver.com/webtoon/list?titleId=6...,1,랑또,이번 주인공은 돌이다!돌이지만 동료도 모으고 악당도 물리친다!랑또 작가표 동양 판타...,이번 주인공은 돌이다!돌이지만 동료도 모으고 악당도 물리친다!랑또 작가표 동양 판타...,100000,247.0,2016.06.01,2016.08.03,1,9.9,0,2


Unnamed: 0,user,webtoon_title,webtoon_num,webtoon_ratings
0,1,리트리츠,494,6.0
1,1,마루는 강쥐,505,5.0


In [11]:
ratings = ratings[['user','webtoon_num','webtoon_ratings']]
rating_matrix = ratings.pivot_table('webtoon_ratings',index = 'user',columns='webtoon_num')
rating_matrix.head(3)

webtoon_num,2,4,7,19,23,26,27,31,32,35,...,2004,2005,2008,2014,2019,2024,2028,2034,2038,2043
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,1.0,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


In [17]:
#영화간 유사도 (협업필터링 아님)

#title 칼럼을 얻기위해 MOVIES 와 join 
ratings_webtoons = pd.merge(ratings,webtoons,on="webtoon_num")

#columns = 'title' 로 TITle 칼럼으로 피벗 수행
rating_matrix = ratings_webtoons.pivot_table("webtoon_ratings",index = 'user',columns='title')

#Nan 값을 모두 0으로 변환
#rating_matrix = rating_matrix.fillna(0)
rating_matrix.head(3)

title,100,10월 28일,12차원 소년들,"17살, 그 여름날의 기적",1초,1학년 9반,2011 미스테리 단편,2012 지구가 멸망한다면?,2013 전설의고향,2015 사이,...,호랑이형님,호랭총각,호러전파상,화산귀환,화이트멜로우,회춘,후레자식,후유증,후유증 2,히어로 킬러
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


In [25]:
num_user,num_items = rating_matrix.shape
K = 3

(95, 571)


In [27]:
import numpy as np
from sklearn.metrics import mean_squared_error
 
def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
     
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
     
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
       
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
     
    return rmse
 
 
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력합니다.
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))
 
    break_count = 0
        
    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장.
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
     
    # SGD기법으로 P와 Q 매트릭스를 계속 업데이트.
    for step in range(steps):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
         
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)
             
    return P, Q
    
if __name__ == "__main__":
    import pandas as pd
    import numpy as np

    P, Q = matrix_factorization(rating_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda = 0.01)
    pred_matrix = np.dot(P, Q.T)

### iteration step :  0  rmse :  4.293309621000512
### iteration step :  10  rmse :  2.5290943538558475
### iteration step :  20  rmse :  1.1500747416833
### iteration step :  30  rmse :  0.6698033333564843
### iteration step :  40  rmse :  0.41312682130062284
### iteration step :  50  rmse :  0.27680342210988457
### iteration step :  60  rmse :  0.18911012070163918
### iteration step :  70  rmse :  0.13387744146946423
### iteration step :  80  rmse :  0.10019100102397749
### iteration step :  90  rmse :  0.07857862014119309
### iteration step :  100  rmse :  0.06427890396798817
### iteration step :  110  rmse :  0.054570455303509874
### iteration step :  120  rmse :  0.04778831076036095
### iteration step :  130  rmse :  0.04291720557498592
### iteration step :  140  rmse :  0.03933158973246197
### iteration step :  150  rmse :  0.03663486450750749
### iteration step :  160  rmse :  0.03456691004514395
### iteration step :  170  rmse :  0.0329518394569504
### iteration step :  180  rm

In [28]:
rating_pred_matrix = pd.DataFrame(data = pred_matrix, index = rating_matrix.index,
                                 columns=rating_matrix.columns)

rating_pred_matrix.head(3)

title,100,10월 28일,12차원 소년들,"17살, 그 여름날의 기적",1초,1학년 9반,2011 미스테리 단편,2012 지구가 멸망한다면?,2013 전설의고향,2015 사이,...,호랑이형님,호랭총각,호러전파상,화산귀환,화이트멜로우,회춘,후레자식,후유증,후유증 2,히어로 킬러
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.54318,1.589137,1.367873,1.629806,4.442588,1.04622,1.604706,1.682391,1.585438,1.094184,...,3.736772,1.932792,1.562607,4.465042,1.555641,1.517317,3.426381,1.573044,1.593545,3.712088
2,0.738121,2.295019,1.858702,1.929631,5.284878,1.524718,2.236472,2.370251,2.287137,1.519216,...,5.01743,2.416305,2.36198,5.761294,2.111926,2.275312,4.499408,2.356265,2.285641,4.957016
3,-0.082025,0.042269,0.131944,0.033651,1.371942,0.086155,0.077092,-0.010406,0.038589,0.043455,...,0.67074,-0.004218,0.198579,1.183964,0.124858,-0.007122,0.459492,0.050589,0.117392,-0.020618


In [33]:
print((rating_pred_matrix['1초']))

user
1     4.442588
2     5.284878
3     1.371942
4     3.740858
5     3.771076
        ...   
95    4.661143
96    5.150009
97    5.655310
98    4.618622
99    4.872143
Name: 1초, Length: 95, dtype: float64
