In [1]:
import pandas as pd
import numpy as np
import pickle

ratings = pd.read_pickle('users_text.pkl')

print(ratings.shape)

(175539, 6)


In [2]:
# user 정보 데이터
ratings.head()

Unnamed: 0,isbn,rank_mean,review_title,review_id,review_rank,review_text
0,9791190710343,9.4,자기만의 방 - 버지니아 울프의 여성에세이,s*****0,5.0,\n\n \n시대를 뛰어넘어 진정한 페미니즘 비평의 장을 연 고전 <자기만의 방>은...
1,9791190710343,9.4,#여성에세이#자기만의 방,녹**다,5.0,\n\n \n자기만의 방 버지니아 울프의 작품이다.한시대를 살다간 그녀의 삶은 기발...
2,9791190710343,9.4,자기만의 방,m*******n,4.0,"\n\n \n시대를 넘어 여전한 감동을 들려주는 작가, 버지니아 울프의 대표적인 작..."
3,9791190710350,10.0,오만과 편견,할**장,5.0,\n\n \n오만과 편견은 1813년 출판된 이래 한국에도 여러 출판사를 통해 번역...
4,9791190710367,10.0,프랑켄슈타인,책*레,5.0,"\n\n신이여,\n내가 그대에게 진흙으로 빚어달라 청했습니까?\n나를 어둠에서 끌어..."


In [2]:
from sklearn.metrics import mean_squared_error
from tqdm import tqdm_notebook

def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [3]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력합니다. 
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0
       
    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장. 
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
   
    # P와 Q 매트릭스를 계속 업데이트(확률적 경사하강법)
    for step in tqdm_notebook(range(steps)):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
       
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)
            
    return P, Q

## 사용자-아이템 평점 행렬로 변환

In [4]:
# 필요한 컬럼만 추출
ratings = ratings[['review_id', 'isbn', 'review_rank']]
ratings

Unnamed: 0,review_id,isbn,review_rank
0,s*****0,9791190710343,5.0
1,녹**다,9791190710343,5.0
2,m*******n,9791190710343,4.0
3,할**장,9791190710350,5.0
4,책*레,9791190710367,5.0
...,...,...,...
175534,귀****면,9788995696972,4.0
175535,q******o,9788995696972,3.0
175536,c*******x,9788995696972,3.0
175537,북****구,9788995696972,3.0


In [5]:
# pivot_table 메소드를 사용해서 행렬 변환
ratings_matrix = ratings.pivot_table('review_rank', index='review_id', columns='isbn')

print(ratings_matrix.shape)
ratings_matrix

(15626, 28994)


isbn,8809254942867,8809332972588,8809416540344,8809448100080,8809448100097,8809478300054,8809487620181,8809501981243,8809501981250,8809501981267,...,9791197732713,9791197736308,9791197742408,9791197766602,9791197770708,9791197770906,9791197780103,9791197784200,9791197798603,9798981631603
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,,,,,,,,,,,...,,,,,,,,,,
'***,,,,,,,,,,,...,,,,,,,,,,
'******아,,,,,,,,,,,...,,,,,,,,,,
(********),,,,,,,,,,,...,,,,,,,,,,
****,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
힙*터,,,,,,,,,,,...,,,,,,,,,,
힝*,,,,,,,,,,,...,,,,,,,,,,
不**哉,,,,,,,,,,,...,,,,,,,,,,
陸*,,,,,,,,,,,...,,,,,,,,,,


# 1. 유저_아이템기반 협업 필터링

In [4]:
books = pd.read_pickle('users_final.pkl')
books = books[['isbn', 'rank_mean', 'rank_percentage']]

In [5]:
re = pd.read_pickle('recommend_cat_new.pkl')
re.columns

Index(['isbn', 'title', 'categoryName', 'salesPoint', 'adult', 'rank',
       'genres', 'mood', 'interest'],
      dtype='object')

In [6]:
movies = pd.merge(books,re, on='isbn')
movies

Unnamed: 0,isbn,rank_mean,rank_percentage,title,categoryName,salesPoint,adult,rank,genres,mood,interest
0,9791190710343,[9.4],"[67%, 33%, 0%, 0%, 0%]",자기만의 방,에세이,0,False,0,"[시대, 동화, 예술]","[자유, 행복, 위로, 열망]","[사랑, 생각, 사회, 페미니즘, 인생]"
1,9791190710350,[10.0],"[100%, 0%, 0%, 0%, 0%]",오만과 편견,소설,0,False,0,"[시대, 동화]","[도전, 열정, 행복]","[사랑, 심리, 인간, 인생, 청춘, 관계, 페미니즘, 생각, 일상]"
2,9791190710367,[10.0],"[100%, 0%, 0%, 0%, 0%]",프랑켄슈타인,소설,0,False,0,"[과학, 동화, 판타지, 예술]","[열망, 열정, 자유, 희망, 고민]","[심리, 동물, 인간, 관계]"
3,9791128858338,[],[],[큰글씨책] 원서발췌 신선전,소설,0,False,0,"[과학, 예술, 철학]",[],"[종교, 자연]"
4,9788954685603,[],[],집착,소설,350,False,0,"[예술, 자서전, 역사]","[고통, 슬픔, 비극, 절망, 불행, 행복, 열정, 고민]","[심리, 관계, 인간, 생각, 사랑, 인생, 청춘, 일상]"
...,...,...,...,...,...,...,...,...,...,...,...
47487,9791185020136,[8.7],"[48%, 45%, 7%, 0%, 0%]",뇌력혁명 - 뇌피로가 풀려야 인생이 풀린다!,자기계발,281,False,9,"[과학, 시대]","[행복, 우울, 불안, 고민, 긍정]","[사회, 일상, 인간, 인생, 청춘, 시간]"
47488,9788997838127,[8.4],"[53%, 35%, 12%, 0%, 0%]",남자의 교과서 - 여자는 전혀 모르고 남자는 차마 말 못하는 것들,자기계발,281,False,9,[동화],"[슬픔, 행복, 고민, 후회]","[정치, 관계, 생각, 인생, 가족, 친구, 시간]"
47489,9788956591858,[],[],그리스 신전에서 인간의 길을 묻다 - 내 인생을 바꾸는 4천 년의 지혜,자기계발,281,False,8,"[예술, 시대, 판타지, 동화]","[열정, 도전, 용기, 행복]","[성공, 인간, 인생, 청춘, 일상]"
47490,9788987794938,[8.0],"[0%, 100%, 0%, 0%, 0%]",성공을 부르는 긍정의 힘,자기계발,281,False,7,[],[],[성공]


In [43]:
#movies = movies[['isbn', 'rank_percentage','title', 'categoryName', 'salesPoint',
#        'adult', 'rank', 'genres', 'mood', 'interest']]

In [7]:
rating_books = pd.merge(ratings, movies, on='isbn') 
rating_books

Unnamed: 0,review_id,isbn,review_rank,rank_mean,rank_percentage,title,categoryName,salesPoint,adult,rank,genres,mood,interest
0,s*****0,9791190710343,5.0,[9.4],"[67%, 33%, 0%, 0%, 0%]",자기만의 방,에세이,0,False,0,"[시대, 동화, 예술]","[자유, 행복, 위로, 열망]","[사랑, 생각, 사회, 페미니즘, 인생]"
1,녹**다,9791190710343,5.0,[9.4],"[67%, 33%, 0%, 0%, 0%]",자기만의 방,에세이,0,False,0,"[시대, 동화, 예술]","[자유, 행복, 위로, 열망]","[사랑, 생각, 사회, 페미니즘, 인생]"
2,m*******n,9791190710343,4.0,[9.4],"[67%, 33%, 0%, 0%, 0%]",자기만의 방,에세이,0,False,0,"[시대, 동화, 예술]","[자유, 행복, 위로, 열망]","[사랑, 생각, 사회, 페미니즘, 인생]"
3,할**장,9791190710350,5.0,[10.0],"[100%, 0%, 0%, 0%, 0%]",오만과 편견,소설,0,False,0,"[시대, 동화]","[도전, 열정, 행복]","[사랑, 심리, 인간, 인생, 청춘, 관계, 페미니즘, 생각, 일상]"
4,책*레,9791190710367,5.0,[10.0],"[100%, 0%, 0%, 0%, 0%]",프랑켄슈타인,소설,0,False,0,"[과학, 동화, 판타지, 예술]","[열망, 열정, 자유, 희망, 고민]","[심리, 동물, 인간, 관계]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
175534,귀****면,9788995696972,4.0,[7.3],"[10%, 60%, 30%, 0%, 0%]",성공한 사람들은 말의 절반이 칭찬이다,자기계발,281,False,8,"[시대, 철학]","[긍정, 희망, 절망, 행복, 도전]","[심리, 관계, 생각, 사회, 성공, 인간, 정치]"
175535,q******o,9788995696972,3.0,[7.3],"[10%, 60%, 30%, 0%, 0%]",성공한 사람들은 말의 절반이 칭찬이다,자기계발,281,False,8,"[시대, 철학]","[긍정, 희망, 절망, 행복, 도전]","[심리, 관계, 생각, 사회, 성공, 인간, 정치]"
175536,c*******x,9788995696972,3.0,[7.3],"[10%, 60%, 30%, 0%, 0%]",성공한 사람들은 말의 절반이 칭찬이다,자기계발,281,False,8,"[시대, 철학]","[긍정, 희망, 절망, 행복, 도전]","[심리, 관계, 생각, 사회, 성공, 인간, 정치]"
175537,북****구,9788995696972,3.0,[7.3],"[10%, 60%, 30%, 0%, 0%]",성공한 사람들은 말의 절반이 칭찬이다,자기계발,281,False,8,"[시대, 철학]","[긍정, 희망, 절망, 행복, 도전]","[심리, 관계, 생각, 사회, 성공, 인간, 정치]"


In [46]:
#rating_books.to_pickle('D:/2022/추천시스템_프로젝트/books_all.pkl', protocol = 4)

In [8]:
# columns='title' 로 title 컬럼으로 pivot 수행. 
ratings_matrix = rating_books.pivot_table('review_rank', index='review_id', columns='title')
ratings_matrix

title,"#걸보스 Girlboss - 훔친 책을 팔던 소녀, 5년 만에 1000억대 CEO가 되다","#너에게 - 너에게 보내는 편지, 완글",#씬,#예능작가 - 예능작가 16인의 생생한 방송 이야기,#킬러스타그램,&quot;죽을 만큼 힘들면 회사 그만두지그래&quot;가 안 되는 이유,'나의 나무' 아래서,'뉴욕 52번가' 하수구의 철학자 라바,'시인' 임경남의 나 데리고 잘 사는 법,(만화편) 2억 빚을 진 내게 우주님이 가르쳐준 운이 풀리는 말버릇,...,힘들 때 펴보라던 편지 - 영혼을 깨우는 선승들의 일화 301,힘들 땐 잠깐 쉬었다 가도 괜찮아 - 오늘 행복해지고 싶은 당신에게,힘들고 지칠 때 희망을 주는 이야기,힘들면 힘 내려놔 - 나를 믿는 습관,힘들어도 괴롭진 않아 - 원유헌의 구례일기,힘들었던 날들을 좋았던 날들로,"힘이 되는 말, 독이 되는 말 - 상처 주지 않고 미움 받지 않는 인간관계의 지혜","힙 피플, 나라는 세계 - 나의 쓸모와 딴짓","女子, 사임당","茶로 만난 인연 - 향기를 품은 사람들이 내게 다가왔다, 홍차 에세이"
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,,,,,,,,,,,...,,,,,,,,,,
'***,,,,,,,,,,,...,,,,,,,,,,
'******아,,,,,,,,,,,...,,,,,,,,,,
(********),,,,,,,,,,,...,,,,,,,,,,
****,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
힙*터,,,,,,,,,,,...,,,,,,,,,,
힝*,,,,,,,,,,,...,,,,,,,,,,
不**哉,,,,,,,,,,,...,,,,,,,,,,
陸*,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# NaN 값을 모두 0 으로 변환
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix

title,"#걸보스 Girlboss - 훔친 책을 팔던 소녀, 5년 만에 1000억대 CEO가 되다","#너에게 - 너에게 보내는 편지, 완글",#씬,#예능작가 - 예능작가 16인의 생생한 방송 이야기,#킬러스타그램,&quot;죽을 만큼 힘들면 회사 그만두지그래&quot;가 안 되는 이유,'나의 나무' 아래서,'뉴욕 52번가' 하수구의 철학자 라바,'시인' 임경남의 나 데리고 잘 사는 법,(만화편) 2억 빚을 진 내게 우주님이 가르쳐준 운이 풀리는 말버릇,...,힘들 때 펴보라던 편지 - 영혼을 깨우는 선승들의 일화 301,힘들 땐 잠깐 쉬었다 가도 괜찮아 - 오늘 행복해지고 싶은 당신에게,힘들고 지칠 때 희망을 주는 이야기,힘들면 힘 내려놔 - 나를 믿는 습관,힘들어도 괴롭진 않아 - 원유헌의 구례일기,힘들었던 날들을 좋았던 날들로,"힘이 되는 말, 독이 되는 말 - 상처 주지 않고 미움 받지 않는 인간관계의 지혜","힙 피플, 나라는 세계 - 나의 쓸모와 딴짓","女子, 사임당","茶로 만난 인연 - 향기를 품은 사람들이 내게 다가왔다, 홍차 에세이"
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'***,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'******아,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(********),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
****,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
힙*터,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
힝*,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
不**哉,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
陸*,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
a.loc[a.head(3).index[i]

### 영화와 영화들 간 유사도 산출

In [10]:
# 아이템-사용자 행렬로 transpose 한다.(기존의 행과 열을 전치함)
ratings_matrix_T = ratings_matrix.transpose()

print(ratings_matrix_T.shape)
ratings_matrix_T.head(3)

(28379, 15626)


review_id,Unnamed: 1_level_0,'***,'******아,(********),****,*****,**********************,-*,.*,.*.,...,힘*나,힘*내,힘*니,힘*지,힙**녀,힙*터,힝*,不**哉,陸*,Ｋ***Ｏ
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"#걸보스 Girlboss - 훔친 책을 팔던 소녀, 5년 만에 1000억대 CEO가 되다",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"#너에게 - 너에게 보내는 편지, 완글",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#씬,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# 영화와 영화들 간 코사인 유사도 산출
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)

# cosine_similarity() 로 반환된 넘파이 행렬을 영화명을 매핑하여 DataFrame으로 변환
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns,
                          columns=ratings_matrix.columns)

print(item_sim_df.shape)
item_sim_df.head(3)

(28379, 28379)


title,"#걸보스 Girlboss - 훔친 책을 팔던 소녀, 5년 만에 1000억대 CEO가 되다","#너에게 - 너에게 보내는 편지, 완글",#씬,#예능작가 - 예능작가 16인의 생생한 방송 이야기,#킬러스타그램,&quot;죽을 만큼 힘들면 회사 그만두지그래&quot;가 안 되는 이유,'나의 나무' 아래서,'뉴욕 52번가' 하수구의 철학자 라바,'시인' 임경남의 나 데리고 잘 사는 법,(만화편) 2억 빚을 진 내게 우주님이 가르쳐준 운이 풀리는 말버릇,...,힘들 때 펴보라던 편지 - 영혼을 깨우는 선승들의 일화 301,힘들 땐 잠깐 쉬었다 가도 괜찮아 - 오늘 행복해지고 싶은 당신에게,힘들고 지칠 때 희망을 주는 이야기,힘들면 힘 내려놔 - 나를 믿는 습관,힘들어도 괴롭진 않아 - 원유헌의 구례일기,힘들었던 날들을 좋았던 날들로,"힘이 되는 말, 독이 되는 말 - 상처 주지 않고 미움 받지 않는 인간관계의 지혜","힙 피플, 나라는 세계 - 나의 쓸모와 딴짓","女子, 사임당","茶로 만난 인연 - 향기를 품은 사람들이 내게 다가왔다, 홍차 에세이"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"#걸보스 Girlboss - 훔친 책을 팔던 소녀, 5년 만에 1000억대 CEO가 되다",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087538,0.0,0.0,...,0.317347,0.0,0.0,0.0,0.0,0.0,0.105727,0.0,0.0,0.089343
"#너에게 - 너에게 보내는 편지, 완글",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#씬,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#item_sim_df.to_pickle('item_sim_df.pkl')

In [12]:
# Godfather와 유사한 도서 6개 확인해보기
item_sim_df["달러구트 꿈 백화점 (50만 부 기념 드림 에디션) - 주문하신 꿈은 매진입니다"].sort_values(ascending=False)[:6]

title
달러구트 꿈 백화점 (50만 부 기념 드림 에디션) - 주문하신 꿈은 매진입니다     1.000000
미드나잇 라이브러리 (25만 부 기념 퍼플 에디션)                     0.439088
시나몬 아이스크림                                        0.433861
소설 사기 3 : 통일천하 - 사마천의 사기열전 소설로 만나다, 개정판          0.433861
사랑할 시간이 많지 않다                                    0.433861
나의 행동이 곧 나의 운명이다 - 평범한 가정주부에서 여성CEO의 대모가 되기까지    0.433861
Name: 달러구트 꿈 백화점 (50만 부 기념 드림 에디션) - 주문하신 꿈은 매진입니다, dtype: float64

In [15]:
# 자기 것 빼고 유사한 도서 5개 확인해보기
item_sim_df["달러구트 꿈 백화점 (50만 부 기념 드림 에디션) - 주문하신 꿈은 매진입니다"].sort_values(ascending=False)[1:6]

title
미드나잇 라이브러리 (25만 부 기념 퍼플 에디션)                     0.439088
시나몬 아이스크림                                        0.433861
소설 사기 3 : 통일천하 - 사마천의 사기열전 소설로 만나다, 개정판          0.433861
사랑할 시간이 많지 않다                                    0.433861
나의 행동이 곧 나의 운명이다 - 평범한 가정주부에서 여성CEO의 대모가 되기까지    0.433861
Name: 달러구트 꿈 백화점 (50만 부 기념 드림 에디션) - 주문하신 꿈은 매진입니다, dtype: float64

# 2. 잠재기반 협업필터링

In [6]:
%%time
# 경사하강법을 이용한 행렬 분해(4~5분 정도 걸림)
P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda = 0.01)
# K=50 잠재요인, steps=200: 200번 반복 수행, learning rate: 학습률, r_lambda: L2 규제 계수

pred_matrix = np.dot(P, Q.T) # 행렬의 곱(내적)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step in tqdm_notebook(range(steps)):


  0%|          | 0/200 [00:00<?, ?it/s]

### iteration step :  0  rmse :  4.635983242870571
### iteration step :  10  rmse :  1.5563443467274514
### iteration step :  20  rmse :  0.8605012698344867
### iteration step :  30  rmse :  0.5671260871462497
### iteration step :  40  rmse :  0.4023264294187457
### iteration step :  50  rmse :  0.2993512461726182
### iteration step :  60  rmse :  0.22469083218967378
### iteration step :  70  rmse :  0.16879415873427656
### iteration step :  80  rmse :  0.1314566636163854
### iteration step :  90  rmse :  0.10739309326930924
### iteration step :  100  rmse :  0.09103356556114023
### iteration step :  110  rmse :  0.07923583374016843
### iteration step :  120  rmse :  0.07039333351180162
### iteration step :  130  rmse :  0.06359057910241542
### iteration step :  140  rmse :  0.05824377606265089
### iteration step :  150  rmse :  0.05396835876235124
### iteration step :  160  rmse :  0.05050333180150111
### iteration step :  170  rmse :  0.047662791883326006
### iteration step :  180  r