In [4]:
import pandas as pd
import numpy as np
import os, sys, gc
from tqdm.notebook import tqdm
import warnings 
warnings.filterwarnings('ignore')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
path = '/content/drive/MyDrive/input/goodbooks10k/'

In [9]:
books = pd.read_csv(path + "books.csv")
book_tags = pd.read_csv(path + "book_tags.csv")
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
tags = pd.read_csv(path + "tags.csv")
to_read = pd.read_csv(path + "to_read.csv")

In [10]:
train['book_id'] = train['book_id'].astype(str)
test['book_id'] = test['book_id'].astype(str)
books['book_id'] = books['book_id'].astype(str)

In [11]:
popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]

In [12]:
sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
gt = {}
for user in tqdm(sol['user_id'].unique()): 
    gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])

HBox(children=(FloatProgress(value=0.0, max=53424.0), HTML(value='')))




In [13]:
rec_df = pd.DataFrame()
rec_df['user_id'] = train['user_id'].unique()

### TF-IDF를 이용한 Contents Based Model

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books['title'])
print(tfidf_matrix.shape)

(10000, 9019)


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_matrix.shape

(10000, 10000)

In [16]:
# book title과 id를 매핑할 딕셔너리 생성
book2id = {}
for i, c in enumerate(books['title']): book2id[i] = c
# id와 book title을 매핑할 딕셔너리 생성
id2book = {}
for i, c in book2id.items(): id2book[c] = i
# book_id과 title을 매핑할 딕셔너리 생성
bookid2book = {}
for i, j in zip(books['title'].values, books['book_id'].values):
    bookid2book[i] = j 

In [17]:
books['title'].head()

0              The Hunger Games (The Hunger Games, #1)
1    Harry Potter and the Sorcerer's Stone (Harry P...
2                              Twilight (Twilight, #1)
3                                To Kill a Mockingbird
4                                     The Great Gatsby
Name: title, dtype: object

In [18]:
idx = id2book['Twilight (Twilight, #1)']
sim_scores = [(book2id[i], c) for i, c in enumerate(cosine_matrix[idx]) if i != idx]
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)
sim_scores[0:10]

[('The Twilight Saga (Twilight, #1-4)', 0.920347418277986),
 ('The Twilight Collection (Twilight, #1-3)', 0.8786339079447184),
 ('The Twilight Saga Complete Collection  (Twilight, #1-4 + 3.5)',
  0.7697532056304309),
 ('Twilight and History', 0.7465001575650626),
 ('The Twilight Saga: The Official Illustrated Guide (Twilight, #4.5)',
  0.7045174300631831),
 ('Twilight Eyes', 0.6770737331426326),
 ('Twilight (The Mediator, #6)', 0.6377631333498953),
 ('New Moon (Twilight, #2)', 0.6185575138625542),
 ('Eclipse (Twilight, #3)', 0.612819563854136),
 ('The Servants of Twilight', 0.5837817298466093)]

1. 학습셋에서 제목이 있는 경우에 대해서만 진행
2. 각 유저별로 읽은 책의 목록을 수집
3. 읽은 책과 유사한 책 추출
4. 모든 책에 대해서 유사도를 더한 값을 계산
5. 3에서 유사도가 가장 높은 순서대로 추출

In [19]:
train = pd.merge(train, books[['book_id', 'title']], how='left', on='book_id')
train.head()

Unnamed: 0,user_id,book_id,title
0,1,6285,
1,2,8034,
2,3,9014,The Long Walk
3,4,8464,
4,5,6646,


In [20]:
# 0. 학습셋에서 제목이 있는 경우에 대해서만 진행
tf_train = train[train['title'].notnull()].reset_index(drop=True)
tf_train['idx2title'] = tf_train['title'].apply(lambda x: id2book[x])
tf_train.head()

Unnamed: 0,user_id,book_id,title,idx2title
0,3,9014,The Long Walk,1287
1,7,4588,Extremely Loud and Incredibly Close,248
2,7,4588,Extremely Loud and Incredibly Close,248
3,10,5084,My Life in France,1440
4,14,7604,Lolita,120


In [21]:
idx2title2book = {}
for i, j in zip(tf_train['idx2title'].values, tf_train['book_id'].values):
    idx2title2book[i] = j

In [22]:
# 1. 각 유저별로 읽은 책의 목록 수집
user = 7
read_list = tf_train.groupby(['user_id'])['idx2title'].agg({'unique'}).reset_index()
seen = read_list[read_list['user_id'] == user]['unique'].values[0]
seen

array([248])

In [23]:
# 2. 읽은 책과 유사한 책 추출
# 248번째 책과 다른 책들간의 유사도
cosine_matrix[248]

array([0., 0., 0., ..., 0., 0., 0.])

In [24]:
# 2. 읽은 책과 유사한 책 추출
total_cosine_sim = np.zeros(len(book2id))
for book_ in seen:
    # 3. 모든 책에 대해서 유사도를 더한 값을 계산
    total_cosine_sim += cosine_matrix[book_]

In [25]:
# 4. 3에서 유사도가 가장 높은 순서대로 추출
sim_scores = [(i, c) for i, c in enumerate(total_cosine_sim) if i not in seen] # 자기 자신을 제외한 영화들의 유사도 및 인덱스 추출
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True) # 유사도가 높은 순서대로 정렬
sim_scores[0:5]

[(3160, 0.3451654965519055),
 (9459, 0.33683651502400813),
 (7969, 0.30892286302140143),
 (9042, 0.26130119036804794),
 (6490, 0.2530815324721045)]

In [26]:
book2id[3160]

'Stay Close'

In [27]:
bookid2book[book2id[3160]]

'11737271'

In [28]:
tf_train['user_id'].unique()

array([    3,     7,    10, ..., 53406, 53408, 53420])

In [29]:
tf_train.head()

Unnamed: 0,user_id,book_id,title,idx2title
0,3,9014,The Long Walk,1287
1,7,4588,Extremely Loud and Incredibly Close,248
2,7,4588,Extremely Loud and Incredibly Close,248
3,10,5084,My Life in France,1440
4,14,7604,Lolita,120


In [30]:
# 전체 영화에 대해 진행
total_rec_list = {}

read_list1 = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list2= tf_train.groupby(['user_id'])['idx2title'].agg({'unique'}).reset_index()

for user in tqdm(train['user_id'].unique()):
    rec_list = []
    
    # 만약 Tf-idf 소속의 추천대상이라면 콘텐츠 기반의 추천
    if user in tf_train['user_id'].unique():
        # 1. 각 유저별로 읽은 책의 목록 수집
        seen = read_list2[read_list2['user_id'] == user]['unique'].values[0]
        # 2. 읽은 책과 유사한 책 추출
        total_cosine_sim = np.zeros(len(book2id))
        for book_ in seen:
            # 3. 모든 책에 대해 유사도를 더한 값을 계산
            total_cosine_sim += cosine_matrix[book_]
            
        # 4. 3에서 유사도가 가장 높은 순서대로 추출
        sim_scores = [(bookid2book[book2id[i]], c) for i, c in enumerate(total_cosine_sim) if i not in seen]
        recs = sorted(sim_scores, key = lambda x: x[1], reverse=True)[0:300]
        for rec in recs:
            if rec not in seen:
                rec_list.append(rec)
                
    # 그렇지 않으면 인기도 기반의 추천
    else:
        seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
        for rec in popular_rec_model[0:400]:
            if rec not in seen:
                rec_list.append(rec)
                
    total_rec_list[user] = rec_list[0:200]
            

HBox(children=(FloatProgress(value=0.0, max=53382.0), HTML(value='')))




In [31]:
import six
import math

# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py

class evaluate():
    def __init__(self, recs, gt, topn=100):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        n, ap = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))
        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))

In [32]:
evaluate_func = evaluate(recs=total_rec_list, gt=gt, topn=200)
evaluate_func._evaluate()

MAP@200: 8.792647421603096e-05
NDCG@200: 0.0008922079511553609
EntDiv@200: 6.916300598784509


### Word2Vec을 이용한 추천시스템
- Tag 간 유사도
- 제목간의 유사도
- 책의 읽은 순서를 통한 유사도

In [33]:
agg = train.groupby(['user_id'])['book_id'].agg({'unique'})
agg.head()

Unnamed: 0_level_0,unique
user_id,Unnamed: 1_level_1
1,[6285]
2,[8034]
3,[9014]
4,[8464]
5,"[6646, 4829]"


In [34]:
# int 형식은 word2vec에서 학습이 되지 않아 string으로 변경
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [35]:
# word2vec 학습 진행
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, size=20, window=5,
                           min_count=1, workers=4, iter=200, sg=1)

In [46]:
embedding_model.wv.most_similar(positive=['4893'], topn=10)
"""[('7264', 0.8029036521911621),
 ('8470', 0.8020585179328918),
 ('9864', 0.7978218793869019),
 ('8923', 0.7909978628158569),
 ('3734', 0.7892516851425171),
 ('8508', 0.7859364748001099),
 ('9537', 0.783882737159729),
 ('9323', 0.7777514457702637),
 ('462', 0.7774814367294312),
 ('9280', 0.7748066186904907)]"""
for i in embedding_model.wv.most_similar(positive=['4893'], topn=5):
     print(i[0])

7264
8470
9864
8923
3734


In [48]:
# 전체 영화에 대해 진행
total_rec_list = {}

read_list = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
for user in tqdm(train['user_id'].unique()):
    rec_list = []
    seen = read_list[read_list['user_id'] == user]['unique'].values[0]
    word2vec_dict = {}
    for book in seen: # 본 책과 유사한 300 개의 책을 뽑는 word2vec 모델
        for i in embedding_model.wv.most_similar(positive=[book], topn=300):
            if i[0] not in seen: # 본 책이 아닐 때
                if i[0] not in word2vec_dict.keys(): # 이전에 word2vec으로 추천한 적이 없는 경우
                    word2vec_dict[i[0]] = i[1] # word2vec_dict에 키와 값 추가
                else: # 이전에 word2vec으로 추천한 적이 있는 경우
                    word2vec_dict[i[0]] += i[1] # 해당 책의 확률을 증가 시킴

    rec_list = list(dict(sorted(word2vec_dict.items(), # 해당 책의 확률 내림차순으로 정렬
                                key = lambda x: x[1], reverse=True)).keys())
    
    total_rec_list[user] = rec_list[0:200] # 200권 저장




HBox(children=(FloatProgress(value=0.0, max=53382.0), HTML(value='')))




In [50]:
evaluate_func = evaluate(recs=total_rec_list, gt=gt, topn=200)
evaluate_func._evaluate()

MAP@200: 0.06373362702004991
NDCG@200: 0.19406269857668312
EntDiv@200: 8.904332683259623


### 태그를 통한 유사도 계산

In [52]:
book_tags.columns = ['book_id', 'tag_id', 'count']
book_tags['book_id'] = book_tags['book_id'].astype(str)
book_tags['tag_id'] = book_tags['tag_id'].astype(str)

tags['tag_id'] = tags['tag_id'].astype(str)

book_tags = pd.merge(book_tags, tags, how='left', on='tag_id')
book_tags.head()

Unnamed: 0,book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,1,11305,37174,fantasy
2,1,11557,34173,favorites
3,1,8717,12986,currently-reading
4,1,33114,12716,young-adult


In [53]:
agg = book_tags.groupby(['book_id'])['tag_name'].agg({'unique'}).reset_index()
agg.head()

Unnamed: 0,book_id,unique
0,1,"[to-read, fantasy, favorites, currently-readin..."
1,10,"[to-read, favorites, fantasy, currently-readin..."
2,10006,"[to-read, fiction, currently-reading, rory-gil..."
3,1000751,"[to-read, classics, childrens, fiction, curren..."
4,10008056,"[to-read, default, currently-reading, krimi, c..."


In [54]:
# 태그간 유사도 계산
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [55]:
from gensim.models import doc2vec
doc_vectorizer = doc2vec.Doc2Vec(
    dm = 0, # PV-DBOW / default 1
    dbow_words = 1, # w2v와 동시에 진행
    window = 10, # 이웃 단어 10개
    size = 100, # 벡터 사이즈 100
    alpha = 0.025, # 학습률 0.025
    seed = 1234, # 시드값
    min_count = 5, # 빈도 낮은건 무시
    min_alpha = 0.025,  # 최소 학습률
    workers = 4, # cpu 코어수
    hs = 1, # hierar chical 소프트맥스
    negative = 10 # negative 샘플링
)

In [56]:
from collections import namedtuple
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(c, [d]) for c, d in agg[['unique', 'book_id']].values]

In [59]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

Doc2Vec(dbow+w,d100,n10,hs,w10,mc5,s0.001,t4)


In [60]:
# 벡터 문서 학습
from time import time
start = time()
for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, 
                         epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002
    doc_vectorizer.min_alpha = doc_vectorizer.alpha

end = time()
print('During Time: {}'.format(end - start))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


During Time: 511.6239986419678


In [61]:
doc_vectorizer.docvecs.most_similar('1', topn=20)

[('136251', 0.8377048969268799),
 ('15881', 0.8329390287399292),
 ('3', 0.8081457614898682),
 ('5', 0.7997579574584961),
 ('6', 0.7975836396217346),
 ('10', 0.7586697340011597),
 ('862041', 0.7193409204483032),
 ('2', 0.7020633220672607),
 ('28187', 0.6340987682342529),
 ('111450', 0.6320762634277344),
 ('6294', 0.6319012641906738),
 ('3950967', 0.628352165222168),
 ('1317181', 0.6273409128189087),
 ('12127750', 0.6234869360923767),
 ('100464', 0.6106216907501221),
 ('99298', 0.6094565391540527),
 ('6164358', 0.6073373556137085),
 ('2120932', 0.603802502155304),
 ('4502507', 0.6030881404876709),
 ('5907', 0.5954141616821289)]

In [62]:
train.head()

Unnamed: 0,user_id,book_id,title
0,1,6285,
1,2,8034,
2,3,9014,The Long Walk
3,4,8464,
4,5,6646,


In [63]:
# tag 정보가 있는 책이 있고 없는 책이 있어 해당 책만 추출
agg['type'] = '1'
train = pd.merge(train, agg, how='left', on='book_id')

In [65]:
# 전체 영화에 대해 진행
total_rec_list = {}
read_list1 = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list2 = train[train['type'] == '1'].groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
for user in tqdm(train['user_id'].unique()):
    rec_list = []
    if user in read_list2['user_id'].unique():
        seen = read_list2[read_list2['user_id'] == user]['unique'].values[0]
        doc2vec_dict = {}
        for book in seen:
            for i in doc_vectorizer.docvecs.most_similar(positive=[book], topn=300):
                if i[0] not in doc2vec_dict.keys():
                    doc2vec_dict[i[0]] = i[1]
                else:
                    doc2vec_dict[i[0]] += i[1]

        rec_list = list(dict(sorted(doc2vec_dict.items(), 
                                    key = lambda x: x[1], reverse = True)).keys())
    else:
        seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
        for rec in popular_rec_model[0:300]:
            if rec not in seen:
                rec_list.append(rec)
    total_rec_list[user] = rec_list[0:200]

HBox(children=(FloatProgress(value=0.0, max=53382.0), HTML(value='')))




In [66]:
evaluate_func = evaluate(recs=total_rec_list, gt=gt, topn=200)
evaluate_func._evaluate()

MAP@200: 0.00017676487652534752
NDCG@200: 0.0017342819446356426
EntDiv@200: 6.98203375757848
