In [1]:
import pandas as pd
import numpy as np
import plotnine 
from plotnine import *
import os, sys, gc
from tqdm.notebook import tqdm
import warnings 
warnings.filterwarnings('ignore')

In [2]:
path = "./books/"

In [3]:
books = pd.read_csv(path + "books.csv")
book_tags = pd.read_csv(path + "book_tags.csv")
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
tags = pd.read_csv(path + "tags.csv")
to_read = pd.read_csv(path + "to_read.csv")

In [4]:
train['book_id'] = train['book_id'].astype(str)
test['book_id'] = test['book_id'].astype(str)
books['book_id'] = books['book_id'].astype(str)

In [9]:
print(train.shape)
train.head()

(477592, 2)


Unnamed: 0,user_id,book_id
0,1,4893
1,2,8855
2,3,9049
3,4,3273
4,5,4829


In [10]:
print(test.shape)
test.head()

(592790, 2)


Unnamed: 0,user_id,book_id
0,1,1180
1,1,6285
2,2,8034
3,2,9762
4,3,9014


In [12]:
print(books.shape)
books.head()

(10000, 23)


Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [15]:
# 상위 500개의 도서목록
popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]
popular_rec_model.shape

(500,)

In [26]:
sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
gt = {}
#for user in tqdm(sol['user_id'].unique()): 
    #gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])
    
len(sol)


53424

In [27]:
for idx in range(len(sol)):
    gt[sol['user_id'][idx]] = sol['unique'][idx].tolist()

In [30]:
rec_df = pd.DataFrame()
rec_df['user_id'] = train['user_id'].unique()
rec_df

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5
...,...
53377,53420
53378,53421
53379,53422
53380,53423


# TF-IDF를 이용한 contents-based model

-> TF-IDF가 자연어모델 -> 타이틀을 구성하는 단어들을 토대로 벡터들간의 유사도를 측정해서 아이템을 추천하자.

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books['title'])
print(tfidf_matrix.shape)

(10000, 9019)


In [36]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_matrix.shape

(10000, 10000)

In [37]:
cosine_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [39]:
# book title와 id를 매핑할 dictionary를 생성해줍니다. 
book2id = {}
for i, c in enumerate(books['title']): book2id[i] = c

# id와 book title를 매핑할 dictionary를 생성해줍니다. 
id2book = {}
for i, c in book2id.items(): id2book[c] = i
    
# book_id와 title를 매핑할 dictionary를 생성해줍니다.
bookid2book = {}
for i, j in zip(books['title'].values, books['book_id'].values):
    bookid2book[i] = j

In [56]:
idx = id2book['Twilight (Twilight, #1)']  
sim_scores = [(book2id[index], value) for index,value in enumerate(cosine_matrix[idx]) if idx != index]
sim_scores.sort(key = lambda x: x[1], reverse = True)
sim_scores[:10]

[('The Twilight Saga (Twilight, #1-4)', 0.920347418277986),
 ('The Twilight Collection (Twilight, #1-3)', 0.8786339079447184),
 ('The Twilight Saga Complete Collection  (Twilight, #1-4 + 3.5)',
  0.7697532056304309),
 ('Twilight and History', 0.7465001575650626),
 ('The Twilight Saga: The Official Illustrated Guide (Twilight, #4.5)',
  0.7045174300631831),
 ('Twilight Eyes', 0.6770737331426326),
 ('Twilight (The Mediator, #6)', 0.6377631333498953),
 ('New Moon (Twilight, #2)', 0.6185575138625542),
 ('Eclipse (Twilight, #3)', 0.612819563854136),
 ('The Servants of Twilight', 0.5837817298466093)]

# 이번에는 다른 방식으로 진행.
0. 학습셋에서 제목이 있는 경우에 대해서만 진행
1. 각 유저별로 읽은 책의 목록을 수집 
2. 읽은 책과 유사한 책 추출 
3. 모든 책에 대해서 유사도를 더한 값을 계산 
4. 3에서 유사도가 가장 높은 순서대로 추출 

In [57]:
train = pd.merge(train, books[['book_id', 'title']], how='left', on='book_id')
train.head()

Unnamed: 0,user_id,book_id,title
0,1,4893,
1,2,8855,
2,3,9049,
3,4,3273,Moloka'i
4,5,4829,


In [62]:
# 0. 학습셋에서 제목이 있는 경우에 대해서만 진행
tf_train = train[train["title"].notnull()].reset_index(drop = True)
tf_train["idx2title"] = tf_train["title"].apply(lambda x: id2book[x])
tf_train.head()

Unnamed: 0,user_id,book_id,title,idx2title
0,4,3273,Moloka'i,1215
1,7,4138,Naked,343
2,7,4588,Extremely Loud and Incredibly Close,248
3,9,8676,Unlimited Power : The New Science Of Personal ...,4701
4,10,5907,The Hobbit,6


In [65]:
#key = idx2title, value = book_id
idx2title2book = {}
for i,j in zip(tf_train["idx2title"].values, tf_train["book_id"].values):
    idx2title2book[i] = j

In [83]:
# 1. 각 유저별로 읽은 책의 목록을 수집 
user = 7 # id가 7인 유저가 본 목록은?
read_list = tf_train.groupby(['user_id'])["idx2title"].agg({"unique"}).reset_index()
read_list.head()

Unnamed: 0,user_id,unique
0,4,[1215]
1,7,"[343, 248]"
2,9,[4701]
3,10,"[6, 1440]"
4,14,[120]


In [98]:
seen = read_list[read_list["user_id"] == 7]["unique"].values[0]
for saw in seen:
    print(book2id[saw])

Naked
Extremely Loud and Incredibly Close


In [90]:
# 2. 읽은 책과 유사한 책 추출
total_cosine_sim = np.zeros(len(book2id))
for book_ in seen:
     # 3. 모든 책에 대해서 유사도를 더한 값을 계산 
    # 343번째 책과 248의 유사도가 모두 결합된 유사도
    total_cosine_sim += cosine_matrix[book_]

In [91]:
total_cosine_sim

array([0., 0., 0., ..., 0., 0., 0.])

In [96]:
# 4. 3에서 유사도가 가장 높은 순서대로 추출
sim_scores = [(book2id[i],c) for i,c in enumerate(total_cosine_sim) if i not in seen]
sim_scores.sort(key = lambda x: x[1], reverse = True)
sim_scores[:10]


[('The Naked and the Dead', 0.793036327171204),
 ('The Naked Face', 0.6915730356677104),
 ('Juliet, Naked', 0.6607508855409738),
 ('Naked Lunch', 0.6390974315343301),
 ("The Naked God (Night's Dawn, #3)", 0.5820260477746269),
 ('The Naked Sun (Robot #2)', 0.5722074713852562),
 ('Naked in Death (In Death, #1)', 0.569400924005893),
 ('Naked (The Blackstone Affair, #1)', 0.5440917692510238),
 ('Naked Empire (Sword of Truth, #8)', 0.5403329094870779),
 ('Naked Heat (Nikki Heat, #2)', 0.3935969471434931)]

In [99]:
# 전체영화에 대해서 진행
## 전체 영화에 대해서 진행 
total_rec_list = {}

read_list1 = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list2 = tf_train.groupby(['user_id'])['idx2title'].agg({'unique'}).reset_index()

for user in tqdm(train['user_id'].unique()):
    rec_list = []
        
    # 만약 TF-IDF 소속의 추천대상이라면 Contents 기반의 추천 
    if user in tf_train['user_id'].unique():
        # 1. 각 유저별로 읽은 책의 목록을 수집 
        seen = read_list2[read_list2['user_id'] == user]['unique'].values[0]
        # 2. 읽은 책과 유사한 책 추출 
        total_cosine_sim = np.zeros(len(book2id))
        for book_ in seen: 
            # 3. 모든 책에 대해서 유사도를 더한 값을 계산 
            # 343번째 책과 248의 유사도가 모두 결합된 유사도
            total_cosine_sim += cosine_matrix[book_]
            
        # 4. 3에서 유사도가 가장 높은 순서대로 추출
        sim_scores = [(bookid2book[book2id[i]], c) for i, c in enumerate(total_cosine_sim) if i not in seen] # 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출 
        recs = sorted(sim_scores, key = lambda x: x[1], reverse=True)[0:300] # 유사도가 높은 순서대로 정렬 
        for rec in recs: 
            if rec not in seen:
                rec_list.append(rec)   
        
    # 그렇지 않으면 인기도 기반의 추천 
    else: 
        seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
        for rec in popular_rec_model[0:400]:
            if rec not in seen:
                rec_list.append(rec)
                
    total_rec_list[user] = rec_list[0:200]

  0%|          | 0/53382 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import six
import math

# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py

class evaluate():
    def __init__(self, recs, gt, topn=100):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        n, ap = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))
        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))

In [None]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()



# Word2vec을 이용한 추천시스템
-> user가 본 영화의 id를 하나의 string으로 표현함.
-> 

In [100]:
agg = train.groupby(['user_id'])['book_id'].agg({'unique'})
agg.head()


Unnamed: 0_level_0,unique
user_id,Unnamed: 1_level_1
1,[4893]
2,[8855]
3,[9049]
4,[3273]
5,"[4829, 6703]"


In [101]:
#int형식은 Word2vec에서 학습이 안되서 string으로 변경
sentence = []
for user_sentence in agg['unique']:
    sentence.append(list(map(str,user_sentence)))
    

In [104]:
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, vector_size = 20, window = 5,
                          min_count = 1, workers = 4, epochs =200, sg = 1) # sg=1: skip방식, sg=0 Cbow


In [110]:
embedding_model.wv.most_similar(positive=['4893'], topn = 10)


[('8675', 0.8483825325965881),
 ('8618', 0.8467643857002258),
 ('9864', 0.8355849385261536),
 ('7375', 0.8334120512008667),
 ('5081', 0.8295274376869202),
 ('8469', 0.8250794410705566),
 ('9291', 0.8236818909645081),
 ('9714', 0.8226191997528076),
 ('6291', 0.8215946555137634),
 ('9984', 0.8156006932258606)]

In [121]:
## 전체 영화에 대해서 진행 
total_rec_list = {}

read_list = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
for user in tqdm(train['user_id'].unique()):
    rec_list = []     
    seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
    word2vec_dict = {}
    for book in seen: 
        for i in embedding_model.wv.most_similar(positive=[book], topn=300):
            if i[0] not in seen: 
                if i[0] not in word2vec_dict.keys(): 
                    word2vec_dict[i[0]] = i[1]
                else:
                    word2vec_dict[i[0]] += i[1]
                
    rec_list = list(dict(sorted(word2vec_dict.items(), key = lambda x: x[1], reverse=True)).keys())
    total_rec_list[user] = rec_list[0:200]

  0%|          | 0/53382 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [124]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()


NameError: name 'evaluate' is not defined

# 태그를 통한 유사도 계산

In [125]:
book_tags.columns = ['book_id', 'tag_id', 'count']
book_tags['book_id'] = book_tags['book_id'].astype(str)
book_tags['tag_id'] = book_tags['tag_id'].astype(str)

tags['tag_id'] = tags['tag_id'].astype(str)

book_tags = pd.merge(book_tags, tags, how='left', on='tag_id')
book_tags.head()

Unnamed: 0,book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,1,11305,37174,fantasy
2,1,11557,34173,favorites
3,1,8717,12986,currently-reading
4,1,33114,12716,young-adult


In [127]:
agg = book_tags.groupby(['book_id'])['tag_name'].agg({'unique'}).reset_index()
agg.head()

Unnamed: 0,book_id,unique
0,1,"[to-read, fantasy, favorites, currently-readin..."
1,10,"[to-read, favorites, fantasy, currently-readin..."
2,10006,"[to-read, fiction, currently-reading, rory-gil..."
3,1000751,"[to-read, classics, childrens, fiction, curren..."
4,10008056,"[to-read, default, currently-reading, krimi, c..."


In [130]:
# 태그간의 유사도 계산 
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))
sentence[0]

['to-read',
 'fantasy',
 'favorites',
 'currently-reading',
 'young-adult',
 'fiction',
 'harry-potter',
 'books-i-own',
 'owned',
 'ya',
 'series',
 'favourites',
 'magic',
 'childrens',
 'owned-books',
 're-read',
 'adventure',
 'children',
 'j-k-rowling',
 'children-s',
 'sci-fi-fantasy',
 'childhood',
 'all-time-favorites',
 'default',
 'my-books',
 'classics',
 'reread',
 'i-own',
 'audiobook',
 '5-stars',
 'children-s-books',
 'favorite-books',
 'kids',
 'novels',
 'fantasy-sci-fi',
 'favorite',
 'middle-grade',
 'audiobooks',
 'paranormal',
 'read-more-than-once',
 'my-library',
 'ya-fantasy',
 'teen',
 'witches',
 'english',
 'urban-fantasy',
 'british',
 'jk-rowling',
 'books',
 'read-in-2016',
 'supernatural',
 're-reads',
 'mystery',
 'ya-fiction',
 'harry-potter-series',
 'my-favorites',
 'own-it',
 'childrens-books',
 'library',
 'audio',
 'young-adult-fiction',
 'novel',
 '2005',
 'scifi-fantasy',
 'wizards',
 'faves',
 'favorite-series',
 'read-in-2015',
 'made-me-cry',


In [132]:
from gensim.models import doc2vec
doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    vector_size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [153]:
from collections import namedtuple

TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(c, [d]) for c, d in agg[['unique', 'book_id']].values]


In [135]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

Doc2Vec(dbow+w,d100,n10,hs,w10,mc5,s0.001,t4)


In [152]:
for c,d in agg[['unique', 'book_id']].values:
    print(type(c),type(d))
    break

<class 'numpy.ndarray'> <class 'str'>


In [154]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.epochs)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

#doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
end = time()
print("During Time: {}".format(end-start))

  0%|          | 0/5 [00:00<?, ?it/s]

During Time: 463.74789118766785


In [None]:
doc_vectorizer.docvecs.most_similar('1', topn=20)

In [None]:
# tag 정보가 있는 책이 있고 아닌 책이 있어서 해당 책만 추출 
agg['type'] = '1'
train = pd.merge(train, agg, how='left', on='book_id')

In [155]:
## 전체 영화에 대해서 진행 
total_rec_list = {}
read_list1 = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list2 = train[train['type'] == '1'].groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
for user in tqdm(train['user_id'].unique()):
    rec_list = []
    if user in read_list2['user_id'].unique():
        seen = read_list2[read_list2['user_id'] == user]['unique'].values[0]
        doc2vec_dict = {}
        for book in seen: 
            for i in doc_vectorizer.docvecs.most_similar(positive=[book], topn=300): 
                if i[0] not in doc2vec_dict.keys(): 
                    doc2vec_dict[i[0]] = i[1]
                else:
                    doc2vec_dict[i[0]] += i[1]

        rec_list = list(dict(sorted(doc2vec_dict.items(), key = lambda x: x[1], reverse=True)).keys())
    else:
        
        seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
        for rec in popular_rec_model[0:300]:
            if rec not in seen:
                rec_list.append(rec)
    total_rec_list[user] = rec_list[0:200]

KeyError: 'type'