In [3]:
import pandas as pd
import numpy as np
import os, sys, gc
from tqdm.notebook import tqdm
import warnings 
warnings.filterwarnings('ignore')

In [4]:
path = './input/'

In [5]:
books = pd.read_csv(path + "books.csv")
book_tags = pd.read_csv(path + "book_tags.csv")
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
tags = pd.read_csv(path + "tags.csv")
to_read = pd.read_csv(path + "to_read.csv")

In [6]:
train['book_id'] = train['book_id'].astype(str)
test['book_id'] = test['book_id'].astype(str)
books['book_id'] = books['book_id'].astype(str)

In [7]:
popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]

In [8]:
sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
gt = {}
for user in tqdm(sol['user_id'].unique()): 
    gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])

  0%|          | 0/53424 [00:00<?, ?it/s]

In [9]:
rec_df = pd.DataFrame()
rec_df['user_id'] = train['user_id'].unique()

### TF-IDF를 이용한 Contents Based Model

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books['title'])
print(tfidf_matrix.shape)

(10000, 9019)


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_matrix.shape

(10000, 10000)

In [13]:
# book title과 id를 매핑할 딕셔너리 생성
book2id = {}
for i, c in enumerate(books['title']): book2id[i] = c
# id와 book title을 매핑할 딕셔너리 생성
id2book = {}
for i, c in book2id.items(): id2book[c] = i
# book_id과 title을 매핑할 딕셔너리 생성
bookid2book = {}
for i, j in zip(books['title'].values, books['book_id'].values):
    bookid2book[i] = j 

In [14]:
books['title'].head()

0              The Hunger Games (The Hunger Games, #1)
1    Harry Potter and the Sorcerer's Stone (Harry P...
2                              Twilight (Twilight, #1)
3                                To Kill a Mockingbird
4                                     The Great Gatsby
Name: title, dtype: object

In [16]:
idx = id2book['Twilight (Twilight, #1)']
sim_scores = [(book2id[i], c) for i, c in enumerate(cosine_matrix[idx]) if i != idx]
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)
sim_scores[0:10]

[('The Twilight Saga (Twilight, #1-4)', 0.920347418277986),
 ('The Twilight Collection (Twilight, #1-3)', 0.8786339079447184),
 ('The Twilight Saga Complete Collection  (Twilight, #1-4 + 3.5)',
  0.7697532056304309),
 ('Twilight and History', 0.7465001575650626),
 ('The Twilight Saga: The Official Illustrated Guide (Twilight, #4.5)',
  0.7045174300631831),
 ('Twilight Eyes', 0.6770737331426326),
 ('Twilight (The Mediator, #6)', 0.6377631333498953),
 ('New Moon (Twilight, #2)', 0.6185575138625542),
 ('Eclipse (Twilight, #3)', 0.612819563854136),
 ('The Servants of Twilight', 0.5837817298466093)]

1. 학습셋에서 제목이 있는 경우에 대해서만 진행
2. 각 유저별로 읽은 책의 목록을 수집
3. 읽은 책과 유사한 책 추출
4. 모든 책에 대해서 유사도를 더한 값을 계산
5. 3에서 유사도가 가장 높은 순서대로 추출

In [17]:
train = pd.merge(train, books[['book_id', 'title']], how='left', on='book_id')
train.head()

Unnamed: 0,user_id,book_id,title
0,1,6285,
1,2,8034,
2,3,9014,The Long Walk
3,4,8464,
4,5,6646,


In [18]:
# 0. 학습셋에서 제목이 있는 경우에 대해서만 진행
tf_train = train[train['title'].notnull()].reset_index(drop=True)
tf_train['idx2title'] = tf_train['title'].apply(lambda x: id2book[x])
tf_train.head()

Unnamed: 0,user_id,book_id,title,idx2title
0,3,9014,The Long Walk,1287
1,7,4588,Extremely Loud and Incredibly Close,248
2,7,4588,Extremely Loud and Incredibly Close,248
3,10,5084,My Life in France,1440
4,14,7604,Lolita,120


In [19]:
idx2title2book = {}
for i, j in zip(tf_train['idx2title'].values, tf_train['book_id'].values):
    idx2title2book[i] = j

In [20]:
# 1. 각 유저별로 읽은 책의 목록 수집
user = 7
read_list = tf_train.groupby(['user_id'])['idx2title'].agg({'unique'}).reset_index()
seen = read_list[read_list['user_id'] == user]['unique'].values[0]
seen

array([248], dtype=int64)

In [21]:
# 2. 읽은 책과 유사한 책 추출
# 248번째 책과 다른 책들간의 유사도
cosine_matrix[248]

array([0., 0., 0., ..., 0., 0., 0.])

In [22]:
# 2. 읽은 책과 유사한 책 추출
total_cosine_sim = np.zeros(len(book2id))
for book_ in seen:
    # 3. 모든 책에 대해서 유사도를 더한 값을 계산
    total_cosine_sim += cosine_matrix[book_]

In [23]:
# 4. 3에서 유사도가 가장 높은 순서대로 추출
sim_scores = [(i, c) for i, c in enumerate(total_cosine_sim) if i not in seen] # 자기 자신을 제외한 영화들의 유사도 및 인덱스 추출
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True) # 유사도가 높은 순서대로 정렬
sim_scores[0:5]

[(3160, 0.3451654965519055),
 (9459, 0.33683651502400813),
 (7969, 0.30892286302140143),
 (9042, 0.26130119036804794),
 (6490, 0.2530815324721045)]

In [24]:
book2id[3160]

'Stay Close'

In [25]:
bookid2book[book2id[3160]]

'11737271'

In [26]:
tf_train['user_id'].unique()

array([    3,     7,    10, ..., 53406, 53408, 53420], dtype=int64)

In [27]:
tf_train.head()

Unnamed: 0,user_id,book_id,title,idx2title
0,3,9014,The Long Walk,1287
1,7,4588,Extremely Loud and Incredibly Close,248
2,7,4588,Extremely Loud and Incredibly Close,248
3,10,5084,My Life in France,1440
4,14,7604,Lolita,120


In [29]:
# 전체 영화에 대해 진행
total_rec_list = {}

read_list1 = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list2= tf_train.groupby(['user_id'])['idx2title'].agg({'unique'}).reset_index()

for user in tqdm(train['user_id'].unique()):
    rec_list = []
    
    # 만약 Tf-idf 소속의 추천대상이라면 콘텐츠 기반의 추천
    if user in tf_train['user_id'].unique():
        # 1. 각 유저별로 읽은 책의 목록 수집
        seen = read_list2[read_list2['user_id'] == user]['unique'].values[0]
        # 2. 읽은 책과 유사한 책 추출
        total_cosine_sim = np.zeros(len(book2id))
        for book_ in seen:
            # 3. 모든 책에 대해 유사도를 더한 값을 계산
            total_cosine_sim += cosine_matrix[book_]
            
        # 4. 3에서 유사도가 가장 높은 순서대로 추출
        sim_scores = [(bookid2book[book2id[i]], c) for i, c in enumerate(total_cosine_sim) if i not in seen]
        recs = sorted(sim_scores, key = lambda x: x[1], reverse=True)[0:300]
        for rec in recs:
            if rec not in seen:
                rec_list.append(rec)
                
    # 그렇지 않으면 인기도 기반의 추천
    else:
        seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
        for rec in popular_rec_model[0:400]:
            if rec not in seen:
                rec_list.append(rec)
                
    total_rec_list[user] = rec_list[0:200]
            

  0%|          | 0/53382 [00:00<?, ?it/s]

In [30]:
import six
import math

# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py

class evaluate():
    def __init__(self, recs, gt, topn=100):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        n, ap = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))
        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))

In [31]:
evaluate_func = evaluate(recs=total_rec_list, gt=gt, topn=200)
evaluate_func._evaluate()

MAP@200: 8.792647421603096e-05
NDCG@200: 0.0008922079511553609
EntDiv@200: 6.916300598784509
