In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.manifold import TSNE
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#파이선 doc2vec 모델 학습, 128차원 고정 길이의 벡터로 만듦, epochs 40 -> 300

def make_doc2vec_models(tagged_data, vector_size = 128, window = 3, epochs = 300, min_counts = 0, workers = 4):
  model = Doc2Vec(tagged_data, vector_size = vector_size, window = window, epochs = epochs, min_counts = min_counts, workers = workers)
  return model

In [None]:
# Doc2Vec에서 사용할 수 있는 데이터 생성
# Doc2Vec model 학습에 사용됩니다.

def make_doc2vec_webtoon(data, column, tagged_document = False):
  data_doc = []
  for tag, doc in zip(data.index, data[column]):
    doc = doc.split(" ")
    data_doc.append(([tag], doc))
  if tagged_document:
    data = [TaggedDocument(words = text, tags = tag) for tag, text in data_doc]
    return data
  else:
    return data_doc

In [None]:
# User embedding 생성
# User history를 평균해서 user embedding을 만드는 함수

def make_user_embedding(index_list, data_doc, model):
    user = []
    user_embedding = []
    for i in index_list:
        user.append(data_doc[i][0][0])
    for i in user:
        user_embedding.append(model.docvecs[i])
    user_embedding = np.array(user_embedding)
    user = np.mean(user_embedding, axis = 0)
    return user

In [None]:
# 추천 결과 반환, 상위 5개를 추천해주는 함수

def recommendation_webtoon(user, data_doc, model):
  scores = []

  for tags, text in data_doc:
    train_doc_vec = model.docvecs[tags[0]]
    scores.append(cosine_similarity(user.reshape(-1,128), train_doc_vec.reshape(-1,128)))

  scores = np.array(scores).reshape(-1)
  scores = np.argsort(-scores)[:5]

  return webtoon.loc[scores, :]

In [None]:
#랜덤하게 뽑아낸 사용자가 본 컨텐츠

def user_contents(data):
  print(data[['title']])

In [None]:
from google.colab import files
myfile = files.upload()

In [None]:
webtoon = pd.read_csv('webtoon_processing_3.csv')
webtoon.head()


In [None]:
webtoon_comb_tag = make_doc2vec_webtoon(webtoon,'title_genre_story', tagged_document=True)
#print(webtoon_doc_title_content_tag)
webtoon_comb = make_doc2vec_webtoon(webtoon, 'title_genre_story')
#print(webtoon_doc_title_content)
webtoon_nouns_tag = make_doc2vec_webtoon(webtoon, 'komoran', tagged_document=True)
#print(webtoon_doc_tok_tag)
webtoon_nouns = make_doc2vec_webtoon(webtoon, 'komoran')
#print(webtoon_doc_tok)
#tag가 붙은 데이터는 doc2vec model 학습에 사용되며, 없는 데이터는 user embedding, cosine similarity를 구할 때 사용됩니다.

In [None]:
# Doc2Vec model 만들기
model_comb = make_doc2vec_models(webtoon_comb_tag)
model_nouns = make_doc2vec_models(webtoon_nouns_tag)

In [None]:
user1 = webtoon.loc[random.sample(webtoon.loc[webtoon.genre == '액션', :].index.values.tolist(),5),:]
user_contents(user1)
#user1,2,3 각 장르 user contents 를 만드는 과정

In [None]:
user2 = webtoon.loc[random.sample(webtoon.loc[webtoon.genre == '스릴러', :].index.values.tolist(),5),:]
user_contents(user2)

In [None]:
user3 = webtoon.loc[random.sample(webtoon.loc[webtoon.genre == '판타지', :].index.values.tolist(),5),:]
user_contents(user3)

In [None]:
user_action = make_user_embedding(user1.index.values.tolist(), webtoon_comb,model_comb)
user_thriller = make_user_embedding(user2.index.values.tolist(), webtoon_comb, model_comb)
user_fantasy = make_user_embedding(user3.index.values.tolist(), webtoon_comb, model_comb)
#각 장르마다 embedding을 통해서 128개의 평균을 구하게 됩니다.

In [None]:
result = recommendation_webtoon(user_action, webtoon_comb, model_comb)
pd.DataFrame(result.loc[:, ['title','genre', 'title_genre_story']])

In [None]:
result = recommendation_webtoon(user_thriller, webtoon_comb, model_comb)
pd.DataFrame(result.loc[:, ['title','genre', 'title_genre_story']])

In [None]:
result = recommendation_webtoon(user_fantasy, webtoon_comb, model_comb)
pd.DataFrame(result.loc[:, ['title','genre', 'title_genre_story']])