In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim # word2vec의 

<추천시스템 3강>

통계기반의 단점
-대규모 말뭉치를 다룰 때 메모리상의 문제가 발생
-한번에 학습데이터 전체 진행(gpu 활용 힘듬)
-학습을 통해서 개선하기가 힘듬

1. word2vec

주변단어가 주어질때 중심단어를 추측하는 작업
you ? goodbye and i say hello -> ? 추측

정의 : 저차원 공간에 벡터로 매핑
'비슷한 위치에 등장하는 단어들은 비슷한 의미를 가진다는 가정'

skip_gram이 성능이 더좋고 많이 사용함

cbow : 주변 단어로 중간 단어를 예측함
ex) you ? say goodbye and i say hello
skip_gram : 중간에 있는 단어로 주변 단어들을 예측함 
ex) ? say ? and i say hello


## 캐글예제 : https://wikidocs.net/24603

In [2]:
import os
print(os.listdir("C:/data/archive"))

['credits.csv', 'keywords.csv', 'links.csv', 'links_small.csv', 'movies_metadata.csv', 'ratings.csv', 'ratings_small.csv']


In [10]:
movie = pd.read_csv( 'C:/data/archive/ratings.csv', low_memory=False)
movie.head(2) # 영화데이터

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435


In [5]:
import warnings
warnings.filterwarnings(action='ignore')

In [11]:
# time stamp가 순서대로 되어있지 않아서 다시 정렬
movie = movie.sort_values(by='timestamp', ascending=True).reset_index(drop=True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,38150,1176,4.0,789652004
1,44717,1079,3.0,789652009
2,44717,47,5.0,789652009
3,44717,21,3.0,789652009
4,190860,21,5.0,822873600


In [12]:
meta = pd.read_csv( 'C:/data/archive/movies_metadata.csv', low_memory=False)
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [13]:
meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [14]:
meta = meta.rename(columns={'id':'movieId'}) # 컬럼명이 안맞아서 변경
movie['movieId'] = movie['movieId'].astype(str)
meta['movieId'] = meta['movieId'].astype(str)

movie = pd.merge(movie, meta[['movieId', 'original_title']], how='left', on='movieId')

In [15]:
movie.head()# original_title에 null값이 많음

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,38150,1176,4.0,789652004,
1,44717,1079,3.0,789652009,
2,44717,47,5.0,789652009,
3,44717,21,3.0,789652009,The Endless Summer
4,190860,21,5.0,822873600,The Endless Summer


In [16]:
#original_title에 null값 제거
movie = movie[movie['original_title'].notnull()].reset_index(drop=True)

In [17]:
agg = movie.groupby(['userId'])['original_title'].agg({'unique'})
agg.head()


Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Young and Innocent, Shuang ma lian huan, Cesa..."
2,"[La passion de Jeanne d'Arc, La belle et la bê..."
3,"[I Love You to Death, Once Were Warriors, Mons..."
4,"[Muxmäuschenstill, Batman & Robin, Hidalgo, 12..."
5,"[Star Trek III: The Search for Spock, The Curs..."


In [18]:
movie['original_title'].unique()

array(['The Endless Summer', 'Apocalypse Now', 'Finding Nemo', ...,
       'Lost River', 'Friends & Lovers', 'The Chechahcos'], dtype=object)

# word2vec 적용

In [19]:
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [20]:
# Word2vec의 학습을 진행해줍니다. 
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, size=20, window = 5, 
                           min_count=1, workers=4, iter=200, sg=1)
# size : 웨이트의 크기 정함 window : 주변단어 얼마나 볼지
# workers : cpu 자원을 몇명이서 학습에 참여할지
# iter : 에폭 몇바퀴 
# sg : 0->cbow, 1-> skip-gram

In [21]:
embedding_model.wv.most_similar(positive=['Spider-Man 2'], topn=10)

[('The Flintstones', 0.9286708831787109),
 ('Solaris', 0.9179903268814087),
 ('To Die For', 0.9111825823783875),
 ('Chill Factor', 0.909142255783081),
 ('Lammbock', 0.8984283208847046),
 ('Les Poupées Russes', 0.8824986815452576),
 ('Domicile Conjugal', 0.8824634552001953),
 ('Flatliners', 0.873989462852478),
 ('Forrest Gump', 0.8685811161994934),
 ('Land of Plenty', 0.8601124286651611)]

# Doc2vec 적용

In [22]:
from gensim.models import doc2vec

In [25]:
meta = pd.read_csv( 'C:/data/archive/movies_metadata.csv', low_memory=False)
meta = meta[meta['original_title'].notnull()].reset_index(drop=True)
meta = meta[meta['overview'].notnull()].reset_index(drop=True)

In [31]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\peopl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\peopl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [32]:
from nltk.corpus import stopwords 
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
import re 
stop_words = set(stopwords.words('english')) 

overview = []
for words in tqdm(meta['overview']):
    word_tokens = word_tokenize(words)
    sentence = re.sub('[^A-Za-z0-9]+', ' ', str(word_tokens))
    sentence = sentence.strip()
    
    sentence_tokens = word_tokenize(sentence)
    result = ''
    for token in sentence_tokens: 
        if token not in stop_words:
            result += ' ' + token 
    result = result.strip().lower()
    overview.append(result)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=44512.0), HTML(value='')))




In [33]:
meta['pre_overview'] = overview

In [34]:
doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [35]:
from collections import namedtuple

agg = meta[['id', 'original_title', 'pre_overview']]
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument((c), [d]) for d, c in agg[['original_title', 'pre_overview']].values]

In [36]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

Doc2Vec(dbow+w,d100,n10,hs,w10,mc5,s0.001,t4)


In [37]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

#doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
end = time()
print("During Time: {}".format(end-start))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))


During Time: 565.5213885307312


In [38]:
doc_vectorizer.docvecs.most_similar('Toy Story', topn=20)

[("Schindler's List", 0.7048392295837402),
 ('Letzte Worte', 0.7040135264396667),
 ('エクスマキナ', 0.7036435008049011),
 ('Children in the Surf at Coney Island', 0.6916139125823975),
 ('It Stains the Sands Red', 0.6908447742462158),
 ('Unstrung Heroes', 0.6769487261772156),
 ('Live Forever as You Are Now with Alan Resnick', 0.6738638877868652),
 ('La moutarde me monte au nez', 0.6725764870643616),
 ('The Aristocats', 0.6717544198036194),
 ('Due Amici', 0.6709775924682617),
 ('El vendedor de humo', 0.6664122343063354),
 ('Stryapukha', 0.6659893989562988),
 ('Celtic Pride', 0.6519331336021423),
 ('Milk Money', 0.6507867574691772),
 ('McLaren', 0.6500544548034668),
 ('Der Sandmann', 0.6482512354850769),
 ('Castle Freak', 0.6476936340332031),
 ('東京喰種 トーキョーグール', 0.6471816301345825),
 ('По следам бременских музыкантов', 0.64649897813797),
 ('Przechodzien', 0.6461136341094971)]

In [39]:
doc_vectorizer.docvecs.most_similar('Harry Potter and the Deathly Hallows: Part 1', topn=20)

[('The Great Ecstasy of Robert Carmichael', 0.7870239615440369),
 ('Handsome Harry', 0.7349021434783936),
 ('Day Dreams', 0.733238935470581),
 ('Never Let Me Go', 0.7234851121902466),
 ('Kasaba', 0.721640944480896),
 ('Il deserto dei Tartari', 0.7213167548179626),
 ('$ Dollars', 0.7193742990493774),
 ('Cold Weather', 0.7144654989242554),
 ('কাঞ্চনজঙ্ঘা', 0.7143682837486267),
 ('Demonic Toys', 0.7085671424865723),
 ('Fantasma', 0.704412043094635),
 ('Se sei vivo spara', 0.7034504413604736),
 ('Emmas Glück', 0.7026129961013794),
 ('밤과 낮', 0.687443196773529),
 ('1990: I guerrieri del Bronx', 0.6826761960983276),
 ('The Blacksmith', 0.6810147762298584),
 ('No One Dies in Lily Dale', 0.6793454885482788),
 ('Zamilované Maso', 0.6785552501678467),
 ('La suerte está echada', 0.6771343946456909),
 ('Dragonlance: Dragons Of Autumn Twilight', 0.6768776178359985)]

컨텐츠 기반 모델의 단점: 
item의 feature을 추출해야하고 이를 기반으로 추천하므로 제대로 추출 못하면 정확도가 낮음
따라서 도메인 지식이 분석시에 필요함

기존의 item과 유사한 item위주로 추천하기 때문에 새로운 장르 추천힘듬
새로운 사용자에 대해서 충분한 평점이 쌓이기 전까지 추천힘듬
but 'cold start'에 대해서 사용자가 평점을 매기지 않은 새로운 item도 추천가능