In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim


In [13]:
path = "./movies/"

In [14]:
movie = pd.read_csv(path + 'ratings.csv', low_memory = False)
movie.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


In [15]:
movie = movie.sort_values(by = "timestamp", ascending = True).reset_index(drop = True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,383,21,3.0,789652009
1,383,47,5.0,789652009
2,383,1079,3.0,789652009
3,409,21,5.0,828212412
4,409,25,4.0,828212412


In [16]:
meta = pd.read_csv(path + "movies_metadata.csv", low_memory = False)
meta.head(2)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [17]:
meta.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [18]:
meta = meta.rename(columns = {"id":"movieId"})

In [19]:
meta["movieId"] = meta["movieId"].astype(str)
movie["movieId"] = movie["movieId"].astype(str)

In [20]:
movie = pd.merge(movie, meta[["movieId","original_title"]], how = "left", on = "movieId")

In [21]:
print(movie.shape)
movie.head()

(100009, 5)


Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,383,21,3.0,789652009,The Endless Summer
1,383,47,5.0,789652009,
2,383,1079,3.0,789652009,
3,409,21,5.0,828212412,The Endless Summer
4,409,25,4.0,828212412,Jarhead


# 5회이상 구매된/한 아이템과 유저만 이용하자.

In [23]:
# original_title is notnull.
movie = movie[movie["original_title"].notnull()].reset_index(drop = True)


In [24]:
item_over5 = movie.groupby("movieId")["userId"].agg("nunique").reset_index()
item_over5 = item_over5[item_over5["userId"] >= 5]["movieId"].values

user_over5 = movie.groupby("userId")["movieId"].agg("nunique").reset_index()
user_over5 = user_over5[user_over5["movieId"] >= 5]["userId"].values

In [25]:
movie = movie[movie["userId"].isin(user_over5)]
movie = movie[movie["movieId"].isin(item_over5)]

# word2vec
https://www.sallys.space/blog/2018/04/05/Word2vec,-skip-gram-model/

In [26]:
agg = movie.groupby(["userId"])["original_title"].agg({'unique'})


In [27]:
sentence = [] # i번째 user가 본 영화목록들
for user_sentence in agg["unique"].values:
    sentence.append(list(map(str, user_sentence)))

In [28]:
# Word2vec의 학습을 진행
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, vector_size = 20, window = 5,
                          min_count = 1, workers= 4, epochs=200, sg = 1)


In [29]:
embedding_model.wv.most_similar(positive=["Spider-Man 2"], topn = 10)


[('Forrest Gump', 0.782939076423645),
 ('Sunrise: A Song of Two Humans', 0.7318478226661682),
 ('Snow Cake', 0.7284685373306274),
 ('Krull', 0.7063934803009033),
 ('Fail-Safe', 0.6992067694664001),
 ('Domicile Conjugal', 0.6876279711723328),
 ('Tillsammans', 0.6866089105606079),
 ('Some Like It Hot', 0.6853487491607666),
 ('Conquest of the Planet of the Apes', 0.6780926585197449),
 ("L'Aile ou la Cuisse", 0.6709350347518921)]

In [30]:
sentence

[['Jay and Silent Bob Strike Back',
  'Vivement dimanche!',
  'Rocky III',
  'American Pie',
  'My Tutor',
  'Greed'],
 ['Terminator 3: Rise of the Machines',
  'The Conversation',
  'The Hours',
  '48 Hrs.',
  'Back to the Future Part II',
  'Silent Hill',
  'Crustacés et coquillages',
  'Lost in Translation',
  'Night on Earth',
  "Dave Chappelle's Block Party",
  "Ocean's Eleven",
  'Sissi',
  'Live and Let Die',
  'A Clockwork Orange',
  'Солярис',
  'Sommer vorm Balkon',
  'La science des rêves',
  'Trois couleurs : Rouge',
  'Grbavica',
  'Czlowiek z zelaza',
  'Le Mépris',
  'Batman Returns',
  'Romeo + Juliet',
  'Monsoon Wedding',
  'Stand by Me',
  'Lucky Number Slevin',
  'Cat on a Hot Tin Roof',
  'The Dark',
  'The Devil Wears Prada',
  'Lili Marleen',
  'Star Trek IV: The Voyage Home',
  'A Nightmare on Elm Street',
  'Notting Hill',
  'Once Were Warriors',
  'Reservoir Dogs',
  '2001: A Space Odyssey',
  'Rebecca',
  'Psycho',
  'The Poseidon Adventure',
  'Batman Begins

# Doc2Vec

https://lovit.github.io/nlp/representation/2018/03/26/word_doc_embedding/

In [31]:
from gensim.models import doc2vec
meta = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
meta = meta[meta['original_title'].notnull()].reset_index(drop=True)
meta = meta[meta['overview'].notnull()].reset_index(drop=True)


In [32]:
from nltk.corpus import stopwords 
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
import re 
stop_words = set(stopwords.words('english')) 

overview = []
for words in tqdm(meta['overview']):
    word_tokens = word_tokenize(words)
    sentence = re.sub('[^A-Za-z0-9]+', ' ', str(word_tokens))
    sentence = sentence.strip()
    
    sentence_tokens = word_tokenize(sentence)
    result = ''
    for token in sentence_tokens: 
        if token not in stop_words:
            result += ' ' + token 
    result = result.strip().lower()
    overview.append(result)

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/igyuseog/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
meta['pre_overview'] = overview
doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [None]:
from collections import namedtuple

agg = meta[['id', 'original_title', 'pre_overview']]
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument((c), [d]) for d, c in agg[['original_title', 'pre_overview']].values]

In [None]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

In [None]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

#doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
end = time()
print("During Time: {}".format(end-start))

In [None]:
doc_vectorizer.docvecs.most_similar('Toy Story', topn=20)
doc_vectorizer.docvecs.most_similar('Harry Potter and the Deathly Hallows: Part 1', topn=20)