## Word2Vec을 이용한 추천시스템(영화추천)

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
import os

In [2]:
import warnings
warnings.filterwarnings(action = 'ignore')

In [3]:
path = '/content/drive/MyDrive/Movie_recommend/movielens'

In [4]:
movie = pd.read_csv(os.path.join(path, 'ratings.csv'), low_memory = False)
movie = movie.sort_values(by = 'timestamp').reset_index(drop = True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,429,595,5.0,828124615
1,429,588,5.0,828124615
2,429,590,5.0,828124615
3,429,592,5.0,828124615
4,429,432,3.0,828124615


In [9]:
meta = pd.read_csv(os.path.join(path, 'movies_metadata.csv'), low_memory = False)
meta.columns

Unnamed: 0,userId,movieId,rating,timestamp
0,429,595,5.0,828124615
1,429,468,3.0,828124615
2,429,22,4.0,828124615
3,429,150,5.0,828124615
4,429,161,5.0,828124615


In [10]:
meta = meta.rename(columns={'id':'movieId'})
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,movieId,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [11]:
movie['movieId'] = movie['movieId'].astype(str)
meta['movieId'] = meta['movieId'].astype(str)

In [12]:
movie = pd.merge(movie, meta[['movieId', 'original_title']], how = 'left', on = 'movieId')

In [13]:
movie

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,429,595,5.0,828124615,To Kill a Mockingbird
1,429,468,3.0,828124615,My Own Private Idaho
2,429,22,4.0,828124615,Pirates of the Caribbean: The Curse of the Bla...
3,429,150,5.0,828124615,48 Hrs.
4,429,161,5.0,828124615,Ocean's Eleven
...,...,...,...,...,...
100836,514,187031,2.5,1537674927,
100837,514,187595,3.0,1537674946,
100838,514,5247,2.5,1537757040,Mercy
100839,514,5246,1.5,1537757059,


In [14]:
movie = movie[movie['original_title'].notnull()].reset_index(drop=True)
movie

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,429,595,5.0,828124615,To Kill a Mockingbird
1,429,468,3.0,828124615,My Own Private Idaho
2,429,22,4.0,828124615,Pirates of the Caribbean: The Curse of the Bla...
3,429,150,5.0,828124615,48 Hrs.
4,429,161,5.0,828124615,Ocean's Eleven
...,...,...,...,...,...
42175,331,1676,4.0,1537235373,Will Penny
42176,272,158238,4.0,1537475893,Stolen Seas
42177,210,122906,4.5,1537632293,About Time
42178,514,5247,2.5,1537757040,Mercy


In [20]:
agg = movie.groupby(['userId'])['original_title'].agg(['unique']) # aggregate 약자
agg[:1]

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Roman Holiday, The Wedding Planner, Der Tunne..."


##### word2vec 적용

In [23]:
sentence = []

for user_sentence in agg['unique'].values:
  # print(user_sentence)
  sentence.append(list(map(str, user_sentence)))

In [25]:
from gensim.models import Word2Vec

embedding_model = Word2Vec(sentence, size = 20, window = 5, min_count = 1, workers = 4, iter = 200, sg =1)

In [28]:
embedding_model.wv.most_similar(positive = ['Spider-Man 2'], topn = 10)

[('Domicile Conjugal', 0.7937459945678711),
 ('Mai ming xiao zi', 0.7744268178939819),
 ('Dialogue avec mon jardinier', 0.7728022933006287),
 ('Sunrise: A Song of Two Humans', 0.7368977665901184),
 ('Inspector Gadget', 0.725652813911438),
 ('Helen', 0.7208864092826843),
 ('La strada', 0.7163882851600647),
 ('Forrest Gump', 0.7162166237831116),
 ('Poodle Springs', 0.7079588770866394),
 ('Licence to Kill', 0.7056770324707031)]

In [53]:
from gensim.models import doc2vec

In [30]:
meta = pd.read_csv(os.path.join(path, 'movies_metadata.csv'), low_memory=False)
meta = meta.rename(columns={'id':'movieId'})
meta = meta[meta['original_title'].notnull()].reset_index(drop = True)
meta = meta[meta['overview'].notnull()].reset_index(drop = True)
meta['overview']

0        Led by Woody, Andy's toys live happily in his ...
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
44507          Rising and falling between a man and woman.
44508    An artist struggles to finish his work while a...
44509    When one of her hits goes wrong, a professiona...
44510    In a small town live two brothers, one a minis...
44511    50 years after decriminalisation of homosexual...
Name: overview, Length: 44512, dtype: object

In [34]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
import re
from tqdm import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

overview = []

for words in tqdm(meta['overview']):
  word_tokens = word_tokenize(words)
  sentence = re.sub('[A-Za-z0-9]+', ' ', str(word_tokens))
  sentence.strip()
# sentence
  sent_tokens = sent_tokenize(sentence)
  result = ''
  for token in sent_tokens:
    if token not in stop_words:
      result += ' ' + token
  result = result.strip().lower()
  overview.append(result)
print(result)

100%|██████████| 44512/44512 [00:23<00:00, 1912.06it/s]

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ',', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ',', ' ', ',', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '.']





In [40]:
meta['pre_overview'] = overview

In [42]:
meta['pre_overview']

0        [' ', ' ', ' ', ',', ' ', "' ", ' ', ' ', ' ',...
1        [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
2        [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' - '...
3        [' ', ' ', ',', ' ', ' ', ' ', ' ', ',', ' ', ...
4        [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
                               ...                        
44507        [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '.']
44508    [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
44509    [' ', ' ', ' ', ' ', ' ', ' ', ' ', ',', ' ', ...
44510    [' ', ' ', ' ', ' ', ' ', ' ', ' ', ',', ' ', ...
44511    [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ...
Name: pre_overview, Length: 44512, dtype: object

In [60]:
doc_vector = doc2vec.Doc2Vec(
    dm = 0,
    dbow_words = 1,
    window = 1,
    size = 1,
    alpha = 0.025,
    seed = 1234,
    min_count = 0.025,
    workers = 4,
    hs = 1,
    negative = 10
)

In [61]:
from collections import namedtuple

agg = meta[['movieId', 'original_title', 'pre_overview']]
TaggedDocument = namedtuple('ToggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument((c), [d]) for c, d in agg[['original_title', 'pre_overview']].values]

In [56]:
doc_vector.build_vocab(tagged_train_docs)
print(str(doc_vector))

Doc2Vec(dbow+w,d1,n10,hs,w1,mc5,s0.001,t4)


In [58]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
  doc_vector.train(tagged_train_docs, total_examples = doc_vector.corpus_count, epochs = doc_vector.iter)
  doc_vector.alpha -= 0.002
  doc_vector.min_alpha = doc_vector.alpha

end = time()
print(f"During Time: {end-start}")

100%|██████████| 5/5 [01:57<00:00, 23.48s/it]

During Time: 117.39596199989319





In [63]:
doc_vector.docvecs.most_similar(['Spider-Man 2'], topn = 10)

AttributeError: ignored