In [1]:
from google.colab import drive
drive.mount("/gdrive")

Mounted at /gdrive


In [2]:
import os

IS_COLAB = True

if IS_COLAB:
  project_path = "/gdrive/MyDrive/colab/Factorization_Machine/"
else:
  project_path = os.path.dirname(os.path.abspath("__file__"))

data_path = os.path.join(project_path,"data")

In [3]:
ls -l /gdrive/MyDrive/colab/Factorization_Machine/data

total 18941
-rw------- 1 root root   197979 Nov 19 04:30 links.csv
-rw------- 1 root root   494431 Nov 24 01:17 movies.csv
-rw------- 1 root root  2483723 Nov 24 01:17 ratings.csv
-rw------- 1 root root   118660 Nov 19 04:30 tags.csv
-rw------- 1 root root   238833 Nov 27 08:36 tag_tokenizer_bpe_100.model
-rw------- 1 root root      672 Nov 27 08:36 tag_tokenizer_bpe_100.vocab
-rw------- 1 root root   370999 Nov 27 05:52 tag_tokenizer_bpe_9000.model
-rw------- 1 root root   114127 Nov 27 05:52 tag_tokenizer_bpe_9000.vocab
-rw------- 1 root root   271684 Nov 27 08:36 tag_tokenizer_input_bpe_100.txt
-rw------- 1 root root   236678 Nov 27 05:52 tag_tokenizer_input_bpe_24000.txt
-rw------- 1 root root   271684 Nov 27 05:57 tag_tokenizer_input_bpe_9000.txt
-rw------- 1 root root   276634 Nov 27 08:36 tag_w2v_bpe_100.model
-rw------- 1 root root   613252 Nov 27 08:36 title_tokenizer_bpe_24000.model
-rw------- 1 root root   340369 Nov 27 08:36 title_tokenizer_bpe_24000.vocab
-rw------- 1 root

In [4]:
!pip install sentencepiece

from gensim.models import  Word2Vec as w2v
import sentencepiece as spm
from tqdm import tqdm_notebook

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 5.7MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.94


In [5]:
import pandas as pd
import numpy as np

csv_movies = os.path.join(data_path,"movies.csv")
csv_ratings = os.path.join(data_path,"ratings.csv")
csv_tags = os.path.join(data_path,"tags.csv")
csv_links =  os.path.join(data_path,"links.csv")

movies = pd.read_csv(csv_movies)
ratings = pd.read_csv(csv_ratings)
tags = pd.read_csv(csv_tags)
links = pd.read_csv(csv_links)

org_movies = movies.copy()
org_ratings = ratings.copy()
org_tags = tags.copy()
org_links = links.copy()

In [6]:
dict_mid_2_nid= {mid:nid for nid, mid in enumerate(movies.movieId.unique())}
dict_nid_2_mid = {nid:mid for mid,nid in dict_mid_2_nid.items()}

dict_uid_2_nid= {uid:nid for nid, uid in enumerate(ratings.userId.unique())}
dict_nid_2_uid = {nid:uid for uid,nid in dict_uid_2_nid.items()}

movies.loc[:,"m_nid"] = movies.movieId.apply(lambda x:dict_mid_2_nid[x])

ratings.loc[:,"u_nid"] = ratings.userId.apply(lambda x:dict_uid_2_nid[x])
ratings.loc[:,"m_nid"] = ratings.movieId.apply(lambda x:dict_mid_2_nid[x])

tags.loc[:,"u_nid"] = tags.userId.apply(lambda x:dict_uid_2_nid[x])
tags.loc[:,"m_nid"] = tags.movieId.apply(lambda x:dict_mid_2_nid[x])

links.loc[:,"m_nid"] = tags.movieId.apply(lambda x:dict_mid_2_nid[x])

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
4,5,Father of the Bride Part II (1995),Comedy,4


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid
0,1,1,4.0,964982703,0,0
1,1,3,4.0,964981247,0,2
2,1,6,4.0,964982224,0,5
3,1,47,5.0,964983815,0,43
4,1,50,5.0,964982931,0,46


In [9]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp,u_nid,m_nid
0,2,60756,funny,1445714994,1,6801
1,2,60756,Highly quotable,1445714996,1,6801
2,2,60756,will ferrell,1445714992,1,6801
3,2,89774,Boxing story,1445715207,1,7697
4,2,89774,MMA,1445715200,1,7697


In [10]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId,m_nid
0,1,114709,862.0,6801.0
1,2,113497,8844.0,6801.0
2,3,113228,15602.0,6801.0
3,4,114885,31357.0,7697.0
4,5,113041,11862.0,7697.0


In [11]:
sorted_ratings = ratings.sort_values(by=["userId","timestamp"])
sorted_ratings['b4_timestamp'] = sorted_ratings.groupby(['userId'])['timestamp'].shift(1)
mc_sorted_ratings= sorted_ratings.groupby(['userId',"timestamp"])["m_nid"].count().to_frame(name="m_count")
sorted_ratings = pd.merge(left=sorted_ratings, right=mc_sorted_ratings, on=["userId","timestamp"], how="left")
sorted_ratings.loc[:,"b4_timestamp"] = sorted_ratings.b4_timestamp.fillna(0).astype("int64")
sorted_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count
0,1,804,4.0,964980499,0,632,0,2
1,1,1210,5.0,964980499,0,911,964980499,2
2,1,2018,5.0,964980523,0,1493,964980499,3
3,1,2628,4.0,964980523,0,1979,964980523,3
4,1,2826,4.0,964980523,0,2126,964980523,3


In [12]:
previous_timestamp = 0
counter = 0
former_movies = list()
now_movies = list()
movie_vectors = list()

for idx,row in tqdm_notebook(sorted_ratings.iterrows()):
  counter += 1
  m_nid = row["m_nid"].astype("int")
  m_count = row["m_count"]
  timestamp = row["timestamp"]
  b4_timestamp = row["b4_timestamp"]

  if previous_timestamp == 0 and counter <= m_count:
    now_movies.append(m_nid)
    movie_vector = np.zeros(len(dict_mid_2_nid)) 
    
    if counter == m_count:
      former_movies = now_movies
      previous_timestamp = timestamp
      now_movies = list()
      counter = 0
  
  elif previous_timestamp != 0 and counter <= m_count:
    now_movies.append(m_nid)
    movie_vector = np.zeros(len(dict_mid_2_nid))
    movie_vector[former_movies] = 1

    if counter == m_count:
      former_movies = now_movies
      previous_timestamp = timestamp
      now_movies = list()
      counter = 0

  movie_vectors.append(movie_vector)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [13]:
sorted_ratings.loc[:,"last_rate_vector"] = pd.Series(movie_vectors)
sorted_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector
0,1,804,4.0,964980499,0,632,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,1210,5.0,964980499,0,911,964980499,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,2018,5.0,964980523,0,1493,964980499,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,2628,4.0,964980523,0,1979,964980523,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,2826,4.0,964980523,0,2126,964980523,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [14]:
ratings = sorted_ratings.sort_values(by=["userId","movieId"]).copy()
ratings = ratings.reset_index(drop=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector
0,1,1,4.0,964982703,0,0,964982681,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,3,4.0,964981247,0,2,964981230,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,6,4.0,964982224,0,5,964982211,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,47,5.0,964983815,0,43,964983793,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,50,5.0,964982931,0,46,964982903,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [15]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
4,5,Father of the Bride Part II (1995),Comedy,4


In [16]:
unique_genres = np.unique(np.concatenate(np.array(movies.genres.apply(lambda x: np.array(x.split("|"))).tolist())))
dict_gid_2_gnr = { gid:genre for gid, genre in enumerate(unique_genres)} 
dict_gnr_2_gid = { genre:gid for gid, genre in dict_gid_2_gnr.items()}
unique_genres

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'], dtype='<U18')

In [17]:
gnr_vectors = list()
for gnrs in movies.genres.apply(lambda x: x.split("|")).tolist():
  gnr_vector = np.zeros(len(unique_genres))
  for gnr in gnrs:
    gnr_vector[dict_gnr_2_gid[gnr]] = 1
  gnr_vectors.append(gnr_vector)

In [18]:
movies.loc[:,"genres_vectors"] = pd.Series(gnr_vectors)
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."


In [19]:
import re

def get_year(regex, string):
  try:
    found = re.search(regex,string).group(0)[1:-1]
  except AttributeError:
    found = ''
  return found

regex = '\([0-9]{4}\)'
movies.loc[:,"launch_year"] = movies.title.apply(lambda x:get_year(regex,x))

In [20]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1995
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",1995
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995


In [21]:
movies.loc[movies.launch_year=="","launch_year"] = 0
movies.loc[:,"launch_year"] = movies.launch_year.astype("int")
movies.loc[movies.launch_year==0].head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year
6059,40697,Babylon 5,Sci-Fi,6059,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9031,140956,Ready Player One,Action|Sci-Fi|Thriller,9031,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9091,143410,Hyena Road,(no genres listed),9091,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9138,147250,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),9138,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9179,149334,Nocturnal Animals,Drama|Thriller,9179,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",0


In [22]:
movies.loc[movies.launch_year != 0,"movie_title"] = movies.loc[movies.launch_year!= 0,"title"].apply(lambda x:x[:-6].strip()) 
movies.loc[movies.launch_year== 0,"movie_title"] = movies.loc[movies.launch_year== 0,"title"]

In [23]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year,movie_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1995,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",1995,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Father of the Bride Part II


In [24]:
### 특수문자 제외 및 단어별 배열로 변경
# list_title_frac= movies.movie_title.apply(lambda x:re.findall(regex,x)).to_list()

vocab_size = 24000
method = "bpe"

input_file_path = os.path.join(data_path,"title_tokenizer_input_{}_{}.txt".format(method,vocab_size))
tokenizer_name = os.path.join(data_path,"title_tokenizer_{}_{}".format(method, vocab_size))
tokenizer_name_model = os.path.join(data_path,"title_tokenizer_{}_{}.model".format(method, vocab_size))

### make_input4tokenizer
# sentences = list(map(lambda x:" ".join(x),list_title_frac))
sentences = movies.movie_title.to_list()


with open(input_file_path,'w',encoding='utf8') as f:
  for sentence in tqdm_notebook(sentences):
    f.write(sentence+'\n')

### train_tokenizer
templates = ' --input={} \
    --pad_id=0 \
    --bos_id=1 \
    --eos_id=2 \
    --unk_id=3 \
    --model_prefix={} \
    --vocab_size={} \
    --character_coverage=1.0 \
    --model_type={}'

cmd = templates.format(input_file_path, tokenizer_name, vocab_size, method)
spm.SentencePieceTrainer.Train(cmd)
print("title_tokenizer {} is generated".format(tokenizer_name))
sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)


def get_tokens_from_sentences(sp, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = sp.EncodeAsPieces(sentence)
        new_tokens = []
        for token in tokens:
            token = token.replace("▁", "")
            # if len(token) > 1:
            new_tokens.append(token)
        # if len(new_tokens) > 1:
        tokenized_sentences.append(new_tokens)

    return tokenized_sentences


def get_tokens_from_sentence(sp, sentence):
    new_tokens = []
    tokens = sp.EncodeAsPieces(sentence)
    for token in tokens:
        token = token.replace("▁", "")
        if len(token) > 1:
            new_tokens.append(token)
    return new_tokens

tokenized_sentences = get_tokens_from_sentences(sp,sentences)

### train w2v
w2v_name = os.path.join(data_path,"title_w2v_{}_{}.model".format(method,vocab_size))
print("start train_title_w2v....")

size = 200
window =5
min_count = 2
workers = 8
sg = 1
hs = 1

model = w2v(tokenized_sentences,size=size,window=window,min_count=min_count,workers=workers,sg=sg,hs=hs)
model.save(w2v_name)
print("title_w2v {} is generated".format(w2v_name))

# ### get embedding

sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)

w2v_model = w2v.load(w2v_name)

sentence_embs = []
# # 학습이 안되는 벡터들에 대해서는 0값이 나으므로 0벡터로 input 처리 수행
for sentence in tqdm_notebook(tokenized_sentences):
  word_embs = []
  for p_word in sentence:
    try:
      word_embs.append(w2v_model.wv[p_word])
    except KeyError:
      pass
  if len(word_embs):
    p_emb = np.average(word_embs, axis=0).tolist()
  else:
    p_emb = np.zeros(200).tolist()
  sentence_embs.append(p_emb)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=9742.0), HTML(value='')))


title_tokenizer /gdrive/MyDrive/colab/Factorization_Machine/data/title_tokenizer_bpe_24000 is generated
start train_title_w2v....
title_w2v /gdrive/MyDrive/colab/Factorization_Machine/data/title_w2v_bpe_24000.model is generated


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=9742.0), HTML(value='')))




In [25]:
movies.loc[:,"title_vector"] = pd.Series(sentence_embs)
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year,movie_title,title_vector
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Toy Story,"[0.06252102553844452, -0.1304159313440323, 0.0..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1995,Jumanji,"[0.013290229253470898, -0.030922923237085342, ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Grumpier Old Men,"[0.10823403298854828, -0.22381934523582458, 0...."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",1995,Waiting to Exhale,"[0.08037611097097397, -0.18362760543823242, 0...."
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Father of the Bride Part II,"[0.10152804851531982, -0.267787903547287, -0.0..."


In [26]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp,u_nid,m_nid
0,2,60756,funny,1445714994,1,6801
1,2,60756,Highly quotable,1445714996,1,6801
2,2,60756,will ferrell,1445714992,1,6801
3,2,89774,Boxing story,1445715207,1,7697
4,2,89774,MMA,1445715200,1,7697


In [27]:
from collections import OrderedDict
set_tag_user_movie = set(zip(tags.userId, tags.movieId))

dict_tag_string = OrderedDict()
for user, movie in set_tag_user_movie:
  list_tag = tags.loc[np.logical_and(tags.userId==user,tags.movieId==movie),"tag"].tolist()
  dict_tag_string[(user,movie)] = list_tag

dict_tag_string = OrderedDict(sorted(dict_tag_string.items()))

In [28]:
user_series = pd.Series(list(map(lambda x:x[0],dict_tag_string.keys())))
movie_series = pd.Series(list(map(lambda x:x[1],dict_tag_string.keys()))) 
tag_series = pd.Series(list(dict_tag_string.values()))

df_tag_list = pd.concat([user_series,movie_series,tag_series],axis=1)
df_tag_list.columns = ["userId","movieId","tags"]
df_tag_list.head()

Unnamed: 0,userId,movieId,tags
0,2,60756,"[funny, Highly quotable, will ferrell]"
1,2,89774,"[Boxing story, MMA, Tom Hardy]"
2,2,106782,"[drugs, Leonardo DiCaprio, Martin Scorsese]"
3,7,48516,[way too long]
4,18,431,"[Al Pacino, gangster, mafia]"


In [29]:
ratings = pd.merge(left=ratings, right=df_tag_list, on=["userId","movieId"], how="left")
ratings.loc[:,"tags"]= ratings.tags.fillna(" ")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector,tags
0,1,1,4.0,964982703,0,0,964982681,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
1,1,3,4.0,964981247,0,2,964981230,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
2,1,6,4.0,964982224,0,5,964982211,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
3,1,47,5.0,964983815,0,43,964983793,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
4,1,50,5.0,964982931,0,46,964982903,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",


In [30]:
### 특수문자 제외 및 단어별 배열로 변경
# list_title_frac= movies.movie_title.apply(lambda x:re.findall(regex,x)).to_list()

vocab_size = 100
method = "bpe"

input_file_path = os.path.join(data_path,"tag_tokenizer_input_{}_{}.txt".format(method,vocab_size))
tokenizer_name = os.path.join(data_path,"tag_tokenizer_{}_{}".format(method, vocab_size))
tokenizer_name_model = os.path.join(data_path,"tag_tokenizer_{}_{}.model".format(method, vocab_size))

### make_input4tokenizer
# sentences = list(map(lambda x:" ".join(x),list_title_frac))
tag_sentences = ratings.tags.to_list()

sentences = []
for tag_sentence in tag_sentences:
  sentence = " ".join(tag_sentence)
  sentences.append(sentence)

with open(input_file_path,'w',encoding='utf8') as f:
  for sentence in tqdm_notebook(sentences):
    tag_sentence = " ".join(sentence)
    f.write(tag_sentence+'\n')

### train_tokenizer
templates = ' --input={} \
    --pad_id=0 \
    --bos_id=1 \
    --eos_id=2 \
    --unk_id=3 \
    --model_prefix={} \
    --vocab_size={} \
    --character_coverage=1.0 \
    --model_type={}'

cmd = templates.format(input_file_path, tokenizer_name, vocab_size, method)
spm.SentencePieceTrainer.Train(cmd)
print("tag_tokenizer {} is generated".format(tokenizer_name))
sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)


def get_tokens_from_sentences(sp, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = sp.EncodeAsPieces(sentence)
        new_tokens = []
        for token in tokens:
            token = token.replace("▁", "")
            # if len(token) > 1:
            new_tokens.append(token)
        # if len(new_tokens) > 1:
        tokenized_sentences.append(new_tokens)

    return tokenized_sentences


def get_tokens_from_sentence(sp, sentence):
    new_tokens = []
    tokens = sp.EncodeAsPieces(sentence)
    for token in tokens:
        token = token.replace("▁", "")
        if len(token) > 1:
            new_tokens.append(token)
    return new_tokens

tokenized_sentences = get_tokens_from_sentences(sp,sentences)

### train w2v
w2v_name = os.path.join(data_path,"tag_w2v_{}_{}.model".format(method,vocab_size))
print("start train_tag_w2v....")

size = 200
window =5
min_count = 2
workers = 8
sg = 1
hs = 1

model = w2v(tokenized_sentences,size=size,window=window,min_count=min_count,workers=workers,sg=sg,hs=hs)
model.save(w2v_name)
print("tag_w2v {} is generated".format(w2v_name))

# ### get embedding

sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)

w2v_model = w2v.load(w2v_name)

sentence_embs = []
# # 학습이 안되는 벡터들에 대해서는 0값이 나으므로 0벡터로 input 처리 수행
for sentence in tqdm_notebook(tokenized_sentences):
  word_embs = []
  for p_word in sentence:
    try:
      word_embs.append(w2v_model.wv[p_word])
    except KeyError:
      pass
  if len(word_embs):
    p_emb = np.average(word_embs, axis=0).tolist()
  else:
    p_emb = np.zeros(200).tolist()
  sentence_embs.append(p_emb)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=100836.0), HTML(value='')))


tag_tokenizer /gdrive/MyDrive/colab/Factorization_Machine/data/tag_tokenizer_bpe_100 is generated
start train_tag_w2v....
tag_w2v /gdrive/MyDrive/colab/Factorization_Machine/data/tag_w2v_bpe_100.model is generated


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=100836.0), HTML(value='')))




In [31]:
ratings.loc[:,"tag_vector"] = pd.Series(sentence_embs)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector,tags,tag_vector
0,1,1,4.0,964982703,0,0,964982681,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,3,4.0,964981247,0,2,964981230,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,6,4.0,964982224,0,5,964982211,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,47,5.0,964983815,0,43,964983793,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,50,5.0,964982931,0,46,964982903,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [32]:
# user, movie, rating, movie_gnr_vec, movie_title_vec, launch_year, user_movie_tag_vector, last_rate_vector

target_ratings = ratings.loc[:,["u_nid","m_nid","tag_vector","last_rate_vector","rating"]]
target_movies = movies.loc[:,["m_nid","genres_vectors","title_vector","launch_year"]]

data = pd.merge(left=target_ratings, right=target_movies, on="m_nid", how="left")
data.head()

Unnamed: 0,u_nid,m_nid,tag_vector,last_rate_vector,rating,genres_vectors,title_vector,launch_year
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[0.06252102553844452, -0.1304159313440323, 0.0...",1995
1,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.10823403298854828, -0.22381934523582458, 0....",1995
2,0,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.03938112035393715, -0.07929226011037827, 0....",1995
3,0,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.017066407948732376, -0.21261781454086304, -...",1995
4,0,46,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0187482051551342, -0.19618019461631775, -0....",1995


In [33]:
y_data = data.loc[:,"rating"]
X_data = data.drop("rating",axis=1)
X_data.head()

Unnamed: 0,u_nid,m_nid,tag_vector,last_rate_vector,genres_vectors,title_vector,launch_year
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[0.06252102553844452, -0.1304159313440323, 0.0...",1995
1,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.10823403298854828, -0.22381934523582458, 0....",1995
2,0,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.03938112035393715, -0.07929226011037827, 0....",1995
3,0,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.017066407948732376, -0.21261781454086304, -...",1995
4,0,46,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0187482051551342, -0.19618019461631775, -0....",1995


In [34]:
u_nid_vectors = list()
m_nid_vectors = list()

for idx, row in tqdm_notebook(X_data.iterrows()):
  u_nid_vector = np.zeros(len(dict_nid_2_uid))
  m_nid_vector = np.zeros(len(dict_nid_2_mid))

  u_nid = row["u_nid"]
  m_nid= row["m_nid"]

  u_nid_vectors.append(u_nid_vector)
  m_nid_vectors.append(m_nid_vector)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [35]:
X_data.loc[:,"u_nid"] = pd.Series(u_nid_vectors)
X_data.loc[:,"m_nid"] = pd.Series(m_nid_vectors)

In [36]:
from sklearn.preprocessing import StandardScaler

launch_years = np.array(X_data.loc[X_data.launch_year != 0].launch_year.to_list()).reshape(-1,1)

scaler = StandardScaler()
scaled_year = scaler.fit_transform(launch_years)

In [37]:
X_data.loc[X_data.launch_year !=0, "launch_year"] = scaled_year

In [None]:
X_data_path = os.path.join(data_path, "X_data.npy")
y_data_path = os.path.join(data_path, "y_data.npy")
np.save(X_data_path,X_data.values)
np.save(y_data_path,y_data.values)