In [142]:
from google.colab import drive
drive.mount("/gdrive")

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [143]:
import os

project_path = "/gdrive/MyDrive/colab/Factorization_Machine/"
data_path = os.path.join(project_path,"data")

In [144]:
ls -l /gdrive/MyDrive/colab/Factorization_Machine/data

total 3219
-rw------- 1 root root  197979 Nov 19 04:30 links.csv
-rw------- 1 root root  494431 Nov 24 01:17 movies.csv
-rw------- 1 root root 2483723 Nov 24 01:17 ratings.csv
-rw------- 1 root root  118660 Nov 19 04:30 tags.csv


In [145]:
import pandas as pd

csv_movies = os.path.join(data_path,"movies.csv")
csv_ratings = os.path.join(data_path,"ratings.csv")
csv_tags = os.path.join(data_path,"tags.csv")
csv_links =  os.path.join(data_path,"links.csv")

movies = pd.read_csv(csv_movies)
ratings = pd.read_csv(csv_ratings)
tags = pd.read_csv(csv_tags)
links = pd.read_csv(csv_links)

org_movies = movies.copy()
org_ratings = ratings.copy()
org_tags = tags.copy()
org_links = links.copy()


In [146]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [147]:
import numpy as np

In [148]:
unique_genres = np.unique(np.concatenate(np.array(movies.genres.apply(lambda x: np.array(x.split("|"))).tolist())))
dict_gid_2_gnr = { gid:genre for gid, genre in enumerate(unique_genres)} 
dict_gnr_2_gid = { genre:gid for gid, genre in dict_gid_2_gnr.items()}
unique_genres

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'], dtype='<U18')

In [149]:
gnr_vectors = list()
for gnrs in movies.genres.apply(lambda x: x.split("|")).tolist():
  gnr_vector = np.zeros(len(unique_genres))
  for gnr in gnrs:
    gnr_vector[dict_gnr_2_gid[gnr]] = 1
  gnr_vectors.append(gnr_vector)

In [150]:
movies.loc[:,"genres_vectors"] = pd.Series(gnr_vectors)
movies.head()

Unnamed: 0,movieId,title,genres,genres_vectors
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
4,5,Father of the Bride Part II (1995),Comedy,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."


In [155]:
import re

def get_year(regex, string):
  try:
    found = re.search(regex,string).group(1)
  except AttributeError:
    found = ''
  return found

regex = '([0-9]{4})'
movies.loc[:,"launch_year"] = movies.title.apply(lambda x:get_year(regex,x))

In [156]:
movies.loc[movies.launch_year != '',"movie_title"] = movies.loc[movies.launch_year!= '',"title"].apply(lambda x:x[:-6].strip()) 
movies.loc[movies.launch_year== '',"movie_title"] = movies.loc[movies.launch_year== '',"title"]

In [167]:
movies.head()

Unnamed: 0,movieId,title,genres,genres_vectors,launch_year,movie_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1995,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",1995,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Father of the Bride Part II


In [210]:
!pip install sentencepiece

from gensim.models import  Word2Vec
import sentencepiece as spm
from tqdm import tqdm_notebook


regex ='[A-Za-z0-9]+'

# 특수문자 제외 및 단어별 배열로 변경
list_title_frac= movies.movie_title.apply(lambda x:re.findall(regex,x)).to_list()


input_file_path = os.path.join(data_path,"tokenizer.txt")
tokenizer_name = os.path.join(data_path,"tokenizer")
tokenizer_name_model = tokenizer_name = os.path.join(data_path,"tokenizer.model")

# make_input4tokenizer
sentences = list(map(lambda x:" ".join(x),list_title_frac))

with open(input_file_path,'w',encoding='utf8') as f:
  for sentence in tqdm_notebook(sentences):
    f.write(sentence+'\n')

# train_tokenizer
templates = ' --input={} \
    --pad_id=0 \
    --bos_id=1 \
    --eos_id=2 \
    --unk_id=3 \
    --model_prefix={} \
    --vocab_size={} \
    --character_coverage=1.0 \
    --model_type={}'

vocab_size = 300
method = "bpe"

cmd = templates.format(input_file_path, tokenizer_name, vocab_size, method)
spm.SentencePieceTrainer.Train(cmd)
print("tokenizer {} is generated".format(tokenizer_name))
sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)


def get_tokens_from_sentences(sp, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = sp.EncodeAsPieces(sentence)
        new_tokens = []
        for token in tokens:
            token = token.replace("▁", "")
            if len(token) > 1:
                new_tokens.append(token)
        if len(new_tokens) > 1:
            tokenized_sentences.append(new_tokens)

    return tokenized_sentences


def get_tokens_from_sentence(sp, sentence):
    new_tokens = []
    tokens = sp.EncodeAsPieces(sentence)
    for token in tokens:
        token = token.replace("▁", "")
        if len(token) > 1:
            new_tokens.append(token)
    return new_tokens

tokenized_sentences = get_tokens_from_sentences(sp,sentences)



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=9742.0), HTML(value='')))


tokenizer /gdrive/MyDrive/colab/Factorization_Machine/data/tokenizer.model is generated


In [173]:
# list_title_frac embedding

word_vectors = model.wv
vocabs = word_vectors.vocab.keys()
word_vectors_list = [word_vectors[v] for v in vocabs]

In [140]:
movies.loc[movies.movie_title.apply(lambda x: len(x)<5)]

Unnamed: 0,movieId,title,genres,genres_vectors,launch_year,movie_title
5,6,Heat (1995),Action|Crime|Thriller,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",1995,Heat
32,34,Babe (1995),Children|Drama,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ...",1995,Babe
112,132,Jade (1995),Thriller,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1995,Jade
147,175,Kids (1995),Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",1995,Kids
161,190,Safe (1995),Thriller,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1995,Safe
...,...,...,...,...,...,...
9584,175303,It (2017),Horror,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2017,It
9621,177765,Coco (2017),Adventure|Animation|Children,"[0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",2017,Coco
9642,179813,LBJ (2017),Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",2017,LBJ
9716,188797,Tag (2018),Comedy,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",2018,Tag
