In [3]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [4]:
import os

IS_COLAB = True

if IS_COLAB:
  project_path = "/gdrive/MyDrive/colab/Factorization_Machine/"
else:
  project_path = os.path.dirname(os.path.abspath("__file__"))

data_path = os.path.join(project_path,"data")

In [5]:
ls -l /gdrive/MyDrive/colab/Factorization_Machine/data

total 18944
-rw------- 1 root root   197979 Nov 19 04:30 links.csv
-rw------- 1 root root   494431 Nov 24 01:17 movies.csv
-rw------- 1 root root  2483723 Nov 24 01:17 ratings.csv
-rw------- 1 root root   118660 Nov 19 04:30 tags.csv
-rw------- 1 root root   238833 Dec  2 02:30 tag_tokenizer_bpe_100.model
-rw------- 1 root root      672 Dec  2 02:30 tag_tokenizer_bpe_100.vocab
-rw------- 1 root root   370999 Nov 27 05:52 tag_tokenizer_bpe_9000.model
-rw------- 1 root root   114127 Nov 27 05:52 tag_tokenizer_bpe_9000.vocab
-rw------- 1 root root   271684 Dec  2 02:30 tag_tokenizer_input_bpe_100.txt
-rw------- 1 root root   236678 Nov 27 05:52 tag_tokenizer_input_bpe_24000.txt
-rw------- 1 root root   271684 Nov 27 05:57 tag_tokenizer_input_bpe_9000.txt
-rw------- 1 root root   276147 Dec  2 02:30 tag_w2v_bpe_100.model
-rw------- 1 root root   613252 Dec  2 02:30 title_tokenizer_bpe_24000.model
-rw------- 1 root root   340369 Dec  2 02:30 title_tokenizer_bpe_24000.vocab
-rw------- 1 root

In [6]:
!pip install sentencepiece

from gensim.models import  Word2Vec as w2v
import sentencepiece as spm
from tqdm import tqdm_notebook

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 6.5MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.94


In [7]:
import pandas as pd
import numpy as np

csv_movies = os.path.join(data_path,"movies.csv")
csv_ratings = os.path.join(data_path,"ratings.csv")
csv_tags = os.path.join(data_path,"tags.csv")
csv_links =  os.path.join(data_path,"links.csv")

movies = pd.read_csv(csv_movies)
ratings = pd.read_csv(csv_ratings)
tags = pd.read_csv(csv_tags)
links = pd.read_csv(csv_links)

org_movies = movies.copy()
org_ratings = ratings.copy()
org_tags = tags.copy()
org_links = links.copy()

In [8]:
dict_mid_2_nid= {mid:nid for nid, mid in enumerate(movies.movieId.unique())}
dict_nid_2_mid = {nid:mid for mid,nid in dict_mid_2_nid.items()}

dict_uid_2_nid= {uid:nid for nid, uid in enumerate(ratings.userId.unique())}
dict_nid_2_uid = {nid:uid for uid,nid in dict_uid_2_nid.items()}

movies.loc[:,"m_nid"] = movies.movieId.apply(lambda x:dict_mid_2_nid[x])

ratings.loc[:,"u_nid"] = ratings.userId.apply(lambda x:dict_uid_2_nid[x])
ratings.loc[:,"m_nid"] = ratings.movieId.apply(lambda x:dict_mid_2_nid[x])

tags.loc[:,"u_nid"] = tags.userId.apply(lambda x:dict_uid_2_nid[x])
tags.loc[:,"m_nid"] = tags.movieId.apply(lambda x:dict_mid_2_nid[x])

links.loc[:,"m_nid"] = tags.movieId.apply(lambda x:dict_mid_2_nid[x])

In [9]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
4,5,Father of the Bride Part II (1995),Comedy,4


In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid
0,1,1,4.0,964982703,0,0
1,1,3,4.0,964981247,0,2
2,1,6,4.0,964982224,0,5
3,1,47,5.0,964983815,0,43
4,1,50,5.0,964982931,0,46


In [11]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp,u_nid,m_nid
0,2,60756,funny,1445714994,1,6801
1,2,60756,Highly quotable,1445714996,1,6801
2,2,60756,will ferrell,1445714992,1,6801
3,2,89774,Boxing story,1445715207,1,7697
4,2,89774,MMA,1445715200,1,7697


In [12]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId,m_nid
0,1,114709,862.0,6801.0
1,2,113497,8844.0,6801.0
2,3,113228,15602.0,6801.0
3,4,114885,31357.0,7697.0
4,5,113041,11862.0,7697.0


In [13]:
sorted_ratings = ratings.sort_values(by=["userId","timestamp"])
sorted_ratings['b4_timestamp'] = sorted_ratings.groupby(['userId'])['timestamp'].shift(1)
mc_sorted_ratings= sorted_ratings.groupby(['userId',"timestamp"])["m_nid"].count().to_frame(name="m_count")
sorted_ratings = pd.merge(left=sorted_ratings, right=mc_sorted_ratings, on=["userId","timestamp"], how="left")
sorted_ratings.loc[:,"b4_timestamp"] = sorted_ratings.b4_timestamp.fillna(0).astype("int64")
sorted_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count
0,1,804,4.0,964980499,0,632,0,2
1,1,1210,5.0,964980499,0,911,964980499,2
2,1,2018,5.0,964980523,0,1493,964980499,3
3,1,2628,4.0,964980523,0,1979,964980523,3
4,1,2826,4.0,964980523,0,2126,964980523,3


In [14]:
previous_timestamp = 0
counter = 0
former_movies = list()
now_movies = list()
movie_vectors = list()

for idx,row in tqdm_notebook(sorted_ratings.iterrows()):
  counter += 1
  m_nid = row["m_nid"].astype("int")
  m_count = row["m_count"]
  timestamp = row["timestamp"]
  b4_timestamp = row["b4_timestamp"]

  if previous_timestamp == 0 and counter <= m_count:
    now_movies.append(m_nid)
    movie_vector = np.zeros(len(dict_mid_2_nid)) 
    
    if counter == m_count:
      former_movies = now_movies
      previous_timestamp = timestamp
      now_movies = list()
      counter = 0
  
  elif previous_timestamp != 0 and counter <= m_count:
    now_movies.append(m_nid)
    movie_vector = np.zeros(len(dict_mid_2_nid))
    movie_vector[former_movies] = 1

    if counter == m_count:
      former_movies = now_movies
      previous_timestamp = timestamp
      now_movies = list()
      counter = 0

  movie_vectors.append(movie_vector)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [15]:
sorted_ratings.loc[:,"last_rate_vector"] = pd.Series(movie_vectors)
sorted_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector
0,1,804,4.0,964980499,0,632,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,1210,5.0,964980499,0,911,964980499,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,2018,5.0,964980523,0,1493,964980499,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,2628,4.0,964980523,0,1979,964980523,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,2826,4.0,964980523,0,2126,964980523,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [16]:
ratings = sorted_ratings.sort_values(by=["userId","movieId"]).copy()
ratings = ratings.reset_index(drop=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector
0,1,1,4.0,964982703,0,0,964982681,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,3,4.0,964981247,0,2,964981230,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,6,4.0,964982224,0,5,964982211,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,47,5.0,964983815,0,43,964983793,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,50,5.0,964982931,0,46,964982903,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [17]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
4,5,Father of the Bride Part II (1995),Comedy,4


In [18]:
unique_genres = np.unique(np.concatenate(np.array(movies.genres.apply(lambda x: np.array(x.split("|"))).tolist())))
dict_gid_2_gnr = { gid:genre for gid, genre in enumerate(unique_genres)} 
dict_gnr_2_gid = { genre:gid for gid, genre in dict_gid_2_gnr.items()}
unique_genres

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'], dtype='<U18')

In [19]:
gnr_vectors = list()
for gnrs in movies.genres.apply(lambda x: x.split("|")).tolist():
  gnr_vector = np.zeros(len(unique_genres))
  for gnr in gnrs:
    gnr_vector[dict_gnr_2_gid[gnr]] = 1
  gnr_vectors.append(gnr_vector)

In [20]:
movies.loc[:,"genres_vectors"] = pd.Series(gnr_vectors)
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."


In [21]:
import re

def get_year(regex, string):
  try:
    found = re.search(regex,string).group(0)[1:-1]
  except AttributeError:
    found = ''
  return found

regex = '\([0-9]{4}\)'
movies.loc[:,"launch_year"] = movies.title.apply(lambda x:get_year(regex,x))

In [22]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1995
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",1995
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995


In [23]:
movies.loc[movies.launch_year=="","launch_year"] = 0
movies.loc[:,"launch_year"] = movies.launch_year.astype("int")
movies.loc[movies.launch_year==0].head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year
6059,40697,Babylon 5,Sci-Fi,6059,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9031,140956,Ready Player One,Action|Sci-Fi|Thriller,9031,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9091,143410,Hyena Road,(no genres listed),9091,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9138,147250,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),9138,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9179,149334,Nocturnal Animals,Drama|Thriller,9179,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",0


In [24]:
movies.loc[movies.launch_year != 0,"movie_title"] = movies.loc[movies.launch_year!= 0,"title"].apply(lambda x:x[:-6].strip()) 
movies.loc[movies.launch_year== 0,"movie_title"] = movies.loc[movies.launch_year== 0,"title"]

In [25]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year,movie_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1995,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",1995,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Father of the Bride Part II


In [26]:
### 특수문자 제외 및 단어별 배열로 변경
# list_title_frac= movies.movie_title.apply(lambda x:re.findall(regex,x)).to_list()

vocab_size = 24000
method = "bpe"

input_file_path = os.path.join(data_path,"title_tokenizer_input_{}_{}.txt".format(method,vocab_size))
tokenizer_name = os.path.join(data_path,"title_tokenizer_{}_{}".format(method, vocab_size))
tokenizer_name_model = os.path.join(data_path,"title_tokenizer_{}_{}.model".format(method, vocab_size))

### make_input4tokenizer
# sentences = list(map(lambda x:" ".join(x),list_title_frac))
sentences = movies.movie_title.to_list()


with open(input_file_path,'w',encoding='utf8') as f:
  for sentence in tqdm_notebook(sentences):
    f.write(sentence+'\n')

### train_tokenizer
templates = ' --input={} \
    --pad_id=0 \
    --bos_id=1 \
    --eos_id=2 \
    --unk_id=3 \
    --model_prefix={} \
    --vocab_size={} \
    --character_coverage=1.0 \
    --model_type={}'

cmd = templates.format(input_file_path, tokenizer_name, vocab_size, method)
spm.SentencePieceTrainer.Train(cmd)
print("title_tokenizer {} is generated".format(tokenizer_name))
sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)


def get_tokens_from_sentences(sp, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = sp.EncodeAsPieces(sentence)
        new_tokens = []
        for token in tokens:
            token = token.replace("▁", "")
            # if len(token) > 1:
            new_tokens.append(token)
        # if len(new_tokens) > 1:
        tokenized_sentences.append(new_tokens)

    return tokenized_sentences


def get_tokens_from_sentence(sp, sentence):
    new_tokens = []
    tokens = sp.EncodeAsPieces(sentence)
    for token in tokens:
        token = token.replace("▁", "")
        if len(token) > 1:
            new_tokens.append(token)
    return new_tokens

tokenized_sentences = get_tokens_from_sentences(sp,sentences)

### train w2v
w2v_name = os.path.join(data_path,"title_w2v_{}_{}.model".format(method,vocab_size))
print("start train_title_w2v....")

size = 200
window =5
min_count = 2
workers = 8
sg = 1
hs = 1

model = w2v(tokenized_sentences,size=size,window=window,min_count=min_count,workers=workers,sg=sg,hs=hs)
model.save(w2v_name)
print("title_w2v {} is generated".format(w2v_name))

# ### get embedding

sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)

w2v_model = w2v.load(w2v_name)

sentence_embs = []
# # 학습이 안되는 벡터들에 대해서는 0값이 나으므로 0벡터로 input 처리 수행
for sentence in tqdm_notebook(tokenized_sentences):
  word_embs = []
  for p_word in sentence:
    try:
      word_embs.append(w2v_model.wv[p_word])
    except KeyError:
      pass
  if len(word_embs):
    p_emb = np.average(word_embs, axis=0).tolist()
  else:
    p_emb = np.zeros(200).tolist()
  sentence_embs.append(p_emb)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=9742.0), HTML(value='')))


title_tokenizer /gdrive/MyDrive/colab/Factorization_Machine/data/title_tokenizer_bpe_24000 is generated
start train_title_w2v....
title_w2v /gdrive/MyDrive/colab/Factorization_Machine/data/title_w2v_bpe_24000.model is generated


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=9742.0), HTML(value='')))




In [27]:
movies.loc[:,"title_vector"] = pd.Series(sentence_embs)
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year,movie_title,title_vector
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Toy Story,"[-0.0016190423630177975, 0.020851947367191315,..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1995,Jumanji,"[-0.0073105162009596825, 0.0023418047931045294..."
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Grumpier Old Men,"[-0.02799421176314354, -0.0012682080268859863,..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",1995,Waiting to Exhale,"[0.010159829631447792, 0.00872352346777916, 0...."
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Father of the Bride Part II,"[-0.009705641306936741, -0.001838729134760797,..."


In [28]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp,u_nid,m_nid
0,2,60756,funny,1445714994,1,6801
1,2,60756,Highly quotable,1445714996,1,6801
2,2,60756,will ferrell,1445714992,1,6801
3,2,89774,Boxing story,1445715207,1,7697
4,2,89774,MMA,1445715200,1,7697


In [29]:
from collections import OrderedDict
set_tag_user_movie = set(zip(tags.userId, tags.movieId))

dict_tag_string = OrderedDict()
for user, movie in set_tag_user_movie:
  list_tag = tags.loc[np.logical_and(tags.userId==user,tags.movieId==movie),"tag"].tolist()
  dict_tag_string[(user,movie)] = list_tag

dict_tag_string = OrderedDict(sorted(dict_tag_string.items()))

In [30]:
user_series = pd.Series(list(map(lambda x:x[0],dict_tag_string.keys())))
movie_series = pd.Series(list(map(lambda x:x[1],dict_tag_string.keys()))) 
tag_series = pd.Series(list(dict_tag_string.values()))

df_tag_list = pd.concat([user_series,movie_series,tag_series],axis=1)
df_tag_list.columns = ["userId","movieId","tags"]
df_tag_list.head()

Unnamed: 0,userId,movieId,tags
0,2,60756,"[funny, Highly quotable, will ferrell]"
1,2,89774,"[Boxing story, MMA, Tom Hardy]"
2,2,106782,"[drugs, Leonardo DiCaprio, Martin Scorsese]"
3,7,48516,[way too long]
4,18,431,"[Al Pacino, gangster, mafia]"


In [31]:
ratings = pd.merge(left=ratings, right=df_tag_list, on=["userId","movieId"], how="left")
ratings.loc[:,"tags"]= ratings.tags.fillna(" ")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector,tags
0,1,1,4.0,964982703,0,0,964982681,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
1,1,3,4.0,964981247,0,2,964981230,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
2,1,6,4.0,964982224,0,5,964982211,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
3,1,47,5.0,964983815,0,43,964983793,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
4,1,50,5.0,964982931,0,46,964982903,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",


In [32]:
### 특수문자 제외 및 단어별 배열로 변경
# list_title_frac= movies.movie_title.apply(lambda x:re.findall(regex,x)).to_list()

vocab_size = 100
method = "bpe"

input_file_path = os.path.join(data_path,"tag_tokenizer_input_{}_{}.txt".format(method,vocab_size))
tokenizer_name = os.path.join(data_path,"tag_tokenizer_{}_{}".format(method, vocab_size))
tokenizer_name_model = os.path.join(data_path,"tag_tokenizer_{}_{}.model".format(method, vocab_size))

### make_input4tokenizer
# sentences = list(map(lambda x:" ".join(x),list_title_frac))
tag_sentences = ratings.tags.to_list()

sentences = []
for tag_sentence in tag_sentences:
  sentence = " ".join(tag_sentence)
  sentences.append(sentence)

with open(input_file_path,'w',encoding='utf8') as f:
  for sentence in tqdm_notebook(sentences):
    tag_sentence = " ".join(sentence)
    f.write(tag_sentence+'\n')

### train_tokenizer
templates = ' --input={} \
    --pad_id=0 \
    --bos_id=1 \
    --eos_id=2 \
    --unk_id=3 \
    --model_prefix={} \
    --vocab_size={} \
    --character_coverage=1.0 \
    --model_type={}'

cmd = templates.format(input_file_path, tokenizer_name, vocab_size, method)
spm.SentencePieceTrainer.Train(cmd)
print("tag_tokenizer {} is generated".format(tokenizer_name))
sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)


def get_tokens_from_sentences(sp, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = sp.EncodeAsPieces(sentence)
        new_tokens = []
        for token in tokens:
            token = token.replace("▁", "")
            # if len(token) > 1:
            new_tokens.append(token)
        # if len(new_tokens) > 1:
        tokenized_sentences.append(new_tokens)

    return tokenized_sentences


def get_tokens_from_sentence(sp, sentence):
    new_tokens = []
    tokens = sp.EncodeAsPieces(sentence)
    for token in tokens:
        token = token.replace("▁", "")
        if len(token) > 1:
            new_tokens.append(token)
    return new_tokens

tokenized_sentences = get_tokens_from_sentences(sp,sentences)

### train w2v
w2v_name = os.path.join(data_path,"tag_w2v_{}_{}.model".format(method,vocab_size))
print("start train_tag_w2v....")

size = 200
window =5
min_count = 2
workers = 8
sg = 1
hs = 1

model = w2v(tokenized_sentences,size=size,window=window,min_count=min_count,workers=workers,sg=sg,hs=hs)
model.save(w2v_name)
print("tag_w2v {} is generated".format(w2v_name))

# ### get embedding

sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)

w2v_model = w2v.load(w2v_name)

sentence_embs = []
# # 학습이 안되는 벡터들에 대해서는 0값이 나으므로 0벡터로 input 처리 수행
for sentence in tqdm_notebook(tokenized_sentences):
  word_embs = []
  for p_word in sentence:
    try:
      word_embs.append(w2v_model.wv[p_word])
    except KeyError:
      pass
  if len(word_embs):
    p_emb = np.average(word_embs, axis=0).tolist()
  else:
    p_emb = np.zeros(200).tolist()
  sentence_embs.append(p_emb)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=100836.0), HTML(value='')))


tag_tokenizer /gdrive/MyDrive/colab/Factorization_Machine/data/tag_tokenizer_bpe_100 is generated
start train_tag_w2v....
tag_w2v /gdrive/MyDrive/colab/Factorization_Machine/data/tag_w2v_bpe_100.model is generated


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=100836.0), HTML(value='')))




In [33]:
ratings.loc[:,"tag_vector"] = pd.Series(sentence_embs)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector,tags,tag_vector
0,1,1,4.0,964982703,0,0,964982681,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,3,4.0,964981247,0,2,964981230,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,6,4.0,964982224,0,5,964982211,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,47,5.0,964983815,0,43,964983793,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,50,5.0,964982931,0,46,964982903,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [34]:
# user, movie, rating, movie_gnr_vec, movie_title_vec, launch_year, user_movie_tag_vector, last_rate_vector

target_ratings = ratings.loc[:,["u_nid","m_nid","tag_vector","last_rate_vector","rating"]]
target_movies = movies.loc[:,["m_nid","genres_vectors","title_vector","launch_year"]]

data = pd.merge(left=target_ratings, right=target_movies, on="m_nid", how="left")
data.head()

Unnamed: 0,u_nid,m_nid,tag_vector,last_rate_vector,rating,genres_vectors,title_vector,launch_year
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.0016190423630177975, 0.020851947367191315,...",1995
1,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.02799421176314354, -0.0012682080268859863,...",1995
2,0,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.021820921450853348, -0.013219483196735382,...",1995
3,0,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.09966608136892319, 0.17752325534820557, -0....",1995
4,0,46,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.012014994397759438, 0.018455717712640762, -...",1995


In [35]:
y_data = data.loc[:,"rating"]
X_data = data.drop("rating",axis=1)
X_data.head()

Unnamed: 0,u_nid,m_nid,tag_vector,last_rate_vector,genres_vectors,title_vector,launch_year
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.0016190423630177975, 0.020851947367191315,...",1995
1,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.02799421176314354, -0.0012682080268859863,...",1995
2,0,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.021820921450853348, -0.013219483196735382,...",1995
3,0,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.09966608136892319, 0.17752325534820557, -0....",1995
4,0,46,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.012014994397759438, 0.018455717712640762, -...",1995


In [36]:
num_tag_vec = len(X_data["tag_vector"][0])
num_last_rate_vec = len(X_data["last_rate_vector"][0])
num_gnrs_vec = len(X_data["genres_vectors"][0])
num_title_vec = len(X_data["title_vector"][0])

print(num_tag_vec, num_last_rate_vec, num_gnrs_vec, num_title_vec)

200 9742 20 200


In [37]:
# from sklearn.preprocessing import StandardScaler

# launch_years = np.array(X_data.loc[X_data.launch_year != 0].launch_year.to_list()).reshape(-1,1)

# scaler = StandardScaler()
# scaled_year = scaler.fit_transform(launch_years)

# X_data.loc[X_data.launch_year !=0, "launch_year"] = scaled_year

In [38]:
# X_data.loc[X_data.launch_year !=0, "launch_year"] = scaled_year

In [39]:
X_data.head()

Unnamed: 0,u_nid,m_nid,tag_vector,last_rate_vector,genres_vectors,title_vector,launch_year
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.0016190423630177975, 0.020851947367191315,...",1995
1,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.02799421176314354, -0.0012682080268859863,...",1995
2,0,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.021820921450853348, -0.013219483196735382,...",1995
3,0,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.09966608136892319, 0.17752325534820557, -0....",1995
4,0,46,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.012014994397759438, 0.018455717712640762, -...",1995


In [40]:
X_data.loc[:, "user_id"] = X_data.u_nid.apply(lambda x: dict_nid_2_uid[x])
X_data.loc[:, "movie_id"] = X_data.m_nid.apply(lambda x: dict_nid_2_mid[x])

In [41]:
X_data.head()

Unnamed: 0,u_nid,m_nid,tag_vector,last_rate_vector,genres_vectors,title_vector,launch_year,user_id,movie_id
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.0016190423630177975, 0.020851947367191315,...",1995,1,1
1,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.02799421176314354, -0.0012682080268859863,...",1995,1,3
2,0,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.021820921450853348, -0.013219483196735382,...",1995,1,6
3,0,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.09966608136892319, 0.17752325534820557, -0....",1995,1,47
4,0,46,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.012014994397759438, 0.018455717712640762, -...",1995,1,50


In [42]:
X_columns = ["user_id", "movie_id","tag_vector","last_rate_vector","genres_vectors","title_vector","launch_year"]
X_data = X_data.loc[:,X_columns].copy()
X_data.head()

Unnamed: 0,user_id,movie_id,tag_vector,last_rate_vector,genres_vectors,title_vector,launch_year
0,1,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.0016190423630177975, 0.020851947367191315,...",1995
1,1,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.02799421176314354, -0.0012682080268859863,...",1995
2,1,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.021820921450853348, -0.013219483196735382,...",1995
3,1,47,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.09966608136892319, 0.17752325534820557, -0....",1995
4,1,50,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.012014994397759438, 0.018455717712640762, -...",1995


interaction

In [183]:
interactions = X_data[["user_id","movie_id"]]
interactions.columns = ["user_id","item_id"]
interactions.head()

Unnamed: 0,user_id,item_id
0,1,1
1,1,3
2,1,6
3,1,47
4,1,50


item_features

In [184]:
target_movies.loc[:,"movie_id"] = target_movies.m_nid.apply(lambda x:dict_nid_2_mid[x])
target_movies = target_movies.loc[:,["movie_id","genres_vectors","title_vector","launch_year"]]
target_movies.head()

AttributeError: ignored

In [185]:
from sklearn.preprocessing import StandardScaler

launch_years = np.array(target_movies.loc[target_movies.launch_year != 0].launch_year.to_list()).reshape(-1,1)

scaler = StandardScaler()
scaled_year = scaler.fit_transform(launch_years)

target_movies.loc[target_movies.launch_year !=0, "launch_year"] = scaled_year
target_movies.head()

Unnamed: 0,movie_id,genres_vectors,title_vector,launch_year
0,1,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.0016190423630177975, 0.020851947367191315,...",0.020846
1,2,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0073105162009596825, 0.0023418047931045294...",0.020846
2,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.02799421176314354, -0.0012682080268859863,...",0.020846
3,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...","[0.010159829631447792, 0.00872352346777916, 0....",0.020846
4,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[-0.009705641306936741, -0.001838729134760797,...",0.020846


In [186]:
len_gnr_vec = len(target_movies.iloc[0]["genres_vectors"])
len_title_vec = len(target_movies.iloc[0]["title_vector"])

gnr_vec_cols = ["gv_{}".format(i) for i in range(len_gnr_vec)]
title_vec_cols = ["ttv_{}".format(i) for i in range(len_title_vec)]

In [187]:
df_gnr_vec =pd.DataFrame(target_movies["genres_vectors"].to_list(),columns=gnr_vec_cols)
df_title_vec =pd.DataFrame(target_movies["title_vector"].to_list(),columns=title_vec_cols)

In [188]:
item_features = pd.concat([target_movies["movie_id"],df_gnr_vec,df_title_vec,target_movies["launch_year"]],axis=1)
item_features.columns = ["item_id"] + item_features.columns[1:].to_list()
item_features.head()

Unnamed: 0,item_id,gv_0,gv_1,gv_2,gv_3,gv_4,gv_5,gv_6,gv_7,gv_8,gv_9,gv_10,gv_11,gv_12,gv_13,gv_14,gv_15,gv_16,gv_17,gv_18,gv_19,ttv_0,ttv_1,ttv_2,ttv_3,ttv_4,ttv_5,ttv_6,ttv_7,ttv_8,ttv_9,ttv_10,ttv_11,ttv_12,ttv_13,ttv_14,ttv_15,ttv_16,ttv_17,ttv_18,...,ttv_161,ttv_162,ttv_163,ttv_164,ttv_165,ttv_166,ttv_167,ttv_168,ttv_169,ttv_170,ttv_171,ttv_172,ttv_173,ttv_174,ttv_175,ttv_176,ttv_177,ttv_178,ttv_179,ttv_180,ttv_181,ttv_182,ttv_183,ttv_184,ttv_185,ttv_186,ttv_187,ttv_188,ttv_189,ttv_190,ttv_191,ttv_192,ttv_193,ttv_194,ttv_195,ttv_196,ttv_197,ttv_198,ttv_199,launch_year
0,1,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.001619,0.020852,0.014607,-0.07447,0.037693,-0.022565,0.045284,0.148651,-0.059135,0.112576,0.039974,-0.052657,-0.101298,-0.003653,-0.016639,-0.062109,0.064035,0.021209,-0.094346,...,-0.045125,0.053715,0.021413,-0.111884,-0.031264,0.077764,0.047066,-0.05391,-0.088029,-0.037358,0.091443,-0.021182,-0.014649,-0.095442,-0.034684,-0.043085,-0.077699,-0.099904,-0.076316,-0.120376,-0.01952,-0.060019,0.034681,0.056294,0.082642,0.104642,-0.009676,-0.097496,-0.016171,-0.029408,-0.165758,0.071256,-0.007154,-0.053782,-0.064591,0.086768,-0.005433,0.063536,-0.009162,0.020846
1,2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.007311,0.002342,0.010191,-0.014122,0.024671,-0.024247,0.004481,0.02866,-0.005594,0.029289,0.004143,-0.011158,-0.026253,-0.00273,-0.004685,-0.007629,-0.003728,0.01375,-0.008839,...,-0.015092,0.015436,0.005814,-0.022853,-0.002913,0.010017,0.014642,-0.015833,-0.000161,-0.002677,0.004497,0.006157,-0.003202,-0.01136,0.0055,-0.003886,-0.006371,-0.034481,-0.018362,-0.021787,0.003398,-0.016522,0.008819,-0.000587,0.036083,0.025864,0.001697,-0.027222,0.008119,-0.001907,-0.027025,0.010351,-0.002751,-0.023396,-0.011686,0.030641,-0.001899,0.020159,0.000216,0.020846
2,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.027994,-0.001268,0.036716,-0.117162,0.068892,-0.056321,0.056485,0.21603,-0.06282,0.145964,0.018308,-0.063717,-0.166279,-0.006023,-0.058182,-0.11343,0.05996,0.039956,-0.11916,...,-0.068407,0.08318,0.036119,-0.168197,-0.052839,0.089922,0.057375,-0.104971,-0.131909,-0.034791,0.117728,-0.004331,-0.009111,-0.127302,-0.02197,-0.045779,-0.08846,-0.163543,-0.102422,-0.169728,-0.017971,-0.091514,0.083277,0.079518,0.164443,0.174463,-0.015113,-0.181278,-0.012017,-0.021333,-0.222293,0.090442,-0.018218,-0.100902,-0.069906,0.158568,-0.05146,0.095885,-6.9e-05,0.020846
3,4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.01016,0.008724,0.018908,-0.106886,0.044604,-0.02785,0.06272,0.177042,-0.058275,0.113545,0.027842,-0.066603,-0.131301,0.015575,-0.045344,-0.081402,0.042827,0.018336,-0.096594,...,-0.050559,0.082381,0.019653,-0.12673,-0.039895,0.072926,0.044712,-0.055442,-0.102742,-0.050351,0.099289,-0.02731,-0.004189,-0.097082,-0.030951,-0.033054,-0.077786,-0.125809,-0.078104,-0.128231,-0.014607,-0.081827,0.058767,0.053064,0.110467,0.123124,-0.00669,-0.13709,0.005266,-0.011584,-0.180073,0.075501,-0.000978,-0.068578,-0.065922,0.102897,-0.02404,0.067549,0.019952,0.020846
4,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.009706,-0.001839,-0.004205,-0.151756,-0.004311,0.010818,0.084913,0.283817,-0.091185,0.1342,0.020124,-0.094595,-0.220452,-0.008585,-0.087853,-0.150233,0.149963,-0.000101,-0.176484,...,-0.066,0.07414,-0.010416,-0.217991,-0.101602,0.142051,0.054253,-0.087568,-0.205137,-0.055496,0.21355,-0.046543,0.016487,-0.179826,-0.072562,-0.055356,-0.126756,-0.135448,-0.089017,-0.210058,0.01049,-0.131375,0.095569,0.125783,0.11555,0.170745,-0.04534,-0.231334,-0.024179,-0.033537,-0.260236,0.151213,0.021207,-0.106634,-0.096092,0.122233,-0.046903,0.116846,-0.008933,0.020846


In [189]:
item_features.item_id.nunique()

9742

In [191]:
interactions.item_id.nunique()

9724

In [193]:
drop_target = [if_item  for if_item in item_features.item_id.unique() if if_item not in interactions.item_id.unique()]

In [194]:
# interaction에 없는 item_id의 feature 정보는 제외
n_item_features= item_features.loc[~item_features.item_id.apply(lambda x: x in drop_target)]

In [196]:
np.random.seed(1492)
interactions["random"] = np.random.random(size=len(interactions))
test_pct = 0.2

train_mask = interactions['random'] <  (1 - test_pct)
valid_mask = interactions['random'] >= (1 - test_pct)

interactions_train = interactions[train_mask][['user_id', 'item_id']]
interactions_valid = interactions[valid_mask][['user_id', 'item_id']]

train_users = np.sort(interactions_train.user_id.unique())
valid_users = np.sort(interactions_valid.user_id.unique())
cold_start_users = set(valid_users) - set(train_users)

train_items = np.sort(interactions_train.item_id.unique())
valid_items = np.sort(interactions_valid.item_id.unique())
cold_start_items = set(valid_items) - set(train_items)

print("train shape: {}".format(interactions_train.shape))
print("valid shape: {}".format(interactions_valid.shape))

print("train users: {}".format(len(train_users)))
print("valid users: {}".format(len(valid_users)))
print("cold-start users: {}".format(cold_start_users))

print("train items: {}".format(len(train_items)))
print("valid items: {}".format(len(valid_items)))
print("cold-start items: {}".format(cold_start_items))

train shape: (80392, 2)
valid shape: (20444, 2)
train users: 610
valid users: 610
cold-start users: set()
train items: 8967
valid items: 5172
cold-start items: {83969, 2056, 155659, 6158, 77841, 6163, 4116, 79897, 131098, 96283, 118814, 131104, 4129, 159779, 6181, 6182, 65577, 8238, 135216, 118834, 143410, 86068, 98361, 4154, 77881, 6204, 36931, 26693, 133195, 6223, 6225, 149590, 120919, 141400, 26713, 4189, 65631, 6241, 143458, 151653, 53355, 77931, 8302, 26736, 118896, 51314, 117, 2165, 96373, 2172, 73858, 90243, 71810, 90245, 100487, 127116, 163981, 151695, 8336, 163985, 32914, 149, 32917, 127134, 26791, 6316, 100527, 82095, 121007, 57526, 84156, 92348, 110781, 102590, 104644, 6342, 6344, 121035, 127180, 129229, 151759, 78034, 6358, 219, 172253, 4323, 2281, 8427, 127212, 168174, 241, 243, 6390, 80124, 8446, 2304, 8450, 96518, 88327, 78088, 8462, 51471, 90384, 69904, 96530, 180497, 26901, 43289, 176413, 6429, 106785, 295, 51498, 4394, 301, 53550, 26928, 67888, 4402, 6448, 170289, 645

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [205]:
# user_features = user_features[user_features.user_id.isin(train_users)]
item_features = item_features[item_features.item_id.isin(train_items)]
item_features.shape

(8967, 222)

Rankfm

In [63]:
!pip install rankfm

Collecting rankfm
[?25l  Downloading https://files.pythonhosted.org/packages/75/4a/34c98f1c2784204ba1bfccc98b9bf19c21cf2fb2a058582ca2961b02dd80/rankfm-0.2.5.tar.gz (145kB)
[K     |██▎                             | 10kB 24.3MB/s eta 0:00:01[K     |████▌                           | 20kB 19.5MB/s eta 0:00:01[K     |██████▊                         | 30kB 11.5MB/s eta 0:00:01[K     |█████████                       | 40kB 9.8MB/s eta 0:00:01[K     |███████████▎                    | 51kB 5.3MB/s eta 0:00:01[K     |█████████████▌                  | 61kB 5.9MB/s eta 0:00:01[K     |███████████████▊                | 71kB 6.4MB/s eta 0:00:01[K     |██████████████████              | 81kB 6.5MB/s eta 0:00:01[K     |████████████████████▎           | 92kB 6.8MB/s eta 0:00:01[K     |██████████████████████▌         | 102kB 6.9MB/s eta 0:00:01[K     |████████████████████████▊       | 112kB 6.9MB/s eta 0:00:01[K     |███████████████████████████     | 122kB 6.9MB/s eta 0:00:01[K 

In [206]:
from rankfm.rankfm import RankFM
from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall, diversity

model = RankFM(factors=20, loss='warp', max_samples=20, alpha=0.01, sigma=0.1, learning_rate=0.10, learning_schedule='invscaling')
model

<rankfm.rankfm.RankFM at 0x7ff295158c50>

In [212]:
interactions_train.item_id.unique()

array([     1,      6,     47, ..., 160527, 160836, 163937])

In [215]:
n_item_features = n_item_features.loc[n_item_features.item_id.isin(interactions_train.item_id.unique())]

In [216]:
model.fit(interactions = interactions_train, item_features= n_item_features, epochs=20,verbose=True)


training epoch: 0
log likelihood: -44315.4296875

training epoch: 1
log likelihood: -41140.01953125

training epoch: 2
log likelihood: -40590.671875

training epoch: 3
log likelihood: -40032.69921875

training epoch: 4
log likelihood: -38975.2109375

training epoch: 5
log likelihood: -38289.23046875

training epoch: 6
log likelihood: -37383.6484375

training epoch: 7
log likelihood: -36384.01171875

training epoch: 8
log likelihood: -35681.3203125

training epoch: 9
log likelihood: -34959.87890625

training epoch: 10
log likelihood: -34325.96875

training epoch: 11
log likelihood: -33674.51953125

training epoch: 12
log likelihood: -33256.28125

training epoch: 13
log likelihood: -32794.58984375

training epoch: 14
log likelihood: -32336.529296875

training epoch: 15
log likelihood: -32013.0

training epoch: 16
log likelihood: -31739.2109375

training epoch: 17
log likelihood: -31555.890625

training epoch: 18
log likelihood: -31245.939453125

training epoch: 19
log likelihood: -31239

In [217]:
valid_scores = model.predict(interactions_valid, cold_start='nan') 
print(valid_scores.shape)
pd.Series(valid_scores).describe()

(20444,)


count    19623.000000
mean         1.299727
std          1.198021
min         -2.426176
25%          0.362357
50%          1.274077
75%          2.177650
max          5.143312
dtype: float64

In [301]:
valid_recommendations = model.recommend(valid_users, n_items=10, filter_previous=True, cold_start='nan')
valid_recommendations

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,2716,1407,2081,1391,223,3114,3175,588,1968,2605
2,68157,80463,48516,59315,2959,69122,112183,593,112556,54286
3,1196,1200,1214,1197,1356,1036,2918,2161,1282,1269
4,2858,1247,223,1,858,1172,50,2396,2987,1285
5,780,318,593,356,454,592,32,480,648,350
...,...,...,...,...,...,...,...,...,...,...
606,296,7153,3949,1246,8949,923,4993,92259,47,3996
607,1210,648,593,1580,1200,1197,588,1073,1240,1198
608,333,2115,2683,3793,1517,2700,466,1690,3253,1197
609,780,153,377,593,165,32,95,349,434,648


In [302]:
k = 10

In [303]:
most_popular = interactions_train.groupby('item_id')['user_id'].count().sort_values(ascending=False)[:k]
most_popular

item_id
318     263
356     262
296     252
593     227
2571    225
260     199
480     198
110     186
589     182
50      175
Name: user_id, dtype: int64

In [304]:
test_user_items = interactions_valid.groupby('user_id')['item_id'].apply(set).to_dict()
test_user_items = {key: val for key, val in test_user_items.items() if key in set(train_users)}

base_hrt = np.mean([int(len(set(most_popular.index) & set(val)) > 0)                       for key, val in test_user_items.items()])
base_pre = np.mean([len(set(most_popular.index) & set(val)) / len(set(most_popular.index)) for key, val in test_user_items.items()])
base_rec = np.mean([len(set(most_popular.index) & set(val)) / len(set(val))                for key, val in test_user_items.items()])

print("number of test users: {}".format(len(test_user_items)))
print("baseline hit rate: {:.3f}".format(base_hrt))
print("baseline precision: {:.3f}".format(base_pre))
print("baseline recall: {:.3f}".format(base_rec))

number of test users: 610
baseline hit rate: 0.515
baseline precision: 0.081
baseline recall: 0.048


## predict dcg for baseline [popular]

In [322]:
def idcg(l):
  return sum((1.0 / np.log(i + 2) for i in range(l)))

In [336]:
def get_data(obj):
    """get the numeric data from either a pd.dataframe or np.ndarray

    :param obj: pd.dataframe or np.ndarray
    :return: the object's underlying np.ndarray data
    """

    if obj.__class__.__name__ in ('DataFrame', 'Series'):
        data = obj.values
    elif obj.__class__.__name__ == 'ndarray':
        data = obj
    else:
        raise TypeError("input data must be in either pd.dataframe/pd.series or np.ndarray format")
    return data


In [337]:
def discounted_cumulative_gain(model, test_interactions, k=10, filter_previous=False):
    """evaluate discounted cumulative gain wrt out-of-sample observed interactions

    :param model: trained RankFM model instance
    :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions
    :param k: number of recommendations to generate for each user
    :param filter_previous: remove observed training items from generated recommendations
    :return: mean discounted cumulative gain wrt the test users
    """

    # ensure that the model has been fit before attempting to generate predictions
    assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"

    # transform interactions into a user -> items dictionary
    test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
    test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
    test_users = list(test_user_items.keys())

    # generate topK recommendations for all test users also present in the training data
    test_recs = model.recommend(users=test_users, n_items=k, filter_previous=filter_previous, cold_start='drop')
    comm_user = test_recs.index.values

    # calculate the discounted cumulative gain (sum of inverse log scaled ranks of relevant items) wrt common users
    # 맞는 노래의 index를 가져오기.
    match_indexes = [np.where(test_recs.loc[u].isin(set(test_recs.loc[u]) & test_user_items[u]))[0] for u in comm_user]
    # discounted_cumulative_gain = np.mean([np.sum(1 / np.log2(index + 2)) if len(index) > 0 else 0 for index in match_indexes])
    dgcs = np.array([np.sum(1 / np.log2(index + 2)) if len(index) > 0 else 0 for index in match_indexes])
    idcgs = np.array([idcg(k) for idx in range(len(dgcs))])
    ndcgs = dgcs / idcgs

    return np.mean(ndcgs)


In [338]:
d1 = np.array([i+2 for i in range(10)])
d2 = np.array([i+1%5 for i in range(10)])

In [339]:
%%time
# model_hit_rate = hit_rate(model, interactions_valid, k=k)
# model_reciprocal_rank = reciprocal_rank(model, interactions_valid, k=k)
model_dcg = discounted_cumulative_gain(model, interactions_valid, k=k)
# model_precision = precision(model, interactions_valid, k=k)
# model_recall = recall(model, interactions_valid, k=k)

CPU times: user 24 s, sys: 8.98 ms, total: 24 s
Wall time: 24 s


In [312]:
print("hit_rate: {:.3f}".format(model_hit_rate))
print("reciprocal_rank: {:.3f}".format(model_reciprocal_rank))
print("dcg: {:.3f}".format(model_dcg, 3))
print("precision: {:.3f}".format(model_precision))
print("recall: {:.3f}".format(model_recall))

hit_rate: 0.528
reciprocal_rank: 0.208
dcg: 0.378
precision: 0.080
recall: 0.060


In [340]:
print("dcg: {:.3f}".format(model_dcg, 3))

dcg: 0.058


In [314]:
model_dcg

0.3782754228922356

In [315]:
idcg = 0.0
for i in range(10):
  idcg += 1.0 / np.log(i+2)

In [316]:
idcg

6.554970525044798

Implicit Baseline

In [276]:
!pip install implicit

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/bc/07/c0121884722d16e2c5beeb815f6b84b41cbf22e738e4075f1475be2791bc/implicit-0.4.4.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 6.6MB/s 
Building wheels for collected packages: implicit
  Building wheel for implicit (setup.py) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.4.4-cp36-cp36m-linux_x86_64.whl size=3419451 sha256=5011bbfb16495aeffb893eed2fe3181b22581e51286b59b0db41362cd5f7f003
  Stored in directory: /root/.cache/pip/wheels/bf/d4/ec/fd4f622fcbefb7521f149905295b2c26adecb23af38aa28217
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.4.4


In [286]:
n_users_train = interactions.user_id.max()
n_items_train = interactions.item_id.max()

In [288]:
from scipy.sparse import csr_matrix

# create zero-based index position <-> user/item ID mappings
index_to_user = pd.Series(np.sort(np.unique(interactions_train['user_id'])))
index_to_item = pd.Series(np.sort(np.unique(interactions_train['item_id'])))

# create reverse mappings from user/item ID to index positions
user_to_index = pd.Series(data=index_to_user.index, index=index_to_user.values)
item_to_index = pd.Series(data=index_to_item.index, index=index_to_item.values)

# convert user/item identifiers to index positions
interactions_train_imp = interactions_train.copy()
interactions_train_imp['user_id'] = interactions_train['user_id'].map(user_to_index)
interactions_train_imp['item_id'] = interactions_train['item_id'].map(item_to_index)

rows = interactions_train_imp['user_id']
cols = interactions_train_imp['item_id']
data = np.ones(len(rows))

# n_users_train
user_items_imp = csr_matrix((data, (rows, cols)), shape=(n_users_train, n_items_train))
item_users_imp = user_items_imp.T.tocsr()

In [291]:
from implicit.als import AlternatingLeastSquares

imp_model = AlternatingLeastSquares(factors=20)
imp_model.fit(item_users_imp)

GPU training requires factor size to be a multiple of 32. Increasing factors from 20 to 32.


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [298]:
recs_imp = imp_model.recommend_all(user_items=user_items_imp, N=10, filter_already_liked_items=False)
recs_imp = pd.DataFrame(recs_imp, index=index_to_user.values).apply(lambda c: c.map(index_to_item))

HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))




In [299]:
valid_user_items = interactions_valid.groupby('user_id')['item_id'].apply(set).to_dict()
combined_users = set(train_users) & set(valid_users)

imp_hrt = np.mean([int(len(set(recs_imp.loc[u]) & valid_user_items[u]) > 0) for u in combined_users])
imp_pre = np.mean([len(set(recs_imp.loc[u]) & valid_user_items[u]) / len(recs_imp.loc[u]) for u in combined_users])
imp_rec = np.mean([len(set(recs_imp.loc[u]) & valid_user_items[u]) / len(valid_user_items[u]) for u in combined_users])

In [300]:
print(imp_hrt)
print(imp_pre)
print(imp_rec)

0.5131147540983606
0.07967213114754099
0.06547803694496168


In [297]:
recs_imp.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1,858,589,1968,1200,2918,919,1036,1374,2762,1221
2,2959,2571,296,356,593,68157,7153,59315,4993,2329
3,1200,924,1214,1499,2641,1375,2985,1222,1994,3702
4,2858,1,2396,608,858,50,1206,527,318,1270
5,356,592,480,318,380,165,593,454,377,161
