* https://www.kaggle.com/niyamatalmass/lightfm-hybrid-recommendation-system
* https://towardsdatascience.com/how-i-would-explain-building-lightfm-hybrid-recommenders-to-a-5-year-old-b6ee18571309

In [1]:
from google.colab import drive
drive.mount("/gdrive")

Mounted at /gdrive


In [2]:
!pip install lightfm

Collecting lightfm
[?25l  Downloading https://files.pythonhosted.org/packages/5e/fe/8864d723daa8e5afc74080ce510c30f7ad52facf6a157d4b42dec83dfab4/lightfm-1.16.tar.gz (310kB)
[K     |█                               | 10kB 20.4MB/s eta 0:00:01[K     |██▏                             | 20kB 27.2MB/s eta 0:00:01[K     |███▏                            | 30kB 26.7MB/s eta 0:00:01[K     |████▎                           | 40kB 19.6MB/s eta 0:00:01[K     |█████▎                          | 51kB 16.1MB/s eta 0:00:01[K     |██████▍                         | 61kB 17.8MB/s eta 0:00:01[K     |███████▍                        | 71kB 14.5MB/s eta 0:00:01[K     |████████▌                       | 81kB 15.9MB/s eta 0:00:01[K     |█████████▌                      | 92kB 15.5MB/s eta 0:00:01[K     |██████████▋                     | 102kB 14.1MB/s eta 0:00:01[K     |███████████▋                    | 112kB 14.1MB/s eta 0:00:01[K     |████████████▊                   | 122kB 14.1MB/s eta 0:

In [3]:
ls -l /gdrive/MyDrive/colab/Factorization_Machine/data

total 18925
-rw------- 1 root root   197979 Nov 19 04:30 links.csv
-rw------- 1 root root   494431 Nov 24 01:17 movies.csv
-rw------- 1 root root  2483723 Nov 24 01:17 ratings.csv
-rw------- 1 root root   118660 Nov 19 04:30 tags.csv
-rw------- 1 root root   238833 Dec  3 01:17 tag_tokenizer_bpe_100.model
-rw------- 1 root root      672 Dec  3 01:17 tag_tokenizer_bpe_100.vocab
-rw------- 1 root root   370999 Nov 27 05:52 tag_tokenizer_bpe_9000.model
-rw------- 1 root root   114127 Nov 27 05:52 tag_tokenizer_bpe_9000.vocab
-rw------- 1 root root   271684 Dec  3 01:17 tag_tokenizer_input_bpe_100.txt
-rw------- 1 root root   236678 Nov 27 05:52 tag_tokenizer_input_bpe_24000.txt
-rw------- 1 root root   271684 Nov 27 05:57 tag_tokenizer_input_bpe_9000.txt
-rw------- 1 root root   276196 Dec  3 01:17 tag_w2v_bpe_100.model
-rw------- 1 root root   613252 Dec  4 07:37 title_tokenizer_bpe_24000.model
-rw------- 1 root root   340369 Dec  4 07:37 title_tokenizer_bpe_24000.vocab
-rw------- 1 root

In [4]:
import pandas as pd
import numpy as np
import os

data_path = "/gdrive/MyDrive/colab/Factorization_Machine/data"
ratings_path = os.path.join(data_path,"ratings.csv")
movies_path = os.path.join(data_path,"movies.csv")
links_path = os.path.join(data_path,"links.csv")
tags_path = os.path.join(data_path,"tags.csv")

ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)
links = pd.read_csv(links_path)
tags = pd.read_csv(tags_path)

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


## [movies] Genre Vectorization

In [9]:
unique_genres = np.unique(np.concatenate(np.array(movies.genres.apply(lambda x: np.array(x.split("|"))).tolist())))
dict_gid_2_gnr = { gid:genre for gid, genre in enumerate(unique_genres)} 
dict_gnr_2_gid = { genre:gid for gid, genre in dict_gid_2_gnr.items()}
unique_genres

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'], dtype='<U18')

In [10]:
gnr_vectors = list()
for gnrs in movies.genres.apply(lambda x: x.split("|")).tolist():
  gnr_vector = np.zeros(len(unique_genres),dtype=np.int8)
  for gnr in gnrs:
    gnr_vector[dict_gnr_2_gid[gnr]] = 1
  gnr_vectors.append(gnr_vector)

In [11]:
movies.loc[:,"genres_vectors"] = pd.Series(gnr_vectors)
movies.head()

Unnamed: 0,movieId,title,genres,genres_vectors
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,5,Father of the Bride Part II (1995),Comedy,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## [movies] Title Word2Vec

### 1. make year vector

In [12]:
import re

def get_year(regex, string):
  try:
    found = re.search(regex,string).group(0)[1:-1]
  except AttributeError:
    found = ''
  return found

regex = '\([0-9]{4}\)'
movies.loc[:,"launch_year"] = movies.title.apply(lambda x:get_year(regex,x))

In [13]:
movies.head()

Unnamed: 0,movieId,title,genres,genres_vectors,launch_year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1995
2,3,Grumpier Old Men (1995),Comedy|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",1995
4,5,Father of the Bride Part II (1995),Comedy,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1995


In [14]:
movies.loc[movies.launch_year=="","launch_year"] = 0
movies.loc[:,"launch_year"] = movies.launch_year.astype("int")
movies.loc[movies.launch_year==0].head()

Unnamed: 0,movieId,title,genres,genres_vectors,launch_year
6059,40697,Babylon 5,Sci-Fi,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
9031,140956,Ready Player One,Action|Sci-Fi|Thriller,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
9091,143410,Hyena Road,(no genres listed),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
9138,147250,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
9179,149334,Nocturnal Animals,Drama|Thriller,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0


In [15]:
movies.loc[movies.launch_year != 0,"movie_title"] = movies.loc[movies.launch_year!= 0,"title"].apply(lambda x:x[:-6].strip()) 
movies.loc[movies.launch_year== 0,"movie_title"] = movies.loc[movies.launch_year== 0,"title"]

In [16]:
movies.head()

Unnamed: 0,movieId,title,genres,genres_vectors,launch_year,movie_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1995,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1995,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1995,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",1995,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1995,Father of the Bride Part II


In [17]:
dict_id_2_year = { idx:year for idx, year in enumerate(np.sort(movies.launch_year.unique()))}
dict_year_2_id = { year:key for key, year in dict_id_2_year.items()}

In [18]:
launch_year_vectors = list()
for idx,row in movies.iterrows():
  launch_year= row["launch_year"]
  id = dict_year_2_id[launch_year]
  launch_year_vector = np.zeros(len(dict_year_2_id),dtype=np.int8)
  launch_year_vector[id] = 1
  launch_year_vectors.append(launch_year_vector)

In [19]:
movies.loc[:,"launch_year_vectors"]= pd.Series(launch_year_vectors)

In [20]:
movies.head()

Unnamed: 0,movieId,title,genres,genres_vectors,launch_year,movie_title,launch_year_vectors
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1995,Toy Story,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1995,Jumanji,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1995,Grumpier Old Men,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",1995,Waiting to Exhale,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,Father of the Bride Part II (1995),Comedy,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1995,Father of the Bride Part II,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [21]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |▎                               | 10kB 20.9MB/s eta 0:00:01[K     |▋                               | 20kB 29.5MB/s eta 0:00:01[K     |▉                               | 30kB 25.8MB/s eta 0:00:01[K     |█▏                              | 40kB 19.8MB/s eta 0:00:01[K     |█▌                              | 51kB 16.1MB/s eta 0:00:01[K     |█▊                              | 61kB 18.3MB/s eta 0:00:01[K     |██                              | 71kB 16.4MB/s eta 0:00:01[K     |██▍                             | 81kB 16.7MB/s eta 0:00:01[K     |██▋                             | 92kB 15.9MB/s eta 0:00:01[K     |███                             | 102kB 14.2MB/s eta 0:00:01[K     |███▎                            | 112kB 14.2MB/s eta 0:00:01[K     |███▌        

### 2. make title vector

In [22]:
from tqdm import tqdm_notebook
from gensim.models import Word2Vec as w2v
import sentencepiece as spm

### 특수문자 제외 및 단어별 배열로 변경
# list_title_frac= movies.movie_title.apply(lambda x:re.findall(regex,x)).to_list()

vocab_size = 24000
method = "bpe"

input_file_path = os.path.join(data_path,"title_tokenizer_input_{}_{}.txt".format(method,vocab_size))
tokenizer_name = os.path.join(data_path,"title_tokenizer_{}_{}".format(method, vocab_size))
tokenizer_name_model = os.path.join(data_path,"title_tokenizer_{}_{}.model".format(method, vocab_size))

### make_input4tokenizer
# sentences = list(map(lambda x:" ".join(x),list_title_frac))
sentences = movies.movie_title.to_list()


with open(input_file_path,'w',encoding='utf8') as f:
  for sentence in tqdm_notebook(sentences):
    f.write(sentence+'\n')

### train_tokenizer
templates = ' --input={} \
    --pad_id=0 \
    --bos_id=1 \
    --eos_id=2 \
    --unk_id=3 \
    --model_prefix={} \
    --vocab_size={} \
    --character_coverage=1.0 \
    --model_type={}'

cmd = templates.format(input_file_path, tokenizer_name, vocab_size, method)
spm.SentencePieceTrainer.Train(cmd)
print("title_tokenizer {} is generated".format(tokenizer_name))
sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)


def get_tokens_from_sentences(sp, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = sp.EncodeAsPieces(sentence)
        new_tokens = []
        for token in tokens:
            token = token.replace("▁", "")
            # if len(token) > 1:
            new_tokens.append(token)
        # if len(new_tokens) > 1:
        tokenized_sentences.append(new_tokens)

    return tokenized_sentences


def get_tokens_from_sentence(sp, sentence):
    new_tokens = []
    tokens = sp.EncodeAsPieces(sentence)
    for token in tokens:
        token = token.replace("▁", "")
        if len(token) > 1:
            new_tokens.append(token)
    return new_tokens

tokenized_sentences = get_tokens_from_sentences(sp,sentences)

### train w2v
w2v_name = os.path.join(data_path,"title_w2v_{}_{}.model".format(method,vocab_size))
print("start train_title_w2v....")

size = 200
window =5
min_count = 2
workers = 8
sg = 1
hs = 1

model = w2v(tokenized_sentences,size=size,window=window,min_count=min_count,workers=workers,sg=sg,hs=hs)
model.save(w2v_name)
print("title_w2v {} is generated".format(w2v_name))

# ### get embedding

sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)

w2v_model = w2v.load(w2v_name)

sentence_embs = []
# # 학습이 안되는 벡터들에 대해서는 0값이 나으므로 0벡터로 input 처리 수행
for sentence in tqdm_notebook(tokenized_sentences):
  word_embs = []
  for p_word in sentence:
    try:
      word_embs.append(w2v_model.wv[p_word])
    except KeyError:
      pass
  if len(word_embs):
    p_emb = np.average(word_embs, axis=0).tolist()
  else:
    p_emb = np.zeros(200).tolist()
  sentence_embs.append(p_emb)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=9742.0), HTML(value='')))


title_tokenizer /gdrive/MyDrive/colab/Factorization_Machine/data/title_tokenizer_bpe_24000 is generated
start train_title_w2v....
title_w2v /gdrive/MyDrive/colab/Factorization_Machine/data/title_w2v_bpe_24000.model is generated


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=9742.0), HTML(value='')))




In [23]:
movies.loc[:,"title_vector"] = pd.Series(sentence_embs)
movies.head()

Unnamed: 0,movieId,title,genres,genres_vectors,launch_year,movie_title,launch_year_vectors,title_vector
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1995,Toy Story,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.023692114278674126, 0.12745040655136108, 0...."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1995,Jumanji,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.005723795387893915, 0.011553888209164143, ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1995,Grumpier Old Men,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.008606274612247944, 0.13409554958343506, -..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",1995,Waiting to Exhale,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.01480109803378582, 0.12307507544755936, 0.0..."
4,5,Father of the Bride Part II (1995),Comedy,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1995,Father of the Bride Part II,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.03158659115433693, 0.1915138214826584, -0.0..."


In [24]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## DataPreparation for lighFM

In [25]:
org_ratings = ratings.copy()
org_movies = movies.copy()

In [26]:
ratings.loc[:,"movieId"] = (ratings.movieId - 1)
ratings.loc[:,"userId"] = (ratings.userId - 1)

movies.loc[:,"movieId"] = (movies.movieId - 1)

### make item features

In [27]:
len_gnr_vec = len(movies.iloc[0]["genres_vectors"])
len_year_vec = len(movies.iloc[0]["launch_year_vectors"])
# len_title_vec = len(movies.iloc[0]["title_vector"])

In [28]:
gnr_vec_cols = [ "gv_{}".format(i) for i in range(len_gnr_vec)]
year_vec_cols = [ "yv_{}".format(i) for i in range(len_year_vec)]
# title_vec_cols = [ "tv_{}".format(i) for i in range(len_title_vec)]

In [29]:
list_if = [ gnr_vec+":"+"{}".format(i) for gnr_vec in gnr_vec_cols for i in range(2)] + [ year_vec+":"+"{}".format(i) for year_vec in year_vec_cols for i in range(2)]

In [30]:
gnr_vec_list = list(map(lambda x: [ "{}".format(gnr_vec_cols[idx])+":"+"{}".format(value) for idx,value in enumerate(x)] ,movies.genres_vectors.to_list()))
year_vec_list = list(map(lambda x: [ "{}".format(year_vec_cols[idx])+":"+"{}".format(value) for idx,value in enumerate(x)] ,movies.launch_year_vectors.to_list()))
# title_vec_list = list(map(lambda x: [ "{}".format(title_vec_cols[idx])+":"+"{}".format(value) for idx,value in enumerate(x)] ,movies.title_vector.to_list()))

In [31]:
# item_vec_list = np.concatenate([np.array(gnr_vec_list), np.array(year_vec_list), np.array(title_vec_list)],axis=1)
item_vec_list = np.concatenate([np.array(gnr_vec_list), np.array(year_vec_list)],axis=1)

In [32]:
item_tuple= list(zip(movies.movieId, item_vec_list))

## make dataset

In [33]:
from lightfm.data import Dataset

dataset = Dataset()

user_ids = [i for i in range(ratings.userId.max()+1)]
movie_ids = [i for i in range(movies.movieId.max()+1)]

dataset.fit(
    user_ids,
    movie_ids,
    item_features = list_if
)

### define interaction

In [34]:
ratings.loc[:,"user_movie_rating"] = pd.Series(list(zip(ratings.userId, ratings.movieId, ratings.rating)))
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,user_movie_rating
0,0,0,4.0,964982703,"(0, 0, 4.0)"
1,0,2,4.0,964981247,"(0, 2, 4.0)"
2,0,5,4.0,964982224,"(0, 5, 4.0)"
3,0,46,5.0,964983815,"(0, 46, 5.0)"
4,0,49,5.0,964982931,"(0, 49, 5.0)"


In [35]:
interactions, weights = dataset.build_interactions(ratings["user_movie_rating"])

In [36]:
interactions

<610x193609 sparse matrix of type '<class 'numpy.int32'>'
	with 100836 stored elements in COOrdinate format>

In [37]:
weights

<610x193609 sparse matrix of type '<class 'numpy.float32'>'
	with 100836 stored elements in COOrdinate format>

In [38]:
item_tuple[0]

(0, array(['gv_0:0', 'gv_1:0', 'gv_2:1', 'gv_3:1', 'gv_4:1', 'gv_5:1',
        'gv_6:0', 'gv_7:0', 'gv_8:0', 'gv_9:1', 'gv_10:0', 'gv_11:0',
        'gv_12:0', 'gv_13:0', 'gv_14:0', 'gv_15:0', 'gv_16:0', 'gv_17:0',
        'gv_18:0', 'gv_19:0', 'yv_0:0', 'yv_1:0', 'yv_2:0', 'yv_3:0',
        'yv_4:0', 'yv_5:0', 'yv_6:0', 'yv_7:0', 'yv_8:0', 'yv_9:0',
        'yv_10:0', 'yv_11:0', 'yv_12:0', 'yv_13:0', 'yv_14:0', 'yv_15:0',
        'yv_16:0', 'yv_17:0', 'yv_18:0', 'yv_19:0', 'yv_20:0', 'yv_21:0',
        'yv_22:0', 'yv_23:0', 'yv_24:0', 'yv_25:0', 'yv_26:0', 'yv_27:0',
        'yv_28:0', 'yv_29:0', 'yv_30:0', 'yv_31:0', 'yv_32:0', 'yv_33:0',
        'yv_34:0', 'yv_35:0', 'yv_36:0', 'yv_37:0', 'yv_38:0', 'yv_39:0',
        'yv_40:0', 'yv_41:0', 'yv_42:0', 'yv_43:0', 'yv_44:0', 'yv_45:0',
        'yv_46:0', 'yv_47:0', 'yv_48:0', 'yv_49:0', 'yv_50:0', 'yv_51:0',
        'yv_52:0', 'yv_53:0', 'yv_54:0', 'yv_55:0', 'yv_56:0', 'yv_57:0',
        'yv_58:0', 'yv_59:0', 'yv_60:0', 'yv_61:0', 'yv

In [39]:
item_features = dataset.build_item_features(item_tuple, normalize=False)

In [40]:
from lightfm import LightFM

# model = LightFM(no_components=150, learning_rate=0.05, loss="warp", random_state =101)

In [41]:
from lightfm import cross_validation

class TrainLightFM:
  def __init__(self):
    pass

  def train_test_split(self, interactions, weights, random_state=101):
    train_interactions, test_interactions = cross_validation.random_train_test_split(interactions, random_state=np.random.RandomState(101))
    train_weights, test_weights = cross_validation.random_train_test_split(weights, random_state=np.random.RandomState(101))
    
    return train_interactions, test_interactions, train_weights, test_weights

  def fit(self, interactions, weights, user_features, item_features, cross_validation=False, no_components=150, learning_rate=0.05, loss="warp", random_state=101, verbose=True, num_threads=4, epochs=5):
    model = LightFM(no_components, learning_rate, loss=loss, random_state=random_state)

    if cross_validation:
      train_interactions, test_interactions, train_weights, test_weights = self.train_test_split(interactions, weights, random_state)
      model.fit(train_interactions, user_features=user_features, item_features=item_features, sample_weight=train_weights,epochs=epochs, num_threads=num_threads, verbose=verbose)

      return model, train_interactions, test_interactions, train_weights, test_weights

    else:
      model.fit(interactions, item_features=item_features, user_features=user_features, sample_weight=weights, epochs=epochs, num_threads=num_threads, verbose=verbose)

      return model


In [42]:
lightFM_trainer = TrainLightFM()

# non_val_model = lightFM_trainer.fit(interactions, weights, user_features=None, item_features=item_features, cross_validation=False, no_components=150, learning_rate=0.05, loss="warp", random_state=101, verbose=True, num_threads=4, epochs=5)
val_model, train_interactions, test_interactions, train_weights, test_weights = lightFM_trainer.fit(interactions, weights, user_features=None, item_features=item_features, cross_validation=True, no_components=150, learning_rate=0.05, loss="warp", random_state=101, verbose=True, num_threads=4, epochs=100)

Epoch: 100%|██████████| 100/100 [02:30<00:00,  1.50s/it]


In [43]:
class LightFMRecommendations:
  def __init__(self, lightfm_model,  user_features, item_features, movies, ratings):
    self.model = lightfm_model
    self.user_features = user_features
    self.item_features = item_features
    self.movies = movies
    self.ratings = ratings

  def previous_rated_movies(self, user_id):
    previous_rated_movies = self.ratings.loc[self.ratings["userId"] == user_id]
    
    return previous_rated_movies

  def _filter_item_by_user(self, user_id, filter_previous=False):
    if filter_previous:
      previous_rated_movies = self.previous_rated_movies(user_id)
      list_prev_rated_movieIds = previous_rated_movies["movieId"].values.tolist()
      movies_for_prediction = self.movies.loc[~self.movies["movieId"].isin(list_prev_rated_movieIds)]
    else:
      movies_for_prediction = self.movies

    return movies_for_prediction

  def recommend_by_user_id(self, user_id, filter_previous=False, num_prediction=10):
    movies_for_prediction = self._filter_item_by_user(user_id,filter_previous)
    score = self.model.predict(
        user_id,
        movies_for_prediction["movieId"].values.tolist(),
        item_features=self.item_features
        # user_features=self.user_features
    )

    movies_for_prediction["recommendation_score"] = score
    movies_for_prediction = movies_for_prediction.sort_values(by="recommendation_score",ascending=False)[:num_prediction]

    return movies_for_prediction

In [44]:
lightFM_recommender = LightFMRecommendations(val_model,None,item_features,movies,ratings)
movies_for_prediction_F = lightFM_recommender.recommend_by_user_id(user_id=0,filter_previous=False,num_prediction=10)
movies_for_prediction_T = lightFM_recommender.recommend_by_user_id(0,True,10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [45]:
from lightfm.evaluation import auc_score

def calculate_auc_score(lightfm_model, interactions_matrix, 
                        question_features, professional_features): 
    """
    Measure the ROC AUC metric for a model. 
    A perfect score is 1.0.

    Parameters
    ----------
    lightfm_model: LightFM model 
        A fitted lightfm model 
    interactions_matrix : 
        A lightfm interactions matrix 
    question_features, professional_features: 
        Lightfm features 
        
    Returns
    -------
    String containing AUC score 
    """
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=question_features, 
        user_features=professional_features, 
        num_threads=4).mean()
    return score

calculate_auc_score(val_model, train_interactions, item_features, None)

0.99971306

In [46]:
calculate_auc_score(val_model, test_interactions, item_features, None)

0.9965533

In [49]:
### epoch 5
from lightfm.evaluation import precision_at_k

# %%time
# precision_at_k(val_model,train_interactions,item_features=item_features).mean()

In [50]:
### epoch 100
%%time
precision_at_k(val_model,train_interactions,item_features=item_features).mean()

CPU times: user 2min 12s, sys: 2.21 ms, total: 2min 12s
Wall time: 2min 12s


0.6262295

In [51]:
### epoch 100
%%time
precision_at_k(val_model,test_interactions, train_interactions,item_features=item_features).mean()

CPU times: user 2min 3s, sys: 1.18 ms, total: 2min 3s
Wall time: 2min 3s


0.24672131

In [52]:
lightFM_recommender = LightFMRecommendations(val_model,None,item_features,movies,ratings)
movies_for_prediction_F = lightFM_recommender.recommend_by_user_id(user_id=0,filter_previous=False,num_prediction=10)
movies_for_prediction_T = lightFM_recommender.recommend_by_user_id(0,True,10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [53]:
movies_for_prediction_F

Unnamed: 0,movieId,title,genres,genres_vectors,launch_year,movie_title,launch_year_vectors,title_vector,recommendation_score
1503,2027,Saving Private Ryan (1998),Action|Drama|War,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",1998,Saving Private Ryan,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.0071511901915073395, 0.06231992319226265, ...",82.218781
898,1195,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1980,Star Wars: Episode V - The Empire Strikes Back,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.006401954684406519, 0.15503838658332825, -0...",82.038666
506,587,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,"[0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",1992,Aladdin,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.00826896633952856, 0.012394865974783897, 0...",82.032471
2250,2986,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...,"[0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, ...",1988,Who Framed Roger Rabbit?,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.00496014766395092, 0.15055891871452332, 0.0...",81.845535
2078,2761,"Sixth Sense, The (1999)",Drama|Horror|Mystery,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, ...",1999,"Sixth Sense, The","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.039835866540670395, 0.13840249180793762, -0...",81.76873
1706,2293,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy,"[0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1998,Antz,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",81.648109
911,1209,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1983,Star Wars: Episode VI - Return of the Jedi,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.028825968503952026, 0.17170323431491852, -0...",81.468903
789,1031,Alice in Wonderland (1951),Adventure|Animation|Children|Fantasy|Musical,"[0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, ...",1951,Alice in Wonderland,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.010601167567074299, 0.11100053787231445, -0...",81.419182
863,1135,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1975,Monty Python and the Holy Grail,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.016194066032767296, 0.12340718507766724, -0...",81.401321
659,857,"Godfather, The (1972)",Crime|Drama,"[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...",1972,"Godfather, The","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.05780700966715813, 0.1697556972503662, -0.0...",81.366302


In [54]:
movies_for_prediction_T

Unnamed: 0,movieId,title,genres,genres_vectors,launch_year,movie_title,launch_year_vectors,title_vector,recommendation_score
506,587,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,"[0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",1992,Aladdin,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.00826896633952856, 0.012394865974783897, 0...",82.032471
2078,2761,"Sixth Sense, The (1999)",Drama|Horror|Mystery,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, ...",1999,"Sixth Sense, The","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.039835866540670395, 0.13840249180793762, -0...",81.76873
1706,2293,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy,"[0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1998,Antz,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",81.648109
659,857,"Godfather, The (1972)",Crime|Drama,"[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...",1972,"Godfather, The","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.05780700966715813, 0.1697556972503662, -0.0...",81.366302
922,1220,"Godfather: Part II, The (1974)",Crime|Drama,"[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...",1974,"Godfather: Part II, The","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.03514277562499046, 0.18574918806552887, -0....",81.349297
507,588,Terminator 2: Judgment Day (1991),Action|Sci-Fi,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1991,Terminator 2: Judgment Day,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0068153319880366325, 0.13086751103401184, -...",81.2799
1757,2354,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,"[0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1998,"Bug's Life, A","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.045480877161026, 0.2499522566795349, 0.0298...",81.099991
1390,1906,Mulan (1998),Adventure|Animation|Children|Comedy|Drama|Musi...,"[0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...",1998,Mulan,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0003962657938245684, 0.0002772041189018637,...",81.064568
1399,1917,Lethal Weapon 4 (1998),Action|Comedy|Crime|Thriller,"[0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",1998,Lethal Weapon 4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.012654793448746204, 0.07857976108789444, 0...",81.037506
1623,2166,Blade (1998),Action|Horror|Thriller,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1998,Blade,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.008226502686738968, 0.053510621190071106, ...",81.030182


## Add Numerical Column to lighfFM

https://github.com/lyst/lightfm/issues/433

numerical data의 경우, 아래의 형태로 데이터를 투입

[(user_id, dict[col]=weight), .... , (user_id, dict)]

### add item features columns

      def fit(self, users, items, user_features=None, item_features=None):
          """
          Fit the user/item id and feature name mappings.
          Calling fit the second time will reset existing mappings.
          Parameters
          ----------
          users: iterable of user ids
          items: iterable of item ids
          user_features: iterable of user features, optional
          item_features: iterable of item features, optional
          """

          self._user_id_mapping = {}
          self._item_id_mapping = {}
          self._user_feature_mapping = {}
          self._item_feature_mapping = {}

          return self.fit_partial(users, items, user_features, item_features)

      def fit_partial(
          self, users=None, items=None, user_features=None, item_features=None
      ):
          """
          Fit the user/item id and feature name mappings.
          Calling fit the second time will add new entries to existing mappings.
          Parameters
          ----------
          users: iterable of user ids, optional
          items: iterable of item ids, optional
          user_features: iterable of user features, optional
          item_features: iterable of item features, optional
          """

          if users is not None:
              for user_id in users:
                  self._user_id_mapping.setdefault(user_id, len(self._user_id_mapping))

                  if self._user_identity_features:
                      self._user_feature_mapping.setdefault(
                          user_id, len(self._user_feature_mapping)
                      )

          if items is not None:
              for item_id in items:
                  self._item_id_mapping.setdefault(item_id, len(self._item_id_mapping))

                  if self._item_identity_features:
                      self._item_feature_mapping.setdefault(
                          item_id, len(self._item_feature_mapping)
                      )

          if user_features is not None:
              for user_feature in user_features:
                  self._user_feature_mapping.setdefault(
                      user_feature, len(self._user_feature_mapping)
                  )

          if item_features is not None:
              for item_feature in item_features:
                  self._item_feature_mapping.setdefault(
                      item_feature, len(self._item_feature_mapping)
                  )

### make item_features sparse matrix

      def build_item_features(self, data, normalize=True):
          """
          Build a item features matrix out of an iterable of the form
          (item id, [list of feature names]) or (item id, {feature name: feature weight}).
          Parameters
          ----------
          data: iterable of the form
              (item id, [list of feature names]) or (item id,
              {feature name: feature weight}).
              Item and feature ids will be translated to internal indices
              constructed during the fit call.
          normalize: bool, optional
              If true, will ensure that feature weights sum to 1 in every row.
          Returns
          -------
          feature matrix: CSR matrix (num items, num features)
              Matrix of item features.
          """

          builder = _FeatureBuilder(
              self._item_id_mapping,
              self._item_feature_mapping,
              self._item_identity_features,
              normalize,
              "item",
          )

          return builder.build(data)

In [55]:
len_title_vec = len(movies.iloc[0]["title_vector"])
title_vec_cols = [ "tv_{}".format(i) for i in range(len_title_vec)]
# title_vec_list = list(map(lambda x: [ {title_vec_cols[idx]:value} for idx,value in enumerate(x)] ,movies.title_vector.to_list()))
# item_vec_list = np.concatenate([np.array(gnr_vec_list), np.array(year_vec_list), np.array(title_vec_list)],axis=1)

In [68]:
def make_dict(list_x,vec_cols):
  dict_result = dict()
  for idx, element in enumerate(list_x):
    vec_col = vec_cols[idx]
    dict_result[vec_col] = element
  return dict_result

In [71]:
title_vec_list = list(map(lambda x: make_dict(x,title_vec_cols), movies.title_vector.to_list()))

In [56]:
# list_if = list_if + title_vec_cols
list_if = title_vec_cols

In [72]:
from lightfm.data import Dataset

dataset = Dataset()

user_ids = [i for i in range(ratings.userId.max()+1)]
movie_ids = [i for i in range(movies.movieId.max()+1)]

dataset.fit(
    user_ids,
    movie_ids,
    item_features = list_if
)

In [73]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

In [75]:
item_tuple= list(zip(movies.movieId, title_vec_list))

In [77]:
interactions, weights = dataset.build_interactions(ratings["user_movie_rating"])

item_features = dataset.build_item_features(item_tuple, normalize=False)

In [78]:
item_features

<193609x193809 sparse matrix of type '<class 'numpy.float32'>'
	with 2142009 stored elements in Compressed Sparse Row format>

In [84]:
lightFM_trainer = TrainLightFM()

# non_val_model = lightFM_trainer.fit(interactions, weights, user_features=None, item_features=item_features, cross_validation=False, no_components=150, learning_rate=0.05, loss="warp", random_state=101, verbose=True, num_threads=4, epochs=5)
val_model, train_interactions, test_interactions, train_weights, test_weights = lightFM_trainer.fit(interactions, weights, user_features=None, item_features=item_features, cross_validation=True, no_components=150, learning_rate=0.05, loss="warp", random_state=101, verbose=True, num_threads=4, epochs=100)

Epoch: 100%|██████████| 100/100 [04:24<00:00,  2.64s/it]


In [85]:
lightFM_recommender = LightFMRecommendations(val_model,None,item_features,movies,ratings)
movies_for_prediction_F = lightFM_recommender.recommend_by_user_id(user_id=0,filter_previous=False,num_prediction=10)
movies_for_prediction_T = lightFM_recommender.recommend_by_user_id(0,True,10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [86]:
calculate_auc_score(val_model, train_interactions, item_features, None)

0.999658

In [88]:
calculate_auc_score(val_model, test_interactions, item_features, None)

0.9875042

In [89]:
### epoch 100
%%time
precision_at_k(val_model,train_interactions,item_features=item_features).mean()

CPU times: user 2min 54s, sys: 8.17 ms, total: 2min 54s
Wall time: 2min 55s


0.6034426

In [90]:
### epoch 100
%%time
precision_at_k(val_model,test_interactions, train_interactions,item_features=item_features).mean()

CPU times: user 2min 43s, sys: 8.15 ms, total: 2min 43s
Wall time: 2min 43s


0.21262297

In [91]:
lightFM_recommender = LightFMRecommendations(val_model,None,item_features,movies,ratings)
movies_for_prediction_F = lightFM_recommender.recommend_by_user_id(user_id=0,filter_previous=False,num_prediction=10)
movies_for_prediction_T = lightFM_recommender.recommend_by_user_id(0,True,10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [92]:
movies_for_prediction_F

Unnamed: 0,movieId,title,genres,genres_vectors,launch_year,movie_title,launch_year_vectors,title_vector,recommendation_score
815,1072,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, ...",1971,Willy Wonka & the Chocolate Factory,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0025171497836709023, 0.11365289986133575, -...",2.989305
224,259,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1977,Star Wars: Episode IV - A New Hope,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.01621856726706028, 0.19248980283737183, 0.0...",2.7947
911,1209,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1983,Star Wars: Episode VI - Return of the Jedi,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.028825968503952026, 0.17170323431491852, -0...",2.615184
898,1195,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1980,Star Wars: Episode V - The Empire Strikes Back,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.006401954684406519, 0.15503838658332825, -0...",2.563305
984,1284,Heathers (1989),Comedy,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1989,Heathers,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2.400512
2250,2986,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...,"[0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, ...",1988,Who Framed Roger Rabbit?,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.00496014766395092, 0.15055891871452332, 0.0...",2.328219
546,647,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",1996,Mission: Impossible,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.006374205928295851, 0.11897297948598862, -0...",2.291911
1261,1675,Starship Troopers (1997),Action|Sci-Fi,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1997,Starship Troopers,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.01422899030148983, 0.020406311377882957, 0...",2.285181
1979,2627,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Sci-Fi,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1999,Star Wars: Episode I - The Phantom Menace,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0254085510969162, 0.18372735381126404, -0.0...",2.230127
915,1213,Alien (1979),Horror|Sci-Fi,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1979,Alien,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.06481888145208359, 0.1306486278772354, 0.04...",2.170139


In [93]:
movies_for_prediction_T

Unnamed: 0,movieId,title,genres,genres_vectors,launch_year,movie_title,launch_year_vectors,title_vector,recommendation_score
984,1284,Heathers (1989),Comedy,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1989,Heathers,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2.400512
793,1035,Die Hard (1988),Action|Crime|Thriller,"[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",1988,Die Hard,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.011233877390623093, 0.1401021033525467, -0....",2.081971
1164,1543,"Lost World: Jurassic Park, The (1997)",Action|Adventure|Sci-Fi|Thriller,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1997,"Lost World: Jurassic Park, The","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.03216763958334923, 0.16337978839874268, -0....",2.081262
2097,2790,Airplane! (1980),Comedy,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1980,Airplane!,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.013136067427694798, 0.14313171803951263, -0...",2.024816
2110,2803,"Christmas Story, A (1983)",Children|Comedy,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1983,"Christmas Story, A","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.06508201360702515, 0.24179835617542267, 0.0...",1.87194
2027,2698,Arachnophobia (1990),Comedy|Horror,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1990,Arachnophobia,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.831845
1404,1922,There's Something About Mary (1998),Comedy|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1998,There's Something About Mary,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.011143731884658337, 0.18981213867664337, 0...",1.807715
1033,1344,Carrie (1976),Drama|Fantasy|Horror|Thriller,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, ...",1976,Carrie,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.004675279371440411, 0.014128515496850014, ...",1.775203
1563,2099,Splash (1984),Comedy|Fantasy|Romance,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1984,Splash,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.699896
1158,1526,"Fifth Element, The (1997)",Action|Adventure|Comedy|Sci-Fi,"[0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1997,"Fifth Element, The","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.04174690321087837, 0.14015807211399078, -0....",1.661327
