# Torch-FM을 통해 Factorization Machine 이해하기

## Reference
    * Paper
      https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf
    * well-formed paper description
      https://www.jefkine.com/recsys/2017/03/27/factorization-machines/
    * Torch-FM package
      https://github.com/rixwew/pytorch-fm
    * implementation and additional reference by pytorch
      https://www.kaggle.com/gennadylaptev/factorization-machine-implemented-in-pytorch




In [12]:
from google.colab import drive
drive.mount("/gdrive")

Mounted at /gdrive


## Torch-FM example implementation (using movie-lens 1M dataset)

In [13]:
ls -l /gdrive/MyDrive/colab/Factorization_Machine/data/

total 18931
-rw------- 1 root root   197979 Nov 19 04:30 links.csv
-rw------- 1 root root   494431 Nov 24 01:17 movies.csv
-rw------- 1 root root  2483723 Nov 24 01:17 ratings.csv
-rw------- 1 root root   118660 Nov 19 04:30 tags.csv
-rw------- 1 root root   238833 Dec  2 10:04 tag_tokenizer_bpe_100.model
-rw------- 1 root root      672 Dec  2 10:04 tag_tokenizer_bpe_100.vocab
-rw------- 1 root root   370999 Nov 27 05:52 tag_tokenizer_bpe_9000.model
-rw------- 1 root root   114127 Nov 27 05:52 tag_tokenizer_bpe_9000.vocab
-rw------- 1 root root   271684 Dec  2 10:04 tag_tokenizer_input_bpe_100.txt
-rw------- 1 root root   236678 Nov 27 05:52 tag_tokenizer_input_bpe_24000.txt
-rw------- 1 root root   271684 Nov 27 05:57 tag_tokenizer_input_bpe_9000.txt
-rw------- 1 root root   276402 Dec  2 10:04 tag_w2v_bpe_100.model
-rw------- 1 root root   613252 Dec  2 10:04 title_tokenizer_bpe_24000.model
-rw------- 1 root root   340369 Dec  2 10:04 title_tokenizer_bpe_24000.vocab
-rw------- 1 root

In [14]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

In [15]:
dataset_name="movielens1M"
dataset_path = "/gdrive/MyDrive/colab/Factorization_Machine/data/ratings.csv"
model_name='fm'
epoch=100
learning_rate=0.001
batch_size=2048
weight_decay=1e-6
device = "cuda:0"

In [16]:
data = pd.read_csv(dataset_path, sep=",", engine="python", header=None)
data = data.to_numpy()[:, :3]
data

array([['userId', 'movieId', 'rating'],
       ['1', '1', '4.0'],
       ['1', '3', '4.0'],
       ...,
       ['610', '168250', '5.0'],
       ['610', '168252', '5.0'],
       ['610', '170875', '3.0']], dtype=object)

In [17]:
import numpy as np
import pandas as pd
import torch.utils.data


class MovieLens20MDataset(torch.utils.data.Dataset):
    """
    MovieLens 20M Dataset

    Data preparation
        treat samples with a rating less than 3 as negative samples
        3점 이하의 평가에 대해서는 0(negative sample)로 활용

    :param dataset_path: MovieLens dataset path

    Reference:
        https://grouplens.org/datasets/movielens
    """

    def __init__(self, dataset_path, sep=',', engine='c', header='infer'):
        data = pd.read_csv(dataset_path, sep=sep, engine=engine, header=header).to_numpy()[:, :3]
        self.items = data[:, :2].astype(np.int) - 1  # -1 because ID begins from 1
        self.targets = self.__preprocess_target(data[:, 2]).astype(np.float32)
        self.field_dims = np.max(self.items, axis=0) + 1
        self.user_field_idx = np.array((0, ), dtype=np.long)
        self.item_field_idx = np.array((1,), dtype=np.long)

    def __len__(self):
        return self.targets.shape[0]

    def __getitem__(self, index):
        return self.items[index], self.targets[index]

    def __preprocess_target(self, target):
        target[target <= 3] = 0
        target[target > 3] = 1
        return target


class MovieLens1MDataset(MovieLens20MDataset):
    """
    MovieLens 1M Dataset

    Data preparation
        treat samples with a rating less than 3 as negative samples

    :param dataset_path: MovieLens dataset path

    Reference:
        https://grouplens.org/datasets/movielens
    """

    def __init__(self, dataset_path):
        super().__init__(dataset_path, sep=',', engine='python', header=0)


In [18]:
def get_dataset(name, path):
    if name == 'movielens1M':
        return MovieLens1MDataset(path)
    elif name == 'movielens20M':
        return MovieLens20MDataset(path)
    elif name == 'criteo':
        return CriteoDataset(path)
    elif name == 'avazu':
        return AvazuDataset(path)
    else:
        raise ValueError('unknown dataset name: ' + name)

In [19]:
device = torch.device(device)
dataset = get_dataset(dataset_name, dataset_path)
train_length = int(len(dataset) * 0.8)
valid_length = int(len(dataset) * 0.1)
test_length = len(dataset) - train_length - valid_length
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    dataset, (train_length, valid_length, test_length))

In [20]:
from torch.utils.data import DataLoader

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=8)
valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=8)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=8)

In [21]:
dataset.field_dims

array([   610, 193609])

In [22]:
iteration = iter(train_data_loader)
data = next(iteration)

In [23]:
data

[tensor([[   220,   6538],
         [   516,  45667],
         [   571,   1692],
         ...,
         [   293,   2495],
         [   248, 102715],
         [     9,  72997]]), tensor([1., 0., 0.,  ..., 0., 1., 0.])]

### Torch-FM code 분해 및 실행

#### fm.py

**FactorizationMachineModel**: Factorization Machine의 전반적인 실행을 위한 클래스

In [24]:
import torch

# from torchfm.layer import FactorizationMachine, FeaturesEmbedding, FeaturesLinear


class FactorizationMachineModel(torch.nn.Module):
    """
    A pytorch implementation of Factorization Machine.

    Reference:
        S Rendle, Factorization Machines, 2010.
    """

    def __init__(self, field_dims, embed_dim):
        super().__init__()
        # user_id와 movie_id value에 대해 각각 embed_dim[K] 만큼의 공간으로 축소하고
        # [Batchsize, num(factors[user_id,movie_id]),embed_dim] 형태로 return 
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        # user_id와 movie_id value에 대해 각각 embed_dim[K] 만큼의 공간으로 축소하고 각 element 간에 합
        self.linear = FeaturesLinear(field_dims)
        # embedding 과 linear 값을 활용하여 Factorization Machine의 정의된 식 수행
        self.fm = FactorizationMachine(reduce_sum=True)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = self.linear(x) + self.fm(self.embedding(x))
        return torch.sigmoid(x.squeeze(1))


#### layer.py

**FeaturesLinear**: 각 user_id, movie_id를 embed_factor(K)에 대해 embedding하고 하나의 차원으로 결합

**FeaturesEmbedding**: 각 user_id, movie_id를 embed_factor(K)에 대해 embedding

**FactorizationMachine**: Paper에서 언급되는 $\hat{y}$에 대한 식으로 layer들을 활용하여 결과를 return 


In [25]:
class FeaturesLinear(torch.nn.Module):

    # field_dims = [610, 193609] in movielens 1M dataset
    def __init__(self, field_dims, output_dim=1):
        super().__init__()
        # user_id와 movie_id를 하나의 평면에 embedding 후 return output_dim
        self.fc = torch.nn.Embedding(sum(field_dims), output_dim)
        # w_0 가중치 in paper
        self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))
        # offset을 더하는 이유는 하나의 평면에 나타냄에 있어서 movie_id가 1부터 값이 시작되어, 
        # max(user_id) + movie_id 함으로써 하나의 독자적인 값으로 나타내기 위함.
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        # w_1, * x_1 + w_2 * x_2함으로써 각각의 변수에 대한 개별 weight에 global weight(w_0[bias])를 더함.
        return torch.sum(self.fc(x), dim=1) + self.bias


class FeaturesEmbedding(torch.nn.Module):

    # field_dims = [610, 193609] in movielens 1M dataset
    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        # [Batchsize, num_features, embed_dim]의 형태로 return
        return self.embedding(x)

# 논문에서 예측값 y를 구하기 위한 수식으로 계산하여 return
class FactorizationMachine(torch.nn.Module):

  def __init__(self, reduce_sum=True):
      super().__init__()
      self.reduce_sum = reduce_sum

  def forward(self, x):
      """
      :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
      """
      square_of_sum = torch.sum(x, dim=1) ** 2
      sum_of_square = torch.sum(x ** 2, dim=1)
      ix = square_of_sum - sum_of_square
      if self.reduce_sum:
          ix = torch.sum(ix, dim=1, keepdim=True)
      return 0.5 * ix

#### 실행

**FeaturesLinear**

In [26]:
fc = torch.nn.Embedding(sum(dataset.field_dims),1)
bias = torch.nn.Parameter(torch.zeros(1,))
offset = np.array((0, *np.cumsum(dataset.field_dims)[:-1]), dtype=np.long)

In [27]:
x = data[0] + data[0].new_tensor(offset).unsqueeze(0)

In [28]:
x

tensor([[   220,   7148],
        [   516,  46277],
        [   571,   2302],
        ...,
        [   293,   3105],
        [   248, 103325],
        [     9,  73607]])

In [29]:
fc(x)[0]

tensor([[2.2098],
        [0.9906]], grad_fn=<SelectBackward>)

In [30]:
torch.sum(fc(x),dim=1)

tensor([[ 3.2004],
        [ 0.4360],
        [ 1.5814],
        ...,
        [-1.5886],
        [-0.0238],
        [-2.7817]], grad_fn=<SumBackward1>)

In [31]:
bias

Parameter containing:
tensor([0.], requires_grad=True)

In [32]:
linear = torch.sum(fc(x),dim=1) + bias

In [33]:
linear

tensor([[ 3.2004],
        [ 0.4360],
        [ 1.5814],
        ...,
        [-1.5886],
        [-0.0238],
        [-2.7817]], grad_fn=<AddBackward0>)

****FeaturesEmbedding****

In [34]:
embedding = torch.nn.Embedding(sum(dataset.field_dims),10)
offset = np.array((0, *np.cumsum(dataset.field_dims)[:-1]), dtype=np.long)
torch.nn.init.xavier_uniform_(embedding.weight.data)

tensor([[-0.0020,  0.0037,  0.0046,  ...,  0.0014,  0.0033, -0.0035],
        [ 0.0030, -0.0053,  0.0013,  ...,  0.0020, -0.0054, -0.0020],
        [ 0.0022, -0.0051,  0.0045,  ...,  0.0024, -0.0011,  0.0015],
        ...,
        [ 0.0045,  0.0034,  0.0024,  ..., -0.0004,  0.0032,  0.0032],
        [ 0.0047,  0.0048,  0.0036,  ..., -0.0021,  0.0028,  0.0008],
        [-0.0007, -0.0053, -0.0052,  ..., -0.0004, -0.0005,  0.0031]])

In [35]:
x = data[0] + data[0].new_tensor(offset).unsqueeze(0)
x

tensor([[   220,   7148],
        [   516,  46277],
        [   571,   2302],
        ...,
        [   293,   3105],
        [   248, 103325],
        [     9,  73607]])

In [36]:
embed = embedding(x)

In [37]:
embed[0]

tensor([[ 0.0005, -0.0037, -0.0046,  0.0039,  0.0028,  0.0037, -0.0052,  0.0043,
          0.0037,  0.0055],
        [-0.0025, -0.0046, -0.0021, -0.0049,  0.0009, -0.0019, -0.0018,  0.0034,
          0.0014, -0.0051]], grad_fn=<SelectBackward>)

**FactorizationMachine**

In [38]:
def forward(x):
  # embedding 된 두개의 벡터를 위치에 맞춰서 합
  square_of_sum = torch.sum(x, dim=1) ** 2
  sum_of_square = torch.sum(x**2,dim=1)
  ix = square_of_sum - sum_of_square
  # ix = torch.sum(ix, dim=1, keepdim=True)
  return 0.5 * ix

In [39]:
fm = forward(embed)

In [40]:
fm[0]

tensor([-1.1803e-06,  1.6863e-05,  9.6194e-06, -1.8882e-05,  2.4525e-06,
        -7.2385e-06,  9.4319e-06,  1.4387e-05,  5.4123e-06, -2.8187e-05],
       grad_fn=<SelectBackward>)

In [41]:
fm_machine = linear + fm

In [42]:
torch.sigmoid(fm_machine.squeeze(1))

tensor([[0.9609, 0.9609, 0.9609,  ..., 0.9609, 0.9609, 0.9608],
        [0.6073, 0.6073, 0.6073,  ..., 0.6073, 0.6073, 0.6073],
        [0.8294, 0.8294, 0.8294,  ..., 0.8294, 0.8294, 0.8294],
        ...,
        [0.1696, 0.1696, 0.1696,  ..., 0.1696, 0.1696, 0.1696],
        [0.4941, 0.4941, 0.4941,  ..., 0.4941, 0.4941, 0.4941],
        [0.0583, 0.0583, 0.0583,  ..., 0.0583, 0.0583, 0.0583]],
       grad_fn=<SigmoidBackward>)

## Actual Implementation

In [43]:
import os

IS_COLAB = True

if IS_COLAB:
  project_path = "/gdrive/MyDrive/colab/Factorization_Machine/"
else:
  project_path = os.path.dirname(os.path.abspath("__file__"))

data_path = os.path.join(project_path,"data")

In [44]:
ls -l /gdrive/MyDrive/colab/Factorization_Machine/data

total 18931
-rw------- 1 root root   197979 Nov 19 04:30 links.csv
-rw------- 1 root root   494431 Nov 24 01:17 movies.csv
-rw------- 1 root root  2483723 Nov 24 01:17 ratings.csv
-rw------- 1 root root   118660 Nov 19 04:30 tags.csv
-rw------- 1 root root   238833 Dec  2 10:04 tag_tokenizer_bpe_100.model
-rw------- 1 root root      672 Dec  2 10:04 tag_tokenizer_bpe_100.vocab
-rw------- 1 root root   370999 Nov 27 05:52 tag_tokenizer_bpe_9000.model
-rw------- 1 root root   114127 Nov 27 05:52 tag_tokenizer_bpe_9000.vocab
-rw------- 1 root root   271684 Dec  2 10:04 tag_tokenizer_input_bpe_100.txt
-rw------- 1 root root   236678 Nov 27 05:52 tag_tokenizer_input_bpe_24000.txt
-rw------- 1 root root   271684 Nov 27 05:57 tag_tokenizer_input_bpe_9000.txt
-rw------- 1 root root   276402 Dec  2 10:04 tag_w2v_bpe_100.model
-rw------- 1 root root   613252 Dec  2 10:04 title_tokenizer_bpe_24000.model
-rw------- 1 root root   340369 Dec  2 10:04 title_tokenizer_bpe_24000.vocab
-rw------- 1 root

In [45]:
!pip install sentencepiece

from gensim.models import  Word2Vec as w2v
import sentencepiece as spm
from tqdm import tqdm_notebook

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |▎                               | 10kB 18.6MB/s eta 0:00:01[K     |▋                               | 20kB 18.0MB/s eta 0:00:01[K     |▉                               | 30kB 10.3MB/s eta 0:00:01[K     |█▏                              | 40kB 8.6MB/s eta 0:00:01[K     |█▌                              | 51kB 4.5MB/s eta 0:00:01[K     |█▊                              | 61kB 4.6MB/s eta 0:00:01[K     |██                              | 71kB 5.0MB/s eta 0:00:01[K     |██▍                             | 81kB 5.2MB/s eta 0:00:01[K     |██▋                             | 92kB 5.5MB/s eta 0:00:01[K     |███                             | 102kB 5.9MB/s eta 0:00:01[K     |███▎                            | 112kB 5.9MB/s eta 0:00:01[K     |███▌                

In [46]:
import pandas as pd
import numpy as np

csv_movies = os.path.join(data_path,"movies.csv")
csv_ratings = os.path.join(data_path,"ratings.csv")
csv_tags = os.path.join(data_path,"tags.csv")
csv_links =  os.path.join(data_path,"links.csv")

movies = pd.read_csv(csv_movies)
ratings = pd.read_csv(csv_ratings)
tags = pd.read_csv(csv_tags)
links = pd.read_csv(csv_links)

org_movies = movies.copy()
org_ratings = ratings.copy()
org_tags = tags.copy()
org_links = links.copy()

In [47]:
dict_mid_2_nid= {mid:nid for nid, mid in enumerate(movies.movieId.unique())}
dict_nid_2_mid = {nid:mid for mid,nid in dict_mid_2_nid.items()}

dict_uid_2_nid= {uid:nid for nid, uid in enumerate(ratings.userId.unique())}
dict_nid_2_uid = {nid:uid for uid,nid in dict_uid_2_nid.items()}

movies.loc[:,"m_nid"] = movies.movieId.apply(lambda x:dict_mid_2_nid[x])

ratings.loc[:,"u_nid"] = ratings.userId.apply(lambda x:dict_uid_2_nid[x])
ratings.loc[:,"m_nid"] = ratings.movieId.apply(lambda x:dict_mid_2_nid[x])

tags.loc[:,"u_nid"] = tags.userId.apply(lambda x:dict_uid_2_nid[x])
tags.loc[:,"m_nid"] = tags.movieId.apply(lambda x:dict_mid_2_nid[x])

links.loc[:,"m_nid"] = tags.movieId.apply(lambda x:dict_mid_2_nid[x])

In [48]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
4,5,Father of the Bride Part II (1995),Comedy,4


In [49]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid
0,1,1,4.0,964982703,0,0
1,1,3,4.0,964981247,0,2
2,1,6,4.0,964982224,0,5
3,1,47,5.0,964983815,0,43
4,1,50,5.0,964982931,0,46


In [50]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp,u_nid,m_nid
0,2,60756,funny,1445714994,1,6801
1,2,60756,Highly quotable,1445714996,1,6801
2,2,60756,will ferrell,1445714992,1,6801
3,2,89774,Boxing story,1445715207,1,7697
4,2,89774,MMA,1445715200,1,7697


In [51]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId,m_nid
0,1,114709,862.0,6801.0
1,2,113497,8844.0,6801.0
2,3,113228,15602.0,6801.0
3,4,114885,31357.0,7697.0
4,5,113041,11862.0,7697.0


In [52]:
sorted_ratings = ratings.sort_values(by=["userId","timestamp"])
sorted_ratings['b4_timestamp'] = sorted_ratings.groupby(['userId'])['timestamp'].shift(1)
mc_sorted_ratings= sorted_ratings.groupby(['userId',"timestamp"])["m_nid"].count().to_frame(name="m_count")
sorted_ratings = pd.merge(left=sorted_ratings, right=mc_sorted_ratings, on=["userId","timestamp"], how="left")
sorted_ratings.loc[:,"b4_timestamp"] = sorted_ratings.b4_timestamp.fillna(0).astype("int64")
sorted_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count
0,1,804,4.0,964980499,0,632,0,2
1,1,1210,5.0,964980499,0,911,964980499,2
2,1,2018,5.0,964980523,0,1493,964980499,3
3,1,2628,4.0,964980523,0,1979,964980523,3
4,1,2826,4.0,964980523,0,2126,964980523,3


In [53]:
previous_timestamp = 0
counter = 0
former_movies = list()
now_movies = list()
movie_vectors = list()

for idx,row in tqdm_notebook(sorted_ratings.iterrows()):
  counter += 1
  m_nid = row["m_nid"].astype("int")
  m_count = row["m_count"]
  timestamp = row["timestamp"]
  b4_timestamp = row["b4_timestamp"]

  if previous_timestamp == 0 and counter <= m_count:
    now_movies.append(m_nid)
    movie_vector = np.zeros(len(dict_mid_2_nid)) 
    
    if counter == m_count:
      former_movies = now_movies
      previous_timestamp = timestamp
      now_movies = list()
      counter = 0
  
  elif previous_timestamp != 0 and counter <= m_count:
    now_movies.append(m_nid)
    movie_vector = np.zeros(len(dict_mid_2_nid))
    movie_vector[former_movies] = 1

    if counter == m_count:
      former_movies = now_movies
      previous_timestamp = timestamp
      now_movies = list()
      counter = 0

  movie_vectors.append(movie_vector)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [54]:
sorted_ratings.loc[:,"last_rate_vector"] = pd.Series(movie_vectors)
sorted_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector
0,1,804,4.0,964980499,0,632,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,1210,5.0,964980499,0,911,964980499,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,2018,5.0,964980523,0,1493,964980499,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,2628,4.0,964980523,0,1979,964980523,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,2826,4.0,964980523,0,2126,964980523,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [55]:
ratings = sorted_ratings.sort_values(by=["userId","movieId"]).copy()
ratings = ratings.reset_index(drop=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector
0,1,1,4.0,964982703,0,0,964982681,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,3,4.0,964981247,0,2,964981230,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,6,4.0,964982224,0,5,964982211,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,47,5.0,964983815,0,43,964983793,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,50,5.0,964982931,0,46,964982903,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [56]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
4,5,Father of the Bride Part II (1995),Comedy,4


In [57]:
unique_genres = np.unique(np.concatenate(np.array(movies.genres.apply(lambda x: np.array(x.split("|"))).tolist())))
dict_gid_2_gnr = { gid:genre for gid, genre in enumerate(unique_genres)} 
dict_gnr_2_gid = { genre:gid for gid, genre in dict_gid_2_gnr.items()}
unique_genres

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'], dtype='<U18')

In [58]:
gnr_vectors = list()
for gnrs in movies.genres.apply(lambda x: x.split("|")).tolist():
  gnr_vector = np.zeros(len(unique_genres))
  for gnr in gnrs:
    gnr_vector[dict_gnr_2_gid[gnr]] = 1
  gnr_vectors.append(gnr_vector)

In [59]:
movies.loc[:,"genres_vectors"] = pd.Series(gnr_vectors)
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."


In [60]:
import re

def get_year(regex, string):
  try:
    found = re.search(regex,string).group(0)[1:-1]
  except AttributeError:
    found = ''
  return found

regex = '\([0-9]{4}\)'
movies.loc[:,"launch_year"] = movies.title.apply(lambda x:get_year(regex,x))

In [61]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1995
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",1995
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995


In [62]:
movies.loc[movies.launch_year=="","launch_year"] = 0
movies.loc[:,"launch_year"] = movies.launch_year.astype("int")
movies.loc[movies.launch_year==0].head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year
6059,40697,Babylon 5,Sci-Fi,6059,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9031,140956,Ready Player One,Action|Sci-Fi|Thriller,9031,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9091,143410,Hyena Road,(no genres listed),9091,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9138,147250,The Adventures of Sherlock Holmes and Doctor W...,(no genres listed),9138,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
9179,149334,Nocturnal Animals,Drama|Thriller,9179,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",0


In [63]:
movies.loc[movies.launch_year != 0,"movie_title"] = movies.loc[movies.launch_year!= 0,"title"].apply(lambda x:x[:-6].strip()) 
movies.loc[movies.launch_year== 0,"movie_title"] = movies.loc[movies.launch_year== 0,"title"]

In [64]:
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year,movie_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1995,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",1995,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Father of the Bride Part II


In [65]:
### 특수문자 제외 및 단어별 배열로 변경
# list_title_frac= movies.movie_title.apply(lambda x:re.findall(regex,x)).to_list()

vocab_size = 24000
method = "bpe"

input_file_path = os.path.join(data_path,"title_tokenizer_input_{}_{}.txt".format(method,vocab_size))
tokenizer_name = os.path.join(data_path,"title_tokenizer_{}_{}".format(method, vocab_size))
tokenizer_name_model = os.path.join(data_path,"title_tokenizer_{}_{}.model".format(method, vocab_size))

### make_input4tokenizer
# sentences = list(map(lambda x:" ".join(x),list_title_frac))
sentences = movies.movie_title.to_list()


with open(input_file_path,'w',encoding='utf8') as f:
  for sentence in tqdm_notebook(sentences):
    f.write(sentence+'\n')

### train_tokenizer
templates = ' --input={} \
    --pad_id=0 \
    --bos_id=1 \
    --eos_id=2 \
    --unk_id=3 \
    --model_prefix={} \
    --vocab_size={} \
    --character_coverage=1.0 \
    --model_type={}'

cmd = templates.format(input_file_path, tokenizer_name, vocab_size, method)
spm.SentencePieceTrainer.Train(cmd)
print("title_tokenizer {} is generated".format(tokenizer_name))
sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)


def get_tokens_from_sentences(sp, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = sp.EncodeAsPieces(sentence)
        new_tokens = []
        for token in tokens:
            token = token.replace("▁", "")
            # if len(token) > 1:
            new_tokens.append(token)
        # if len(new_tokens) > 1:
        tokenized_sentences.append(new_tokens)

    return tokenized_sentences


def get_tokens_from_sentence(sp, sentence):
    new_tokens = []
    tokens = sp.EncodeAsPieces(sentence)
    for token in tokens:
        token = token.replace("▁", "")
        if len(token) > 1:
            new_tokens.append(token)
    return new_tokens

tokenized_sentences = get_tokens_from_sentences(sp,sentences)

### train w2v
w2v_name = os.path.join(data_path,"title_w2v_{}_{}.model".format(method,vocab_size))
print("start train_title_w2v....")

size = 200
window =5
min_count = 2
workers = 8
sg = 1
hs = 1

model = w2v(tokenized_sentences,size=size,window=window,min_count=min_count,workers=workers,sg=sg,hs=hs)
model.save(w2v_name)
print("title_w2v {} is generated".format(w2v_name))

# ### get embedding

sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)

w2v_model = w2v.load(w2v_name)

sentence_embs = []
# # 학습이 안되는 벡터들에 대해서는 0값이 나으므로 0벡터로 input 처리 수행
for sentence in tqdm_notebook(tokenized_sentences):
  word_embs = []
  for p_word in sentence:
    try:
      word_embs.append(w2v_model.wv[p_word])
    except KeyError:
      pass
  if len(word_embs):
    p_emb = np.average(word_embs, axis=0).tolist()
  else:
    p_emb = np.zeros(200).tolist()
  sentence_embs.append(p_emb)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=9742.0), HTML(value='')))


title_tokenizer /gdrive/MyDrive/colab/Factorization_Machine/data/title_tokenizer_bpe_24000 is generated
start train_title_w2v....
title_w2v /gdrive/MyDrive/colab/Factorization_Machine/data/title_w2v_bpe_24000.model is generated


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=9742.0), HTML(value='')))




In [66]:
movies.loc[:,"title_vector"] = pd.Series(sentence_embs)
movies.head()

Unnamed: 0,movieId,title,genres,m_nid,genres_vectors,launch_year,movie_title,title_vector
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Toy Story,"[0.015534833073616028, 0.01623934507369995, 0...."
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,"[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1995,Jumanji,"[-0.0010245643788948655, 0.0012786060106009245..."
2,3,Grumpier Old Men (1995),Comedy|Romance,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Grumpier Old Men,"[0.003464460140094161, 0.030880093574523926, 0..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",1995,Waiting to Exhale,"[0.016978969797492027, 0.021812498569488525, 0..."
4,5,Father of the Bride Part II (1995),Comedy,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",1995,Father of the Bride Part II,"[0.06127961352467537, 0.027192719280719757, 0...."


In [67]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp,u_nid,m_nid
0,2,60756,funny,1445714994,1,6801
1,2,60756,Highly quotable,1445714996,1,6801
2,2,60756,will ferrell,1445714992,1,6801
3,2,89774,Boxing story,1445715207,1,7697
4,2,89774,MMA,1445715200,1,7697


In [68]:
from collections import OrderedDict
set_tag_user_movie = set(zip(tags.userId, tags.movieId))

dict_tag_string = OrderedDict()
for user, movie in set_tag_user_movie:
  list_tag = tags.loc[np.logical_and(tags.userId==user,tags.movieId==movie),"tag"].tolist()
  dict_tag_string[(user,movie)] = list_tag

dict_tag_string = OrderedDict(sorted(dict_tag_string.items()))

In [69]:
user_series = pd.Series(list(map(lambda x:x[0],dict_tag_string.keys())))
movie_series = pd.Series(list(map(lambda x:x[1],dict_tag_string.keys()))) 
tag_series = pd.Series(list(dict_tag_string.values()))

df_tag_list = pd.concat([user_series,movie_series,tag_series],axis=1)
df_tag_list.columns = ["userId","movieId","tags"]
df_tag_list.head()

Unnamed: 0,userId,movieId,tags
0,2,60756,"[funny, Highly quotable, will ferrell]"
1,2,89774,"[Boxing story, MMA, Tom Hardy]"
2,2,106782,"[drugs, Leonardo DiCaprio, Martin Scorsese]"
3,7,48516,[way too long]
4,18,431,"[Al Pacino, gangster, mafia]"


In [70]:
ratings = pd.merge(left=ratings, right=df_tag_list, on=["userId","movieId"], how="left")
ratings.loc[:,"tags"]= ratings.tags.fillna(" ")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector,tags
0,1,1,4.0,964982703,0,0,964982681,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
1,1,3,4.0,964981247,0,2,964981230,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
2,1,6,4.0,964982224,0,5,964982211,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
3,1,47,5.0,964983815,0,43,964983793,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",
4,1,50,5.0,964982931,0,46,964982903,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",


In [71]:
### 특수문자 제외 및 단어별 배열로 변경
# list_title_frac= movies.movie_title.apply(lambda x:re.findall(regex,x)).to_list()

vocab_size = 100
method = "bpe"

input_file_path = os.path.join(data_path,"tag_tokenizer_input_{}_{}.txt".format(method,vocab_size))
tokenizer_name = os.path.join(data_path,"tag_tokenizer_{}_{}".format(method, vocab_size))
tokenizer_name_model = os.path.join(data_path,"tag_tokenizer_{}_{}.model".format(method, vocab_size))

### make_input4tokenizer
# sentences = list(map(lambda x:" ".join(x),list_title_frac))
tag_sentences = ratings.tags.to_list()

sentences = []
for tag_sentence in tag_sentences:
  sentence = " ".join(tag_sentence)
  sentences.append(sentence)

with open(input_file_path,'w',encoding='utf8') as f:
  for sentence in tqdm_notebook(sentences):
    tag_sentence = " ".join(sentence)
    f.write(tag_sentence+'\n')

### train_tokenizer
templates = ' --input={} \
    --pad_id=0 \
    --bos_id=1 \
    --eos_id=2 \
    --unk_id=3 \
    --model_prefix={} \
    --vocab_size={} \
    --character_coverage=1.0 \
    --model_type={}'

cmd = templates.format(input_file_path, tokenizer_name, vocab_size, method)
spm.SentencePieceTrainer.Train(cmd)
print("tag_tokenizer {} is generated".format(tokenizer_name))
sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)


def get_tokens_from_sentences(sp, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokens = sp.EncodeAsPieces(sentence)
        new_tokens = []
        for token in tokens:
            token = token.replace("▁", "")
            # if len(token) > 1:
            new_tokens.append(token)
        # if len(new_tokens) > 1:
        tokenized_sentences.append(new_tokens)

    return tokenized_sentences


def get_tokens_from_sentence(sp, sentence):
    new_tokens = []
    tokens = sp.EncodeAsPieces(sentence)
    for token in tokens:
        token = token.replace("▁", "")
        if len(token) > 1:
            new_tokens.append(token)
    return new_tokens

tokenized_sentences = get_tokens_from_sentences(sp,sentences)

### train w2v
w2v_name = os.path.join(data_path,"tag_w2v_{}_{}.model".format(method,vocab_size))
print("start train_tag_w2v....")

size = 200
window =5
min_count = 2
workers = 8
sg = 1
hs = 1

model = w2v(tokenized_sentences,size=size,window=window,min_count=min_count,workers=workers,sg=sg,hs=hs)
model.save(w2v_name)
print("tag_w2v {} is generated".format(w2v_name))

# ### get embedding

sp = spm.SentencePieceProcessor()
sp.Load(tokenizer_name_model)

w2v_model = w2v.load(w2v_name)

sentence_embs = []
# # 학습이 안되는 벡터들에 대해서는 0값이 나으므로 0벡터로 input 처리 수행
for sentence in tqdm_notebook(tokenized_sentences):
  word_embs = []
  for p_word in sentence:
    try:
      word_embs.append(w2v_model.wv[p_word])
    except KeyError:
      pass
  if len(word_embs):
    p_emb = np.average(word_embs, axis=0).tolist()
  else:
    p_emb = np.zeros(200).tolist()
  sentence_embs.append(p_emb)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=100836.0), HTML(value='')))


tag_tokenizer /gdrive/MyDrive/colab/Factorization_Machine/data/tag_tokenizer_bpe_100 is generated
start train_tag_w2v....
tag_w2v /gdrive/MyDrive/colab/Factorization_Machine/data/tag_w2v_bpe_100.model is generated


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=100836.0), HTML(value='')))




In [72]:
ratings.loc[:,"tag_vector"] = pd.Series(sentence_embs)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,u_nid,m_nid,b4_timestamp,m_count,last_rate_vector,tags,tag_vector
0,1,1,4.0,964982703,0,0,964982681,4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,3,4.0,964981247,0,2,964981230,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,6,4.0,964982224,0,5,964982211,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,47,5.0,964983815,0,43,964983793,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,50,5.0,964982931,0,46,964982903,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [73]:
# user, movie, rating, movie_gnr_vec, movie_title_vec, launch_year, user_movie_tag_vector, last_rate_vector

target_ratings = ratings.loc[:,["u_nid","m_nid","tag_vector","last_rate_vector","rating"]]
target_movies = movies.loc[:,["m_nid","genres_vectors","title_vector","launch_year"]]

data = pd.merge(left=target_ratings, right=target_movies, on="m_nid", how="left")
data.head()

Unnamed: 0,u_nid,m_nid,tag_vector,last_rate_vector,rating,genres_vectors,title_vector,launch_year
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[0.015534833073616028, 0.01623934507369995, 0....",1995
1,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.003464460140094161, 0.030880093574523926, 0...",1995
2,0,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.00021089462097734213, 0.007862547412514687...",1995
3,0,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.1379885971546173, -0.026576576754450798, 0....",1995
4,0,46,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.12589919567108154, -0.008645707741379738, 0...",1995


In [74]:
y_data = data.loc[:,"rating"]
X_data = data.drop("rating",axis=1)
X_data.head()

Unnamed: 0,u_nid,m_nid,tag_vector,last_rate_vector,genres_vectors,title_vector,launch_year
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[0.015534833073616028, 0.01623934507369995, 0....",1995
1,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.003464460140094161, 0.030880093574523926, 0...",1995
2,0,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.00021089462097734213, 0.007862547412514687...",1995
3,0,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.1379885971546173, -0.026576576754450798, 0....",1995
4,0,46,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.12589919567108154, -0.008645707741379738, 0...",1995


In [75]:
num_tag_vec = len(X_data["tag_vector"][0])
num_last_rate_vec = len(X_data["last_rate_vector"][0])
num_gnrs_vec = len(X_data["genres_vectors"][0])
num_title_vec = len(X_data["title_vector"][0])

print(num_tag_vec, num_last_rate_vec, num_gnrs_vec, num_title_vec)

200 9742 20 200


In [76]:
from sklearn.preprocessing import StandardScaler

launch_years = np.array(X_data.loc[X_data.launch_year != 0].launch_year.to_list()).reshape(-1,1)

scaler = StandardScaler()
scaled_year = scaler.fit_transform(launch_years)

In [77]:
X_data.loc[X_data.launch_year !=0, "launch_year"] = scaled_year

In [78]:
X_data.head()

Unnamed: 0,u_nid,m_nid,tag_vector,last_rate_vector,genres_vectors,title_vector,launch_year
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[0.015534833073616028, 0.01623934507369995, 0....",0.038646
1,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.003464460140094161, 0.030880093574523926, 0...",0.038646
2,0,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.00021089462097734213, 0.007862547412514687...",0.038646
3,0,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.1379885971546173, -0.026576576754450798, 0....",0.038646
4,0,46,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.12589919567108154, -0.008645707741379738, 0...",0.038646


In [79]:
X_data.loc[:, "user_id"] = X_data.u_nid.apply(lambda x: dict_nid_2_uid[x])
X_data.loc[:, "movie_id"] = X_data.m_nid.apply(lambda x: dict_nid_2_mid[x])

In [80]:
X_data.head()

Unnamed: 0,u_nid,m_nid,tag_vector,last_rate_vector,genres_vectors,title_vector,launch_year,user_id,movie_id
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[0.015534833073616028, 0.01623934507369995, 0....",0.038646,1,1
1,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.003464460140094161, 0.030880093574523926, 0...",0.038646,1,3
2,0,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.00021089462097734213, 0.007862547412514687...",0.038646,1,6
3,0,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.1379885971546173, -0.026576576754450798, 0....",0.038646,1,47
4,0,46,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.12589919567108154, -0.008645707741379738, 0...",0.038646,1,50


In [81]:
X_columns = ["user_id", "movie_id","tag_vector","last_rate_vector","genres_vectors","title_vector","launch_year"]
X_data = X_data.loc[:,X_columns].copy()
X_data.head()

Unnamed: 0,user_id,movie_id,tag_vector,last_rate_vector,genres_vectors,title_vector,launch_year
0,1,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[0.015534833073616028, 0.01623934507369995, 0....",0.038646
1,1,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.003464460140094161, 0.030880093574523926, 0...",0.038646
2,1,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.00021089462097734213, 0.007862547412514687...",0.038646
3,1,47,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.1379885971546173, -0.026576576754450798, 0....",0.038646
4,1,50,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.12589919567108154, -0.008645707741379738, 0...",0.038646


In [82]:
data = pd.concat([X_data,y_data],axis=1)
data.head()

Unnamed: 0,user_id,movie_id,tag_vector,last_rate_vector,genres_vectors,title_vector,launch_year,rating
0,1,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[0.015534833073616028, 0.01623934507369995, 0....",0.038646,4.0
1,1,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.003464460140094161, 0.030880093574523926, 0...",0.038646,4.0
2,1,6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.00021089462097734213, 0.007862547412514687...",0.038646,4.0
3,1,47,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.1379885971546173, -0.026576576754450798, 0....",0.038646,5.0
4,1,50,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.12589919567108154, -0.008645707741379738, 0...",0.038646,5.0


In [83]:
# user_id, movie_id -1 for embedding
data.loc[:,"user_id"] = (data.user_id - 1).to_list()
data.loc[:,"movie_id"] = (data.movie_id - 1).to_list()
data.head()

Unnamed: 0,user_id,movie_id,tag_vector,last_rate_vector,genres_vectors,title_vector,launch_year,rating
0,0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[0.015534833073616028, 0.01623934507369995, 0....",0.038646,4.0
1,0,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.003464460140094161, 0.030880093574523926, 0...",0.038646,4.0
2,0,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[-0.00021089462097734213, 0.007862547412514687...",0.038646,4.0
3,0,46,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.1379885971546173, -0.026576576754450798, 0....",0.038646,5.0
4,0,49,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.12589919567108154, -0.008645707741379738, 0...",0.038646,5.0


In [84]:
data.iloc[0]["last_rate_vector"]

array([0., 0., 0., ..., 0., 0., 0.])

In [224]:
from torch.utils.data import Dataset

class MovieLensDataset(Dataset):
  def __init__(self, dataframe):
    super(MovieLensDataset, self).__init__()
    self.df = dataframe
    self.field_dim = np.array([self.df.user_id.max() + 1, self.df.movie_id.max() + 1])
    self.len_tag_vec = len(self.df.iloc[0]["tag_vector"])
    self.len_last_rate_vec = len(self.df.iloc[0]["last_rate_vector"])
    self.len_gnrs_vec = len(self.df.iloc[0]["genres_vectors"])
    self.len_title_vec = len(self.df.iloc[0]["title_vector"])

  def __getitem__(self, index):
    row = self.df.iloc[index]
    user_id = row["user_id"]
    movie_id = row["movie_id"]
    user_movie = torch.LongTensor([user_id, movie_id])
    tag_vector = torch.Tensor(row["tag_vector"])
    last_rate_vector = torch.Tensor(row["last_rate_vector"])
    genres_vectors = torch.Tensor(row["genres_vectors"])
    title_vector = torch.Tensor(row["title_vector"])
    # launch_year = torch.Tensor(np.array(row["launch_year"]))
    ratings = row["rating"]

    return (user_movie, tag_vector, last_rate_vector, genres_vectors, title_vector), ratings

  def __len__(self):
    return len(self.df)

In [241]:
data.rating

0         4.0
1         4.0
2         4.0
3         5.0
4         5.0
         ... 
100831    4.0
100832    5.0
100833    5.0
100834    5.0
100835    3.0
Name: rating, Length: 100836, dtype: float64

In [225]:
dataset = MovieLensDataset(data)

In [226]:
train_length = int(len(dataset) * 0.8)
valid_length = int(len(dataset) * 0.1)
test_length = len(dataset) - train_length - valid_length
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    dataset, (train_length, valid_length, test_length))

In [227]:
from torch.utils.data import DataLoader

dummy_loader= DataLoader(train_dataset, batch_size = 10, shuffle=False)

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=8)
valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=8)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=8)

In [228]:
iterator = iter(dummy_loader)
dummy = next(iterator)

In [244]:
class FeaturesLinear(torch.nn.Module):

    def __init__(self, field_dims, output_dim=1):
        super().__init__()
        self.fc = torch.nn.Embedding(sum(field_dims), output_dim)
        self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)

    def forward(self, x, fc1, fc2, fc3, fc4):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        fc = self.fc(x).squeeze(2)
        cat = torch.cat([fc,fc1,fc2,fc3,fc4],dim=1).unsqueeze(2)

        return torch.sum(cat, dim=1) + self.bias


class FactorizationMachineModel(nn.Module):
  def __init__(self, field_dims, embed_dim, len_tag_vec, len_last_rate_vec, len_gnrs_vec, len_title_vec):
    super(FactorizationMachineModel, self).__init__()
    self.fc1 = nn.Linear(len_tag_vec, 1)
    self.fc2 = nn.Linear(len_last_rate_vec, 1)
    self.fc3 = nn.Linear(len_gnrs_vec, 1)
    self.fc4 = nn.Linear(len_title_vec, 1)

    self.fc1_emb = nn.Linear(len_tag_vec, embed_dim)
    self.fc2_emb = nn.Linear(len_last_rate_vec, embed_dim)
    self.fc3_emb = nn.Linear(len_gnrs_vec, embed_dim)
    self.fc4_emb = nn.Linear(len_title_vec, embed_dim)

    self.linear = FeaturesLinear(field_dims)
    self.embedding = FeaturesEmbedding(field_dims,embed_dim)
    self.fm = FactorizationMachine(reduce_sum=True)

  def forward(self, x, tag_vec, last_rate_vec, gnrs_vec, title_vec):
    linear = self.linear(x, self.fc1(tag_vec), self.fc2(last_rate_vec), self.fc3(gnrs_vec), self.fc4(title_vec))
    embed = self.embedding(x, self.fc1_emb(tag_vec), self.fc2_emb(last_rate_vec), self.fc3_emb(gnrs_vec), self.fc4_emb(title_vec))
    fm = self.fm(embed)
    x = linear + fm
    return x.squeeze(1)

In [245]:
class FeaturesEmbedding(torch.nn.Module):

    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, x, tag_vec, last_rate_vec, gnrs_vec, title_vec):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        cat = torch.cat([self.embedding(x),tag_vec.unsqueeze(1),last_rate_vec.unsqueeze(1),gnrs_vec.unsqueeze(1),title_vec.unsqueeze(1)],dim=1)

        return cat

In [246]:
class FactorizationMachine(torch.nn.Module):

    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """

        num_factors = x.shape[1]
        dict_vector = {i:x[:,i,:].unsqueeze(1) for i in range(num_factors)}
        dict_ixs = dict()

        for i in range(num_factors):
          for j in range(i,num_factors):
            if i<j:
              x1 = dict_vector[i]
              x2 = dict_vector[j]
              cat = torch.cat([x1,x2],dim=1)
              square_of_sum= torch.sum(cat,dim=1) ** 2
              sum_of_square = torch.sum(cat ** 2, dim=1)
              ix = square_of_sum - sum_of_square
              if self.reduce_sum:
                ix = torch.sum(ix, dim=1, keepdim=True)
              dict_ixs[(i,j)] = 0.5 * ix

        tensor_ixs = torch.stack(list(dict_ixs.values()),dim=1)
        result = torch.sum(tensor_ixs,dim=1)

        return result

In [247]:
model = FactorizationMachineModel(dataset.field_dim, 10, dataset.len_tag_vec, dataset.len_last_rate_vec, dataset.len_gnrs_vec, dataset.len_title_vec)

In [252]:
model

FactorizationMachineModel(
  (fc1): Linear(in_features=200, out_features=1, bias=True)
  (fc2): Linear(in_features=9742, out_features=1, bias=True)
  (fc3): Linear(in_features=20, out_features=1, bias=True)
  (fc4): Linear(in_features=200, out_features=1, bias=True)
  (fc1_emb): Linear(in_features=200, out_features=10, bias=True)
  (fc2_emb): Linear(in_features=9742, out_features=10, bias=True)
  (fc3_emb): Linear(in_features=20, out_features=10, bias=True)
  (fc4_emb): Linear(in_features=200, out_features=10, bias=True)
  (linear): FeaturesLinear(
    (fc): Embedding(194219, 1)
  )
  (embedding): FeaturesEmbedding(
    (embedding): Embedding(194219, 10)
  )
  (fm): FactorizationMachine()
)

In [222]:
n_epochs = 10
learning_rate = 0.001
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

In [253]:
from tqdm import tqdm_notebook

model = model.to(device)

for epoch in range(n_epochs):
  model.train()
  total_loss = 0
  iterations = len(train_data_loader)
  for idx,(X_vectors, ratings) in enumerate(tqdm_notebook(train_data_loader)):
    user_movie, tag_vector, last_rate_vector, genres_vectors, title_vector = X_vectors
    user_movie = user_movie.to(device)
    tag_vector = tag_vector.to(device)
    last_rate_vector = last_rate_vector.to(device)
    genres_vectors = genres_vectors.to(device)
    title_vector = title_vector.to(device)
    ratings = ratings.float().to(device)

    y_preds = model(user_movie, tag_vector, last_rate_vector, genres_vectors, title_vector)
    loss = criterion(y_preds, ratings)
    model.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss += loss / iterations

  print(f"{epoch}|{n_epochs} epochs; loss:{total_loss}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


0|10 epochs; loss:16.311203002929688


HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


1|10 epochs; loss:16.311203002929688


HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))

KeyboardInterrupt: ignored