In [1]:
# http://files.grouplens.org/datasets/movielens/ml-20m.zip
import pandas as pd
from sklearn import model_selection
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader


df = pd.read_csv('./ml-20m/ratings.csv')
X = df[['userId', 'movieId']].values
Y = df[['rating']].values
train_X, test_X, train_Y, test_Y = model_selection.train_test_split(X, Y, test_size=0.1)
train_dataset = TensorDataset(torch.LongTensor(train_X), torch.FloatTensor(train_Y))
test_dataset = TensorDataset(torch.LongTensor(test_X), torch.FloatTensor(test_Y))
train_loader = DataLoader(train_dataset, batch_size=1024, num_workers=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, num_workers=4)

In [2]:
from torch import nn


# 行列因子分解の内積をとる代わりにMLPを通して非線形化
class NeuralMatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, user_k=10, item_k=10, hidden_dim=50):
        super().__init__()
        self.user_emb = nn.Embedding(max_user, user_k, 0)
        self.item_emb = nn.Embedding(max_item, item_k, 0)
        self.mlp = nn.Sequential(
            nn.Linear(user_k + item_k, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        # ユーザー特徴量と商品特徴量をまとめて一つのベクトルにする
        out = torch.cat([user_feature, item_feature], 1)
        # まとめた特徴量ベクトルをMLPに入れる
        # 内積をしないので、dot((n, k),(k, m))の様な特徴量ベクトルの次元をそろえる必要がない
        # バッチノーマリゼーションのようなニューラルネットの訓練のテクニックがそのまま使える
        out = self.mlp(out)
        out = nn.functional.sigmoid(out) * 5
        return out.squeeze()

In [3]:
max_user, max_item = X.max(0)
max_user = int(max_user)
max_item = int(max_item)
net = NeuralMatrixFactorization(max_user+1, max_item+1)

In [4]:
from torch.autograd import Variable as V
from statistics import mean


def mae(x, y):
    # 平均絶対誤差を計算
    return (x - y).abs().mean()

def eval_net(net, loader, score_fn=mae):
    ys = []
    ypreds = []
    for x, y in loader:
        x = V(x)
        ys.append(y)
        ypred = net(x).data
        ypreds.append(ypred)
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score

In [5]:
import resource
from torch import optim


rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1]))

# 普通のMFより良い精度が得られる
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()
for epoch in range(5):
    loss_log = []
    for x, y in train_loader:
        x = V(x)
        y = V(y)
        o = net(x)
        loss = loss_f(o, y)
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.data[0])
    test_score = eval_net(net, test_loader)
    print(epoch, mean(loss_log), test_score, flush=True)

0 0.7535731178636376 0.6477937812666393
1 0.6907853181807415 0.6313660525415021
2 0.6695159246432062 0.6288411734874598
3 0.6546556788338106 0.6245222466788294
4 0.639868713125745 0.620215331755076


In [6]:
# ユーザー1、映画10、の評価を計算したい
query = (1, 10)
# LongTensorに変換し、batchの次元を付加
# 次元(2, )を(1, 2)に変換
query = torch.LongTensor(query).view(1, -1)
net.eval()
net(V(query))

Variable containing:
 3.8138
[torch.FloatTensor of size 1]

In [7]:
# あるユーザーに対する全映画の評価予測値を計算し、上位5つを取り出す
# 映画の数だけ(userId, movieId)のペアを作り、ネットワークに渡して評価値を作る
query = torch.stack([
    torch.zeros(max_item).fill_(1),
    torch.arange(1, max_item+1)
], 1).long()
scores, indices = torch.topk(net(V(query)), 5)
print(scores, indices)

Variable containing:
 4.3063
 4.2946
 4.2905
 4.2571
 4.2345
[torch.FloatTensor of size 5]
 Variable containing:
   109
  7152
   317
   355
 58558
[torch.LongTensor of size 5]



In [8]:
import csv
from sklearn.feature_extraction.text import CountVectorizer

# ジャンルの辞書を作る
with open('./ml-20m/movies.csv') as fp:
    reader = csv.DictReader(fp)
    def parse(d):
        movieId = int(d['movieId'])
        genres = d['genres']
        return movieId, genres
    data = [parse(d) for d in reader]

movieIds = [x[0] for x in data]
genres = [x[1] for x in data]
# CountVectorizerでBoWの特徴量を作る
cv = CountVectorizer(dtype='f4').fit(genres)
# ジャンルの数
num_genres = len(cv.get_feature_names())
# BoWの次元は(27278, 24)
it = cv.transform(genres).toarray()
it = (torch.FloatTensor(g) for g in it)
# keyがmovieIdでvalueがBoWのTensorを作る
genre_dict = dict(zip(movieIds, it))

In [9]:
from torch.utils.data import Dataset


def first(xs):
    it = iter(xs)
    return next(it)

# userId, movieId, ratingと一緒にジャンルBoWを返すDataset
class MovieLensDataset(Dataset):
    def __init__(self, x, y, genres):
        assert len(x) == len(y)
        self.x = x
        self.y = y
        self.genres = genres
        # ジャンル辞書にないmovieIdの時のダミーデータ(ゼロ行列)
        self.null_genre = torch.zeros_like(first(genres.values()))
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        movieId = x[1]
        g = self.genres.get(movieId, self.null_genre)
        return x, y, g

In [10]:
train_dataset = MovieLensDataset(
    torch.LongTensor(train_X),
    torch.FloatTensor(train_Y),
    genre_dict
)
test_dataset = MovieLensDataset(
    torch.LongTensor(test_X),
    torch.FloatTensor(test_Y),
    genre_dict
)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=1024, num_workers=4)

In [11]:
from torch import nn


# ジャンル情報も織り込んだネットワークモデル
class NeuralMatrixFactorization2(nn.Module):
    def __init__(self, max_user, max_item, num_genres, user_k=10, item_k=10, hidden_dim=50):
        super().__init__()
        self.user_emb = nn.Embedding(max_user, user_k, 0)
        self.item_emb = nn.Embedding(max_item, item_k, 0)
        # num_genres分だけ次元を増やす
        self.mlp = nn.Sequential(
            nn.Linear(user_k + item_k + num_genres, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x, g):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        # ジャンルのBoWを特徴ベクトルに結合
        out = torch.cat([user_feature, item_feature, g], 1)
        out = self.mlp(out)
        out = nn.functional.sigmoid(out) * 5
        return out.squeeze()

In [12]:
def mae(x, y):
    return (x - y).abs().mean()

# ネットワークにuserId, movieId以外にジャンルのBoWを渡すようになったので、評価関数も変更
def eval_net(net, loader, score_fn=mae):
    ys = []
    ypreds = []
    for x, y, g in loader:
        x = V(x)
        g = V(g)
        ys.append(y)
        ypred = net(x, g).data
        ypreds.append(ypred)
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score

In [13]:
from torch import optim
from torch.autograd import Variable as V


net = NeuralMatrixFactorization2(max_user+1, max_item+1, num_genres)
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

for epoch in range(5):
    loss_log = []
    net.train()
    for x, y, g in train_loader:
        x = V(x)
        y = V(y)
        g = V(g)
        o = net(x, g)
        loss = loss_f(o, y)
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.data[0])
    net.eval()
    test_score = eval_net(net, test_loader)
    print(epoch, mean(loss_log), test_score, flush=True)

0 0.7497978749735796 0.6420253083307167
1 0.6831190756403605 0.6290456645330885
2 0.6541487066790108 0.6190908115267977
3 0.6320047632670998 0.6140315166739249
4 0.6166021982057963 0.6112788813222539


In [14]:
# userIdとジャンルのみの情報からレコメンドが可能になる
# 例えば、ユーザーid=100に対してそれぞれのジャンル1つだけ含んだ映画のスコアを計算する

# 指定した一だけ1で、残りが0のTensorを返す関数
def make_genre_vector(i, max_len):
    g = torch.zeros(max_len)
    g[i] = 1
    return g

# num_genres分だけuserId=100とmvieId=0のTensorを作って結合する
query_genres = [make_genre_vector(i, num_genres) for i in range(num_genres)]
query_genres = torch.stack(query_genres, 1)
query = torch.stack([
    torch.LongTensor(num_genres).fill_(100),
    torch.LongTensor(num_genres).fill_(0)
], 1)
# スコアを計算
net(V(query), V(query_genres))

Variable containing:
 2.8454
 2.9914
 3.1460
 2.7222
 2.9945
 3.1822
 3.4193
 3.3634
 3.0947
 3.1323
 3.2765
 3.1129
 3.0448
 3.0545
 3.1136
 3.0758
 3.2091
 3.1114
 3.2770
 3.0088
 3.1299
 3.0486
 3.2824
 3.2219
[torch.FloatTensor of size 24]