In [1]:
# http://files.grouplens.org/datasets/movielens/ml-20m.zip
import pandas as pd
from sklearn import model_selection
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader


df = pd.read_csv('./ml-20m/ratings.csv')
X = df[['userId', 'movieId']].values
Y = df[['rating']].values
train_X, test_X, train_Y, test_Y = model_selection.train_test_split(X, Y, test_size=0.1)
train_dataset = TensorDataset(torch.LongTensor(train_X), torch.FloatTensor(train_Y))
test_dataset = TensorDataset(torch.LongTensor(test_X), torch.FloatTensor(test_Y))
train_loader = DataLoader(train_dataset, batch_size=1024, num_workers=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, num_workers=4)

In [2]:
from torch import nn


class MatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, k=20):
        super().__init__()
        self.max_user = max_user
        self.max_item = max_item
        self.user_emb = nn.Embedding(max_user, k, 0)
        self.item_emb = nn.Embedding(max_item, k, 0)
        
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        out = torch.sum(user_feature*item_feature, 1)
        out = nn.functional.sigmoid(out)
        return out

In [3]:
max_user, max_item = X.max(0)
max_user = int(max_user)
max_item = int(max_item)
net = MatrixFactorization(max_user+1, max_item+1)

In [4]:
def mae(x, y):
    return (x - y).abs().mean()

def eval_net(net, loader, score_fn=mae):
    ys = []
    ypreds = []
    for x, y in loader:
        x = V(x)
        ys.append(y)
        ypred = net(x).data
        ypreds.append(ypred)
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score

In [5]:
import resource
from statistics import mean
from torch import nn
from torch import optim
from torch.autograd import Variable as V


rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1]))

opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()
for epoch in range(5):
    loss_log = []
    for x, y in train_loader:
        x = V(x)
        y = V(y)
        o = net(x)
        loss = loss_f(o, y)
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.data[0])
    test_score = eval_net(net, test_loader)
    print(epoch, mean(loss_log), test_score, flush=True)

0 7.904827312680192 2.5382276043871506
1 7.4880929851201055 2.537697251916965
2 7.4868984495836 2.53764365159435
3 7.486615328666964 2.537608309091998
4 7.486444170493044 2.537587559740098


In [6]:
query = (1, 10)
query = torch.LongTensor(query).view(1, -1)
net(V(query))

Variable containing:
 1
[torch.FloatTensor of size 1]

In [7]:
query = torch.stack([
    torch.zeros(max_item).fill_(1),
    torch.arange(1, max_item+1)
], 1).long()
scores, indices = torch.topk(net(V(query)), 5)

In [8]:
class NeuralMatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, user_k=10, item_k=10, hidden_dim=50):
        super().__init__()
        self.user_emb = nn.Embedding(max_user, user_k, 0)
        self.item_emb = nn.Embedding(max_item, item_k, 0)
        self.mlp = nn.Sequential(
            nn.Linear(user_k + item_k, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        out = torch.cat([user_feature, item_feature], 1)
        out = self.mlp(out)
        out = nn.functional.sigmoid(out) * 5
        return out.squeeze()

In [9]:
import csv
from sklearn.feature_extraction.text import CountVectorizer


with open('./ml-20m/movies.csv') as fp:
    reader = csv.DictReader(fp)
    def parse(d):
        movieId = int(d['movieId'])
        genres = d['genres']
        return movieId, genres
    data = [parse(d) for d in reader]

movieIds = [x[0] for x in data]
genres = [x[1] for x in data]
cv = CountVectorizer(dtype='f4').fit(genres)
num_genres = len(cv.get_feature_names())
it = cv.transform(genres).toarray()
it = (torch.FloatTensor(g) for g in it)
genre_dict = dict(zip(movieIds, it))

In [10]:
def first(xs):
    it = iter(xs)
    return next(it)


class MovieLensDataset(Dataset):
    def __init__(self, x, y, genres):
        assert len(x) == len(y)
        self.x = x
        self.y = y
        self.genres = genres
        self.null_genre = torch.zeros_like(first(genres.values()))
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        movieId = x[1]
        g = self.genres.get(movieId, self.null_genre)
        return x, y, g

NameError: name 'Dataset' is not defined

In [None]:
train_dataset = MovieLensDataset(
    torch.LongTensor(train_X),
    torch.FloatTensor(train_Y),
    genre_dict
)
test_dataset = MovieLensDataset(
    torch.LongTensor(test_X),
    torch.FloatTensor(test_Y),
    genre_dict
)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=1024, num_workers=4)

In [None]:
class NeuralMatrixFactorization2(nn.Module):
    def __init__(self, max_user, max_item, num_genres, user_k=10, item_k=10, hidden_dim=50):
        super().__init__()
        self.user_emb = nn.Embedding(max_user, user_k, 0)
        self.item_emb = nn.Embedding(max_item, item_k, 0)
        self.mlp = nn.Sequential(
            nn.Linear(user_k + item_k + num_genres, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x, g):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        out = torch.cat([user_feature, item_feature, g], 1)
        out = self.mlp(out)
        out = nn.functional.sigmoid(out) * 5
        return out.squeeze()

In [None]:
def eval_net(net, loader, score_fn=mae):
    ys = []
    ypreds = []
    for x, y, g in loader:
        x = V(x, volatile=True)
        g = V(g, volatile=True)
        ys.append(y)
        ypred = net(x, g).data
        ypreds.append(ypred)
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score

In [None]:
net = NeuralMatrixFactorization2(max_user+1, max_item+1, num_genres)
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

for epoch in range(5):
    loss_log = []
    net.train()
    for x, y, g in train_loader:
        x = V(x)
        y = V(y)
        g = V(g)
        o = net(x, g)
        loss = loss_f(o, y)
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.data[0])
    net.eval()
    test_score = eval_net(net, test_loader)
    print(epoch, mean(loss_log), test_score, flush=True)

In [None]:
def make_genre_vector(i, max_len):
    g = torch.zeros(max_len)
    g[i] = 1
    return g

query_genres = [make_genre_vector(i, num_genres) for i in range(num_genres) for i in range(num_genres)]
query_genres = torch.stach(query_genres, 1)
query = torch.stach([
    torch.LongTensor(num_genres).fill_(100),
    torch.LongTensor(num_genres).fill_(0)
], 1)
net(V(query), V(query_genres))