#### Tested with :
- Linux Ubuntu
- Python 3.5
- Cuda 8
- Conda package for PyTorch

In [1]:
from __future__ import print_function
import argparse

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

In [3]:
import numpy as np

In [4]:
PREFER_CUDA = True

In [5]:
use_cuda = PREFER_CUDA and torch.cuda.is_available()

In [6]:
if not(use_cuda == PREFER_CUDA):
    print('CUDA SETUP NOT AS EXCEPTED')
else:
    print('OK')

OK


### Model assumption
- We assume all model variables are binary 0/1 valued
- We represent the X input vector has the sparse coding of its "1" indices
- Indices start at 1 so that we can reserve 0 for padding in the embeddings

In [7]:
# a batch of 2 samples of 3 indices each
X = Variable(torch.LongTensor([[11,20,4],[30,10,20],]))

In [8]:
class FactorizationMachine(nn.Module):
    def __init__(self, nb_features, dim_embed=50, isClassifier=True, withCuda=True):
        super(FactorizationMachine, self).__init__()
        
        self.nb_features = nb_features
        self.dim_embed = dim_embed
        self.isClassifier = isClassifier # binary-classifier or regression
        
        # Stores the bias term
        if withCuda: # WARNING : not working at present with CUDA due to type mismatch
            self.B = Variable(torch.randn((1)).type(torch.cuda.FloatTensor), requires_grad=True)
        else:
            self.B = Variable(torch.randn((1)).type(torch.FloatTensor), requires_grad=True)
        
        # Stores the weights for the linear terms
        self.embeddingL = nn.Embedding(nb_features, 1, padding_idx=0, max_norm=None, norm_type=2)
        
        # Stores the weights for the quadratic FM terms
        self.embeddingQ = nn.Embedding(nb_features, dim_embed, padding_idx=0, max_norm=None, norm_type=2)

        
    def forward(self, X):
        
        # The linear part
        eL = self.embeddingL(X)
        logitL = eL.sum(dim=1)
        
        # The Quadratic-FM part using the O(kn) formulation from Steffen Rendle
        eQ = self.embeddingQ(X)
        logitFM1 = eQ.mul(eQ).sum(1).sum(2)
        z = eQ.sum(dim=1)# sum across features
        z2 = z.mul(z) # element-wise product
        logitFM2 = z2.sum(dim=2) # sum across embedding dimensions
        logitFM = (logitFM1 - logitFM2)*0.5
        
        # Total logit
        logit = (logitL + logitFM).squeeze(dim=-1).squeeze(dim=-1)
        logit+= self.B.expand(1, logit.size()[0]).transpose(0,1)
        
        if self.isClassifier:
            return F.sigmoid(logit)
        else:
            return logit

In [9]:
model = FactorizationMachine(100, dim_embed=50, isClassifier=True, withCuda=use_cuda)

In [10]:
if use_cuda:
    model.cuda()
    X = X.cuda()

In [11]:
model.forward(X)

Variable containing:
1.00000e-04 *
  0.0002
  5.1449
[torch.cuda.FloatTensor of size 2 (GPU 0)]

### Model training to the data

In [12]:
learning_rate = 0.1
momentum = 0.9
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

#loss_function = nn.NLLLoss()
loss_function = nn.CrossEntropyLoss()

In [22]:
if True: # Dummy data
    train_input = np.array( [ [[11,20,4],[30,10,20],] ] )
    train_target = np.array( [ [[1],[0],] ] )
    print(train_input.shape, train_target.shape)
else:
    from sklearn.datasets import load_iris
    iris = load_iris(False)

    # Only keep classes 0 and 1 for binary classification example
    data = iris['data'][iris['target']<2]
    target = iris['target'][iris['target']<2]

    # Quantile representation of the data : like a sparse vector representation
    import pandas as pd
    offset = 1
    qs = 3
    cols = []
    for j in range(data.shape[1]):
        col = pd.qcut(data[:,j], qs, labels=False)+offset
        cols.append(col)
        offset+= qs

    qdata = np.vstack(cols).T
    
    # Shuffle
    p = np.random.permutation(data.shape[0])
    train_input = qdata[p]
    train_target = target[p]
    # WARNING : need batching !

(1, 2, 3) (1, 2, 1)


In [14]:
nb_batches = 1

def train(epoch):
    model.train()
    for batch_idx in range(nb_batches):
        data = torch.LongTensor(train_input[batch_idx,:].astype('int'))
        target = torch.LongTensor(np.squeeze(train_target[batch_idx,:].astype('int'))) # WARNING : squeeze
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        #print(data.size())
        print(target.size())
        optimizer.zero_grad()
        output = model(data) # same thing as 'model.forward(data)' ?
        print(output.size())
        # INFO : still needs to debug these lines
        #loss = loss_function(output, target)
        #loss.backward()
        #optimizer.step()
        #if ((batch_idx % 10 == 0) or (batch_idx == nb_batches-1)):
        #    print('Train Epoch: {} [{}]\tLoss: {:.6f}'.format(
        #        epoch, batch_idx * len(data), loss.data[0]))

In [15]:
for e in range(1):
    train(e)

torch.Size([2])
torch.Size([2])
