In [1]:
import numpy as np
import pandas as pd
from interactions_data import create_index
from sklearn.model_selection import train_test_split

In [2]:
path = '../data/movielens-small/'

In [3]:
df = pd.read_csv(path+'ratings.csv').drop('timestamp', axis=1)

In [4]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


### Encode IDs from 1-n

In [5]:
movie_ids = np.sort(df.movieId.unique())
user_ids = np.sort(df.userId.unique())

In [6]:
movie_encoder, movie_decoder = create_index(movie_ids)
user_encoder, user_decoder = create_index(user_ids)

In [7]:
df.movieId = df.movieId.replace(movie_encoder)
df.userId = df.userId.replace(user_encoder)

In [20]:
n_users = max(df.userId.unique())
n_items = max(df.movieId.unique())
max_score = max(df.rating)
min_score = min(df.rating)

In [21]:
n_users, n_items, min_score, max_score

(609, 9723, 0.5, 5.0)

### Remove movies with too few ratings

In [8]:
movie_counts = df.movieId.value_counts()
to_remove = movie_counts[movie_counts <= 5]

In [9]:
len(list(to_remove.index))

6456

### Create test set

In [10]:
train, test, _, _ = train_test_split(df, df,
                                     test_size = 0.1,
                                     random_state=42,
                                     stratify=df.userId)

In [11]:
train.shape

(90752, 3)

In [12]:
test.shape

(10084, 3)

In [13]:
train.to_csv(path+'train.csv', index=False)
test.to_csv(path+'test.csv', index=False)

# Skorch

In [14]:
X = df.iloc[:,:2].values
y = df.iloc[:,2].values
y = y.astype('float32')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=X[:,0])

In [16]:
y_train, y_test = y_train.reshape(-1, 1), y_test.reshape(-1, 1)

In [17]:
X_train.shape, y_train.shape

((80668, 2), (80668, 1))

In [22]:
import torch
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm_notebook as tqdm
from time import time

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [24]:
def get_emb(ni,nf):
    e = nn.Embedding(ni, nf)
    e.weight.data.uniform_(-0.01,0.01)
    return e

In [25]:
class EmbeddingNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.min_score, self.max_score = min_score, max_score

        # get user and item embeddings
        (self.u, self.i) = [get_emb(*o) for o in [
            (n_users, 10), (n_items, 10)]]
        self.lin1 = nn.Linear(20, 10)  # 10 hidden neurons
        self.lin2 = nn.Linear(10, 1)
        self.drop1 = nn.Dropout(0.05)  # dropout rate of 5%
        self.drop2 = nn.Dropout(0.15)  # dropout rate of 15%

    def forward(self, X, **kwargs):
        users, items = X[:,0], X[:,1]
        # concatenate embeddings to form first layer, add dropout
        x = self.drop1(torch.cat([self.u(users),self.i(items)], dim=1))
        # second layer with 10 hidden neurons and dropout
        x = self.drop2(F.relu(self.lin1(x)))
        # output layer with one neuron
        x = self.lin2(x)
        # add sigmoid activation function, but squeeze between min and max score
        return torch.sigmoid(x) * (self.max_score - self.min_score) + self.min_score

In [26]:
from skorch import NeuralNetRegressor

In [27]:
net = NeuralNetRegressor(
    EmbeddingNet,
    max_epochs=10,
    lr=0.1,
    device=device,
)

In [28]:
from skorch import NeuralNet

In [29]:
net.fit(X_train, y_train)

  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        1.0773        1.0124  2.4917
      2        0.9248        0.8551  2.2755
      3        0.8244        0.8072  2.2669
      4        0.7705        0.7856  2.2609
      5        0.7373        0.7904  2.3832
      6        0.7164        0.7880  2.3672
      7        0.7018        0.7836  2.3603
      8        0.6890        0.7879  2.3249
      9        0.6795        0.7763  2.2999
     10        0.6732        0.7745  2.3459


<class 'skorch.regressor.NeuralNetRegressor'>[initialized](
  module_=EmbeddingNet(
    (u): Embedding(609, 10)
    (i): Embedding(9723, 10)
    (lin1): Linear(in_features=20, out_features=10, bias=True)
    (lin2): Linear(in_features=10, out_features=1, bias=True)
    (drop1): Dropout(p=0.05)
    (drop2): Dropout(p=0.15)
  ),
)

In [30]:
predicted = net.predict(X_test)

In [35]:
np.sqrt(np.mean((predicted - y_test) ** 2))

0.87023157