Data taken from Kaggle competition: https://www.kaggle.com/netflix-inc/netflix-prize-data/data

## Load Libraries

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

## 1. Data Cleaning

In [18]:
data1 = pd.read_csv('netflix-prize-data/combined_data_1.txt', header=None, names = ['Cust_Id', 'Rating', 'Date'], usecols=[0,1,2])

In [7]:
data1.head()

Unnamed: 0,Cust_Id,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26


In [8]:
data2 = pd.read_csv('netflix-prize-data/combined_data_2.txt', header=None, names = ['Cust_Id', 'Rating', 'Date'], usecols=[0,1,2])

In [9]:
data3 = pd.read_csv('netflix-prize-data/combined_data_3.txt', header=None, names = ['Cust_Id', 'Rating', 'Date'], usecols=[0,1,2])

In [10]:
data4 = pd.read_csv('netflix-prize-data/combined_data_4.txt', header=None, names = ['Cust_Id', 'Rating', 'Date'], usecols=[0,1,2])

In [11]:
data = data1
data = data.append(data2)
data = data.append(data3)
data = data.append(data4)

In [12]:
data = data.reset_index()

In [13]:
data.drop('index', axis=1, inplace = True)

In [14]:
data.shape

(100498277, 3)

In [15]:
df_nan = pd.DataFrame(pd.isnull(data.Rating))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()

In [16]:
movie_np = []
movie_id = 1

In [None]:
for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    # numpy approach
    #print(i, j)
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

In [91]:
temp = np.full((1,data[-1:].index.values[0]-i), movie_id)
movie_np = np.append(movie_np, temp)

In [94]:
data = data[pd.notnull(data['Rating'])]
data['Movie_Id'] = movie_np.astype(int)
data['Cust_Id'] = data['Cust_Id'].astype(int)

In [35]:
data.head()

Unnamed: 0,Cust_Id,Rating,Date
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26
5,823519,3.0,2004-05-03


In [20]:
dates = data['Date']

In [103]:
data.to_csv('data_all.csv', index=False)

## 2. Create Train/Test Data

In [38]:
data_final = pd.read_csv('data_all.csv')

In [39]:
data_final.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id
0,1488844,3.0,1
1,822109,5.0,1
2,885013,4.0,1
3,30878,4.0,1
4,823519,3.0,1


In [40]:
len(data_final)

100480507

In [44]:
data_final['Date'] = dates.values

In [47]:
data_final.sort_values(by='Date', ascending=True, inplace=True)

In [51]:
len(data_final)*.8

80384405.60000001

In [52]:
train = data_final[:80384405]
test = data_final[80384405:]

In [59]:
train.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id,Date
56431994,510180,4.0,10341,1999-11-11
9056171,510180,5.0,1798,1999-11-11
58698779,510180,3.0,10774,1999-11-11
48101611,510180,2.0,8651,1999-11-11
81893208,510180,2.0,14660,1999-11-11


In [57]:
train.to_csv('train_data.csv', index=False)

In [58]:
test.to_csv('test_data.csv', index=False)

## 3. Transforming Data into Tensor

In [15]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [16]:
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [17]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["Cust_Id", "Movie_Id"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [18]:
df_t_e = encode_data(train)
df_v_e = encode_data(test, train)

In [19]:
df_t_e.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id,Date
0,0,4.0,0,1999-11-11
1,0,5.0,1,1999-11-11
2,0,3.0,2,1999-11-11
3,0,2.0,3,1999-11-11
4,0,2.0,4,1999-11-11


### Transforming data into tensor

In [21]:
train_tensor = TensorDataset(torch.from_numpy(df_t_e.as_matrix(['Cust_Id', 'Movie_Id'])), torch.from_numpy(df_t_e.as_matrix(['Rating'])))

In [22]:
test_tensor = TensorDataset(torch.from_numpy(df_v_e.as_matrix(['Cust_Id', 'Movie_Id'])), torch.from_numpy(df_v_e.as_matrix(['Rating'])).float())

### Using data loader to avoid memory overload

In [57]:
batch_size = 300000
train_loader = DataLoader(train_tensor, batch_size=batch_size, shuffle=True)
# for test we use shuffle=False
test_loader = DataLoader(test_tensor, batch_size=batch_size, shuffle=False) 

In [24]:
train = None
test = None

## 4. Matrix Factorization

In [25]:
num_users = len(df_t_e.Cust_Id.unique())
num_items = len(df_t_e.Movie_Id.unique())
print(num_users, num_items) 

405041 17424


In [26]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        # initlializing weights
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)  

In [27]:
model = MF(num_users, num_items, emb_size=100).cuda()

In [71]:
def train_epocs(train_loader, model, epochs=3, lr=0.01, wd=0.0, unsqueeze=False):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        sum_loss = 0.0
        total = 0
        for features, labels in train_loader:
            batch = features.shape[0]
            y_hat = model(features[:,0].cuda(), features[:,1].cuda())
            if unsqueeze:
                loss = F.mse_loss(y_hat, labels.view(-1, batch)[0].unsqueeze(1).float().cuda())
            else:
                loss = F.mse_loss(y_hat, labels.view(-1, batch)[0].float().cuda())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total += batch
            sum_loss += batch * loss.item()
            #print(total)
            #print(sum_loss)
            #print("%0.3f" % (sum_loss/total))
        print("train loss %.3f" % (sum_loss/total))
        test_loss(model, unsqueeze)

In [73]:
def test_loss(model, unsqueeze=False):
    model.eval()
    sum_loss = 0.0
    total = 0
    for features, labels in test_loader:
        batch = features.shape[0]
        y_hat = model(features[:,0].cuda(), features[:,1].cuda())
        if unsqueeze:
            loss = F.mse_loss(y_hat, labels.view(-1, batch)[0].unsqueeze(1).float().cuda())
        else:
            loss = F.mse_loss(y_hat, labels.view(-1, batch)[0].float().cuda())
        total += batch
        sum_loss += batch * loss.item()
    print("test loss %.3f " % (sum_loss/total))

In [60]:
train_epocs(train_loader, model, epochs=3, lr=0.1, wd=1e-5)

train loss 1.473
test loss 3.417 
train loss 1.431
test loss 3.407 
train loss 1.434
test loss 3.391 


In [61]:
train_epocs(train_loader, model, epochs=5, lr=0.01, wd=1e-5)

train loss 1.227
test loss 3.091 
train loss 1.176
test loss 3.040 
train loss 1.173
test loss 3.034 
train loss 1.171
test loss 3.031 
train loss 1.170
test loss 3.028 


In [62]:
train_epocs(train_loader, model, epochs=5, lr=0.001, wd=1e-5)

train loss 1.150
test loss 3.132 
train loss 1.136
test loss 3.104 
train loss 1.131
test loss 3.085 
train loss 1.129
test loss 3.070 
train loss 1.127
test loss 3.060 


## 4. Neural Networks + Embeddings

In [63]:
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.0)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.drop2(x)
        x = self.lin2(x)
        return x

In [64]:
model = CollabFNet(num_users, num_items, emb_size=100).cuda()

In [None]:
train_epocs(train_loader, model, epochs=5, lr=0.01, wd=1e-5, unsqueeze=True) 

train loss 0.867
test loss 0.922 
train loss 0.830
test loss 0.919 


In [76]:
test_loss(model, unsqueeze=True)

test loss 0.911 


We can see that test loss (MSE) for NN model is much lower, at 0.911, when compared to matrix factorization baseline model's loss of 3.028.