# AutoEncoder Meet Collaborative Filtering

- Collaborative Filtering을 위해 user-item matrix 만들기
- AutoEncoder 모델 구조 정의하기

* Training Deep AutoEncoder 논문은 [저자 코드](https://github.com/NVIDIA/DeepRecommender) 참고

## 논문 종류
- AutoRec
- Training Deep AutoEncoder
- Variational AutoEncoder

## 1. Data Loader

In [1]:
data_path = '../data/kmrd/kmr_dataset/datafile/kmrd-small'

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
def train_df, val_df, user_to_index, movie_to_index = read_data(data_path=data_path)(data_path):
    df = pd.read_csv(os.path.join(data_path,'rates.csv'))[:10000]
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=1234, shuffle=True)

    user_to_index = {original: idx for idx, original in enumerate(df.user.unique())}
    movie_to_index = {original: idx for idx, original in enumerate(df.movie.unique())}

    return train_df, val_df, user_to_index, movie_to_index

In [4]:
class KMRDdataset(Dataset):
    def __init__(self, df, user_to_index, movie_to_index, item_based=True):
        self.min_rating = min(df.rate)
        self.max_rating = max(df.rate)

        self.user = [user_to_index[u] for u in df.user.values]
        self.movie = [movie_to_index[m] for m in df.movie.values]
        self.rating = df.rate.values

        if item_based:
            input_tensor = torch.LongTensor([self.movie, self.user])
            self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rating),
                                             torch.Size([len(movie_to_index), len(user_to_index)])).to_dense()
        else:
            input_tensor = torch.LongTensor([self.user, self.movie])
            self.data = torch.sparse.FloatTensor(input_tensor, torch.FloatTensor(self.rating),
                                             torch.Size([len(user_to_index), len(movie_to_index)])).to_dense()


    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]


In [6]:
train_df, val_df, user_to_index, movie_to_index = read_data(data_path=data_path)

train_dataset = KMRDdataset(train_df, user_to_index, movie_to_index)
val_dataset = KMRDdataset(val_df, user_to_index, movie_to_index)

In [7]:
print(train_df.shape)
print(train_dataset.data[0].size())
print(val_df.shape)
print(val_dataset.data[0].size())

(8000, 4)
torch.Size([466])
(2000, 4)
torch.Size([466])


In [9]:
train_dataset

<__main__.KMRDdataset at 0x10f37b790>

In [10]:
print(len(list(user_to_index.keys())))

466


In [11]:
train_dataset.data[0]

tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0., 10.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0., 27.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  8.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  8.,  9.,  0., 10.,  0.,  9.,  0.,  0.,
         0.,  0.,  5.,  0.,  0.,  0.,  0., 10.,  0.,  0.,  0.,  9.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  9.,  0.,
         0.,  0.,  0.,  0.,  0., 10.,  0.,  1.,  0.,  0.,  0., 10.,  0.,  0.,
         0.,  0.,  0.,  0.,  9.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         9.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  8.,  0.,  0., 10.,  0.,  0., 10.,  0.,  0.,  0.,
         0.,  0., 10.,  0.,  0.,  0.,  0.,  0.,  9.,  0.,  0., 1

In [12]:
list(user_to_index.keys())

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [13]:
len(train_dataset)

532

In [17]:
len(list(movie_to_index.keys()))

532

In [18]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

## 2. Define AutoEncoder 

In [19]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.nn.init as weight_init

In [20]:
class SimpleAutoEncoder(nn.Module):
    def __init__(self, num_inputs, num_hiddens, kind='sigmoid', dropout=None):
        super(SimpleAutoEncoder, self).__init__()
        # encoder -> hidden -> decoder
        # input -> hidden -> output
        # input -> hidden : encoder
        # hidden -> output = input : decoder
        self.encoder = nn.Sequential(nn.Linear(num_inputs, num_hiddens), self.activation(kind))
        self.decoder = nn.Sequential(nn.Linear(num_hiddens, num_inputs), self.activation(kind))  

    def activation(self, kind):
        if kind == 'selu':
            return nn.SELU()
        elif kind == 'relu':
            return nn.ReLU()
        elif kind == 'relu6':
            return nn.ReLU6()
        elif kind == 'sigmoid':
            return nn.Sigmoid()
        elif kind == 'tanh':
            return nn.Tanh()
        elif kind == 'elu':
            return nn.ELU()
        elif kind == 'lrelu':
            return nn.LeakyReLU()
        elif kind == 'none':
            return input
        else:
            raise ValueError('Unknown non-linearity type')

    def forward(self, x):
        return self.decoder(self.encoder(x))

In [24]:
class Encoder(nn.Module):
    def __init__(self, num_hiddens, num_layers, dropout=None, nn_type='diamond'):
        super(AutoEncoder, self).__init__()
        # input -> hidden -> output
        # input -> hidden(10) -> ... -> hidden(10) -> output = input
        self.encoder, self.decoder = self.generate_layers(num_hiddens, num_layers, dropout, nn_type)
    def forward(self, x):
        return self.decoder(self.encoder(x))
  
    def generate_layers(self, num_hiddens, num_layers, dropout=None, nn_type='diamond'):
        # hidden layers -> [50, 25, 12, 6, 12, 25, 50], [100 50 100] -> 100, 50, 60, 50 100 
        if nn_type == 'diamond':
            encoder_modules = []
            decoder_modules = []

            hidden_layers = []
            temp = num_hiddens
            for idx, x in enumerate(range(num_layers)):
                if idx == 0:
                    hidden_layers.append(temp)
                else:
                    hidden_layers.append(int(temp/2))
            temp = temp/2
            hidden_layers = [x for x in hidden_layers if x > 10]

          # encoder
            for idx, num_hidden in enumerate(hidden_layers):
                if idx < len(hidden_layers)-1:
                    encoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
                    encoder_modules.append(nn.Sigmoid())

          # decoder
            hidden_layers = list(reversed(hidden_layers))
            for idx, num_hidden in enumerate(hidden_layers):
                if idx < len(hidden_layers)-1:
                    decoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
                    decoder_modules.append(nn.Identity())

        # num_hidden = 50, num_layers = 3 ->  input_dim -> [50, 50, 50] -> output_dim = input_dim 
        elif nn_type == 'constant':
            hidden_layers = [num_hiddens] * num_layers
            for idx, enc in enumerate(hidden_layers):
                if idx < num_layers-1:
                    encoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
                    encoder_modules.append(nn.Sigmoid())
                    decoder_modules.append(nn.Linear(hidden_layers[idx], hidden_layers[idx+1], bias=True))
                    decoder_modules.append(nn.Identity())

        if dropout is not None:    
            encoder_modules = [x for y in (encoder_modules[i:i+2] + [nn.Dropout(dropout)] * (i < len(encoder_modules) - 1) 
                              for i in range(0, len(encoder_modules), 2)) for x in y]
            decoder_modules = [x for y in (decoder_modules[i:i+2] + [nn.Dropout(dropout)] * (i < len(decoder_modules) - 1)
                              for i in range(0, len(decoder_modules), 2)) for x in y]

        encoder = nn.Sequential(*encoder_modules)
        decoder = nn.Sequential(*decoder_modules)

        return encoder, decoder

## Train

In [25]:
num_users = len(user_to_index.keys())
num_movies = len(movie_to_index.keys())
print(num_users, num_movies)

466 532


In [26]:
model = SimpleAutoEncoder(num_inputs=num_users, num_hiddens=100, kind='selu')
model

SimpleAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=466, out_features=100, bias=True)
    (1): SELU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=100, out_features=466, bias=True)
    (1): SELU()
  )
)

In [27]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [28]:
def weights_init(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.zeros_(m.bias)

model.apply(weights_init)

SimpleAutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=466, out_features=100, bias=True)
    (1): SELU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=100, out_features=466, bias=True)
    (1): SELU()
  )
)

In [29]:
train_dataset.data[0].size()

torch.Size([466])

In [30]:
# NVIDIA Recommender System 참고
def MSEloss(inputs, targets, size_average=False):
    mask = targets != 0
    num_ratings = torch.sum(mask.float())
    criterion = nn.MSELoss(reduction='sum' if not size_average else 'mean')
    return criterion(inputs * mask.float(), targets), Variable(torch.Tensor([1.0])) if size_average else num_ratings

In [31]:
model.train()
train_loss = 0
for idx, batch in enumerate(train_dataloader):
    optimizer.zero_grad()
    
    pred = model(batch)
    loss, num_ratings = MSEloss(pred, batch)    
    loss = torch.sqrt(loss / num_ratings)
    loss.backward()
    train_loss += loss.item() 
    optimizer.step()
    
    print(train_loss / (idx+1))

12.787084579467773
11.363252639770508
11.4984343846639
10.804795980453491
10.46843376159668
10.538475036621094
10.246515137808663
10.680930137634277
10.342405054304335


In [32]:
model.eval()
val_loss = 0
with torch.no_grad():
    for idx, batch in enumerate(val_dataloader):
        pred = model(batch)
        loss, num_ratings = MSEloss(pred, batch)
        loss = torch.sqrt(loss / num_ratings)
        val_loss += loss.item()

        print(val_loss/(idx+1))

8.686783790588379
8.305405378341675
8.46501366297404
8.456071496009827
8.275437927246093
8.160489161809286
8.385420594896589
8.340406596660614
8.268771436479357
8.208452606201172
8.209791963750666
8.172353307406107
8.166303964761587
8.322408369609288
8.31943057378133
8.29599916934967
8.292543355156393
8.27474331855774
8.267130425101833
8.282361435890198
8.349243913378034
8.393507805737583
8.378307031548541
8.31652836004893
8.316402015686036
8.371596042926495
8.395281509116844
8.433735540934972
8.403057114831332
8.407469669977823
8.400179509193666
8.441355720162392
8.442356181867195
8.434548953000236
