In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import os

In [3]:
col_names = ['user_id', 'movie_id', 'rating', 'timestamp']
df = pd.read_csv('../Data/MovieLens/ml-100k/u.data', sep='\t', names=col_names)
train_df = pd.read_csv('../Data/MovieLens/ml-100k/ua.base', sep='\t', names=col_names)
test_df = pd.read_csv('../Data/MovieLens/ml-100k/ua.test', sep='\t', names=col_names)

In [None]:
train_df.drop('timestamp', axis=1, inplace=True)
test_df.drop('timestamp', axis=1, inplace=True)

In [4]:
n_users = len(df.loc[:,'user_id'].unique())
n_items = len(df.loc[:,'movie_id'].unique())

In [5]:
# R matrix
R = torch.zeros((n_users, n_items))
for user_id, movie_id, rating, timestamp in train_df.values:
    R[user_id-1, movie_id-1] = rating

In [6]:
R_test = torch.zeros((n_users, n_items))
for user_id, movie_id, rating, timestamp in test_df.values:
    R_test[user_id-1, movie_id-1] = rating

In [25]:
train_dummy = pd.get_dummies(train_df, columns=['user_id', 'movie_id'])
train_dummy.drop('rating', axis=1, inplace=True)
train_df = pd.concat([train_dummy, train_df], axis=1)

In [46]:
train_df.iloc[:,2623]

0          1
1          1
2          1
3          1
4          1
        ... 
90565    943
90566    943
90567    943
90568    943
90569    943
Name: user_id, Length: 90570, dtype: int64

In [36]:
train_df.head()

Unnamed: 0,user_id_1,user_id_2,user_id_3,user_id_4,user_id_5,user_id_6,user_id_7,user_id_8,user_id_9,user_id_10,...,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682,user_id,movie_id,rating
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,5
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,2,3
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,3,4
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,4,3
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,5,3


In [48]:
class PandasDataset(Dataset):
    
    def __init__(self, dataset): 
        super(PandasDataset, self).__init__()
        self.user = dataset.iloc[:,:n_users]
        self.item = dataset.iloc[:,n_users:2623]
        self.embed_user = dataset.iloc[:, 2623]
        self.embed_item = dataset.iloc[:, 2624]
        self.y = dataset.iloc[ :, 2625]
        
        self.user_value, self.item_value = self.user.values, self.item.values
        self.embed_value, self.embed_item_value, self.y_value = self.embed_user.values, self.embed_item.values ,self.y.values
        
    def __len__(self):
        return len(self.user)
        
    def __getitem__(self, idx):
        return{
            'user' : torch.from_numpy(self.user_value)[idx],
            'item' : torch.from_numpy(self.item_value)[idx],
            'embed_user' : torch.from_numpy(self.embed_user_value)[idx],
            'embed_item' : torch.from_numpy(self.embed_item_value)[idx],
            'y' : torch.from_numpy(self.y_value)[idx]
        }
        

In [49]:
batch_size = 1000

train_dataset = PandasDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [50]:
# number of latent factor
k = 10

In [12]:
class SVD(nn.Module):
    
    def __init__(self):
        super(SVD, self).__init__()
        
        self.mu = nn.Parameter(torch.tensor(np.mean(train_df.loc[:, 'rating'])), requires_grad = True) # scalar
                
        self.user_embedding = nn.Embedding(n_users, k)
        self.item_embedding = nn.Embedding(n_items, k)
        
        self.lin = nn.Linear(n_users+n_items, 1)
        
    def forward(self, user, item, emb_user, emb_item):
        
        embeded_user = nn.Embedding(emb_user) # batch x k
        embeded_item = nn.Embedding(emb_item) # batch x k
        
        interaction = torch.mm(embeded_user, embeded_item.T) # batch x batch
        
        torch.cat((user ,item, interaction), dim=1)
        
        user_bias = self.B_u.repeat(n_items).view(n_users,n_items)
        item_bias = self.B_i.repeat(n_users).view(n_users,n_items)
        
        output = torch.mm(self.P.T, self.Q) + user_bias + item_bias
        
        reg = torch.norm(self.P) + torch.norm(self.Q) + torch.norm(self.B_u) + torch.norm(self.B_i)
        
        return output, reg
    

In [13]:
model = SVD()

In [None]:
# embedding시키려면 onehotencoding해서 각각의 가중치를 넣어주는 방식을 이용해야 할것 같습니다.
# Matrix를 만들어서 계산시키려니 bias term을 추가해주기가 힘드네요
# 수식을 따라 코딩하려면 Matrix방법이 맞는것 같습니다.

In [None]:
# R 매트릭스를 만들거면 user와 movie의 index를 알아야하는데 embedding시키고 나면 어떤 품목인지 알수가없음
# Factorization Machine처럼 짜면 될거같긴함