In [1]:
import import_ipynb
from NonRegression_function import *

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch import nn, optim
import tqdm

import csv
from sklearn.feature_extraction.text import CountVectorizer

importing Jupyter notebook from NonRegression_function.ipynb
importing Jupyter notebook from DataSet.ipynb


In [3]:
# Read 'CSV' file by using 'csv.DictReader'.
# Extract required-part.
with open("ml-20m/movies.csv") as fp:
    reader = csv.DictReader(fp)
    def parse(d):
        movieId = int(d["movieId"])
        genres = d["genres"]
        return movieId, genres
    
    data = [parse(d) for d in reader]
    movieIds = [x[0] for x in data]
    genres = [x[1] for x in data]
    
    # Train CountVectorizer by matching data.
    cv = CountVectorizer(dtype="f4").fit(genres)
    num_genres = len(cv.get_feature_names())
    
    # Generate Tensor's dict that the key is movidId,
    # and the value equals to Bow.
    it = cv.transform(genres).toarray()
    it = (torch.tensor(g, dtype=torch.float32) for g in it)
    genre_dict = dict(zip(movieIds, it))
    

In [4]:
def first(xs):
    it = iter(xs)
    return next(it)


class MovieLensDataset(Dataset):
    def __init__(self, x, y, genres):
        assert len(x) == len(y)
        self.x = x
        self.y = y
        self.genres = genres
        
        # A dummy-data for movieId that does not have in Genre-dictionary.
        self.null_genre = torch.zeros_like(
            first(genres.values())
        )
    
    def __len__(self):
        return len(self.x)
    
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        # x = (userId, movieId)
        movieId = x[1]
        g = self.genres.get(movieId, self.null_genre)
        return x, y, g

In [6]:
train_dataset = MovieLensDataset(
    torch.tensor(train_X, dtype=torch.int64),
    torch.tensor(train_Y, dtype=torch.float32),
    genre_dict
)

test_dataset = MovieLensDataset(
    torch.tensor(test_X, dtype=torch.int64),
    torch.tensor(test_Y, dtype=torch.float32),
    genre_dict
)

In [7]:
train_loader = DataLoader(
    train_dataset, batch_size=1024, shuffle=True, num_workers=4
)
test_loader = DataLoader(
    test_dataset, batch_size=1024, num_workers=4
)

In [8]:
class NeuralMatrixFactorization2(nn.Module):
    def __init__(
        self, max_user, max_item, 
        num_genres, 
        user_k=10, item_k=10,
        hidden_dim=50
    ):
        super().__init__()
        self.user_emb = nn.Embedding(max_user, user_k, 0)
        self.item_emb = nn.Embedding(max_item, item_k, 0)
        self.mlp = nn.Sequential(
            # Increase dimension by 'num_genres'.
            nn.LInear(user_k + item_k + num_genres, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, 1)
        )