In [129]:
import import_ipynb
# from NonRegression_function import *
# from DataSet import *

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch import nn, optim
import tqdm

from statistics import mean
import csv
from sklearn.feature_extraction.text import CountVectorizer



In [130]:
import pandas as pd
from sklearn import model_selection
# Apply and divide Train-Data and Test-Data.
df = pd.read_csv("ml-20m/ratings.csv")

In [131]:
# Variable 'X' is (userID, movieID) pair.
X = df[["userId", "movieId"]].values
Y = df[['rating']].astype(int)

Y = df[["rating"]].values

# df[["rating"]] = df[["rating"]].astype(int)
print("Format of Y : ", Y)
print("df : ", df)

# Divide Train-Data and Test-Data as 9 to 1.
train_X, test_X, train_Y, test_Y\
    = model_selection.train_test_split(X, Y, test_size=0.1)
# dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor

Format of Y :  [[3.5]
 [3.5]
 [3.5]
 ...
 [3. ]
 [5. ]
 [2.5]]
df :            userId  movieId  rating   timestamp
0              1        2     3.5  1112486027
1              1       29     3.5  1112484676
2              1       32     3.5  1112484819
3              1       47     3.5  1112484727
4              1       50     3.5  1112484580
...          ...      ...     ...         ...
20000258  138493    68954     4.5  1258126920
20000259  138493    69526     4.5  1259865108
20000260  138493    69644     3.0  1260209457
20000261  138493    70286     5.0  1258126944
20000262  138493    71619     2.5  1255811136

[20000263 rows x 4 columns]


In [132]:
# Read 'CSV' file by using 'csv.DictReader'.
# Extract required-part.
with open("ml-20m/movies.csv", encoding="UTF8") as fp:
    reader = csv.DictReader(fp)
    def parse(d):
        movieId = int(d["movieId"])
        genres = d["genres"]
        return movieId, genres
    
    data = [parse(d) for d in reader]
    movieIds = [x[0] for x in data]
    genres = [x[1] for x in data]
    
    # Train CountVectorizer by matching data.
    cv = CountVectorizer(dtype="f4").fit(genres)
    num_genres = len(cv.get_feature_names())
    
    # Generate Tensor's dict that the key is movidId,
    # and the value equals to Bow.
    it = cv.transform(genres).toarray()
    it = (torch.tensor(g, dtype=torch.float32) for g in it)
    genre_dict = dict(zip(movieIds, it))
    

In [133]:
def first(xs):
    it = iter(xs)
    return next(it)


class MovieLensDataset(Dataset):
    def __init__(self, x, y, genres):
        assert len(x) == len(y)
        self.x = x
        self.y = y
        self.genres = genres
        
        # A dummy-data for movieId that does not have in Genre-dictionary.
        self.null_genre = torch.zeros_like(
            first(genres.values())
        )
    
    def __len__(self):
        return len(self.x)
    
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        # x = (userId, movieId)
        movieId = x[1]
        g = self.genres.get(movieId, self.null_genre)
        return x, y, g

In [134]:
train_dataset = MovieLensDataset(
    torch.tensor(train_X, dtype=torch.int64),
    torch.tensor(train_Y, dtype=torch.float32),
    genre_dict
)

test_dataset = MovieLensDataset(
    torch.tensor(test_X, dtype=torch.int64),
    torch.tensor(test_Y, dtype=torch.float32),
    genre_dict
)

In [135]:
train_loader = DataLoader(
    train_dataset, batch_size=1024, shuffle=True, num_workers=3
)
test_loader = DataLoader(
    test_dataset, batch_size=1024, num_workers=3
)

In [136]:
class NeuralMatrixFactorization2(nn.Module):
    def __init__(
        self, max_user, max_item, 
        num_genres, 
        user_k=10, item_k=10,
        hidden_dim=50
    ):
        super().__init__()
        self.user_emb = nn.Embedding(max_user, user_k, 0)
        self.item_emb = nn.Embedding(max_item, item_k, 0)
        self.mlp = nn.Sequential(
            # Increase dimension by 'num_genres'.
            nn.Linear(user_k + item_k + num_genres, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x, g):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)

In [137]:
def eval_net(net, loader, score_fn=nn.functional.l1_loss, device="cpu"):
    ys = []
    ypreds = []
    
    # 'loader' returns Genre 'Bow'.
    for x, y, g in loader:
        x = x.to(device)
        g = g.to(device)
        ys.append(y)
        
        # Send not only 'userId' and 'movieId', but also 'Bow'.
        with torch.no_grad():
            ypred = net(x, g).to("cpu")
        ypreds.append(ypred)
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score

In [138]:
net = NeuralMatrixFactorization2(
    max_user + 1, max_item + 1, num_genres
)
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

In [144]:
net.to("cuda:0")
for epoch in range(5):
    loss_log = []
    net.train()
    
    for x, y, g in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        g = g.to("cuda:0")
        # o = net(x, g)
        
        loss = loss_f(x, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
    
    net.eval()
    test_score = eval_net(net, test_loader, device="cuda:0")
    print(epoch, mean(loss_log, test_score.item(), flush=True))














  0%|          | 0/17579 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

BrokenPipeError: [Errno 32] Broken pipe

In [140]:
# Convert Tensor that the position equals to '1', 
# and the rest of tensor is '0'.
def make_genre_vector(i, max_len):
    g = torch.zeros(max_len)
    g[i] = 1
    return g

query_genres = [
    make_genre_vector(i, num_genres)
    for i in range(num_genres)
]
query_genres = torch.stack(query_genres, 1)

In [141]:
query = torch.stack([
    torch.empty(num_genres, dtype=torch.int64).fill_(100),
    torch.empty(num_genres, dtype=torch.int64).fill_(0)
], 1)

# Send to CPU(GPU).
query_genres = query_genres.to("cuda:0")
query = query.to("cuda:0")

In [142]:
# Calculating scores.
net(query, query_genres)