In [None]:
!pip install voila
!jupyter serverextension enable voila --sys-prefix

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import the dataset
import pandas as pd


In [None]:
from google.colab import files
uploaded = files.upload()

Saving rating.csv to rating.csv
Saving anime.csv to anime.csv


In [None]:
anime_df = pd.read_csv('/content/anime.csv')
ratings_df = pd.read_csv('/content/rating.csv',usecols=range(3))

In [None]:
print('The dimensions of anime dataframe are:', anime_df.shape)
print('The dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of anime dataframe are: (12294, 7)
The dimensions of ratings dataframe are: (7813737, 3)


In [None]:
# Take a look at anime_df
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [None]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [None]:
# Mapping anime name into a dictionary for reference
anime_names = anime_df.set_index('anime_id')['name'].to_dict()
anime_genres = anime_df.set_index('anime_id')['genre'].to_dict()

n_users = len(ratings_df.user_id.unique())
n_items = len(ratings_df.anime_id.unique())
print("Number of unique users:", n_users)
print("Number of unique animes:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))

Number of unique users: 73515
Number of unique animes: 11200
The full rating matrix will have: 823368000 elements.
----------
Number of ratings: 7813737


In [None]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [None]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform data to machine learning readiness


class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.user_id.unique()
        animes = ratings_df.anime_id.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(animes)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.anime_id = ratings_df.anime_id.apply(lambda x: self.movieid2idx[x])
        self.ratings.user_id = ratings_df.user_id.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [None]:
num_epochs = 2
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

if cuda:
    model = model.cuda()


Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(73515, 8)
  (item_factors): Embedding(11200, 8)
)
user_factors.weight tensor([[0.0253, 0.0198, 0.0064,  ..., 0.0211, 0.0080, 0.0402],
        [0.0184, 0.0365, 0.0175,  ..., 0.0264, 0.0324, 0.0343],
        [0.0013, 0.0394, 0.0257,  ..., 0.0475, 0.0383, 0.0347],
        ...,
        [0.0337, 0.0315, 0.0401,  ..., 0.0153, 0.0198, 0.0044],
        [0.0369, 0.0159, 0.0354,  ..., 0.0312, 0.0200, 0.0342],
        [0.0494, 0.0101, 0.0431,  ..., 0.0392, 0.0473, 0.0429]])
item_factors.weight tensor([[0.0255, 0.0184, 0.0228,  ..., 0.0107, 0.0362, 0.0464],
        [0.0168, 0.0159, 0.0371,  ..., 0.0175, 0.0017, 0.0248],
        [0.0015, 0.0175, 0.0337,  ..., 0.0064, 0.0105, 0.0058],
        ...,
        [0.0084, 0.0314, 0.0444,  ..., 0.0144, 0.0212, 0.0126],
        [0.0112, 0.0076, 0.0181,  ..., 0.0078, 0.0199, 0.0455],
        [0.0409, 0.0150, 0.0102,  ..., 0.0487, 0.0210, 0.0358]])


In [None]:
# MSE loss
loss_fn = torch.nn.MSELoss()

In [None]:
# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)



In [None]:
# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

In [None]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/2 [00:00<?, ?it/s]

iter #0 Loss: 10.278628836514685
iter #1 Loss: 5.233722848141943


In [None]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[-0.0365, -0.0430, -0.0616,  ..., -0.0364, -0.0444, -0.0219],
        [-0.0248, -0.0065, -0.0257,  ..., -0.0168, -0.0106, -0.0085],
        [ 0.5261,  0.5620,  0.5501,  ...,  0.5693,  0.5608,  0.5593],
        ...,
        [ 0.0973,  0.0950,  0.1036,  ...,  0.0788,  0.0834,  0.0679],
        [ 0.6649,  0.6351,  0.6490,  ...,  0.6576,  0.6381,  0.6579],
        [ 0.1765,  0.1372,  0.1701,  ...,  0.1662,  0.1744,  0.1699]],
       device='cuda:0')
item_factors.weight tensor([[ 1.6151,  1.5643,  1.5486,  ...,  1.5444,  1.5321,  1.6319],
        [ 1.6614,  1.6330,  1.6778,  ...,  1.6395,  1.6507,  1.6585],
        [ 1.4712,  1.4978,  1.4872,  ...,  1.4767,  1.4775,  1.4810],
        ...,
        [-0.0552, -0.0322, -0.0191,  ..., -0.0491, -0.0424, -0.0509],
        [-0.0523, -0.0559, -0.0454,  ..., -0.0557, -0.0437, -0.0180],
        [ 0.1045,  0.0786,  0.0737,  ...,  0.1122,  0.0846,  0.0993]],
       device='cuda:0')


In [None]:
trained_anime_embeddings = model.item_factors.weight.data.cpu().numpy()

In [None]:
len(trained_anime_embeddings) # unique movie factor weights

11200

In [None]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_anime_embeddings)



In [None]:
for cluster in range(5):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['anime_id']==movid].count()[0]
    movs.append((anime_names.get(movid,"Unknown"), rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\tName:", mov[0])
        print("\t\tNumber of Ratings:", mov[1])

Cluster #0
	Name: Tenkuu Danzai Skelter+Heaven
		Number of Ratings: 237
	Name: Hika Ryoujoku: Wana ni Hamatta Futari
		Number of Ratings: 33
	Name: Love Live! School Idol Project: μ&#039;s →NEXT LoveLive! 2014 - Endless Parade Encore Animation
		Number of Ratings: 29
	Name: Crayon Shin-chan Movie 13: Densetsu wo Yobu Buriburi 3 Pun Dai Shingeki
		Number of Ratings: 28
	Name: Gosenzo San&#039;e
		Number of Ratings: 27
	Name: Oyayubi Hime Monogatari
		Number of Ratings: 26
	Name: Kokka Kimigayo
		Number of Ratings: 26
	Name: Nami
		Number of Ratings: 25
	Name: Soushitsukyou
		Number of Ratings: 25
	Name: Dream Hazard: Akuma no Program
		Number of Ratings: 25
Cluster #1
	Name: School Days
		Number of Ratings: 12417
	Name: Dragon Ball GT
		Number of Ratings: 11079
	Name: Yosuga no Sora: In Solitude, Where We Are Least Alone.
		Number of Ratings: 6687
	Name: Kämpfer
		Number of Ratings: 6595
	Name: Naruto Movie 2: Dai Gekitotsu! Maboroshi no Chiteiiseki Dattebayo!
		Number of Ratings: 6296
