Here we want to make a simple recommender system to gauge the similarity between shows, users and to help us predict whether a user will enjoy a particular anime.

Import relevant libraries 

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as scp

In [2]:
anime = pd.read_csv('./anime.csv')

For this analysis we're only interest in finding recommendations for the TV category

In [3]:
anime = anime.loc[anime.type == 'TV']
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351


In [4]:
rating = pd.read_csv('./rating.csv')
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


Join the two dataframes on the `anime_id` columns

In [5]:
merged = rating.merge(anime, left_on='anime_id', right_on='anime_id', suffixes=['_user', ''])
merged.head()

Unnamed: 0,user_id,anime_id,rating_user,name,genre,type,episodes,rating,members
0,1,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,3,20,8,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
2,5,20,6,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,6,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
4,10,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297


In [6]:
merged = merged[['user_id', 'name', 'rating_user']]
merged.head()

Unnamed: 0,user_id,name,rating_user
0,1,Naruto,-1
1,3,Naruto,8
2,5,Naruto,6
3,6,Naruto,-1
4,10,Naruto,-1


In [7]:
merged.rename(columns={'rating_user': 'user_rating'}, inplace=True)
merged.head()

Unnamed: 0,user_id,name,user_rating
0,1,Naruto,-1
1,3,Naruto,8
2,5,Naruto,6
3,6,Naruto,-1
4,10,Naruto,-1


In [8]:
merged = merged.loc[merged.user_rating != -1]
merged.head()

Unnamed: 0,user_id,name,user_rating
1,3,Naruto,8
2,5,Naruto,6
5,21,Naruto,8
6,28,Naruto,9
7,34,Naruto,9


In [9]:
merged.shape

(4364294, 3)

For computing reasons we're limiting the dataframe length to 10,000 users

In [10]:
merged = merged.loc[merged.user_id <= 10000]
merged.head()

Unnamed: 0,user_id,name,user_rating
1,3,Naruto,8
2,5,Naruto,6
5,21,Naruto,8
6,28,Naruto,9
7,34,Naruto,9


In [11]:
merged.shape

(582392, 3)

Let's split data into train and dev sets. Note, that it is not good train/dev/test split! But for simplicity we will do so.

In [12]:
from sklearn.model_selection import train_test_split

random_state = 314159

In [13]:
train_data, test_data = train_test_split(merged, test_size=0.1, random_state=random_state)
train_data.sort_index(inplace=True)
test_data.sort_index(inplace=True)
train_data.shape, test_data.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


((524152, 3), (58240, 3))

In [14]:
train_data.head()

Unnamed: 0,user_id,name,user_rating
1,3,Naruto,8
2,5,Naruto,6
5,21,Naruto,8
6,28,Naruto,9
7,34,Naruto,9


In [15]:
test_data.head()

Unnamed: 0,user_id,name,user_rating
33,118,Naruto,7
34,124,Naruto,7
35,125,Naruto,4
39,132,Naruto,9
43,148,Naruto,8


Create dictionaries from `user_id` to their index and vice versa.

In [16]:
user_to_idx = {user_id : idx for idx, user_id in enumerate(merged.user_id.unique())}
idx_to_user = {b : a for a, b in user_to_idx.items()}
len(user_to_idx)

9387

In [17]:
assert 9387 == len(user_to_idx)

Same for `name` represanting anime title.

In [18]:
anime_to_idx = {name : idx for idx, name in enumerate(merged.name.unique())}
idx_to_anime = {b : a for a, b in anime_to_idx.items()}

In [19]:
assert 2708 == len(anime_to_idx)

Let's map all into indices.

In [20]:
data = train_data.copy()
data['user_id'] = data['user_id'].apply(lambda x: user_to_idx[x])
data['name'] = data['name'].apply(lambda x: anime_to_idx[x])
data.head()

Unnamed: 0,user_id,name,user_rating
1,0,0,8
2,1,0,6
5,2,0,8
6,3,0,9
7,4,0,9


Our data needs to be in a sparse matrix format to be read by the following functions

In [21]:
def get_sparse(data):
    sp = scp.csr_matrix((data['user_rating'], (data['user_id'], data['name'])), shape = (len(user_to_idx), len(anime_to_idx)))
    return sp

In [22]:
train_sp = get_sparse(data)
train_sp

<9387x2708 sparse matrix of type '<class 'numpy.int64'>'
	with 524152 stored elements in Compressed Sparse Row format>

In [23]:
train_dense = train_sp.todense().A
train_dense

array([[8, 0, 0, ..., 0, 0, 0],
       [6, 1, 0, ..., 0, 0, 0],
       [8, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

http://www.machinelearning.ru/wiki/images/8/8a/Kitov-ML-eng-24-Recommender_systems.pdf

Baseline predictions:

$$
\begin{align*}
\hat{r}_{ui}=&\mu+b_u+b_i,\\
\mu=&\frac{1}{n}\sum_{u,i}r_{ui},\\
b_u=&\frac{1}{|I_u|+\alpha}\sum_{i\in{}I_u}(r_{ui} - \mu),\\
b_i=&\frac{1}{|U_i|+\beta}\sum_{u^\prime\in{}U_i}(r_{u^\prime{}i} - b_{u^\prime} - \mu),
\end{align*}
$$

where $\alpha=\beta\approx{}25$

Intuition:
- $b_u$ is how much higher user rates items than averag
- $b_i$ is how much item i is rated higher than average user ratin

In [24]:
alpha = beta = 25

In [25]:
import numpy.ma as ma

In [26]:
train_dense_masked = ma.masked_array(train_dense, mask=train_dense == 0, fill_value=0)
train_dense_masked

masked_array(
  data=[[8, --, --, ..., --, --, --],
        [6, 1, --, ..., --, --, --],
        [8, --, --, ..., --, --, --],
        ...,
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --]],
  mask=[[False,  True,  True, ...,  True,  True,  True],
        [False, False,  True, ...,  True,  True,  True],
        [False,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]],
  fill_value=0,
  dtype=int64)

In [27]:
mu = train_dense_masked.mean()
mu

7.882478746623117

In [28]:
bu = 1 / ((~train_dense_masked.mask).sum(1) + alpha) * (train_dense_masked - mu).sum(1)
bu.fill_value = 0
assert bu.shape == (len(user_to_idx), )
bu

masked_array(data=[-0.13464002632911784, -3.2868801654863473,
                   -0.3677503990118461, ..., 0.042981586668341655,
                   0.11981638913902837, 0.08144312512988013],
             mask=[False, False, False, ..., False, False, False],
       fill_value=0.0)

In [29]:
bi = 1 / ((~train_dense_masked.mask).sum(0) + beta) * ((train_dense_masked - mu - bu[:, np.newaxis]).sum(0))
bi.fill_value = 0
assert bi.shape == (len(anime_to_idx), )
bi

masked_array(data=[-0.09248308250339071, 0.37910272266658857,
                   -0.2704309573197862, ..., -0.00019449975560036615,
                   0.038267038705938096, -0.03865603821713883],
             mask=[False, False, False, ..., False, False, False],
       fill_value=0.0)

In [30]:
B = bu[..., None] + bi  # or np.outer(bu, bi)

In [31]:
baseline_predictions = mu + B
baseline_predictions

masked_array(
  data=[[7.655355637790608, 8.126941442960588, 7.477407762974213, ...,
         7.747644220538398, 7.786105758999938, 7.70918268207686],
        [4.503115498633379, 4.974701303803358, 4.325167623816983, ...,
         4.595404081381169, 4.633865619842708, 4.55694254291963],
        [7.42224526510788, 7.89383107027786, 7.244297390291485, ...,
         7.51453384785567, 7.552995386317209, 7.476072309394132],
        ...,
        [7.832977250788068, 8.304563055958047, 7.655029375971672, ...,
         7.925265833535859, 7.963727371997397, 7.88680429507432],
        [7.909812053258754, 8.381397858428734, 7.731864178442359, ...,
         8.002100636006546, 8.040562174468084, 7.963639097545006],
        [7.8714387892496065, 8.343024594419585, 7.693490914433211, ...,
         7.963727371997397, 8.002188910458935, 7.925265833535859]],
  mask=[[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., 

In [32]:
train_grby_user = train_data.groupby('user_id')
dev_grby_user = test_data.groupby('user_id')

In [87]:
# 16, 18, 19, 20
user = 16
train_grby_user.get_group(user)

Unnamed: 0,user_id,name,user_rating
279569,16,Angel Beats!,8
1072360,16,Death Note,9
2766645,16,Black★Rock Shooter (TV),8
2910593,16,Clannad,8
2958025,16,Clannad: After Story,9
3604181,16,Bokura ga Ita,9


In [88]:
dev_grby_user.get_group(user)

Unnamed: 0,user_id,name,user_rating
3175327,16,D.N.Angel,8


In [89]:
def line_on_baseline(user, top_k=10):
    user = user_to_idx[user]
    ratings = baseline_predictions[user].data
    anime_to_line_on = []
    for anime_idx in np.argsort(ratings)[::-1]:
        if ma.is_masked(ratings[anime_idx]):
            break
        anime_to_line_on.append((idx_to_anime[anime_idx], ratings[anime_idx]))
    return anime_to_line_on[:top_k]

In [90]:
line_on_baseline(16)

[('Gintama°', 9.517293177777349),
 ('Fullmetal Alchemist: Brotherhood', 9.415267932577757),
 ('Steins;Gate', 9.400509506491034),
 ('Gintama&#039;', 9.398658108811711),
 ('Gintama', 9.322705265060955),
 ('Hunter x Hunter (2011)', 9.284404659676813),
 ('Gintama&#039;: Enchousen', 9.271041719346364),
 ('Clannad: After Story', 9.202850703232642),
 ('Code Geass: Hangyaku no Lelouch R2', 9.118341885917944),
 ('Mushishi Zoku Shou 2nd Season', 9.116008279199168)]

This function calculates the weighted average of similar users to determine a potential rating for an input user and show

User-user:

$$
\hat{r}_{ui}=\frac{\sum_{u^\prime}sim(u,u^\prime)r_{u^\prime{}i}}{\sum_{u^\prime}|sim(u,u^\prime)|}
$$

or better

$$
\hat{r}_{ui}=\bar{r}_u+\frac{\sum_{u^\prime}sim(u,u^\prime)(r_{u^\prime{}i}-\bar{r}_{u^\prime})}{\sum_{u^\prime}|sim(u,u^\prime)|}
$$

Don't forget to normalize $R$, as

$$
R^\prime=r_{ui}-\mu-b_u-b_i.
$$

Useful heuristic: after $$\hat{r}_{u}\to{}\frac{\hat{r}_{u}}{\lVert{}r_u\rVert}$$ users, having rated few
items will impact items similarity more

These matrices show us the computed cosine similarity values between each user/user array pair and item/item array pair.

In [91]:
train_norm = train_dense_masked - baseline_predictions
train_norm

masked_array(
  data=[[0.344644362209392, --, --, ..., --, --, --],
        [1.4968845013666208, -3.974701303803358, --, ..., --, --, --],
        [0.5777547348921201, --, --, ..., --, --, --],
        ...,
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --]],
  mask=[[False,  True,  True, ...,  True,  True,  True],
        [False, False,  True, ...,  True,  True,  True],
        [False,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]],
  fill_value=0)

In [92]:
from sklearn.metrics.pairwise import cosine_similarity

In [93]:
user_similarities = cosine_similarity(train_norm)
item_similarities = cosine_similarity(train_norm.T)

In [94]:
assert user_similarities.shape == (len(user_to_idx), len(user_to_idx))
assert item_similarities.shape == (len(anime_to_idx), len(anime_to_idx))

This function will return the top 10 shows with the highest cosine similarity value

In [95]:
def top_anime(anime_name, top_k=10):
    anime_idx = anime_to_idx[anime_name]
    
    print('Top {} to: {}'.format(top_k, anime_name))
    for i, j in enumerate(np.argsort(item_similarities[anime_idx])[::-1][1:top_k + 1]):
        print('#{}: {}'.format(i + 1, idx_to_anime[j]))

In [96]:
top_anime('Naruto')

Top 10 to: Naruto
#1: Bleach
#2: Dragon Ball GT
#3: Dragon Ball Z
#4: Yu☆Gi☆Oh! Duel Monsters
#5: Death Note
#6: Sword Art Online
#7: Hello! Lady Lynn
#8: Fairy Tail
#9: Pokemon Advanced Generation
#10: Fullmetal Alchemist


In [97]:
train_norm

masked_array(
  data=[[0.344644362209392, --, --, ..., --, --, --],
        [1.4968845013666208, -3.974701303803358, --, ..., --, --, --],
        [0.5777547348921201, --, --, ..., --, --, --],
        ...,
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --]],
  mask=[[False,  True,  True, ...,  True,  True,  True],
        [False, False,  True, ...,  True,  True,  True],
        [False,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]],
  fill_value=0)

In [166]:
user_mean = [x.mean() for x in train_dense_masked]

def predicted_rating(item_name, user_id, N=50):
    item_idx = anime_to_idx[item_name]
    user_idx = user_to_idx[user_id]
    similar_users = np.argsort(user_similarities[user_idx])[::-1][1:N+1]
    ans = user_mean[user_idx]
    numerator = 0
    denominator = 0
    for other in similar_users:
        numerator += user_similarities[user_idx][other] * (baseline_predictions.data[other][item_idx] - user_mean[other])
        denominator += np.abs(user_similarities[user_idx][other])
    return ans + numerator / denominator
#     return  + train_norm[sim_users][]

In [167]:
predicted_rating('Gintama°', 16)

9.65005049663187

In [168]:
from tqdm import tqdm

def line_on_user_user(user, top_k=10, N=50):
    ratings = []
    for anime in anime_to_idx:
        ratings.append((anime_to_idx[anime], predicted_rating(anime, user, N)))
    ratings.sort()
    ratings = [x[1] for x in ratings]
    anime_to_line_on = []
    for anime_idx in np.argsort(ratings)[::-1][:top_k]:
        anime_to_line_on.append((idx_to_anime[anime_idx], ratings[anime_idx]))
            
    return anime_to_line_on

In [169]:
train_grby_user.get_group(user)

Unnamed: 0,user_id,name,user_rating
279569,16,Angel Beats!,8
1072360,16,Death Note,9
2766645,16,Black★Rock Shooter (TV),8
2910593,16,Clannad,8
2958025,16,Clannad: After Story,9
3604181,16,Bokura ga Ita,9


In [170]:
dev_grby_user.get_group(user)

Unnamed: 0,user_id,name,user_rating
3175327,16,D.N.Angel,8


In [171]:
top_k = 10
line_on_user_user(16, top_k=top_k)  # with N=1000 ok!

[('Gintama°', 9.65005049663187),
 ('Fullmetal Alchemist: Brotherhood', 9.548025251432279),
 ('Steins;Gate', 9.533266825345555),
 ('Gintama&#039;', 9.531415427666232),
 ('Gintama', 9.455462583915477),
 ('Hunter x Hunter (2011)', 9.417161978531334),
 ('Gintama&#039;: Enchousen', 9.403799038200885),
 ('Clannad: After Story', 9.335608022087161),
 ('Code Geass: Hangyaku no Lelouch R2', 9.251099204772464),
 ('Mushishi Zoku Shou 2nd Season', 9.248765598053689)]

Exercise: implement item-item approach.

Suppose that all users have some latent state and items too, i. e.

$$
R=UI^T,
$$

where $R\in\mathbb{R}^{u\times{}i}$, $U\in\mathbb{R}^{u\times{}k}$ and $I\in\mathbb{R}^{i\times{}k}$ with some small $k$. We could do this factorization using simple `SVD`.

In [172]:
u, s, vt = scp.linalg.svds(scp.csr_matrix(train_norm.data), k=100)
u = u.dot(np.diag(s))

In [173]:
def line_on_svd(user, top_k=10):
    ratings = []
    for anime in tqdm(anime_to_idx):
        rat = np.sum(u[user_to_idx[user]]*vt[:, anime_to_idx[anime]])
#         rat *= data_norm_fro
#         rat += bu[user_to_idx[user]]
#         rat += bi[anime_to_idx[anime]]
#         rat += mu
        ratings.append(rat)

    anime_to_line_on = []
    for anime_idx in np.argsort(ratings)[::-1]:
        if ma.is_masked(ratings[anime_idx]):
            continue
        if len(anime_to_line_on) == top_k: break
        if idx_to_anime[anime_idx] not in train_data.groupby('user_id').get_group(user).name.values:
            anime_to_line_on.append((idx_to_anime[anime_idx], ratings[anime_idx]))
            
    return anime_to_line_on

In [188]:
user = 16
train_grby_user.get_group(user)

Unnamed: 0,user_id,name,user_rating
279569,16,Angel Beats!,8
1072360,16,Death Note,9
2766645,16,Black★Rock Shooter (TV),8
2910593,16,Clannad,8
2958025,16,Clannad: After Story,9
3604181,16,Bokura ga Ita,9


In [189]:
dev_grby_user.get_group(user)

Unnamed: 0,user_id,name,user_rating
3175327,16,D.N.Angel,8


In [190]:
line_on_svd(user, top_k=top_k)



  0%|          | 0/2708 [00:00<?, ?it/s][A[A

100%|██████████| 2708/2708 [00:00<00:00, 110511.74it/s][A[A

[('Hanamaru Youchien', 0.17287213275212127),
 ('Koutetsushin Jeeg', 0.11483900690167936),
 ('Seikai no Monshou', 0.06673089066671181),
 ('Abenobashi Mahou☆Shoutengai', 0.06084253707577783),
 ('12-sai.: Chicchana Mune no Tokimeki', 0.05568670233659734),
 ('Rokujouma no Shinryakusha!? (TV)', 0.052266631880727464),
 ('Samurai Champloo', 0.05174380182175364),
 ('Arrow Emblem Grand Prix no Taka', 0.050091287319365824),
 ('Hipira-kun', 0.0455988473647383),
 ('Kamen no Maid Guy', 0.0449844947969244)]

In [191]:
line_on_user_user(user, top_k=top_k)

[('Gintama°', 9.65005049663187),
 ('Fullmetal Alchemist: Brotherhood', 9.548025251432279),
 ('Steins;Gate', 9.533266825345555),
 ('Gintama&#039;', 9.531415427666232),
 ('Gintama', 9.455462583915477),
 ('Hunter x Hunter (2011)', 9.417161978531334),
 ('Gintama&#039;: Enchousen', 9.403799038200885),
 ('Clannad: After Story', 9.335608022087161),
 ('Code Geass: Hangyaku no Lelouch R2', 9.251099204772464),
 ('Mushishi Zoku Shou 2nd Season', 9.248765598053689)]

Now we will use the library [`surprise`](https://surprise.readthedocs.io/en/stable/).

In [180]:
!pip install surprise
from surprise import Reader, Dataset

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise (from surprise)
[?25l  Downloading https://files.pythonhosted.org/packages/4d/fc/cd4210b247d1dca421c25994740cbbf03c5e980e31881f10eaddf45fdab0/scikit-surprise-1.0.6.tar.gz (3.3MB)
[K    100% |████████████████████████████████| 3.3MB 3.5MB/s ta 0:00:01
[?25hCollecting joblib>=0.11 (from scikit-surprise->surprise)
[?25l  Downloading https://files.pythonhosted.org/packages/0d/1b/995167f6c66848d4eb7eabc386aebe07a1571b397629b2eac3b7bebdc343/joblib-0.13.0-py2.py3-none-any.whl (276kB)
[K    100% |████████████████████████████████| 276kB 2.2MB/s ta 0:00:01
Building wheels for collected packages: scikit-surprise
  Running setup.py bdist_wheel for scikit-surprise ... [?25ldone
[?25h  Stored in directory: /home/kikos/.cache/pip/wheels/ec/c0/55/3a28eab06b53c220015063ebbdb81213cd3dcbb72c088251ec


In [181]:
# to load dataset from pandas df, we need `load_fromm_df` method in surprise lib
ratings_dict = {'itemID': list(train_data.name),
                'userID': list(train_data.user_id),
                'rating': list(train_data.user_rating)}
df = pd.DataFrame(ratings_dict)
df.head()

Unnamed: 0,itemID,rating,userID
0,Naruto,8,3
1,Naruto,6,5
2,Naruto,8,21
3,Naruto,9,28
4,Naruto,9,34


In [182]:
# A reader is still needed but only the rating_scale param is required.
# The Reader class is used to parse a file containing ratings.
reader = Reader(rating_scale=(1, 10))

In [183]:
# The columns must correspond to user id, item id and ratings (in that order).
data_sur = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

In [184]:
from collections import defaultdict

from surprise import SVD

# see https://surprise.readthedocs.io/en/stable/FAQ.html
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [185]:
# First train an SVD algorithm on the movielens dataset.
# data = Dataset.load_builtin('ml-100k')
trainset = data_sur.build_full_trainset()
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2c78eb4860>

In [186]:
user = 20
train_data.groupby('user_id').get_group(user)

Unnamed: 0,user_id,name,user_rating
275264,20,Seikon no Qwaser,10
472881,20,Seikon no Qwaser II,10
1550869,20,Mirai Nikki (TV),10
1773309,20,InuYasha,10
2022016,20,K-On!,10
3214361,20,Gravitation,9
3232257,20,Ouran Koukou Host Club,10
3834619,20,Maburaho,8
3837790,20,Peach Girl,8


In [187]:
dev_data.groupby('user_id').get_group(user)

NameError: name 'dev_data' is not defined

In [None]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test([(user, anime, 0) for anime in anime_to_idx])

In [None]:
top_k = 10
line_on_user_user(user, top_k)

In [None]:
line_on_svd(user, top_k)

In [None]:
top_n = get_top_n(predictions, n=top_k)
top_n[user]

Now we are going to do all stuff with NNs.

In [None]:
import keras.layers as L
from keras.models import Model
from keras.optimizers import Adam
import keras.backend as K

In [None]:
def build_animenet(user_dim, item_dim, latent_dim=32):
    input_user = L.Input(shape=(1,), name='user')
    input_item = L.Input(shape=(1,), name='item')
    emb_user = L.Embedding(user_dim, latent_dim)
    emb_item = L.Embedding(item_dim, latent_dim)

    user = emb_user(input_user)
    item = emb_item(input_item)
    
    dot = L.Multiply()([user, item])
    dot = L.Lambda(lambda x: K.sum(x, axis=-1), name='rating')(dot)
    
    model = Model(inputs=[input_user, input_item], outputs=[dot])
    model.compile(optimizer=Adam(), loss=['mse'])
    
    return model

In [None]:
animenet = build_animenet(len(user_to_idx), len(anime_to_idx))
animenet.summary()

In [None]:
# data_net = scp.coo_matrix(train_norm.data)
# df_net = pd.DataFrame(data=np.vstack([data_net.row, data_net.col, data_net.data]).T,
#                       columns=['user_id', 'name', 'user_rating'])
# df_net

In [None]:
def get_batch(data):
    user = np.array([user_to_idx[u] for u in data['user_id'].values])
    item = np.array([anime_to_idx[i] for i in data['name'].values])
    # user = data['user_id'].values
    # item = data['name'].values
    rating = data['user_rating'].values
    
    return {'user': user, 'item': item, 'rating': rating}

In [None]:
def batch_generator(data, shuffle=True, batch_size=32):
    while True:
        inds = np.arange(len(data))
        if shuffle:
            np.random.shuffle(inds)
            
        for start in range(0, len(data), batch_size):
            batch = get_batch(data.iloc[inds[start:start + batch_size]])

            yield batch, batch

In [None]:
epochs = 1
batch_size = 32

In [None]:
animenet.fit_generator(batch_generator(train_data, batch_size=batch_size),
                       steps_per_epoch=(len(train_data) - 1)//batch_size + 1,
                       epochs=epochs,
                       validation_data=batch_generator(dev_data, batch_size=batch_size),
                       validation_steps=(len(dev_data) - 1)//batch_size + 1)

In [None]:
def line_on_net(user, top_k=10):
    ratings = []
    for anime in tqdm(anime_to_idx):
        rat = animenet.predict_on_batch({'user': np.array([user_to_idx[3]]),
                                         'item': np.array([anime_to_idx['Naruto']])})[0][0]
        ratings.append(rat)

    anime_to_line_on = []
    for anime_idx in np.argsort(ratings)[::-1]:
        if ma.is_masked(ratings[anime_idx]):
            continue
        if len(anime_to_line_on) == top_k: break
        if idx_to_anime[anime_idx] not in train_data.groupby('user_id').get_group(user).name.values:
            anime_to_line_on.append((idx_to_anime[anime_idx], ratings[anime_idx]))
            
    return anime_to_line_on

In [None]:
line_on_net(16)

In [None]:
line_on_net(20)