# Goodbooks-10k Collaborative Genre Tagging
  
[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/James-Leslie/deep-collaborative-filtering/blob/master/tf-goodbooks.ipynb)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

%matplotlib inline

## Load rating data

In [2]:
path = 'data/goodbooks-10k/'  # if the files are local
df = pd.read_csv(path+'ratings.csv')

In [3]:
df.head()

Unnamed: 0,user_id,book_id,rating
0,0,257,5
1,1,4080,4
2,1,259,5
3,1,9295,5
4,1,2317,3


Unnamed: 0,user_id,book_id,rating
0,0,257,5
1,1,4080,4
2,1,259,5
3,1,9295,5
4,1,2317,3


In [4]:
df.shape

(5976479, 3)

In [5]:
print('Number of users:', df.user_id.nunique())
print('Number of items:', df.book_id.nunique())
print("Min item rating:", df.rating.min())
print("Max item rating:", df.rating.max())
print("Mean item rating:", df.rating.mean())

Number of users: 53424
Number of items: 10000
Min item rating: 1
Max item rating: 5
Mean item rating: 3.9198655261735214


## Load book metadata
  - remove 10% as holdout test set

In [33]:
books = pd.read_csv(path+'books.csv').fillna(0)
books.iloc[:,-10:] = books.iloc[:,-10:].astype('int')
books.head()

Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,mystery,romance,young-adult
0,0,The Hunger Games,eng,0,0,1,1,1,1,1,0,1,1
1,1,Harry Potter and the Philosopher's Stone,eng,0,0,1,0,1,1,1,1,0,1
2,2,Twilight,en-US,0,0,0,1,1,1,1,0,0,1
3,3,To Kill a Mockingbird,eng,1,1,0,1,1,0,1,1,0,1
4,4,The Great Gatsby,eng,1,1,0,1,0,0,1,0,1,1


Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,mystery,romance,young-adult
0,0,The Hunger Games,eng,0,0,1,1,1,1,1,0,1,1
1,1,Harry Potter and the Philosopher's Stone,eng,0,0,1,0,1,1,1,1,0,1
2,2,Twilight,en-US,0,0,0,1,1,1,1,0,0,1
3,3,To Kill a Mockingbird,eng,1,1,0,1,1,0,1,1,0,1
4,4,The Great Gatsby,eng,1,1,0,1,0,0,1,0,1,1


In [14]:
from sklearn.model_selection import train_test_split

In [34]:
books, books_holdout = train_test_split(books, test_size=.1, random_state=42)

---
# Create baseline features
For each user, calculate average user bias - the average difference between the user's rating and the movie's average rating:

$$b_{u} = \dfrac{\sum_{j=1}^{n_u} (r_{uj} - \mu_i)}{n_u}$$

For each item, calculate the difference between its average rating and the average rating of all movies:

$$b_{i} = \dfrac{\sum_{k=1}^{n_i} (r_{ki})}{n_i} - \mu$$

Then, for each interaction, calculate the combined bias:

$$b_{ui} = \dfrac{b_u + b_i}{2}$$

In [16]:
from CGT import get_baseline
?get_baseline

[1;31mSignature:[0m [0mget_baseline[0m[1;33m([0m[0mdf[0m[1;33m,[0m [0mtrain_index[0m[1;33m,[0m [0mtest_index[0m[1;33m,[0m [0muser_col[0m[1;33m,[0m [0mitem_col[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Calculate baseline features from an explicit ratings dataset. Receives a dataframe
and returns train and test splits with added bias column and mean rating value.
User and item biases are calculated as average difference from global mean rating.
Baseline factors are only calculated from training observations, with users or
items that do not appear in train receiving the global average as default.

Args:
    df          : explicit ratings dataframe with columns userId, movieId and rating
    train_index : train index splits taken from KFold.splits()
    test_index  : test index splits taken from KFold.splits()
    
Returns:
    train, test : train/test splits of df, with added bias column
    global_mean : average rating of all training observat

---
# CGT model
**To do**:
  - Can we avoid re-training rating model on CV fold?
  - Create a grid search function / class

In [17]:
from CGT import compile_multigenre_model
?compile_multigenre_model

[1;31mSignature:[0m
[0mcompile_multigenre_model[0m[1;33m([0m[1;33m
[0m    [0mn_items[0m[1;33m,[0m[1;33m
[0m    [0mn_users[0m[1;33m,[0m[1;33m
[0m    [0mmin_rating[0m[1;33m,[0m[1;33m
[0m    [0mmax_rating[0m[1;33m,[0m[1;33m
[0m    [0mmean_rating[0m[1;33m,[0m[1;33m
[0m    [0mn_genres[0m[1;33m,[0m[1;33m
[0m    [0mn_latent[0m[1;33m,[0m[1;33m
[0m    [0mn_hidden_1[0m[1;33m,[0m[1;33m
[0m    [0mn_hidden_2[0m[1;33m,[0m[1;33m
[0m    [0mactivation[0m[1;33m=[0m[1;34m'relu'[0m[1;33m,[0m[1;33m
[0m    [0mdropout_1[0m[1;33m=[0m[1;36m0.2[0m[1;33m,[0m[1;33m
[0m    [0mdropout_2[0m[1;33m=[0m[1;36m0.2[0m[1;33m,[0m[1;33m
[0m    [0mrandom_seed[0m[1;33m=[0m[1;36m42[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m <no docstring>
[1;31mFile:[0m      c:\users\jleslie\documents\deep-collaborative-filtering\cgt.py
[1;31mType:[0m      function


# Classification report

In [18]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [19]:
books.head()

Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,mystery,romance,young-adult
4896,4896,Second Grave on the Left,en-US,1,1,0,0,1,1,1,1,1,0
4782,4782,Метро 2033,ger,1,0,1,0,1,1,1,1,0,0
1496,1496,The Borrowers Afield,0,0,0,1,0,0,1,1,0,0,1
1957,1957,Destined,eng,0,0,0,0,0,1,1,0,1,1
9171,9171,アンダーグラウンド [Andāguraundo],eng,1,0,0,1,1,0,1,0,0,0


Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,mystery,romance,young-adult
4896,4896,Second Grave on the Left,en-US,1,1,0,0,1,1,1,1,1,0
4782,4782,Метро 2033,ger,1,0,1,0,1,1,1,1,0,0
1496,1496,The Borrowers Afield,0,0,0,1,0,0,1,1,0,0,1
1957,1957,Destined,eng,0,0,0,0,0,1,1,0,1,1
9171,9171,アンダーグラウンド [Andāguraundo],eng,1,0,0,1,1,0,1,0,0,0


In [20]:
books.shape

(9000, 13)

In [21]:
# get baseline predictors for full dataset
train, _, _ = get_baseline(df, df.index, df.index, 'user_id', 'book_id')

# compile both models
model1, model2 = compile_multigenre_model(
    n_items=df.book_id.nunique(),
    n_users=df.user_id.nunique(),
    min_rating=df.rating.min(),
    max_rating=df.rating.max(),
    mean_rating=df.rating.mean(),
    n_genres=10,
    n_latent=200, 
    n_hidden_1=100,
    n_hidden_2=100,
    dropout_1=.25,
    dropout_2=.15
)

In [22]:
# train rating model
ratings = model1.fit(
    x=[train.user_id.values, train.book_id.values, train.bias.values],
    y=train.rating.values, 
    batch_size=2048,
    epochs=6,
    verbose=1,
    validation_split=.2
)

Train on 4781183 samples, validate on 1195296 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [23]:
# train genre model
genres = model2.fit(
    books.book_id.values, books.iloc[:,-10:].values,  # multi label
    batch_size=512, 
    epochs=6,
    validation_split=.2)

Train on 7200 samples, validate on 1800 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [24]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [25]:
y_test = books_holdout.iloc[:,-10:]
y_score = pd.DataFrame(model2.predict(books_holdout.book_id.values), columns=y_test.columns)
y_pred = y_score.round().astype('int')

In [26]:
print(classification_report(y_test, y_pred, target_names=y_test.columns))

               precision    recall  f1-score   support

        adult       0.66      0.99      0.79       660
adult-fiction       0.59      0.59      0.59       495
    adventure       0.55      0.30      0.38       342
    book-club       0.70      0.48      0.57       481
 contemporary       0.63      0.70      0.66       537
      fantasy       0.51      0.47      0.49       372
      fiction       0.90      1.00      0.95       901
      mystery       0.57      0.32      0.41       368
      romance       0.58      0.51      0.54       410
  young-adult       0.58      0.07      0.12       363

    micro avg       0.68      0.63      0.65      4929
    macro avg       0.63      0.54      0.55      4929
 weighted avg       0.66      0.63      0.61      4929
  samples avg       0.68      0.61      0.61      4929



## Save predictions

In [35]:
# add test flag
books['test'] = 0
books_holdout['test'] = 1

# combine train and test together
books = pd.concat((books, books_holdout), ignore_index=True).sort_values('book_id')

In [59]:
pred_df = pd.DataFrame(
    data=model2.predict(books.book_id.values).round().astype('int'),
    columns=[col+'_pred' for col in list(books.columns[-11:-1])]
)

pred_df = pd.concat(
    (books, pred_df),
    axis=1
)

pred_df.to_csv(path+'predictions.csv', index=False)

pred_df.head()

Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,...,adult_pred,adult-fiction_pred,adventure_pred,book-club_pred,contemporary_pred,fantasy_pred,fiction_pred,mystery_pred,romance_pred,young-adult_pred
0,4896,Second Grave on the Left,en-US,1,1,0,0,1,1,1,...,1,0,1,1,0,0,1,0,0,0
1,4782,Метро 2033,ger,1,0,1,0,1,1,1,...,1,0,1,0,0,1,1,0,0,1
2,1496,The Borrowers Afield,0,0,0,1,0,0,1,1,...,1,1,0,0,1,0,1,0,1,0
3,1957,Destined,eng,0,0,0,0,0,1,1,...,1,0,0,1,0,0,1,0,0,0
4,9171,アンダーグラウンド [Andāguraundo],eng,1,0,0,1,1,0,1,...,1,1,0,1,1,0,1,0,0,0


Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,...,adult_pred,adult-fiction_pred,adventure_pred,book-club_pred,contemporary_pred,fantasy_pred,fiction_pred,mystery_pred,romance_pred,young-adult_pred
0,4896,Second Grave on the Left,en-US,1,1,0,0,1,1,1,...,1,0,1,1,0,0,1,0,0,0
1,4782,Метро 2033,ger,1,0,1,0,1,1,1,...,1,0,1,0,0,1,1,0,0,1
2,1496,The Borrowers Afield,0,0,0,1,0,0,1,1,...,1,1,0,0,1,0,1,0,1,0
3,1957,Destined,eng,0,0,0,0,0,1,1,...,1,0,0,1,0,0,1,0,0,0
4,9171,アンダーグラウンド [Andāguraundo],eng,1,0,0,1,1,0,1,...,1,1,0,1,1,0,1,0,0,0


## Save embeddings

In [61]:
emb = model1.get_layer('embedding')

emb_df = pd.concat(
    (books, pd.DataFrame(emb.embeddings.numpy(), columns=['e'+str(i) for i in range(200)])),
    axis=1
)

emb_df.to_csv(path+'embeddings.csv', index=False)

emb_df.head()

Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,...,e190,e191,e192,e193,e194,e195,e196,e197,e198,e199
0,4896,Second Grave on the Left,en-US,1,1,0,0,1,1,1,...,-0.138427,0.06235,-0.152912,-0.256688,0.055669,0.114496,-0.061739,0.083857,0.058608,0.071826
1,4782,Метро 2033,ger,1,0,1,0,1,1,1,...,0.070695,0.104773,0.066383,-0.160181,0.051208,0.237654,-0.244884,0.173077,0.002489,0.138566
2,1496,The Borrowers Afield,0,0,0,1,0,0,1,1,...,0.09313,-0.010829,0.012914,0.067064,-0.035031,-0.128536,0.010186,0.096474,0.152025,0.008786
3,1957,Destined,eng,0,0,0,0,0,1,1,...,-0.18255,0.100834,0.007463,-0.00472,-0.11711,0.275701,0.002035,-0.161463,-0.189617,-0.171345
4,9171,アンダーグラウンド [Andāguraundo],eng,1,0,0,1,1,0,1,...,-0.041603,-0.034576,-0.027677,-0.123346,-0.071159,0.372377,-0.038627,0.012957,-0.207651,-0.242619


Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,...,e190,e191,e192,e193,e194,e195,e196,e197,e198,e199
0,4896,Second Grave on the Left,en-US,1,1,0,0,1,1,1,...,-0.138427,0.06235,-0.152912,-0.256688,0.055669,0.114496,-0.061739,0.083857,0.058608,0.071826
1,4782,Метро 2033,ger,1,0,1,0,1,1,1,...,0.070695,0.104773,0.066383,-0.160181,0.051208,0.237654,-0.244884,0.173077,0.002489,0.138566
2,1496,The Borrowers Afield,0,0,0,1,0,0,1,1,...,0.09313,-0.010829,0.012914,0.067064,-0.035031,-0.128536,0.010186,0.096474,0.152025,0.008786
3,1957,Destined,eng,0,0,0,0,0,1,1,...,-0.18255,0.100834,0.007463,-0.00472,-0.11711,0.275701,0.002035,-0.161463,-0.189617,-0.171345
4,9171,アンダーグラウンド [Andāguraundo],eng,1,0,0,1,1,0,1,...,-0.041603,-0.034576,-0.027677,-0.123346,-0.071159,0.372377,-0.038627,0.012957,-0.207651,-0.242619
