# Goodbooks-10k Collaborative Genre Tagging
  
[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/James-Leslie/deep-collaborative-filtering/blob/master/tf-goodbooks.ipynb)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

%matplotlib inline

## Load rating data

In [2]:
path = 'data/goodbooks-10k/'  # if the files are local
df = pd.read_csv(path+'ratings.csv')

In [3]:
df.head()

Unnamed: 0,user_id,book_id,rating
0,0,257,5
1,1,4080,4
2,1,259,5
3,1,9295,5
4,1,2317,3


Unnamed: 0,user_id,book_id,rating
0,0,257,5
1,1,4080,4
2,1,259,5
3,1,9295,5
4,1,2317,3


In [4]:
df.shape

(5976479, 3)

In [5]:
print('Number of users:', df.user_id.nunique())
print('Number of items:', df.book_id.nunique())
print("Min item rating:", df.rating.min())
print("Max item rating:", df.rating.max())
print("Mean item rating:", df.rating.mean())

Number of users: 53424
Number of items: 10000
Min item rating: 1
Max item rating: 5
Mean item rating: 3.9198655261735214


## Load book metadata
  - remove 10% as holdout test set

In [6]:
books = pd.read_csv(path+'books.tsv', sep='\t', index_col=0)
books.head()

Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,mystery,romance,young-adult
0,0,The Hunger Games,eng,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
1,1,Harry Potter and the Philosopher's Stone,eng,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
2,2,Twilight,en-US,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
3,3,To Kill a Mockingbird,eng,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
4,4,The Great Gatsby,eng,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0


Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,mystery,romance,young-adult
0,0,The Hunger Games,eng,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
1,1,Harry Potter and the Philosopher's Stone,eng,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
2,2,Twilight,en-US,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
3,3,To Kill a Mockingbird,eng,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
4,4,The Great Gatsby,eng,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
books, books_holdout = train_test_split(books, test_size=.1, random_state=42)

---
# Create baseline features
For each user, calculate average user bias - the average difference between the user's rating and the movie's average rating:

$$b_{u} = \dfrac{\sum_{j=1}^{n_u} (r_{uj} - \mu_i)}{n_u}$$

For each item, calculate the difference between its average rating and the average rating of all movies:

$$b_{i} = \dfrac{\sum_{k=1}^{n_i} (r_{ki})}{n_i} - \mu$$

Then, for each interaction, calculate the combined bias:

$$b_{ui} = \dfrac{b_u + b_i}{2}$$

In [9]:
from CGT import get_baseline
?get_baseline

[1;31mSignature:[0m [0mget_baseline[0m[1;33m([0m[0mdf[0m[1;33m,[0m [0mtrain_index[0m[1;33m,[0m [0mtest_index[0m[1;33m,[0m [0muser_col[0m[1;33m,[0m [0mitem_col[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Calculate baseline features from an explicit ratings dataset. Receives a dataframe
and returns train and test splits with added bias column and mean rating value.
User and item biases are calculated as average difference from global mean rating.
Baseline factors are only calculated from training observations, with users or
items that do not appear in train receiving the global average as default.

Args:
    df          : explicit ratings dataframe with columns userId, movieId and rating
    train_index : train index splits taken from KFold.splits()
    test_index  : test index splits taken from KFold.splits()
    
Returns:
    train, test : train/test splits of df, with added bias column
    global_mean : average rating of all training observat

---
# CGT model
**To do**:
  - Can we avoid re-training rating model on CV fold?
  - Create a grid search function / class

In [10]:
from CGT import compile_multigenre_model
?compile_multigenre_model

[1;31mSignature:[0m
[0mcompile_multigenre_model[0m[1;33m([0m[1;33m
[0m    [0mn_items[0m[1;33m,[0m[1;33m
[0m    [0mn_users[0m[1;33m,[0m[1;33m
[0m    [0mmin_rating[0m[1;33m,[0m[1;33m
[0m    [0mmax_rating[0m[1;33m,[0m[1;33m
[0m    [0mmean_rating[0m[1;33m,[0m[1;33m
[0m    [0mn_genres[0m[1;33m,[0m[1;33m
[0m    [0mn_latent[0m[1;33m,[0m[1;33m
[0m    [0mn_hidden_1[0m[1;33m,[0m[1;33m
[0m    [0mn_hidden_2[0m[1;33m,[0m[1;33m
[0m    [0mactivation[0m[1;33m=[0m[1;34m'relu'[0m[1;33m,[0m[1;33m
[0m    [0mdropout_1[0m[1;33m=[0m[1;36m0.2[0m[1;33m,[0m[1;33m
[0m    [0mdropout_2[0m[1;33m=[0m[1;36m0.2[0m[1;33m,[0m[1;33m
[0m    [0mrandom_seed[0m[1;33m=[0m[1;36m42[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m <no docstring>
[1;31mFile:[0m      c:\users\jleslie\documents\deep-collaborative-filtering\cgt.py
[1;31mType:[0m      function


# Classification report

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [12]:
books.head()

Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,mystery,romance,young-adult
4896,4896,Second Grave on the Left,en-US,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
4782,4782,Метро 2033,ger,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
1496,1496,The Borrowers Afield,,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1957,1957,Destined,eng,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
9171,9171,アンダーグラウンド [Andāguraundo],eng,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0


Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,mystery,romance,young-adult
4896,4896,Second Grave on the Left,en-US,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
4782,4782,Метро 2033,ger,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
1496,1496,The Borrowers Afield,,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1957,1957,Destined,eng,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
9171,9171,アンダーグラウンド [Andāguraundo],eng,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0


In [13]:
books.shape

(9000, 13)

In [14]:
# get baseline predictors for full dataset
train, _, _ = get_baseline(df, df.index, df.index, 'user_id', 'book_id')

# compile both models
model1, model2 = compile_multigenre_model(
    n_items=df.book_id.nunique(),
    n_users=df.user_id.nunique(),
    min_rating=df.rating.min(),
    max_rating=df.rating.max(),
    mean_rating=df.rating.mean(),
    n_genres=10,
    n_latent=200, 
    n_hidden_1=100,
    n_hidden_2=100,
    dropout_1=.15,
    dropout_2=.15
)

In [16]:
# train rating model
ratings = model1.fit(
    x=[train.user_id.values, train.book_id.values, train.bias.values],
    y=train.rating.values, 
    batch_size=3072,
    epochs=6,
    verbose=1,
    validation_split=.2
)

Train on 4781183 samples, validate on 1195296 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [38]:
# train genre model
genres = model2.fit(
    books.book_id.values, books.iloc[:,-10:].values,  # multi label
    batch_size=512, 
    epochs=6,
    validation_split=.2)

Train on 5904 samples, validate on 1477 samples
Epoch 1/6
 512/5904 [=>............................] - ETA: 3s

InvalidArgumentError:  assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:x (model_1/dense_3/Sigmoid:0) = ] [[nan nan nan...]...] [y (metrics/AUC/Cast_1/x:0) = ] [0]
	 [[{{node metrics/AUC/assert_greater_equal/Assert/AssertGuard/else/_1/Assert}}]] [Op:__inference_distributed_function_26609]

Function call stack:
distributed_function


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [None]:
y_test = books_holdout.iloc[:,-10:]
y_score = pd.DataFrame(model2.predict(books_holdout.book_id.values), columns=y_test.columns)
y_pred = y_score.round().astype('int')

In [None]:
print(classification_report(y_test, y_pred, target_names=y_test.columns))

# To DO:
  - save predictions
  - save embeddings

In [None]:
books = pd.read_csv(path+'books.tsv', sep='\t', index_col=0).sort_values('book_id')
books.head()

In [None]:
emb = model1.get_layer('embedding')

In [None]:
emb_df = pd.concat(
    (books, pd.DataFrame(emb.embeddings.numpy(), columns=['e'+str(i) for i in range(200)])),
    axis=1
)

emb_df.head()

In [None]:
emb_df.to_csv(path+'embeddings.csv', index=False)