# Goodbooks-10k Collaborative Genre Tagging
  
[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/James-Leslie/deep-collaborative-filtering/blob/master/tf-goodbooks.ipynb)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

%matplotlib inline

## Load rating data

In [2]:
path = 'data/goodbooks-10k/'  # if the files are local
df = pd.read_csv(path+'ratings.csv')

In [3]:
df.head()

Unnamed: 0,user_id,book_id,rating
0,0,257,5
1,1,4080,4
2,1,259,5
3,1,9295,5
4,1,2317,3


In [4]:
df.shape

(5976479, 3)

In [5]:
print('Number of users:', df.user_id.nunique())
print('Number of items:', df.book_id.nunique())
print("Min item rating:", df.rating.min())
print("Max item rating:", df.rating.max())
print("Mean item rating:", df.rating.mean())

Number of users: 53424
Number of items: 10000
Min item rating: 1
Max item rating: 5
Mean item rating: 3.9198655261735214


## Load book metadata
  - remove 10% as holdout test set

In [6]:
books = pd.read_csv(path+'books.csv').fillna(0)
books.iloc[:,-10:] = books.iloc[:,-10:].astype('int')
books.head()

Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,mystery,romance,young-adult
0,0,The Hunger Games,eng,0,0,1,1,1,1,1,0,1,1
1,1,Harry Potter and the Philosopher's Stone,eng,0,0,1,0,1,1,1,1,0,1
2,2,Twilight,en-US,0,0,0,1,1,1,1,0,0,1
3,3,To Kill a Mockingbird,eng,1,1,0,1,1,0,1,1,0,1
4,4,The Great Gatsby,eng,1,1,0,1,0,0,1,0,1,1


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
books, books_holdout = train_test_split(books, test_size=.1, random_state=42)

---
# Create baseline features
For each user, calculate average user bias - the average difference between the user's rating and the movie's average rating:

$$b_{u} = \dfrac{\sum_{j=1}^{n_u} (r_{uj} - \mu_i)}{n_u}$$

For each item, calculate the difference between its average rating and the average rating of all movies:

$$b_{i} = \dfrac{\sum_{k=1}^{n_i} (r_{ki})}{n_i} - \mu$$

Then, for each interaction, calculate the combined bias:

$$b_{ui} = \dfrac{b_u + b_i}{2}$$

In [9]:
from CGT import get_baseline
?get_baseline

[1;31mSignature:[0m [0mget_baseline[0m[1;33m([0m[0mdf[0m[1;33m,[0m [0mtrain_index[0m[1;33m,[0m [0mtest_index[0m[1;33m,[0m [0muser_col[0m[1;33m,[0m [0mitem_col[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Calculate baseline features from an explicit ratings dataset. Receives a dataframe
and returns train and test splits with added bias column and mean rating value.
User and item biases are calculated as average difference from global mean rating.
Baseline factors are only calculated from training observations, with users or
items that do not appear in train receiving the global average as default.

Args:
    df          : explicit ratings dataframe with columns userId, movieId and rating
    train_index : train index splits taken from KFold.splits()
    test_index  : test index splits taken from KFold.splits()
    
Returns:
    train, test : train/test splits of df, with added bias column
    global_mean : average rating of all training observat

---
# CGT model
**To do**:
  - Can we avoid re-training rating model on CV fold?
  - Create a grid search function / class

In [10]:
from CGT import compile_genre_model
?compile_genre_model

[1;31mSignature:[0m
[0mcompile_genre_model[0m[1;33m([0m[1;33m
[0m    [0mn_items[0m[1;33m,[0m[1;33m
[0m    [0mn_users[0m[1;33m,[0m[1;33m
[0m    [0mmin_rating[0m[1;33m,[0m[1;33m
[0m    [0mmax_rating[0m[1;33m,[0m[1;33m
[0m    [0mmean_rating[0m[1;33m,[0m[1;33m
[0m    [0mn_latent[0m[1;33m,[0m[1;33m
[0m    [0mn_hidden_1[0m[1;33m,[0m[1;33m
[0m    [0mn_hidden_2[0m[1;33m,[0m[1;33m
[0m    [0mactivation[0m[1;33m=[0m[1;34m'relu'[0m[1;33m,[0m[1;33m
[0m    [0mdropout_1[0m[1;33m=[0m[1;36m0.2[0m[1;33m,[0m[1;33m
[0m    [0mdropout_2[0m[1;33m=[0m[1;36m0.2[0m[1;33m,[0m[1;33m
[0m    [0mrandom_seed[0m[1;33m=[0m[1;36m42[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m <no docstring>
[1;31mFile:[0m      c:\users\jleslie\documents\deep-collaborative-filtering\cgt.py
[1;31mType:[0m      function


# Classification report

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [12]:
books.head()

Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,mystery,romance,young-adult
4896,4896,Second Grave on the Left,en-US,1,1,0,0,1,1,1,1,1,0
4782,4782,Метро 2033,ger,1,0,1,0,1,1,1,1,0,0
1496,1496,The Borrowers Afield,0,0,0,1,0,0,1,1,0,0,1
1957,1957,Destined,eng,0,0,0,0,0,1,1,0,1,1
9171,9171,アンダーグラウンド [Andāguraundo],eng,1,0,0,1,1,0,1,0,0,0


In [13]:
books.shape

(9000, 13)

In [14]:
# get baseline predictors for full dataset
train, _, _ = get_baseline(df, df.index, df.index, 'user_id', 'book_id')

# compile both models
model1, model2 = compile_genre_model(
    n_items=df.book_id.nunique(),
    n_users=df.user_id.nunique(),
    min_rating=df.rating.min(),
    max_rating=df.rating.max(),
    mean_rating=train.rating.mean(),
    n_latent=200,
    n_hidden_1=100,
    n_hidden_2=100,
    dropout_1=.2,
    dropout_2=.2
)

In [15]:
# train rating model
ratings = model1.fit(
    x=[train.user_id.values, train.book_id.values, train.bias.values],
    y=train.rating.values, 
    batch_size=1024,
    epochs=5,
    verbose=1,
    validation_split=.2
)

Train on 4781183 samples, validate on 1195296 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
# train genre model
genres = model2.fit(
    books.book_id.values, books['adult-fiction'].values,
    batch_size=256, 
    epochs=5,
    validation_split=.2)

Train on 7200 samples, validate on 1800 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [18]:
y_test = books_holdout['adult-fiction']
y_score = pd.DataFrame(model2.predict(books_holdout.book_id.values))
y_pred = y_score.round().astype('int')

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.67      0.64       505
           1       0.63      0.59      0.61       495

    accuracy                           0.63      1000
   macro avg       0.63      0.63      0.63      1000
weighted avg       0.63      0.63      0.63      1000



In [20]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

Unnamed: 0,0,1
0,337,168
1,205,290


## Save predictions

In [21]:
# add test flag
books['test'] = 0
books_holdout['test'] = 1

# combine train and test together
books = pd.concat((books, books_holdout), ignore_index=True).sort_values('book_id')

In [22]:
pred_df = pd.DataFrame(
    data=model2.predict(books.book_id.values).round().astype('int'),
    columns=['prediction']
)

pred_df = pd.concat(
    (books, pred_df),
    axis=1
)

pred_df[['book_id','adult-fiction','prediction','test']].to_csv(path+'predictions.csv', index=False)

pred_df.head()

Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,mystery,romance,young-adult,test,prediction
0,4896,Second Grave on the Left,en-US,1,1,0,0,1,1,1,1,1,0,0,0
1,4782,Метро 2033,ger,1,0,1,0,1,1,1,1,0,0,0,0
2,1496,The Borrowers Afield,0,0,0,1,0,0,1,1,0,0,1,0,0
3,1957,Destined,eng,0,0,0,0,0,1,1,0,1,1,0,0
4,9171,アンダーグラウンド [Andāguraundo],eng,1,0,0,1,1,0,1,0,0,0,0,1


## Save embeddings

In [24]:
emb = model1.get_layer('embedding')

emb_df = pd.concat(
    (books, pd.DataFrame(emb.embeddings.numpy(), columns=['e'+str(i) for i in range(200)])),
    axis=1
)

emb_df.to_csv(path+'embeddings.csv', index=False)

emb_df.head()

Unnamed: 0,book_id,original_title,language_code,adult,adult-fiction,adventure,book-club,contemporary,fantasy,fiction,...,e190,e191,e192,e193,e194,e195,e196,e197,e198,e199
0,4896,Second Grave on the Left,en-US,1,1,0,0,1,1,1,...,-0.050259,-0.074923,-0.037485,-0.246658,0.043654,0.193562,-0.109489,-0.058342,0.241099,-0.008333
1,4782,Метро 2033,ger,1,0,1,0,1,1,1,...,-0.076517,-0.014883,-0.282789,-0.151571,0.017308,0.170393,-0.010839,0.003244,0.136885,0.127962
2,1496,The Borrowers Afield,0,0,0,1,0,0,1,1,...,0.038115,0.049776,0.068584,0.071408,-0.052329,-0.039255,0.193202,-0.102509,0.017598,-0.010646
3,1957,Destined,eng,0,0,0,0,0,1,1,...,-0.244271,0.189629,-0.044946,0.003869,-0.33521,0.008915,0.145613,-0.284492,-0.154346,-0.272709
4,9171,アンダーグラウンド [Andāguraundo],eng,1,0,0,1,1,0,1,...,-0.15163,0.099969,-0.224902,0.045649,-0.03588,0.206805,-0.25653,0.005667,0.097728,-0.35897
