In [349]:
#libraries
import json
import pandas as pd
import numpy as np
import random
random.seed(100)
from keras.layers import Input, Embedding, Dot, Reshape, Dense
from keras.models import Model

## Recommendation System using Neural Netwrok with Word Embeddings

### adapted from https://www.kaggle.com/willkoehrsen/neural-network-embedding-recommendation-system 

In [378]:
#Load Json data for Books and articles
data = []
with open("DataSet/found_books_filtered.ndjson", 'r') as f:
    data = [json.loads(l) for l in f]

In [351]:
# filter books 
books = [book for book in books if 'Wikipedia:' not in book[0]]
print(f'Found {len(books)} books.')

Found 37020 books.


In [352]:
book_index = {book[0]: idx for idx, book in enumerate(books)}
index_book = {idx: book for book, idx in book_index.items()}

In [353]:
# books data for embedding
from itertools import chain

wikilinks = list(chain(*[book[2] for book in books]))

In [354]:
# Count no of occurances of each link to remove the most frequntly occuring wikilinks
from collections import Counter, OrderedDict
def count_items(l):
    """Return ordered dictionary of counts of objects in `l`"""
    
    # Create a counter object
    counts = Counter(l)
    
    # Sort by highest count first and place in ordered dictionary
    counts = sorted(counts.items(), key = lambda x: x[1], reverse = True)
    counts = OrderedDict(counts)
    
    return counts

In [355]:
# Find set of wikilinks for each book and convert to a flattened list
unique_wikilinks = list(chain(*[list(set(book[2])) for book in books]))

In [356]:
# Count no of occurances of each link
wikilinks = [link.lower() for link in unique_wikilinks]
wikilink_counts = count_items(wikilinks)

In [357]:
# Top 10 most frequnt links
list(wikilink_counts.items())[:10]

[('paperback', 8740),
 ('hardcover', 8648),
 ('wikipedia:wikiproject books', 6043),
 ('wikipedia:wikiproject novels', 6016),
 ('science fiction', 5665),
 ('english language', 4248),
 ('united states', 3063),
 ('novel', 2983),
 ('the new york times', 2742),
 ('fantasy', 2003)]

In [358]:
# Remove most frequntly occuring links
to_remove = ['paperback','hardcover', 'wikipedia:wikiproject books', 'wikipedia:wikiproject novels', 'hardback', 'e-book']
for t in to_remove:
    wikilinks.remove(t)
    wikilink_counts.pop(t)

In [359]:
# Limit to greater than 3 links
links = [t[0] for t in wikilink_counts.items() if t[1] >= 4]

In [360]:
# index links
link_index = {link: idx for idx, link in enumerate(links)}
index_link = {idx: link for link, idx in link_index.items()}

In [361]:
#Pairs of Book index and link index to prepare train dataset
pairs = []

# Iterate through each book
for book in books:
    # Iterate through the links in the book
    pairs.extend((book_index[book[0]], link_index[link.lower()]) for link in book[2] if link.lower() in links)


In [362]:
#generator for train data

def generate_TrainData(pairs, n_positive = 50, negative_ratio = 1.0):
    
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))

    neg_label = 0
  
    # This creates a generator
    while True:
        
        # choose positive examples
        for idx, (book_id, link_id) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (book_id, link_id, 1)

        # Increment idx by 1
        idx += 1
        
        # Add negative examples 
        while idx < batch_size:
            
            # random selection
            random_book = random.randrange(len(books))
            random_link = random.randrange(len(links))
            
            # Check to make sure this is not a positive example
            if (random_book, random_link) not in pairs_set:
                
                # Add to batch and increment index
                batch[idx, :] = (random_book, random_link, neg_label)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {'book': batch[:, 0], 'link': batch[:, 1]}, batch[:, 2]


In [363]:
# train data sample
next(generate_TrainData(pairs, n_positive = 2, negative_ratio = 2))

({'book': array([ 6895., 25757., 29814., 22162., 28410.,  7206.]),
  'link': array([  260., 22920., 11452.,  5588., 33217., 34924.])},
 array([1., 0., 0., 1., 0., 0.]))

In [364]:
def book_embedding_model(embedding_size = 50, classification = False):
    """Model to embed books and wikilinks using the functional API.
       Trained to discern if a link is present in a article"""
    
    # Both inputs are 1-dimensional
    book = Input(name = 'book', shape = [1])
    link = Input(name = 'link', shape = [1])
    
    # Embedding the book (shape will be (None, 1, 50))
    book_embedding = Embedding(name = 'book_embedding',
                               input_dim = len(book_index),
                               output_dim = embedding_size)(book)
    
    # Embedding the link (shape will be (None, 1, 50))
    link_embedding = Embedding(name = 'link_embedding',
                               input_dim = len(link_index),
                               output_dim = embedding_size)(link)
    
    # Merge the layers with a dot product along the second axis (shape will be (None, 1, 1))
    merged = Dot(name = 'dot_product', normalize = True, axes = 2)([book_embedding, link_embedding])
    
    # Reshape to be a single number (shape will be (None, 1))
    merged = Reshape(target_shape = [1])(merged)
    
    # add extra layer and loss function is binary cross entropy
    
    merged = Dense(1, activation = 'sigmoid')(merged)
    model = Model(inputs = [book, link], outputs = merged)
    model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    return model

In [365]:
# Instantiate model and show parameters
model = book_embedding_model()
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
book (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
link (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
book_embedding (Embedding)      (None, 1, 50)        1851000     book[0][0]                       
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 50)        2087900     link[0][0]                       
____________________________________________________________________________________________

In [366]:
# Declare generator for train data
n_positive = 2000
gen = generate_TrainData(pairs, n_positive, negative_ratio = 2)

In [367]:
# Train Model
history = model.fit_generator(gen, epochs = 12, steps_per_epoch = len(pairs) // n_positive, verbose = 2)

Epoch 1/12
 - 28s - loss: 0.6528 - accuracy: 0.6438
Epoch 2/12
 - 29s - loss: 0.5396 - accuracy: 0.8087
Epoch 3/12
 - 28s - loss: 0.4094 - accuracy: 0.8593
Epoch 4/12
 - 30s - loss: 0.3548 - accuracy: 0.8779
Epoch 5/12
 - 30s - loss: 0.3190 - accuracy: 0.8991
Epoch 6/12
 - 31s - loss: 0.2968 - accuracy: 0.9105
Epoch 7/12
 - 29s - loss: 0.2726 - accuracy: 0.9222
Epoch 8/12
 - 29s - loss: 0.2542 - accuracy: 0.9299
Epoch 9/12
 - 29s - loss: 0.2364 - accuracy: 0.9380
Epoch 10/12
 - 31s - loss: 0.2476 - accuracy: 0.9293
Epoch 11/12
 - 29s - loss: 0.2177 - accuracy: 0.9443
Epoch 12/12
 - 27s - loss: 0.1970 - accuracy: 0.9537


In [373]:
# Save Model
model.save('RS_weights.h5')

In [374]:
# Extract embeddings
book_layer = model.get_layer('book_embedding')
book_weights = book_layer.get_weights()[0]

In [375]:
#Normalize Book Embedding weights
book_weights = book_weights / np.linalg.norm(book_weights, axis = 1).reshape((-1, 1))

In [376]:
# Return Similar Books using Book Embedding weights
def recommend_books(name, weights, n = 10):

    index = book_index
    reverseIndex = index_book

 
    # Calculate dot product between book and all others
    dists = np.dot(weights, weights[index[name]])

    # Sort distance indexes from smallest to largest
    sorted_dists = np.argsort(dists)
    recommended = sorted_dists[-n:]
    
    
    max_width = max([len(reverseIndex[r]) for r in recommended])
    
    for r in reversed(recommended):
        print(f'{reverseIndex[r]:{max_width}} Similarity:{str(round(dists[r],2))}')
        

In [377]:
recommend_books('War and Peace', book_weights)

War and Peace             Similarity:1.0
Anna Karenina             Similarity:0.84
The Brothers Karamazov    Similarity:0.82
Crime and Punishment      Similarity:0.81
Demons (Dostoevsky novel) Similarity:0.8
The Idiot                 Similarity:0.77
The Master and Margarita  Similarity:0.77
Lord of the World         Similarity:0.76
Poor Folk                 Similarity:0.75
Cousin Bette              Similarity:0.74
