## Cypher Based Recommendations

In [1]:
import pandas as pd
import numpy as np
from neo4j import GraphDatabase
from scipy import sparse
import datetime as dt

uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "password"))

In [26]:
# Get all customer ID's and all products ID's to create sparse matrix

with driver.session() as session:
    query = """
    MATCH (c:Customer)
    RETURN c.id as id
    """
    
    results = session.run(query)
    customers = [record['id'] for record in results]
    
    
    query = """
    MATCH (p:Product)
    RETURN p.code as id
    """ 
    
    results = session.run(query)
    products = [record['id'] for record in results]
    
    

In [40]:
# Create mappings for sparse matrix
# Create mappings
all_product_to_idx = {}
all_idx_to_product = {}
for (idx, prod_id) in enumerate(products):
    all_product_to_idx[prod_id] = idx
    all_idx_to_product[idx] = prod_id
    
all_customer_to_idx = {}
all_idx_to_customer = {}
for (idx, cust_id) in enumerate(customers):
    all_customer_to_idx[cust_id] = idx
    all_idx_to_customer[idx] = cust_id

In [23]:
# Create lil matrix to incrementally build sparse matrix of recs
n_users = len(customers)
n_products = len(products)

rec_matrix = sparse.lil_matrix((n_users, n_products))

In [3]:
# Creating class to get recommendations for a user

import logging
import sys
from neo4j import GraphDatabase
from neo4j.exceptions import ServiceUnavailable

class NeoUserRecommender:

    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        # Don't forget to close the driver connection when you are finished with it
        self.driver.close()

    @staticmethod
    def enable_log(level, output_stream):
        handler = logging.StreamHandler(output_stream)
        handler.setLevel(level)
        logging.getLogger("neo4j").addHandler(handler)
        logging.getLogger("neo4j").setLevel(level)

    def get_recommended_items_for_user(self, customer_id):
        with self.driver.session(database='fabric') as session:
            # Write transactions allow the driver to handle retries and transient errors
            result = session.read_transaction(
                self._get_recommended_items_for_user, customer_id)
            return result[0]

    @staticmethod
    def _get_recommended_items_for_user(tx, customer_id):
        
        query = ("""
        CALL {
        // Return all purchases for a single customer
        USE fabric.neo4j
        MATCH (c:Customer)-[pur:PURCHASED]->(p:Product)
        WHERE  c.id = $customer_id
        RETURN p.code as prod_code
        }
        CALL {
        // For each of those purchases, return the 10 most similar
        // by image embedding
        USE fabric.products
        WITH prod_code
        MATCH (p1:Product)-[:SIMILAR]->(rec:Product)
        WHERE p1.code = prod_code
        RETURN rec.code as rec_code
        }
        CALL {
        // For each of these recommendations, filter out the ones
        // that don't come from a department or index previously
        // purchases from by the customer
        USE fabric.neo4j
        with rec_code
        MATCH (c:Customer)
        WHERE c.id = $customer_id
        MATCH (ind:Index)<-[:HAS_INDEX]-(rec:Product)-[:FROM_DEPARTMENT]->(d:Department)
        WHERE rec.code = rec_code
        AND EXISTS ((c)-[:PURCHASED]->(:Product)-[:FROM_DEPARTMENT]->(d))
        AND EXISTS ((c)-[:PURCHASED]->(:Product)-[:HAS_INDEX]->(ind))
        RETURN rec.code AS recommended_item
        }
        
        RETURN collect(recommended_item) as recommended_items
        """
        )
        result = tx.run(query, customer_id=customer_id)
        try:
            return [row['recommended_items'] for row in result]
        # Capture any errors along with the query and data for traceability
        except ServiceUnavailable as exception:
            logging.error(f"{query} raised an error: \n {exception}")
            raise
            
    
    def get_cypher_recs_for_user(self, customer_id):
        with self.driver.session(database='neo4j') as session:
            # Write transactions allow the driver to handle retries and transient errors
            result = session.read_transaction(
                self._get_cypher_recs_for_user, customer_id)
            return result[0]

    @staticmethod
    def _get_cypher_recs_for_user(tx, customer_id):

        query = """
        MATCH (c:Customer {id: $customer_id})-[:PURCHASED]->(p:Product)<-[:PURCHASED]-(:Customer)-[:PURCHASED]->(rec:Product) 
        WHERE id(p) <> id(rec)
        AND NOT EXISTS ((c)-[:PURCHASED]->(rec))
        WITH c.id as customer_id, rec, COUNT(rec) as score ORDER BY COUNT(rec) DESC LIMIT 12
        RETURN collect(rec.code) as recommended_items
        """

        result = tx.run(query, customer_id=customer_id)
        try:
            return [row['recommended_items'] for row in result]
        # Capture any errors along with the query and data for traceability
        except ServiceUnavailable as exception:
            logging.error(f"{query} raised an error: \n {exception}")
            raise

In [4]:
neo_recs = NeoUserRecommender(uri, 'neo4j', 'password')

In [5]:
%%timeit
recs = neo_recs.get_recommended_items_for_user('000097d91384a0c14893c09ed047a963c4fc6a5c021044eec603b323e8c82d1d')

104 ms ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
cypher_rec = neo_recs.get_cypher_recs_for_user('000097d91384a0c14893c09ed047a963c4fc6a5c021044eec603b323e8c82d1d')

In [182]:
cypher_rec

['0565200032',
 '0624120004',
 '0706016001',
 '0673396002',
 '0610776002',
 '0372860001',
 '0720125001',
 '0158340001',
 '0548110002',
 '0562245001',
 '0573716012',
 '0372860002']

In [88]:
# For each customer, get recommendations and map them to the sparse matrix
for i, cust in enumerate(customers):
    
    customer_index = customer_to_idx[cust]
    
    recs = neo_recs.get_recommended_items_for_user(cust)
    rec_indices = [product_to_idx[rec] for rec in recs]
    
    rec_matrix[customer_index, rec_indices] = 1
    
    if i%10000 == 0:
        print(f'{i} customers processed')
#         to_save = rec_matrix.tocsr()
#         sparse.save_npz('rec_matrix.npz', to_save)
#         del(to_save)
        
rec_matrix = rec_matrix.tocsr()
#sparse.save_npz('rec_matrix.npz', rec_matrix)

0 customers processed
1000 customers processed
2000 customers processed
3000 customers processed
4000 customers processed
5000 customers processed
6000 customers processed
7000 customers processed
8000 customers processed
9000 customers processed
10000 customers processed
11000 customers processed
12000 customers processed
13000 customers processed
14000 customers processed
15000 customers processed
16000 customers processed
17000 customers processed
18000 customers processed
19000 customers processed
20000 customers processed
21000 customers processed
22000 customers processed
23000 customers processed
24000 customers processed
25000 customers processed
26000 customers processed
27000 customers processed
28000 customers processed
29000 customers processed
30000 customers processed
31000 customers processed
32000 customers processed
33000 customers processed
34000 customers processed
35000 customers processed
36000 customers processed
37000 customers processed
38000 customers processed

In [7]:
rec_matrix = sparse.load_npz('rec_matrix.npz')

Now we have recommendations for each user based on their purchased items we will use our ALS model to rank these items to get our best 12 items. If a user or item has been filtered out of our data due to not meeting the minimum criteria for purchases, we will revert to a cypher query to find items bought by those who also purchased a product.

We'll use our ALS model from previous blog post, check the post for how we trained it

In [9]:
import pandas as pd
purchases = pd.read_csv('/Users/grantbeasley/Library/Application Support/Neo4j Desktop/Application/relate-data/dbmss/dbms-8e47f9b7-a408-4d53-915d-8e56e440f5d0/import/transactions_train.csv')

In [10]:
purchases['t_dat'] = pd.to_datetime(purchases['t_dat'])

In [11]:
def threshold_likes(df, user_min, article_min):
    n_users = df['customer_id'].unique().shape[0]
    n_items = df['article_id'].unique().shape[0]
    sparsity = float(df.shape[0]) / float(n_users*n_items) * 100
    print('Starting likes info')
    print('Number of users: {}'.format(n_users))
    print('Number of models: {}'.format(n_items))
    print('Sparsity: {:4.3f}%'.format(sparsity))
    
    done = False
    while not done:
        starting_shape = df.shape[0]
        article_counts = df.groupby('customer_id')['article_id'].count()
        df = df[~df['customer_id'].isin(article_counts[article_counts < article_min].index.tolist())]
        user_counts = df.groupby('article_id')['customer_id'].count()
        df = df[~df['article_id'].isin(user_counts[user_counts < user_min].index.tolist())]
        ending_shape = df.shape[0]
        if starting_shape == ending_shape:
            done = True
    
    assert(df.groupby('customer_id')['article_id'].count().min() >= article_min)
    assert(df.groupby('article_id')['customer_id'].count().min() >= user_min)
    
    n_users = df['customer_id'].unique().shape[0]
    n_items = df['article_id'].unique().shape[0]
    sparsity = float(df.shape[0]) / float(n_users*n_items) * 100
    print('Ending likes info')
    print('Number of users: {}'.format(n_users))
    print('Number of models: {}'.format(n_items))
    print('Sparsity: {:4.3f}%'.format(sparsity))
    return df

In [12]:
purchases_dense = threshold_likes(purchases, 5, 5)

Starting likes info
Number of users: 1362281
Number of models: 104547
Sparsity: 0.022%
Ending likes info
Number of users: 925154
Number of models: 91511
Sparsity: 0.036%


In [39]:
# Create mappings
als_product_to_idx = {}
als_idx_to_product = {}
for (idx, aid) in enumerate(purchases_dense['article_id'].unique().tolist()):
    als_product_to_idx[aid] = idx
    als_idx_to_product[idx] = aid
    
als_customer_to_idx = {}
als_idx_to_customer = {}
for (idx, uid) in enumerate(purchases_dense['customer_id'].unique().tolist()):
    als_customer_to_idx[uid] = idx
    als_idx_to_customer[idx] = uid

In [14]:
def map_ids(row, mapper):
    return mapper[row]

In [190]:
from scipy import sparse
import numpy as np

I = purchases_dense['customer_id'].apply(map_ids, args=[user_to_idx]).values
J = purchases_dense['article_id'].apply(map_ids, args=[article_to_idx]).values
V = np.ones(I.shape[0])
purchases_sparse = sparse.coo_matrix((V, (I, J)), dtype=np.float64)
purchases_sparse = purchases_sparse.tocsr()

In [16]:
def assign_confidence_recent(date):
    if date >= dt.date(2020,3,20):
        return 5
    elif (date >= dt.date(2020,3,20)) & (date < dt.date(2020,3,20)): 
        return 3
    else:
        return 1
    
purchases_dense['confidence_recent'] = purchases_dense['t_dat'].apply(assign_confidence_recent)

  if date >= dt.date(2020,3,20):
  elif (date >= dt.date(2020,3,20)) & (date < dt.date(2020,3,20)):


In [18]:
I = purchases_dense['customer_id'].apply(map_ids, args=[user_to_idx]).values
J = purchases_dense['article_id'].apply(map_ids, args=[article_to_idx]).values
confidence = purchases_dense['confidence_recent'].values 
purchases_with_confidence = sparse.coo_matrix((confidence, (I, J)), dtype=np.float64)
purchases_with_confidence = purchases_with_confidence.tocsr()

In [117]:
import implicit
als = implicit.als.AlternatingLeastSquares()
als.fit(purchases_with_confidence.T)

  0%|          | 0/15 [00:00<?, ?it/s]

Process:  
    1. Get overall user ID  
    2. Convert user ID to ALS matrix index  
    3. For all recs for a given user, get the ALS index (rec -> product_id -> als_product_index)  

In [221]:
# Create a matrix to hold the recommendations
final_recommendations = sparse.lil_matrix((n_users, n_products))
    
for i, cust in enumerate(customers):

    # Convert the customer_id to the index of all user/items
    all_user_id = all_customer_to_idx[cust]
    
    # Get all the purchases identidied by image similarity and previously written to the user/item matrix
    neo_user_recs = rec_matrix[all_user_id].indices
    
    # Convert all these product_ids to their respective indicies in the user/item matrix
    neo_recs_product_id = [all_idx_to_product[rec] for rec in neo_user_recs]

    # the dataset used to train the ALS model was filtered by num of purchases and sales
    # so first check that the customer was part of the training set
    if cust in als_customer_to_idx and len(neo_recs_product_id) > 0:
        
        # Identify the customers index in the als user/item matrix
        als_user_id = als_customer_to_idx[cust]
        
        # Identify the product index in the als user/item matrix
        als_prod_ids = [als_product_to_idx[int(rec)] for rec in neo_recs_product_id if int(rec) in als_product_to_idx.keys()]
        
        # Providing at least one of the items was part of the als training data - rank the items and take the top 12
        if len(als_prod_ids) > 0:
            ranked_recs = als.rank_items(als_user_id, purchases_with_confidence, selected_items=als_prod_ids)
            ranked_recs = [x[0] for x in ranked_recs[:12]]
            ranked_recs_prod_ids = [als_idx_to_product[x] for x in ranked_recs]
            ranked_recs_all_prod_ids = [all_product_to_idx[f'0{x}'] for x in ranked_recs_prod_ids]

            final_recommendations[all_user_id, ranked_recs_all_prod_ids] = 1

 
    # If we didn't find at least 12 recs, find the extra recs to add
    if final_recommendations.getrow(all_user_id).count_nonzero() < 12:
        
        
        num_to_insert = 12 - final_recommendations.getrow(all_user_id).count_nonzero()
        
        if len(rec_matrix[all_user_id].indices) == 0:
            continue
        
        extra_recs = np.random.choice(rec_matrix[all_user_id].indices, size=num_to_insert)

        final_recommendations[all_user_id, extra_recs] = 1
      
    if i % 10000 == 0:
        print(f'{i} customers processed')
        print(f'{n_users - i} customers remaining')
        
    
    
    

0 customers processed
1371980 customers remaining
10000 customers processed
1361980 customers remaining
20000 customers processed
1351980 customers remaining
30000 customers processed
1341980 customers remaining
40000 customers processed
1331980 customers remaining
50000 customers processed
1321980 customers remaining
60000 customers processed
1311980 customers remaining
70000 customers processed
1301980 customers remaining
80000 customers processed
1291980 customers remaining
90000 customers processed
1281980 customers remaining
100000 customers processed
1271980 customers remaining
110000 customers processed
1261980 customers remaining
120000 customers processed
1251980 customers remaining
130000 customers processed
1241980 customers remaining
140000 customers processed
1231980 customers remaining
150000 customers processed
1221980 customers remaining
180000 customers processed
1191980 customers remaining
190000 customers processed
1181980 customers remaining
200000 customers process

In [222]:
final_recommendations = final_recommendations.tocsr()

In [226]:
still_to_recommend = np.where(np.diff(final_recommendations.indptr) == 0)[0]

In [227]:
final_recommendations = final_recommendations.tolil()

for i, cust in enumerate(still_to_recommend):
    
    cust_id = all_idx_to_customer[cust]
    
    cypher_recs = neo_recs.get_cypher_recs_for_user(cust_id)
    recs = [all_product_to_idx[x] for x in cypher_recs]
    
    final_recommendations[cust ,recs] = 1
    
    if i % 1000 == 0:
        print(i)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000


In [230]:
final_recommendations = final_recommendations.tocsr()

In [231]:
sparse.save_npz('final_recommendations.npz', final_recommendations)

In [233]:
# Still some people - maybe no purchases in the original dataset
np.where(np.diff(final_recommendations.indptr) == 0)[0].shape

(9712,)

In [254]:
with open('submission.txt', 'w') as f:
    f.write('customer_id,prediction\n')

    for i,row in enumerate(final_recommendations):
        cust_id = all_idx_to_customer[i]
        rec_string = f'{cust_id},{" ".join([all_idx_to_product[rec] for rec in final_recommendations[i].indices])}\n'
        f.write(rec_string)

In [255]:
!wc -l submission.csv

 1371981 submission.csv
