In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse as sparse
import random
import implicit
import pickle

from apyori import apriori
from collections import defaultdict
from pandas.api.types import CategoricalDtype

In [2]:
full_df = pd.read_csv('../dataset/cleaned/combined_cleansed.csv')

## Apriori - Basket Analysis

- Both X and Y can be placed on the same shelf, so that buyers of one item would be prompted to buy the other.
- Promotional discounts could be applied to just one out of the two items.
- Advertisements on X could be targeted at buyers who purchase Y.
- X and Y could be combined into a new product, such as having Y in flavors of X.

In [3]:
def add_to_dict(x):
    prod_dict[x[0]].append(x[1])

In [4]:
prod_dict = defaultdict(list)

In [5]:
full_df[['order_id','product_name']].apply(add_to_dict, axis = 1);

In [6]:
purchase_list = list(prod_dict.values())

In [7]:
with open('../pickles/purchase_list.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(purchase_list, filehandle)

In [8]:
with open('../pickles/purchase_list.data', 'rb') as filehandle:
    # read the data as binary data stream
    purchase_list = pickle.load(filehandle)

In [9]:
len(purchase_list)

3214669

In [10]:
purchase_list = [x for x in purchase_list if len(x) > 1]

- Only keep orders which have more than 1 orders
- transaction with only 1 order will increase our total transaction count and will not be useful for our analysis

In [11]:
len(purchase_list)

3057605

- support = I want items that had been bought in at least 3000 times
- confidence = I want at least 20% of the transactions where the items are bought together when compared to the transactions when only the second item is bought
- lift = 2
- min length = I want at least 2 products in our rules

In [12]:
association_rules = apriori(purchase_list, min_support=3000/len(purchase_list), min_confidence=0.2, min_lift=2, min_length=2)
association_results = list(association_rules)

In [13]:
with open('../pickles/association_rules_01_percent_min_support.data', 'wb') as filehandle:
    pickle.dump(association_results, filehandle)

In [14]:
with open('../pickles/association_rules_01_percent_min_support.data', 'rb') as filehandle:
    # read the data as binary data stream
    association_results = pickle.load(filehandle)

In [15]:
len(association_results)

421

In [16]:
association_results[0][0]

frozenset({'Apple Honeycrisp Organic', 'Bag of Organic Bananas'})

- Support is the number of transactions that contain the item divide by the total number of transactions
    - I have 100 transactions, 10 of them contains jam = 10/100 = 0.1 = 10%
    
- Confidence = The likelihood that item B is bought when item A is bought
    - I have 100 transactions where only bread is bought, 20 of them contains both bread and jam, Confidence of bread→jam = 20/100 = 0.2 = 20%
    - The likelihood of buying jam when bread is purchased is 20%

- Lift = the increase in ratio of the sale of B when A is sold, it can be calculated by (Confidence A→B) / (Support B).
    - A higher lift means that the likelihood of the products being bought together is higher
    - A lift lesser than 1 means that the items are not likely to be bought together
    - A lift equals to 1 means that there are no association between both products
    - Lift(bread → jam) = (20/100) / (10/100) = 2
    - The likelihood of buying jam and bread together is 2 times more likely than just buying bread alone

In [17]:
result = []
for item in association_results:

    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    value0 = str(items[0])
    value1 = str(items[1])
    
    # second index for the inner listing
    value2 = str(item[1])[:7]
    
    value3 = str(item[2][0][2])[:7]
    value4 = str(item[2][0][3])[:7]
    
    rows = (value0, value1, value2, value3, value4)
    result.append(rows)

labels = ['Antecedent', 'Consequents', 'Support', 'Confidence', 'Lift']
product_suggestions = pd.DataFrame(result, columns = labels)

In [18]:
product_suggestions.shape[0]

421

In [19]:
product_suggestions.head()

Unnamed: 0,Antecedent,Consequents,Support,Confidence,Lift
0,Apple Honeycrisp Organic,Bag of Organic Bananas,0.00774,0.27941,2.26811
1,Apples,Bag of Organic Bananas,0.00101,0.25411,2.06274
2,Apples,Clementines,0.00131,0.3282,33.614
3,Banana,Asparation/Broccolini/Baby Broccoli,0.00198,0.36813,2.3923
4,Baby Cucumbers,Hass Avocados,0.00103,0.22707,14.0823


- Clementines appeared in 0.00131 of the total transactions (0.13%)
- The likelihood of someone buying Apples when they purchase Clementines is 0.32 (32%)
- Past transanctions shows that people are 33 times more likely to buy Apples and Clementines compared to just buying Apples

In [20]:
product_suggestions.sample(5)

Unnamed: 0,Antecedent,Consequents,Support,Confidence,Lift
208,Organic Tomato Cluster,Bag of Organic Bananas,0.00102,0.20763,2.63702
84,Organic Strawberries,Large Yellow Flesh Nectarine,0.00117,0.23448,2.72036
120,Organic Cucumber,Organic Hass Avocado,0.0057,0.2174,3.12255
184,Limes,Bag of Organic Bananas,0.00141,0.20703,2.62933
326,Banana,Strawberries,0.00143,0.31924,2.07461


In [21]:
product_suggestions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421 entries, 0 to 420
Data columns (total 5 columns):
Antecedent     421 non-null object
Consequents    421 non-null object
Support        421 non-null object
Confidence     421 non-null object
Lift           421 non-null object
dtypes: object(5)
memory usage: 16.6+ KB


## Collaborative Filtering

In [3]:
collaborative_df = full_df.groupby(['user_id', 'product_name', 'product_id'])['product_id'].agg('count').to_frame('purchase_count').reset_index()

In [4]:
# get a list of unique users
users = list(np.sort(collaborative_df['user_id'].unique()))
# get a list of unique products
products = list(collaborative_df['product_id'].unique())
# get a list of purchase count
purchase_count = list(collaborative_df['purchase_count'])

# get the row indices
cols = collaborative_df['user_id'].astype('category', CategoricalDtype(categories = users)).cat.codes
# get the column indices
rows = collaborative_df['product_id'].astype('category', CategoricalDtype(categories = products)).cat.codes

collaborative_sparse = sparse.csr_matrix((purchase_count, (rows, cols)), shape = (len(products), len(users)))

# purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))

In [5]:
collaborative_sparse

<48422x206209 sparse matrix of type '<class 'numpy.int32'>'
	with 13266179 stored elements in Compressed Sparse Row format>

We have 206209 customers with 48422 items. lets check our sparcity

## Scaling

Centering sparse data would destroy the sparseness structure in the data, and thus rarely is a sensible thing to do. However, it can make sense to scale sparse inputs, especially if features are on different scales.

MaxAbsScaler and maxabs_scale were specifically designed for scaling sparse data, and are the recommended way to go about this

In [25]:
# get the number of possible interactions
matrix_size = collaborative_sparse.shape[0] * collaborative_sparse.shape[1]

# number of actual interactions
purchase_num = len(collaborative_sparse.nonzero()[0])
print('Sparcity is {}'.format(100*(1-(purchase_num/matrix_size))))

Sparcity is 99.86713961292403


for this to work we the maximum sparcity should be about 99.5% we are 0.3% above it, this may affect our result

## Alternating Least Squares

In [6]:
# closest to the power point

model = implicit.als.AlternatingLeastSquares(factors=350, regularization=0.1, iterations=40)
alpha_val = 40
data_conf = (collaborative_sparse * alpha_val).astype('double')
model.fit(data_conf)
user_items = data_conf.T.tocsr()



HBox(children=(IntProgress(value=0, max=40), HTML(value='')))




In [64]:
def get_recommendations(df, model, fitted, user):
    recommendations = model.recommend(user, fitted, N = 4, filter_already_liked_items = False)
    product_dict = dict(zip(full_df.product_id, full_df.product_name))
    
    print('Recommended items for user {} are: \n'.format(user))
    for i in recommendations:
        print(i[0], product_dict.get(i[0]), i[1])

In [88]:
get_recommendations(collaborative_sparse, model, user_items, 1)

Recommended items for user 1 are: 

33409 Sugar Free Cherry Gelatin Dessert 1.2291799
13517 Whole Wheat Bread 1.2027234
24332 Carrot Ginger With Coconut Soup 1.1276637
20493 All Natural Chocolate Hemp 1.0903381


In [89]:
get_recommendations(collaborative_sparse, model, user_items, 2)

Recommended items for user 2 are: 

426 2nd Foods Bananas 1.2574742
20319 Water Chestnuts, Sliced 1.1400969
45944 Devils Food Cake Mix 1.134639
39521 Premium Potatoes Organic Fingerling Medley 1.0907922


In [90]:
get_recommendations(collaborative_sparse, model, user_items, 5)

Recommended items for user 5 are: 

21353 Shredded Pizza Cheese 0.72977406
24342 Big Chewy Peanut Butter Chocolate Chip Granola 0.7140698
46544 Mild Tikka Curry Paste 0.70919603
22362 Original Rice Krispies Treats 0.7051064


In [91]:
get_recommendations(collaborative_sparse, model, user_items, 7)

Recommended items for user 7 are: 

17338 Vanilla Swiss Almond Ice Cream 0.9824994
30924 Cherry Real Italian Ice 0.9820426
33502 Double Cheese Baked Snack Mix 0.97929156
33275 Aloe Vera Soap Bar 0.97042596


In [92]:
get_recommendations(collaborative_sparse, model, user_items, 264)

Recommended items for user 264 are: 

12835 Good Start® Gentle for Supplementing Powder Infant Formula 1.016046
24231 Tahitian Vanilla Bean Gelato 0.99917996
21353 Shredded Pizza Cheese 0.9966327
20608 Grapeseed Oil 0.9963527


In [93]:
get_recommendations(collaborative_sparse, model, user_items, 3754)

Recommended items for user 3754 are: 

33502 Double Cheese Baked Snack Mix 1.2523721
29408 Mean Beans Spicy Green Bean Pickles 1.1054409
45339 Men's Refresh Dandruff Shampoo 1.0747702
11114 Whole Poppy Seed 1.0670176


This is a test

# Raw Codes

# test our recommender

## Testing our recommender system

This function will take in the original user-item matrix and "mask" a percentage of the original ratings where a
user-item interaction has taken place for use as a test set. The test set will contain all of the original ratings, 
while the training set replaces the specified percentage of them with a zero in the original ratings matrix. 

parameters: 

ratings - the original ratings matrix from which you want to generate a train/test set. Test is just a complete
copy of the original set. This is in the form of a sparse csr_matrix. 

pct_test - The percentage of user-item interactions where an interaction took place that you want to mask in the 
training set for later comparison to the test set, which contains all of the original ratings. 

returns:

training_set - The altered version of the original data with a certain percentage of the user-item pairs 
that originally had interaction set back to zero.

test_set - A copy of the original ratings matrix, unaltered, so it can be used to see how the rank order 
compares with the actual interactions.

user_inds - From the randomly selected user-item indices, which user rows were altered in the training data.
This will be necessary later when evaluating the performance via AUC.

This simple function will output the area under the curve using sklearn's metrics. 

parameters:

- predictions: your prediction output

- test: the actual target result you are comparing to

returns:

- AUC (area under the Receiver Operating Characterisic curve)

'''
This function will calculate the mean AUC by user for any user that had their user-item matrix altered. 

parameters:

training_set - The training set resulting from make_train, where a certain percentage of the original
user/item interactions are reset to zero to hide them from the model 

predictions - The matrix of your predicted ratings for each user/item pair as output from the implicit MF.
These should be stored in a list, with user vectors as item zero and item vectors as item one. 

altered_users - The indices of the users where at least one user/item pair was altered from make_train function

test_set - The test set constucted earlier from make_train function



returns:

The mean AUC (area under the Receiver Operator Characteristic curve) of the test set only on user-item interactions
there were originally zero to test ranking ability in addition to the most popular items as a benchmark.
'''


## Backup

Implicit weighted ALS taken from Hu, Koren, and Volinsky 2008. Designed for alternating least squares and implicit
feedback based collaborative filtering. 

parameters:

training_set - Our matrix of ratings with shape m x n, where m is the number of users and n is the number of items.
Should be a sparse csr matrix to save space. 

lambda_val - Used for regularization during alternating least squares. Increasing this value may increase bias
but decrease variance. Default is 0.1. 

alpha - The parameter associated with the confidence matrix discussed in the paper, where Cui = 1 + alpha*Rui. 
The paper found a default of 40 most effective. Decreasing this will decrease the variability in confidence between
various ratings.

iterations - The number of times to alternate between both user feature vector and item feature vector in
alternating least squares. More iterations will allow better convergence at the cost of increased computation. 
The authors found 10 iterations was sufficient, but more may be required to converge. 

rank_size - The number of latent features in the user/item feature vectors. The paper recommends varying this 
between 20-200. Increasing the number of features may overfit but could reduce bias. 

seed - Set the seed for reproducible results

returns:

The feature vectors for users and items. The dot product of these feature vectors should give you the expected 
"rating" at each point in your original matrix. 