# Recommendations Using Custom Similarity Functions <sup>1</sup>

## Imports

In [1]:
import numpy as np
import pandas as pd
from scipy.special import logsumexp

## Load Data

Only loading a subset of the original data set for proof of concept reasons.

In [2]:
# 80/20 split earlier
df_train = pd.read_csv('../Data/training_data_subset.csv')
df_test = pd.read_csv('../Data/testing_data_subset.csv')

In [3]:
df_train.head(2)

Unnamed: 0,category,title,also_buy,brand,rank,also_view,main_cat,price,asin,overall,verified,reviewerID,vote,style,for_testing
0,"['Grocery & Gourmet Food', 'Candy & Chocolate'...","YumEarth Organic Gummy Bears, 10 Count","['B008CC8UXC', 'B00C25LO8S', 'B073RWDCMD', 'B0...",YumEarth,"129,438 in Grocery & Gourmet Food (","['B008CC8UXC', 'B00C25LNWA', 'B008CC8ULY', 'B0...",Grocery,,B008B7JNRA,3.0,True,A35KP4ROS9KWPO,,"{'Size:': ' 10 Count', 'Style:': ' Natural Gum...",False
1,"['Grocery & Gourmet Food', 'Jams, Jellies & Sw...",Bell Plantation Powdered PB2 Bundle: 1 Peanut ...,"['B06W9N8X9H', 'B06X15V3DC', 'B01ENYJX3S', 'B0...",PB2,"1,214 in Grocery & Gourmet Food (",,Grocery,18.49,B00H9H56QA,5.0,True,AVAMZWS7AAI1S,,{'Size:': ' Pack of 2 (1 each flavor)'},False


In [4]:
df_test.head(2)

Unnamed: 0,category,title,also_buy,brand,rank,also_view,main_cat,price,asin,overall,verified,reviewerID,vote,style,for_testing
0,"['Grocery & Gourmet Food', 'Snack Foods', 'Bar...","Grocery &amp; Gourmet Food"" />","['B01MT0QDPO', 'B00NL17FE4', 'B01NBM9OJN', 'B0...",Nature Valley,"16,921 in Grocery & Gourmet Food (",,Grocery,18.04,B001E6GFR6,5.0,True,A2IUE299OONA73,,,True
1,"['Grocery & Gourmet Food', 'Snack Foods', 'Chi...",Gourmet Basics Smart Fries 4-Flavor Variety Pa...,"['B0763SHX4W', 'B0040FIHS8', 'B00FYR5HS4', 'B0...",Gourmet Basics,"53,167 in Grocery & Gourmet Food (",,Grocery,23.99,B003AZ2ECY,4.0,True,A38NO7J1TK4R1W,,,True


In [5]:
# Also, load the entire original dataset to make a user index.
# Note: If using the full train/test dataset several of the methods below can takes 2-6 hours to run.
df_original = pd.read_csv('../Data/eda_data.csv')

In [6]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1083170 entries, 0 to 1083169
Data columns (total 14 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   category    1083170 non-null  object 
 1   title       1083170 non-null  object 
 2   also_buy    926546 non-null   object 
 3   brand       1075197 non-null  object 
 4   rank        1039163 non-null  object 
 5   also_view   577060 non-null   object 
 6   main_cat    1081896 non-null  object 
 7   price       750231 non-null   float64
 8   asin        1083170 non-null  object 
 9   overall     1083170 non-null  float64
 10  verified    1083170 non-null  bool   
 11  reviewerID  1083170 non-null  object 
 12  vote        149247 non-null   float64
 13  style       559212 non-null   object 
dtypes: bool(1), float64(3), object(10)
memory usage: 108.5+ MB


### RMSE

In [7]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

### Evaluation method

In [8]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(df_test.reviewerID, df_test.asin)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = df_test.overall.values
    return compute_rmse(estimated, real)

### Create a user index
To retrieve information given a specific user_id in a more convenient way.

In [9]:
# The key features and ids from earlier analysis.
# Note that additional features could be included if desired; except for the target feature: 'overall'.
user_info = df_original[['title', 'also_buy', 'also_view', 'price', 'rank', 'asin', 'reviewerID', 'vote']]
user_info.set_index('reviewerID', inplace=True)
user_info.head(2)

Unnamed: 0_level_0,title,also_buy,also_view,price,rank,asin,vote
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A1J205ZK25TZ6W,Lipton Yellow Label Tea (loose tea) - 450g,"['B00886E4K0', 'B00CREXSHY', 'B001QTRGAQ', 'B0...","['B00CREXSHY', 'B001QTRGAQ', 'B000JSQK70', 'B0...",12.46,"30,937 in Grocery & Gourmet Food (",4639725043,8.0
ACOICLIJQYECU,Lipton Yellow Label Tea (loose tea) - 450g,"['B00886E4K0', 'B00CREXSHY', 'B001QTRGAQ', 'B0...","['B00CREXSHY', 'B001QTRGAQ', 'B000JSQK70', 'B0...",12.46,"30,937 in Grocery & Gourmet Food (",4639725043,9.0


### Euclidean 'similarity'

$$ sim(x,y) = \frac{1}{1 + \sqrt{\sum (x - y)^2}}$$

In [10]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

### Cosine similarity

$$ sim(x,y) = \frac{(x . y)}{\sqrt{(x . x) (y . y)}} $$

In [11]:
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

### Pearson correlation

$$ sim(x,y) = \frac{(x - \bar x).(y - \bar y)}{\sqrt{(x - \bar x).(x - \bar x) * (y - \bar y)(y - \bar y)}} $$

In [12]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(logsumexp(s1_c ** 2)) * np.sum(logsumexp(s2_c ** 2)))

### Jaccard similarity

$$ sim(x,y) = \frac{(x . y)}{(x . x) + (y . y) - (x . y)} $$

In [13]:
def jaccard(s1, s2):
    dotp = np.sum(s1 * s2)
    return dotp / (np.sum(s1 ** 2) + np.sum(s2 ** 2) - dotp)

def binjaccard(s1, s2):
    dotp = s1.index.intersection(s2.index).size
    return dotp / (s1.sum() + s2.sum() - dotp)

In [14]:
class CollaborativeSimilarityRecommendation:
    """ Collaborative filtering using a custom sim(u,u'). """

    def __init__(self, similarity=None):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = df_train.pivot_table('overall', index='asin', columns='reviewerID')
        self._similarity = similarity
        
    @property
    def similarity(self):
        return self._similarity
    
    @similarity.setter
    def similarity(self, value):
        self._similarity = value
        
    def estimate(self, user_id, product_id):
        """ Ratings weighted by custom similarity. """
        
        user_condition = df_train.reviewerID != user_id
        movie_condition = df_train.asin == product_id
        ratings_by_others = df_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 4.0
        
        ratings_by_others.set_index('reviewerID', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.overall
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: self.similarity(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'overall': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.overall, weights=ratings_sims.sim)
        

In [15]:
reco = CollaborativeSimilarityRecommendation(pearson)
print('RMSE for Pearson: %s' % evaluate(reco.estimate))

KeyError: 'A2IUE299OONA73'

## TODO: Troubleshooting

Neither the test or train datasets have this reviewerID so I wasn't sure where it is coming from.
However, it is in the original dataframe (see below).

I was able to get this class to work in the tutorial code after refactoring several items to Python 3.
But, I'm doing things a little differently than the tutorial for the user_info table because the tutorial used the user database to set the index.
I cannot do that because the review dataset only has the reviewerID and overall rating that is being used here.
All the other features of interest are in the product database; so I'm trying to use the combined

Possible areas where the problem may be:
- The process may be too different with all the refactored code.
- May need to reset the database's index. I've tried resetting both columns and rows unsuccessfully.
- There are duplicate reviewerIDs in certain cases. 
When I set the user_info to use this as the index it could be losing some of the data.
If this is the case, probably need to reference the reviewerID in a different way for the comparisons.
- Could have transposed or mixed up something in the class after I copied it from the tutorial.
If this is the case a second pair of eyes could help.

In [19]:
user_info[user_info.index == 'A2IUE299OONA73']

Unnamed: 0_level_0,title,also_buy,also_view,price,rank,asin,vote
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A2IUE299OONA73,Splenda--No Calorie Sweetener Value Pack--1000...,"['B00J0EKV3I', 'B00451ZJB0', 'B00F3SQBVQ', 'B0...","['B002FJM46C', 'B0014CWT6G', 'B01IKPTL5Q', 'B0...",24.66,"2,053 in Grocery & Gourmet Food (",B000F3N7AC,2.0
A2IUE299OONA73,"M&amp;M's Peanut Candy, 62 Ounce","['B072J938RF', 'B00I332L6M', 'B00CFVDIKG', 'B0...","['B00CFVDIKG', 'B078BQZ5BP', 'B01MCWEDU7', 'B0...",17.52,"32,540 in Grocery & Gourmet Food (",B000NMCEJK,
A2IUE299OONA73,"York Peppermint Patties, 175-Count Changemaker...","['B00374XTQI', 'B005CULDQI', 'B008DQI0UK', 'B0...","['B00374XTQI', 'B008DQI0UK', 'B016WNGJZ8', 'B0...",,"177,795 in Grocery & Gourmet Food (",B000NME632,
A2IUE299OONA73,"Austin, Cookies and Crackers, Variety Pack, 68...","['B01GQ5WNC0', 'B008GVJ9S4', 'B00HFC2E82', 'B0...",,10.19,849 in Grocery & Gourmet Food (,B000P6MSOU,
A2IUE299OONA73,"Grocery &amp; Gourmet Food"" />","['B01MT0QDPO', 'B00NL17FE4', 'B01NBM9OJN', 'B0...",,18.04,"16,921 in Grocery & Gourmet Food (",B001E6GFR6,
A2IUE299OONA73,"TWIZZLERS Pull 'n' Peel Licorice Candy, Cherry...","['B005CUM6CS', 'B0029JU0ZU', 'B007F1JP26', 'B0...",,26.49,"104,525 in Grocery & Gourmet Food (",B001EQ4AKU,
A2IUE299OONA73,MILKY WAY Milk Chocolate Fun Size Candy Bars 1...,"['B0029J6J14', 'B015NCA6VM', 'B01MRQKCX6', 'B0...",,42.25,"252,442 in Grocery & Gourmet Food (",B001M074O2,
A2IUE299OONA73,Lance Fresh Sandwich Crackers Variety Pack - 3...,"['B008GVJ9S4', 'B00BXGB4NW', 'B00HFC2E82', 'B0...","['B00BXGB4NW', 'B073MJDJD7', 'B00MMHVIUK', 'B0...",17.39,"4,941 in Grocery & Gourmet Food (",B001PAS5GK,
A2IUE299OONA73,"York Peppermint Patties, 175-Count Changemaker...","['B00374XTQI', 'B016WNGJZ8', 'B000NME632', 'B0...","['B00374XTQI', 'B008DQI0UK', 'B016WNGJZ8', 'B0...",,"85,327 in Grocery & Gourmet Food (",B005CULDQI,
A2IUE299OONA73,Bulk No Calorie Sweetener - 1000 Yellow Packet...,"['B00R56L7TA', 'B00DY4J6YU', 'B07J9RGGC8', 'B0...",,20.99,"21,824 in Grocery & Gourmet Food (",B00PL99TT8,


In [20]:
df_test[df_test.index == 'A2IUE299OONA73']

Unnamed: 0,category,title,also_buy,brand,rank,also_view,main_cat,price,asin,overall,verified,reviewerID,vote,style,for_testing


In [21]:
df_train[df_train.index == 'A2IUE299OONA73']

Unnamed: 0,category,title,also_buy,brand,rank,also_view,main_cat,price,asin,overall,verified,reviewerID,vote,style,for_testing


## Summary
- TODO

## References
1) Unata 2015 [Hands-on with PyData: How to Build a Minimal Recommendation Engine](https://www.youtube.com/watch?v=F6gWjOc1FUs).  