# Recommendations Using Custom Similarity Functions <sup>1</sup>

## Imports

In [26]:
import numpy as np
import pandas as pd
from scipy.special import logsumexp

## Load Data

Only loading a subset of the original data set for proof of concept reasons.

In [27]:
# 80/20 split earlier
df_train = pd.read_csv('../Data/training_data_subset.csv')
df_test = pd.read_csv('../Data/testing_data_subset.csv')

In [28]:
df_train.head(2)

Unnamed: 0,category,description,title,also_buy,brand,rank,also_view,main_cat,price,asin,details,overall,verified,reviewerID,reviewText,summary,vote,style,for_testing
0,"['Grocery & Gourmet Food', 'Sauces, Gravies & ...",['Sriracha chili sauce made from sun ripened c...,"Huy Fong Sriracha Chili Sauce, 28 Ounce Bottle...","['B001E5DZZM', 'B003NROMC4', 'B00U9VTL5U', 'B0...",Huy Fong,"145,292 in Grocery & Gourmet Food (","['B001E5DZZM', 'B008AV5HLS', 'B00U9VTL5U', 'B0...",Grocery,,B00BT7C9R0,"{'Shipping Weight:': '11.4 pounds', 'ASIN: ': ...",5.0,True,A3FYXMWYC9KUCK,I have been using Sriracha for several years n...,This stuff is great!,,,False
1,"['Grocery & Gourmet Food', 'Breakfast Foods', ...",['belVita Chocolate Breakfast Biscuits are lig...,"belVita Chocolate Breakfast Biscuits, 5 Count ...","['B00QF27JL0', 'B01BNIN5ZO', 'B01FLPFPOY', 'B0...",Belvita,"19,427 in Grocery & Gourmet Food (","['B01COWTO4O', 'B01FLPFPOY', 'B00QF27JL0', 'B0...",Grocery,,B00IO2DO2W,"{'Shipping Weight:': '4.1 pounds', 'Domestic S...",5.0,True,A2OWR2PL3DLWS4,My daughter is a Belvita addict. She likes al...,Delciious,,,False


In [29]:
df_test.head(2)

Unnamed: 0,category,description,title,also_buy,brand,rank,also_view,main_cat,price,asin,details,overall,verified,reviewerID,reviewText,summary,vote,style,for_testing
0,"['Grocery & Gourmet Food', 'Produce', 'Fresh V...","['<div class=""aplus""> <div class=""three-fourth...","Organic Green Cabbage, 1 Head",,produce aisle,,,Grocery,,B000P6H29Q,{'\n Product Dimensions: \n ': '7.5 x 6....,5.0,True,A1NKRXSU63EA4M,Hugh and delicious,Five Stars,,,True
1,"['Grocery & Gourmet Food', 'Cooking & Baking',...",['Light & Fluffy. Just add water. Made with re...,"Krusteaz Complete Pancake Mix, Buttermilk, 32 oz","['B000R32RJC', 'B07CX6LN8T', 'B000PXZZQG', 'B0...",Krusteaz,,"['B00DXGGSBI', 'B00CEMP2Z0', 'B00BP2RY42', 'B0...",Grocery,,B000QCLEB6,{'\n Product Dimensions: \n ': '6.1 x 2....,5.0,True,A3TR0FIT13SSVN,Great flavor and surprisingly fluffy out of th...,Surprisingly good :),6.0,,True


In [30]:
# Also, load the entire original dataset to make a user index.
# Note: If using the full train/test dataset several of the methods below can takes 2-6 hours to run.
df_original = pd.read_csv('../Data/eda_data.csv')

In [31]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1086548 entries, 0 to 1086547
Data columns (total 18 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   category     1086548 non-null  object 
 1   description  993137 non-null   object 
 2   title        1086548 non-null  object 
 3   also_buy     929733 non-null   object 
 4   brand        1078563 non-null  object 
 5   rank         1042485 non-null  object 
 6   also_view    578911 non-null   object 
 7   main_cat     1085268 non-null  object 
 8   price        752804 non-null   float64
 9   asin         1086548 non-null  object 
 10  details      1086497 non-null  object 
 11  overall      1086548 non-null  float64
 12  verified     1086548 non-null  bool   
 13  reviewerID   1086548 non-null  object 
 14  reviewText   1086175 non-null  object 
 15  summary      1086335 non-null  object 
 16  vote         149486 non-null   float64
 17  style        561516 non-null   object 
dtypes:

### RMSE

In [32]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

### Evaluation method

In [33]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(df_test.reviewerID, df_test.asin)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = df_test.overall.values
    return compute_rmse(estimated, real)

### Create a user index
To retrieve information given a specific user_id in a more convenient way.

In [34]:
# The key features and ids from earlier analysis.
# Note that additional features could be included if desired; except for the target feature: 'overall'.
user_info = df_original[['title', 'also_buy', 'also_view', 'price', 'rank', 'asin', 'reviewerID', 'vote', 'reviewText', 'summary']]
user_info.set_index('reviewerID', inplace=True)
user_info.head(2)

Unnamed: 0_level_0,title,also_buy,also_view,price,rank,asin,vote,reviewText,summary
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A1J205ZK25TZ6W,Lipton Yellow Label Tea (loose tea) - 450g,"['B00886E4K0', 'B00CREXSHY', 'B001QTRGAQ', 'B0...","['B00CREXSHY', 'B001QTRGAQ', 'B000JSQK70', 'B0...",12.46,"30,937 in Grocery & Gourmet Food (",4639725043,8.0,I make the best brewed iced tea with this yell...,Best for brewed iced tea.
ACOICLIJQYECU,Lipton Yellow Label Tea (loose tea) - 450g,"['B00886E4K0', 'B00CREXSHY', 'B001QTRGAQ', 'B0...","['B00CREXSHY', 'B001QTRGAQ', 'B000JSQK70', 'B0...",12.46,"30,937 in Grocery & Gourmet Food (",4639725043,9.0,I have recently started drinking hot tea again...,Not Bad for iced Tea


In [35]:
user_info[user_info.index == 'A2HF7X0UUO13I5']

Unnamed: 0_level_0,title,also_buy,also_view,price,rank,asin,vote,reviewText,summary
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A2HF7X0UUO13I5,"Red Star Bread Machine Yeast, 4 oz","['155832156X', 'B00DVWWS7C', 'B00BC3WVNS', 'B0...","['B00DVWWS7C', 'B0001CXUHW', 'B00BC3WVNS', 'B0...",,"25,911 in Grocery & Gourmet Food (",B002CIP40E,,works great,Five Stars
A2HF7X0UUO13I5,"Hodgson Mill Best For Bread Flour, 32-Ounce (...","['B00C3XU688', 'B00BC3WVNS', 'B0000CEQ6H', 'B0...",,32.47,"45,196 in Grocery & Gourmet Food (",B004IMYB1K,,Good flour for the bread machine. No complain...,Good quality flour.
A2HF7X0UUO13I5,"White Lily Unbleached Bread Flour, 5 Pound","['B00BC3WVNS', 'B009UP5DQC', 'B00RP3EACY', 'B0...","['B006E5E8O4', 'B005QQ1MB4', 'B00KQ12BQ4', 'B0...",,"100,574 in Grocery & Gourmet Food (",B005QQ1L10,,the most delicious bread I have tasted so far.,Five Stars
A2HF7X0UUO13I5,"Stur - Classic Variety Pack, Natural Water Enh...","['B0092H2K6E', 'B01A2L9PHU', 'B003QZNABC', 'B0...",,19.95,290 in Grocery & Gourmet Food (,B00C1LXBFC,,"good flavor, right sweetener. Great for low c...",Four Stars
A2HF7X0UUO13I5,Jell-O Sugar Free Gelatin Sampler (Bundle of 8...,"['B00DJZPV84', 'B00JD8WU3U', 'B004B9MUT0', 'B0...",,14.99,"8,971 in Grocery & Gourmet Food (",B00DJZUPHQ,,I'd like it better if they use Stevia instead ...,Three Stars
A2HF7X0UUO13I5,Stur Fruit Punch Water Flavor 1.62 ounce (Single),"['B01MATERKP', 'B01MAT7ZC4', 'B01M311CXS', 'B0...",,7.35,"31,570 in Grocery & Gourmet Food (",B00JS8XK52,,so glad they make these. Great for the Atkins...,Five Stars


In [36]:
df_test[df_test.index == 'A2HF7X0UUO13I5']

Unnamed: 0,category,description,title,also_buy,brand,rank,also_view,main_cat,price,asin,details,overall,verified,reviewerID,reviewText,summary,vote,style,for_testing


In [37]:
df_train[df_train.index == 'A2HF7X0UUO13I5']

Unnamed: 0,category,description,title,also_buy,brand,rank,also_view,main_cat,price,asin,details,overall,verified,reviewerID,reviewText,summary,vote,style,for_testing


### Euclidean 'similarity'

$$ sim(x,y) = \frac{1}{1 + \sqrt{\sum (x - y)^2}}$$

In [38]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

### Cosine similarity

$$ sim(x,y) = \frac{(x . y)}{\sqrt{(x . x) (y . y)}} $$

In [39]:
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

### Pearson correlation

$$ sim(x,y) = \frac{(x - \bar x).(y - \bar y)}{\sqrt{(x - \bar x).(x - \bar x) * (y - \bar y)(y - \bar y)}} $$

In [40]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(logsumexp(s1_c ** 2)) * np.sum(logsumexp(s2_c ** 2)))

### Jaccard similarity

$$ sim(x,y) = \frac{(x . y)}{(x . x) + (y . y) - (x . y)} $$

In [41]:
def jaccard(s1, s2):
    dotp = np.sum(s1 * s2)
    return dotp / (np.sum(s1 ** 2) + np.sum(s2 ** 2) - dotp)

def binjaccard(s1, s2):
    dotp = s1.index.intersection(s2.index).size
    return dotp / (s1.sum() + s2.sum() - dotp)

In [42]:
class CollaborativeSimilarityRecommendation:
    """ Collaborative filtering using a custom sim(u,u'). """

    def __init__(self, similarity=None):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = df_train.pivot_table('overall', index='asin', columns='reviewerID')
        self._similarity = similarity
        
    @property
    def similarity(self):
        return self._similarity
    
    @similarity.setter
    def similarity(self, value):
        self._similarity = value
        
    def estimate(self, user_id, product_id):
        """ Ratings weighted by custom similarity. """
        
        user_condition = df_train.reviewerID != user_id
        movie_condition = df_train.asin == product_id
        ratings_by_others = df_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 4.0
        
        ratings_by_others.set_index('reviewerID', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.overall
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: self.similarity(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'overall': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.overall, weights=ratings_sims.sim)
        

In [43]:
reco = CollaborativeSimilarityRecommendation(pearson)
print('RMSE for Pearson: %s' % evaluate(reco.estimate))

KeyError: 'A2HF7X0UUO13I5'

## Summary
- TODO

## References
1) Unata 2015 [Hands-on with PyData: How to Build a Minimal Recommendation Engine](https://www.youtube.com/watch?v=F6gWjOc1FUs).  