# Modeling Recommendation Engine v2 <sup>[1]</sup>

## Imports

In [None]:
import logging
import numpy as np
import pandas as pd
from scipy.special import logsumexp

## Load Data

In [None]:
# 80/20 split earlier
df_train = pd.read_csv('../Data/training_data.csv')
df_test = pd.read_csv('../Data/testing_data.csv')

In [None]:
df_train.head(2)

In [None]:
df_test.head(2)

In [None]:
# Also need full dataset to index users for easier comparisons later.
# Make sure to avoid data leakage and not mix up the analysis on different DataFrames
df = pd.read_csv('../Data/eda_data.csv')

In [None]:
# subset of data didn't make a difference
# df_train_subset = df_train.iloc[np.random.choice(df_train.index, size=10000, replace=False)]
# print(df_train_subset.shape)
# print(df_train.reviewerID.nunique())
# print(df_train.asin.nunique())

### Create a user index
To retrieve information given a specific user_id in a more convenient way.

In [None]:
# The key features and ids from earlier analysis.
# Note that additional features could be included if desired; except for the target feature: 'overall'.
user_info = df[['title', 'also_buy', 'rank', 'asin', 'reviewerID', 'reviewText', 'summary']]
user_info.set_index('reviewerID', inplace=True)
user_info.head(2)

In [None]:
user_ids_larger_1 = pd.value_counts(df.reviewerID, sort=False) > 1
user_ids_larger_1 = user_ids_larger_1[user_ids_larger_1].index
len(user_ids_larger_1)

In [None]:
df = df.loc[user_ids_larger_1]
print(df.shape)

## Evaluation functions

Used to test your `estimate` method.

In [None]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [None]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(df_test.reviewerID, df_test.asin)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = df_test.overall.values
    return compute_rmse(estimated, real)

## Several Custom Similarity Functions

### Euclidean 'similarity'

$$ sim(x,y) = \frac{1}{1 + \sqrt{\sum (x - y)^2}}$$

In [None]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

### Cosine similarity

$$ sim(x,y) = \frac{(x . y)}{\sqrt{(x . x) (y . y)}} $$

In [None]:
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

- Pearson correlation

$$ sim(x,y) = \frac{(x - \bar x).(y - \bar y)}{\sqrt{(x - \bar x).(x - \bar x) * (y - \bar y)(y - \bar y)}} $$

In [None]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(logsumexp(s1_c ** 2)) * np.sum(logsumexp(s2_c ** 2)))

- Jaccard similarity

$$ sim(x,y) = \frac{(x . y)}{(x . x) + (y . y) - (x . y)} $$

In [None]:
def jaccard(s1, s2):
    dotp = np.sum(s1 * s2)
    return dotp / (np.sum(s1 ** 2) + np.sum(s2 ** 2) - dotp)

def binjaccard(s1, s2):
    dotp = s1.index.intersection(s2.index).size
    return dotp / (s1.sum() + s2.sum() - dotp)

In [None]:
class CollaborativeSimilarityRecommendation:
    """ Collaborative filtering using a custom sim(u,u'). """

    def __init__(self, similarity=None):
        """ Prepare datastructures for estimation. """
        
        try:
            self.all_user_profiles = df_train.pivot_table('overall', index='asin', columns='reviewerID')
        except IndexError as e:
            logging.exception(e)
            
        self._similarity = similarity
        
    @property
    def similarity(self):
        return self._similarity
    
    @similarity.setter
    def similarity(self, value):
        self._similarity = value
        
    def estimate(self, user_id, product_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = df_train.reviewerID != user_id
        movie_condition = df_train.asin == product_id
        ratings_by_others = df_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 4.0
        
        ratings_by_others.set_index('reviewerID', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.overall
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: self.similarity(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'overall': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.overall, weights=ratings_sims.sim)
        

In [None]:
reco = CollaborativeSimilarityRecommendation(pearson)
print('RMSE for Pearson: %s' % evaluate(reco.estimate))

## References
1) Unata 2015 [Hands-on with PyData: How to Build a Minimal Recommendation Engine](https://www.youtube.com/watch?v=F6gWjOc1FUs).  