# Parameters

In [1]:
subset_name = 'CDs_and_Vinyl'
min_amount_product_mentions = 20
min_amount_user_mentions = 20 

# Imports

In [2]:
import gzip
import json
import numpy as np
import os
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Data

In [3]:
def get_data(subset_name, min_amount_product_mentions, min_amount_user_mentions, test_size=0.1, random_state=42):    
    data = []
    with gzip.open(os.path.join('data', f'meta_{subset_name}.json.gz')) as f:
        for l in f:
            data.append(json.loads(l.strip()))
    meta_df = pd.DataFrame.from_dict(data)
    meta_df = meta_df[['asin', 'title']]
    meta_df = meta_df.drop_duplicates()  
    
    data = []
    with gzip.open(os.path.join('data', f'{subset_name}.json.gz')) as f:
        for l in f:
            data.append(json.loads(l.strip()))
    rating_df = pd.DataFrame.from_dict(data)
    rating_df = rating_df[['asin', 'reviewerID', 'overall', 'reviewTime']]
    
    rating_df = rating_df.drop_duplicates()    
    rating_df = rating_df[rating_df['asin'].map(rating_df['asin'].value_counts()) >= min_amount_product_mentions]
    rating_df = rating_df[rating_df['reviewerID'].map(rating_df['reviewerID'].value_counts()) >= min_amount_user_mentions]
    
    X = rating_df.copy()
    y = rating_df['overall']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)
        
    return X_train, X_test, y_train, y_test, meta_df

In [4]:
X_train, X_test, y_train, y_test, meta_df = get_data(subset_name, min_amount_product_mentions, min_amount_user_mentions)

In [5]:
print(X_train.shape)
print(X_test.shape)

(407553, 4)
(45284, 4)


In [6]:
X_train.head()

Unnamed: 0,asin,reviewerID,overall,reviewTime
2140084,B0067FGYGQ,A1N5FSCYN4796F,4.0,"03 22, 2012"
2812507,B000000YGA,AI43VKPN5NF7D,5.0,"10 12, 2014"
1957854,B0030BYWKU,A200C7YQJ45LRR,3.0,"04 3, 2010"
1706107,B0014XCMV2,AI83XP5L7OMAU,5.0,"07 13, 2008"
2638225,B00R55U1UW,AC6UTDXWZTIEH,4.0,"02 17, 2015"


In [7]:
meta_df.head()

Unnamed: 0,asin,title
0,1393774,Songs for the Shepherd
1,1501348,Lift Him Up With Ron Kenoly VHS
2,1515209,I Love You
3,5072298,Hymns: 16 Classic Hymns for Children
4,5121515,Christmas Carols


# Lazy Getter 

In [8]:
user_product_matrix = None
def get_user_product_matrix(df):
    global user_product_matrix
    if not isinstance(user_product_matrix, pd.DataFrame):
         user_product_matrix = pd.crosstab(X_train.reviewerID, X_train.asin, X_train.overall, aggfunc='max')
    return user_product_matrix

user_cos_similarity_matrix = None
def get_user_cos_similarity_matrix(user_product_matrix):
    global user_cos_similarity_matrix
    if not isinstance(user_cos_similarity_matrix, pd.DataFrame):
        cos_similarity = cosine_similarity(user_product_matrix.copy().fillna(0))
        user_cos_similarity_matrix = pd.DataFrame(cos_similarity, index=user_product_matrix.index)
        user_cos_similarity_matrix.columns = user_product_matrix.index
    return user_cos_similarity_matrix

user_pea_similarity_matrix = None
def get_user_pea_similarity_matrix(user_product_matrix):
    global user_pea_similarity_matrix
    if not isinstance(user_pea_similarity_matrix, pd.DataFrame):
        pea_similarity = np.corrcoef(user_product_matrix.copy().fillna(0))
        user_pea_similarity_matrix = pd.DataFrame(pea_similarity, index=user_product_matrix.index)
        user_pea_similarity_matrix.columns = user_product_matrix.index
    return user_cos_similarity_matrix

# Base Model

In [9]:
class BaseModel:
    
    def __init__(self, X_train, X_test, y_train, y_test, meta_df):
        self._X_train = X_train
        self._y_train = y_train
        self._X_test = X_test
        self._y_test = y_test
        
        self._user_product_matrix = get_user_product_matrix(X_train)
                
        self._meta_df = meta_df
        self._all_product_ids = pd.concat([X_train, X_test])['asin'].unique()
        
    def predict(self, user_id, product_id):
        return 2.5
    
    def rmse_on_test_set(self, predicted_ratings):
        return np.sqrt(mean_squared_error(self._y_test, predicted_ratings))
    
    def mae_on_test_set(self, predicted_ratings):
        return mean_absolute_error(self._y_test, predicted_ratings)
    
    def score_on_test_set(self):
        user_product_pairs =  zip(self._X_test['reviewerID'], self._X_test['asin'])
        predicted_ratings = np.array([self.predict(user, product) for (user, product) in user_product_pairs])
        
        rmse = self.rmse_on_test_set(predicted_ratings)
        mae = self.mae_on_test_set(predicted_ratings)
        
        print(f'RMSE: {rmse}')
        print(f'MAE: {mae}')
        
        return rmse, mae
    
    def get_recommendations(self, user_id, k=5):
        predicted_ratings = [(product, self.predict(user_id, product)) for product in self._all_product_ids]
        sort_by_ratings = sorted(predicted_ratings, key=lambda predicted_ratings: (predicted_ratings[1], predicted_ratings[0]), reverse=True)
        return [pair[0] for pair in sort_by_ratings[:k]]
    
    def display_recommendations(self, user_id, k=5):
        self._display_user_info(user_id)
        
        rec_ids = self.get_recommendations(user_id, k)
        
        for (i, rec_id) in enumerate(rec_ids):
            print(f'Recommondation {i+1}:')
            self._display_product(rec_id)
            print('')
            
    def _display_user_info(self, user_id, k=5):
        print(f'User {user_id} has previously enjoyed:')
        best_rated_products = self._user_product_matrix.loc[user_id][~self._user_product_matrix.loc[user_id].isnull()].sort_values(ascending=False).head(5).index.values
        for product_id in best_rated_products:
            self._display_product(product_id)
        print('')
        
    def _display_product(self, product_id):
        reindexed_meta_df = self._meta_df.set_index('asin')
        if(product_id in reindexed_meta_df.index.values):
            product = reindexed_meta_df.loc[product_id]
            print(f'Title: : {product["title"]}')
        else:
            print(f'No metadata for product with id {product_id} found')
        

In [10]:
base_model = BaseModel(X_train, X_test, y_train, y_test, meta_df)
base_model.score_on_test_set()

RMSE: 2.1118730903697203
MAE: 1.9897756381945058


(2.1118730903697203, 1.9897756381945058)

# Weighted Average

In [12]:
class WeightedAverageModel(BaseModel):
    
    def __init__(self, X_train, X_test, y_train, y_test, meta_df, similarity='cosine'):
        BaseModel.__init__(self, X_train, X_test, y_train, y_test, meta_df)
        
        if similarity == 'cosine': 
            self._similarity = get_user_cos_similarity_matrix(self._user_product_matrix)
        elif similarity == 'pearson':
            self._similarity = get_user_pea_similarity_matrix(self._user_product_matrix)
        else:
            raise Exception
    
    def predict(self, user_id, product_id):
        if product_id in self._user_product_matrix:
            sim_scores = self._similarity[user_id] 
            ratings_scores = self._user_product_matrix[product_id] 

            index_not_rated = ratings_scores[ratings_scores.isnull()].index
            ratings_scores = ratings_scores.dropna()
            sim_scores = sim_scores.drop(index_not_rated)

            if sim_scores.sum() != 0:
                return np.dot(ratings_scores, sim_scores)/sim_scores.sum()
        
        return 2.5

In [13]:
weighted_average_model = WeightedAverageModel(X_train, X_test, y_train, y_test, meta_df)
weighted_average_model.score_on_test_set()

RMSE: 1.0881908494922603
MAE: 0.7775115402863946


(1.0881908494922603, 0.7775115402863946)

# KNN

In [14]:
class KnnModel(BaseModel):
    
    def __init__(self, X_train, X_test, y_train, y_test, meta_df, similarity='cosine', k=10):
        BaseModel.__init__(self, X_train, X_test, y_train, y_test, meta_df)
        
        self._k = k        
        
        if similarity == 'cosine': 
            cos_similarity = cosine_similarity(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(cos_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        elif similarity == 'pearson':
            pea_similarity = np.corrcoef(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(pea_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        else:
            raise Exception
    
    def _knn_filtered(self, user_id, product_id, k):
        return self._similarity[user_id][~np.isnan(self._user_product_matrix[product_id])].sort_values(ascending=False).head(k)
    
    def predict(self, user_id, product_id):
        if product_id in self._user_product_matrix:
            neighbours = self._knn_filtered(user_id, product_id, self._k)

            if not len(neighbours):
                return 2.5
            
            ratings = self._user_product_matrix[product_id][neighbours.index.values].to_numpy().astype(float)
            weights = neighbours.values.astype(float)
            
            if weights.sum() != 0:
                return np.dot(ratings, weights)/weights.sum()
        return 2.5

In [15]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, meta_df, k=10)
knn_model.score_on_test_set()

RMSE: 1.09160312713944
MAE: 0.7741288942303893


(1.09160312713944, 0.7741288942303893)

In [16]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, meta_df, k=7)
knn_model.score_on_test_set()

RMSE: 1.096815033741887
MAE: 0.7744033588942375


(1.096815033741887, 0.7744033588942375)

In [17]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, meta_df, k=5)
knn_model.score_on_test_set()

RMSE: 1.106599279533919
MAE: 0.7769100476571354


(1.106599279533919, 0.7769100476571354)

In [18]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, meta_df, k=3)
knn_model.score_on_test_set()

RMSE: 1.134017119333312
MAE: 0.7849504376685776


(1.134017119333312, 0.7849504376685776)

In [19]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, meta_df, k=1)
knn_model.score_on_test_set()

RMSE: 1.2996117084359562
MAE: 0.8355379383446692


(1.2996117084359562, 0.8355379383446692)

In [20]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, meta_df, 'pearson', k=10)
knn_model.score_on_test_set()

RMSE: 21.67586206074521
MAE: 0.903924472946585


(21.67586206074521, 0.903924472946585)

# Display Recommendations

In [23]:
weighted_average_model.display_recommendations('A1N5FSCYN4796F')

User A1N5FSCYN4796F has previously enjoyed:
Title: : Different Shades of Blue
Title: : Croweology
No metadata for product with id B008V0OLOW found
Title: : Lifeline
Title: : Sloe Gin

Recommondation 1:
Title: : The Complete Concert by the Sea

Recommondation 2:
Title: : Deathless

Recommondation 3:
No metadata for product with id B00DGL3IKY found

Recommondation 4:
Title: : asin
B0000DFZZV    Silver Jubilee
B0000DFZZV    Silver Jubilee
Name: title, dtype: object

Recommondation 5:
Title: : Phil Spector: Back to Mono

