# Parameters

In [1]:
subset_name = 'CDs_and_Vinyl'
min_amount_product_mentions = 20
min_amount_user_mentions = 20 

# Imports

In [2]:
import gzip
import json
import numpy as np
import os
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Data

In [3]:
def get_data(subset_name, min_amount_product_mentions, min_amount_user_mentions, test_size=0.1, random_state=42):    
    data = []
    with gzip.open(os.path.join('data', 'raw', f'meta_{subset_name}.json.gz')) as f:
        for l in f:
            data.append(json.loads(l.strip()))
    meta_df = pd.DataFrame.from_dict(data)
    meta_df = meta_df[['asin', 'title']]
    
    data = []
    with gzip.open(os.path.join('data', 'raw', f'{subset_name}.json.gz')) as f:
        for l in f:
            data.append(json.loads(l.strip()))
    rating_df = pd.DataFrame.from_dict(data)
    rating_df = rating_df[['asin', 'reviewerID', 'overall', 'reviewTime']]
    
    rating_df = rating_df.drop_duplicates()    
    rating_df = rating_df[rating_df['asin'].map(rating_df['asin'].value_counts()) >= min_amount_product_mentions]
    rating_df = rating_df[rating_df['reviewerID'].map(rating_df['reviewerID'].value_counts()) >= min_amount_user_mentions]
    
    X = rating_df.copy()
    y = rating_df['overall']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)
        
    return X_train, X_test, y_train, y_test

In [4]:
X_train, X_test, y_train, y_test = get_data(subset_name, min_amount_product_mentions, min_amount_user_mentions)

In [5]:
print(X_train.shape)
print(X_test.shape)

(407553, 4)
(45284, 4)


In [6]:
X_train.head()

Unnamed: 0,asin,reviewerID,overall,reviewTime
2140084,B0067FGYGQ,A1N5FSCYN4796F,4.0,"03 22, 2012"
2812507,B000000YGA,AI43VKPN5NF7D,5.0,"10 12, 2014"
1957854,B0030BYWKU,A200C7YQJ45LRR,3.0,"04 3, 2010"
1706107,B0014XCMV2,AI83XP5L7OMAU,5.0,"07 13, 2008"
2638225,B00R55U1UW,AC6UTDXWZTIEH,4.0,"02 17, 2015"


# Base Model

In [11]:
class BaseModel:
    
    def __init__(self, X_train, X_test, y_train, y_test):
        self._X_train = X_train
        self._y_train = y_train
        self._X_test = X_test
        self._y_test = y_test 
        
    def predict(self, user_id, product_id):
        return 2.5
    
    def rmse_on_test_set(self, predicted_ratings):
        return np.sqrt(mean_squared_error(self._y_test, predicted_ratings))
    
    def mae_on_test_set(self, predicted_ratings):
        return mean_absolute_error(self._y_test, predicted_ratings)
    
    def score_on_test_set(self):
        user_product_pairs =  zip(self._X_test['reviewerID'], self._X_test['asin'])
        predicted_ratings = np.array([self.predict(user, product) for (user, product) in user_product_pairs])
        
        rmse = self.rmse_on_test_set(predicted_ratings)
        mae = self.mae_on_test_set(predicted_ratings)
        
        print(f'RMSE: {rmse}')
        print(f'MAE: {mae}')
        
        return rmse, mae

In [12]:
base_model = BaseModel(X_train, X_test, y_train, y_test)
base_model.score_on_test_set()

RMSE: 2.1118730903697203
MAE: 1.9897756381945058


(2.1118730903697203, 1.9897756381945058)

# Weighted Average

In [16]:
class WeightedAverageModel(BaseModel):
    
    def __init__(self, X_train, X_test, y_train, y_test, similarity='cosine'):
        BaseModel.__init__(self, X_train, X_test, y_train, y_test)
        
        self._user_product_matrix = pd.crosstab(X_train.reviewerID, X_train.asin, X_train.overall, aggfunc='max')
        
        if similarity == 'cosine': 
            cos_similarity = cosine_similarity(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(cos_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        elif similarity == 'pearson':
            pea_similarity = np.corrcoef(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(pea_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        else:
            raise Exception
    
    def predict(self, user_id, product_id):
        if product_id in self._user_product_matrix:
            sim_scores = self._similarity[user_id] 
            ratings_scores = self._user_product_matrix[product_id] 

            index_not_rated = ratings_scores[ratings_scores.isnull()].index
            ratings_scores = ratings_scores.dropna()
            sim_scores = sim_scores.drop(index_not_rated)

            if sim_scores.sum() != 0:
                return np.dot(ratings_scores, sim_scores)/sim_scores.sum()
        
        return 2.5

In [14]:
weighted_average_model = WeightedAverageModel(X_train, X_test, y_train, y_test)
weighted_average_model.score_on_test_set()

RMSE: 1.0881908494922603
MAE: 0.7775115402863946


(1.0881908494922603, 0.7775115402863946)

In [17]:
weighted_average_model = WeightedAverageModel(X_train, X_test, y_train, y_test, 'pearson')
weighted_average_model.score_on_test_set()

RMSE: 21.447975090551065
MAE: 0.931401432802154


(21.447975090551065, 0.931401432802154)

# KNN

In [18]:
class KnnModel(BaseModel):
    
    def __init__(self, X_train, X_test, y_train, y_test, similarity='cosine', k=10):
        BaseModel.__init__(self, X_train, X_test, y_train, y_test)
        
        self._k = k        
        
        self._user_product_matrix = pd.crosstab(X_train.reviewerID, X_train.asin, X_train.overall, aggfunc='max')
        
        if similarity == 'cosine': 
            cos_similarity = cosine_similarity(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(cos_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        elif similarity == 'pearson':
            pea_similarity = np.corrcoef(self._user_product_matrix.copy().fillna(0))
            self._similarity = pd.DataFrame(pea_similarity, index=self._user_product_matrix.index)
            self._similarity.columns = self._user_product_matrix.index
        else:
            raise Exception
    
    def _knn_filtered(self, user_id, product_id, k):
        return self._similarity[user_id][~np.isnan(self._user_product_matrix[product_id])].sort_values(ascending=False).head(k)
    
    def predict(self, user_id, product_id):
        if product_id in self._user_product_matrix:
            neighbours = self._knn_filtered(user_id, product_id, self._k)

            if not len(neighbours):
                return 2.5
            
            ratings = self._user_product_matrix[product_id][neighbours.index.values].to_numpy().astype(float)
            weights = neighbours.values.astype(float)
            
            if weights.sum() != 0:
                return np.dot(ratings, weights)/weights.sum()
        return 2.5

In [19]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, k=10)
knn_model.score_on_test_set()

RMSE: 1.09160312713944
MAE: 0.7741288942303893


(1.09160312713944, 0.7741288942303893)

In [20]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, k=7)
knn_model.score_on_test_set()

RMSE: 1.096815033741887
MAE: 0.7744033588942375


(1.096815033741887, 0.7744033588942375)

In [21]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, k=5)
knn_model.score_on_test_set()

RMSE: 1.106599279533919
MAE: 0.7769100476571354


(1.106599279533919, 0.7769100476571354)

In [22]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, k=3)
knn_model.score_on_test_set()

RMSE: 1.134017119333312
MAE: 0.7849504376685776


(1.134017119333312, 0.7849504376685776)

In [23]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, k=1)
knn_model.score_on_test_set()

RMSE: 1.2996117084359562
MAE: 0.8355379383446692


(1.2996117084359562, 0.8355379383446692)

In [25]:
knn_model = KnnModel(X_train, X_test, y_train, y_test, 'pearson', k=10)
knn_model.score_on_test_set()

RMSE: 21.67586206074521
MAE: 0.903924472946585


(21.67586206074521, 0.903924472946585)