In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr
from feather import read_dataframe
%matplotlib inline
from functools import lru_cache
import time
from tqdm import tqdm_notebook
import logging

In [2]:
complete = read_dataframe("user_book_ratings.feather",).pivot_table("Book-Rating",
                                                                    index="User-ID",
                                                                    columns="Book-Title")

In [3]:
complete.head(10)

Book-Title,Murder of a Sleeping Beauty (Scumble River Mysteries (Paperback)),"Q-Space (Star Trek The Next Generation, Book 47)","Q-Zone (Star Trek The Next Generation, Book 48)",""" Lamb to the Slaughter and Other Stories (Penguin 60s S.)","""A"" is for Alibi : A Kinsey Millhone Mystery (A Kinsey Millhone Mystery)","""O"" Is for Outlaw","""Surely You're Joking, Mr. Feynman!"": Adventures of a Curious Character","""The Happy Prince"" and Other Stories (Penguin Popular Classics)",'Salem's Lot,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,...,Zodiac: The Eco-Thriller,"Zoey Phillips (Girlfriends) (Harlequin Superromance, No. 1020)",Zombies of the Gene Pool,Zoya,Zoya's Story: An Afghan Woman's Struggle for Freedom,ZwÃ?Â¶lf.,e,iI Paradiso Degli Orchi,one hundred years of solitude,stardust
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User_100004,,,,,,,,,,,...,,,,,,,,,,
User_100009,,,,,,,,,,,...,,,,,,,,,,
User_10001,,,,,,,,,,,...,,,,,,,,,,
User_100010,,,,,,,,,,,...,,,,,,,,,,
User_100038,,,,,,,,,,,...,,,,,,,,,,
User_100053,,,,,,,,,,,...,,,,,,,,,,
User_100066,,,,,,,,,,,...,,,,,,,,,,
User_100088,,,,,,,,,,,...,,,,,,,,,,
User_100098,,,,,,,,,,,...,,,,,,,,,,
User_100115,,,,,,,,,,,...,,,,,,,,,,


In [4]:
complete.memory_usage().sum() / 1024 ** 3 # GB

1.2238024920225143

In [5]:
############# Utilities #############


def compute_evaluation_metric(y_true, y_pred, metric="MSE"):
    """
        Given two vectors computes the selected
        metric over non-missing values.
    """
    
    # This works only on the common elements
    # missing values are ignored
    diff = np.subtract(y_true, y_pred)
    
    # Count NaNs in the difference, check for NaNs and 
    # invert that, finally count non-zeros 
    # https://stackoverflow.com/a/21778195/6655150
    N = np.count_nonzero(~np.isnan(diff))
    
    if metric == "MSE":
        distance = (1 / N) * np.nansum( np.power(diff, 2) )
    elif metric == "MAE":
        distance = (1 / N) * np.nansum( np.abs(diff) )
    elif metric == "RMSE":
        distance = np.sqrt((1 / N) * np.nansum( np.power(diff, 2) ))
    elif metric == "R2":
        y_mean = np.nanmean(y_true)
        diffs1 = np.nansum(np.square(y_true - y_mean))
        diffs2 = np.nansum(np.square(y_pred - y_true))
        distance = 1 - diffs2 / diffs1
        
    return distance


def cos_sim(user, others):
    """
        This is nothing more than the implementation of cosine similarity
        in a way to handle (ignore) missing values. It is about 2-3x faster
        than replacing NaNs with zeros and then using scipy's or sklearn's
        functions. It assumes the input vectors are `atleast_2D`.
    """
    
    # add some numbers for numeric stability
    sum_sq = np.nansum(np.multiply(user, others), axis=1)

    magnitudes = np.multiply(np.sqrt(np.nansum(np.square(user))),
                             np.sqrt(np.nansum(np.square(others), axis=1))) 
    
    # calculate cosing similarity and add 
    # a small amount to avoid division by 0
    return np.divide(sum_sq+1e-7, magnitudes+1e-3).reshape((user.shape[0], -1))

def pearson_sim(user, others):
    """
        Similarly to the calculation of cosine similarity, this custom approach
        is faster (~1.5x) than filling the whole matrix (this time with the row
        mean) and then using `scipy.stats.stats.pearsonr` with a list comprehension
        iterating over all the rows.
    """
    
    user_mean = np.nanmean(user)
    others_means = np.nanmean(others, axis=1, keepdims=True)

    diff1 = np.subtract( user, user_mean ) 
    diff2 = np.subtract( others, others_means )

    var1 = np.sqrt( np.nansum( np.power(diff1, 2) ) )
    var2 = np.sqrt( np.nansum( np.power(diff2, 2), axis=1, 
                               keepdims=True) )

    num = np.nansum( np.multiply( diff1, diff2 ), axis=1,
                     keepdims=True )
    denom = np.multiply( var1, var2 )
    
    return np.divide(num, (denom + 1e-3)).reshape((user.shape[0], -1))


def similirity_evaluation(user, others, method="pearson"):
    """
        Given two vectors, or a vector and list of vectors, computes
        the selected similarity metric(s). Make sure both vectors
        are 2D.
    """

    if method == "pearson":
        return pearson_sim(user, others)
    
    elif method == "cosine":
        return cos_sim(user, others)

# mycls . lazypropINST . __get__ myclsFUNC

class lazyproperty:
    
    def __init__(self, func):
        self.func = func
        
    def __get__(self, instance, cls):
#         if instance is None:
#             print("First run")
#             return self
#         else:
        value = self.func(instance)
        setattr(instance, self.func.__name__, value)
        return value
    
############# Functions used for actual prediction #############

class BaseRecommender:
    """
        This showcases the general pattern: subclasses need only
        implement _get_item_sims and _get_user_sims which calculate
        item and user similarities for a given input string
    """
    
    def __init__(self, df, *args, **kwargs):
        self.df = df
        
        # so we know where the missing values are
        self.nan_mask = df.isna() 
        
    def _get_user(self, i):
        """Get row i; should be the same dtype as RowIndex"""
        return np.atleast_2d(self.df.loc[i].values)

    def _get_item(self, j):
        """Get row j from the transponsed matrix, aka item j"""
        return np.atleast_2d(self.df.loc[:,j].values)
    
    # TODO: Better function signatures and more details on
    # how to overwrite them        
    def _get_item_sims(self, *args, **kwargs):
        raise NotImplementedError
    
    def _get_user_sims(self, *args, **kwargs):
        raise NotImplementedError
    
    def find_similar_items(self, item_name=None, metric="cosine"):
        """
            item_name: str or list of str, columns of dataframe
            metric: str, one of pearson / cosine
        
            Given an item and a dataframe (optionally a similarity metric)
            this function finds the most similar items.
        """
        
        if item_name is not None:
            sims = self._get_item_sims(item_name, metric)
        else:
            raise Exception("Invalid user or vector input.")
            
        results = pd.DataFrame([self.df.columns, sims.ravel()],
                               index=["Items", "Similarity scores"]).T
        
        return results.sort_values("Similarity scores", ascending=False).reset_index(drop=True)
    
    
    def find_similar_users(self, username=None, metric="pearson"):
        """
            username: str, index of dataframe
            metric: str, one of pearson / cosine
        
            Given a username (optionally a similarity metric) or a vector
            this function finds the most similar users.
        """
        
        if username is not None:
            sims = self._get_user_sims(username, metric)
        else:
            raise Exception("Invalid user input.")

        results = pd.DataFrame([self.df.index, sims.ravel()],
                               index=["Users", "Similarity scores"]).T
        
        return results.sort_values("Similarity scores", ascending=False).reset_index(drop=True)
    
class LowMemRecommender(BaseRecommender):
    """
        Does not fill missing values, does not precompute anything,
        and does not store excessive arrays. Inference is usually
        slower but calculations are more accurate due to better
        handling of NaNs (does not assume NaNs to be zero).
    """


    def _get_user_sims(self, username, metric="cosine"):
        user = self._get_user(username) # get vectors
        return similirity_evaluation(user, self.df, metric) # calculate similarity
    
    def _get_item_sims(self, item_name, metric="cosine"):
        item = self._get_item(item_name) # get vectors
        return similirity_evaluation(item, self.df.T, metric) # calculate similarity
    
    
    ## THIS MAY NOT BE WORKING CORRECTLY FOR ITEMS
    ## TODO: Move to base class
    def predict_user_ratings(self, username=None, metric="cosine", *, vector=None):
        """
            username: str, index of dataframe
            metric: str, one of pearson / cosine
            vector: vector with shape (n_items,) or (1,n_items), ignored
                    if username is also specified
        
            Given a certain username or a vector, it tries
            to predict missing values for each user/row by taking into account
            the user(s) mean and their similarity with other users.
        """
        
        if username is not None:
            user = self._get_user(username)
        elif vector is not None:
            # make sure the shape is correct
            user = np.atleast_2d(vector)
        else:
            raise Exception("Invalid user or vector input.")

        # use numpy to calculate the mean ignoring NaNs
        user_mean = np.nanmean(user)
        others_means =  np.nanmean(self.df, axis=1, keepdims=True)

        diff2 = np.subtract(self.df, others_means)

        similarities = similirity_evaluation(user, self.df, metric)

        num = np.nansum( np.multiply(similarities.T, diff2), axis=0 )
        denom = np.nansum(similarities)

        return np.add(user_mean, np.divide(num, denom + 1e-4))
    
    
class FastRecommender(BaseRecommender):
    """
        This implementation stores a copy of the original (unless copy=False)
        as well as a mask of missing values
    """
    
    def __init__(self, df, copy=True):
        
        super().__init__(df)
        
        if copy:
            # in case we need to reset
            self._orig_df = df.copy() 
        
    @lazyproperty
    def user_similarities(self):
        logging.info("Computing user similarity matrix...")
        return np.corrcoef(complete.fillna(0).values + 1e-3) # numerical stability
    
    @lazyproperty
    def item_similarities(self):
        logging.info("Computing item similarity matrix...")
        return np.corrcoef(complete.T.fillna(0).values + 1e-3) # numerical stability
    
    def _get_user_index(self, i):
        """Get row i; should be the same dtype as RowIndex"""
        return complete.index.get_loc(i)

    def _get_item_index(self, j):
        """Get row j from the transponsed matrix, aka item j"""
        return complete.columns.get_loc(j)
    
    def _get_item_sims(self, item, *args, **kwargs):
        idx = self._get_item_index(item)
        return self.item_similarities[idx]
    
    def _get_user_sims(self, user, *args, **kwargs):
        idx = self._get_user_index(user)
        return self.user_similarities[idx]
    
    
# Cheating
def Recommender(df, low_memory=False):
    
    if low_memory:
        return LowMemRecommender(df)
    else:
        return FastRecommender(df)
        

In [28]:
start = time.time()
rec = Recommender(complete) # instant fetch, slow start (~30 sec)
end = time.time()

end - start

1.5803501605987549

In [29]:
rec.find_similar_users("User_8", "cosine")

Unnamed: 0,Users,Similarity scores
0,User_8,1
1,User_67544,0.398428
2,User_116866,0.311722
3,User_219008,0.275284
4,User_263325,0.134649
5,User_11676,0.0180414
6,User_16759,2.82467e-15
7,User_129002,2.82467e-15
8,User_87846,2.82467e-15
9,User_220626,2.82467e-15


In [30]:
rec.find_similar_users("User_8", "cosine")

Unnamed: 0,Users,Similarity scores
0,User_8,1
1,User_67544,0.398428
2,User_116866,0.311722
3,User_219008,0.275284
4,User_263325,0.134649
5,User_11676,0.0180414
6,User_16759,2.82467e-15
7,User_129002,2.82467e-15
8,User_87846,2.82467e-15
9,User_220626,2.82467e-15


In [63]:
rec.find_similar_items("Harry Potter and the Chamber of Secrets (Book 2)")

Unnamed: 0,Items,Similarity scores
0,Harry Potter and the Chamber of Secrets (Book 2),1
1,Harry Potter and the Prisoner of Azkaban (Book 3),0.57121
2,Harry Potter and the Goblet of Fire (Book 4),0.562486
3,Harry Potter and the Sorcerer's Stone (Book 1),0.443944
4,Harry Potter and the Order of the Phoenix (Boo...,0.350768
5,Harry Potter and the Sorcerer's Stone (Harry P...,0.334566
6,The Fellowship of the Ring (The Lord of the Ri...,0.148811
7,The Hobbit: or There and Back Again,0.130225
8,Dr. Seuss's A B C (I Can Read It All by Myself...,0.117636
9,Ramona the Pest (Ramona Quimby (Paperback)),0.115795


In [11]:
complete.loc[["User_8", "User_67544"]].dropna(thresh=2, axis=1)

Book-Title,Clara Callan
User-ID,Unnamed: 1_level_1
User_8,5.0
User_67544,8.0


In [12]:
from spotlight.interactions import Interactions
from spotlight.evaluation import rmse_score

#### This doesn't work as expected

In [13]:
user_itempreds = predict_user_ratings(i1, complete)
test = pd.DataFrame(np.vstack((user_itempreds, i1, complete.columns))).T.dropna(thresh=3, axis=0)
test.sort_values(1, ascending=False).head(10)

Unnamed: 0,0,1,2
2039,5.10612,5,Clara Callan
10488,1.42583,0,The Kitchen God's Wife


In [14]:
from sklearn.decomposition import NMF, TruncatedSVD, dict_learning
import time

In [174]:
def decompose_matrix(matrix, decomposer, evaluate=False):
    """
        matrix: 2D numpy array or pandas dataframe
        decomposer: sklearn class that performs the decomposition.
                    The decomposer can be any object but it is expected
                    to implement an API similar to sklearn with a
                    `fit_transform` method and a `components_` attribute.
        evaluate: bool, whether to also evaluate the model.
    
        Decomposes a matrix with the given method and returns the two
        decomposed vectors. If evaluate=True then this also returns an
        evaluation of the decomposition.
        
        Since we don't want to treat missing values as zeros for evaluation
        but we need numbers to perform the decomposition. One way to test
        this is to add one to all the existent values, then replace NaNs
        with zeros, perform the matrix decomposition, reconstruct the matrix,
        and finally subtract one and calculate the metrics. For the resulting
        decomposition remember to subtract 1 from every future reconstruction,
        and also do min-max scaling (since recostruction may have negative or
        too-high values).        
    """
    
    matrix = (matrix+int(evaluate)).fillna(0).values

    # perform the decomposition
    User = decomposer.fit_transform(matrix)
    Item = decomposer.components_
    
    if evaluate:
        # Reconstruct the matrix and replace negatives with NaNs & clip values
        reconstructed = User @ Item - 1
        reconstructed[reconstructed<0] = np.nan
        reconstructed[reconstructed>10] = 10
    
        # R_squared, pearson correlation, and cosine similarity between 
        # the reconstructed and original matrices
        r2 = compute_evaluation_metric(matrix, reconstructed, "R2")
        rmse = compute_evaluation_metric(matrix, reconstructed, "RMSE")
        corr = similirity_evaluation(reconstructed.ravel(), matrix.ravel())[0][0]
        cosine = similirity_evaluation(reconstructed.ravel(), matrix.ravel(), "cosine")[0][0]
    
        print("RSq: {}\nRMSE: {}\nPearson: {}\nCosine similarity: {}".format(r2, rmse, corr, cosine))
        return User, Item
    
    return User, Item



In [175]:
decomposer = TruncatedSVD(100)
U, I = decompose_matrix(complete, decomposer, evaluate=True)



RSq: 0.8009940018759327
RMSE: 3.916204465187512
Pearson: 0.3103366760774409
Cosine similarity: 0.4744121995336806


In [16]:
# find rows with common ratings with i2
ff3 = complete.T[((~np.isnan(i2)) & (~np.isnan(complete))).any(0)].T
ff3

Book-Title,A Patchwork Planet (Ballantine Reader's Circle),A Time to Kill,Along Came a Spider (Alex Cross Novels),An Italian Affair,Big Cherry Holler: A Big Stone Gap Novel (Ballantine Reader's Circle),Big Stone Gap : A Novel,Bread Alone : A Novel,Clara Callan,Dating Big Bird,Durable Goods,...,The Horse Whisperer,The Hours: A Novel,The Pilot's Wife : A Novel Tag: Author of the Weight of Water (Oprah's Book Club (Hardcover)),The Poisonwood Bible,The Red Tent (Bestselling Backlist),The Winner,True to Form : A Novel,Winter Solstice,Wuthering Heights (Penguin Classics),Young Wives
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User_100004,,,,,,,,,,,...,,,,,,,,,,
User_100009,,,,,,,,,,,...,,,,,,,,,,
User_10001,,,,,,,,,,,...,,,,,,,,,,
User_100010,,,,,,,,,,,...,,,,,,,,,,
User_100038,,,,,,,,,,,...,,,,,,,,,,
User_100053,,,,,,,,,,,...,,,,,0.0,,,,,
User_100066,,,,,,,,,,,...,,,,,,,,,,
User_100088,,0.0,,,,,,,,,...,,,,,,,,,,
User_100098,,,,,,,,,,,...,,,,,,,,,,
User_100115,,,,,,,,,,,...,,,,,,,,,,


In [17]:
ff3.shape

(12490, 38)

In [18]:
# drop rows and columns with less than 5 common ratings
nnans3 = ff3.dropna(thresh=2, axis=1).dropna(thresh=2, axis=0)
nnans3

Book-Title,A Patchwork Planet (Ballantine Reader's Circle),A Time to Kill,Along Came a Spider (Alex Cross Novels),An Italian Affair,Big Cherry Holler: A Big Stone Gap Novel (Ballantine Reader's Circle),Big Stone Gap : A Novel,Bread Alone : A Novel,Clara Callan,Dating Big Bird,Durable Goods,...,The Horse Whisperer,The Hours: A Novel,The Pilot's Wife : A Novel Tag: Author of the Weight of Water (Oprah's Book Club (Hardcover)),The Poisonwood Bible,The Red Tent (Bestselling Backlist),The Winner,True to Form : A Novel,Winter Solstice,Wuthering Heights (Penguin Classics),Young Wives
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User_10030,,0.0,,,,,,,,,...,,,,,0.0,,,,,
User_100459,,10.0,8.0,,,,,,,,...,0.0,0.0,,10.0,0.0,,,4.0,,
User_100846,,,,,,,,,,,...,,,,,,8.0,,,,
User_101209,,,,,,,,,,,...,0.0,,,,,,,,,
User_101305,,,,,,,,,,0.0,...,,,,0.0,5.0,,,,,
User_101876,,,,,0.0,,,,,,...,0.0,,,,,8.0,,,,
User_102275,,,,,,,,,,,...,,0.0,,,,,,,,
User_102647,,10.0,10.0,,,,,,,,...,,,,,,,,,,
User_102967,,0.0,0.0,,,8.0,,,,,...,0.0,,0.0,,,0.0,,,,
User_10314,,,,,,,9.0,,,,...,,,,,9.0,,,,,


In [19]:
nnans3.shape

(1065, 38)

In [20]:
# Find the similarity between the new vector and the candidates
i2_short = nnans3.loc[:,"A Patchwork Planet (Ballantine Reader's Circle)"]
similirity_evaluation(i2_short, nnans3.T, "cosine").round(2)

array([[1.  , 0.  , 0.05, 0.21, 0.04, 0.  , 0.  , 0.25, 0.18, 0.11, 0.16,
        0.08, 0.08, 0.08, 0.03, 0.13, 0.06, 0.31, 0.  , 0.  , 0.05, 0.41,
        0.1 , 0.  , 0.04, 0.  , 0.  , 0.06, 0.08, 0.05, 0.11, 0.06, 0.09,
        0.05, 0.12, 0.05, 0.05, 0.12]])

In [21]:
similirity_evaluation(i2_short, nnans3.loc[:,"An Italian Affair"].T,
                      "cosine").round(2)

array([[0.21]])

In [22]:
nnans3.shape, i2_short.shape

((1065, 38), (1065,))

In [23]:
# check original (only common values) to see if 0.44 correlation appears plausible 
complete.loc[:,["An Italian Affair",
                "A Patchwork Planet (Ballantine Reader's Circle)"]].dropna(thresh=2, axis=0)

Book-Title,An Italian Affair,A Patchwork Planet (Ballantine Reader's Circle)
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1
User_11400,8.0,7.0
User_194600,7.0,6.0


## Experimenting with tensorflow for decomposition

This didn't seem to work very well.

In [24]:
from tqdm import tqdm

In [25]:
import tensorflow as tf

In [26]:
sess = tf.Session()

In [27]:
mask = (complete * 0 + 1).fillna(0).values
mask

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
complete.shape

(12490, 13150)

In [38]:
n = 50

InputTensor = tf.placeholder(tf.float32, shape=(None, complete.shape[1]))

UserTensor = tf.Variable(tf.random_normal((complete.shape[0],n), dtype=tf.float32),
                         dtype=tf.float32)
ItemTensor = tf.Variable(tf.random_normal((n, complete.shape[1]), dtype=tf.float32),
                              dtype=tf.float32)

# a mask of missing values (true when present, false when nan)
Mask = tf.constant(mask.astype(np.float32))
reg_lambda = tf.constant(0.5)

norms = tf.add(tf.sqrt(tf.reduce_sum(tf.square(UserTensor))),
              tf.sqrt(tf.reduce_sum(tf.square(ItemTensor))))

# Using the mask, we force the loss to be computed only on known values
mul = tf.multiply(Mask, tf.matmul(UserTensor, ItemTensor,))
diffs = tf.subtract(InputTensor, mul)
squares = tf.square(diffs)
loss = tf.add(tf.reduce_sum(squares), tf.multiply(reg_lambda, norms))

opt = tf.train.AdamOptimizer(learning_rate=0.1).minimize(loss)
sess.run(tf.global_variables_initializer())


In [39]:
for i in range(200):
    o, l = sess.run([opt, loss], feed_dict={InputTensor: complete.fillna(0).astype(np.float32).values})
    print(f"Epoch: {i}, Loss: {l:.2f}")

Epoch: 0, Loss: 28692564.00
Epoch: 1, Loss: 19756440.00
Epoch: 2, Loss: 13771133.00
Epoch: 3, Loss: 9887452.00
Epoch: 4, Loss: 7423917.50
Epoch: 5, Loss: 5876934.00
Epoch: 6, Loss: 4894670.00
Epoch: 7, Loss: 4243662.50
Epoch: 8, Loss: 3777818.75
Epoch: 9, Loss: 3412460.25
Epoch: 10, Loss: 3103042.00
Epoch: 11, Loss: 2828696.25
Epoch: 12, Loss: 2580980.75
Epoch: 13, Loss: 2357169.25
Epoch: 14, Loss: 2156456.50
Epoch: 15, Loss: 1977980.00
Epoch: 16, Loss: 1820148.38
Epoch: 17, Loss: 1680752.12
Epoch: 18, Loss: 1557328.12
Epoch: 19, Loss: 1447465.12
Epoch: 20, Loss: 1349021.88
Epoch: 21, Loss: 1260267.75
Epoch: 22, Loss: 1179893.62
Epoch: 23, Loss: 1106914.25
Epoch: 24, Loss: 1040566.75
Epoch: 25, Loss: 980211.19
Epoch: 26, Loss: 925260.31
Epoch: 27, Loss: 875173.12
Epoch: 28, Loss: 829446.44
Epoch: 29, Loss: 787622.88
Epoch: 30, Loss: 749296.81
Epoch: 31, Loss: 714084.06
Epoch: 32, Loss: 681610.88
Epoch: 33, Loss: 651505.44
Epoch: 34, Loss: 623425.81
Epoch: 35, Loss: 597124.62
Epoch: 36,

In [40]:
U = sess.run(UserTensor)
I = sess.run(ItemTensor)

In [41]:
(U @ I).max(), (U @ I).min()

(93.80846, -98.86373)

In [42]:
reconstructed = U @ I

r2 = compute_evaluation_metric(complete, reconstructed, "R2")
rmse = compute_evaluation_metric(complete, reconstructed, "RMSE")
corr = similirity_evaluation(reconstructed.ravel(), complete.values.ravel())[0][0]
cosine = similirity_evaluation(reconstructed.ravel(), complete.values.ravel(), "cosine")[0][0]

print("RSq: {}\nRMSE: {}\nPearson: {}\nCosine similarity: {}".format(r2, rmse, corr, cosine))

RSq: 0.9923013192470189
RMSE: 0.33378412402625257
Pearson: 0.020808072746854676
Cosine similarity: 0.02528973086618768


## This seems to be working nicely

In [43]:
reconstructed = reconstructed.clip(0,10)

r2 = compute_evaluation_metric(complete*mask, reconstructed*mask, "R2")
rmse = compute_evaluation_metric(complete*mask, reconstructed*mask, "RMSE")
corr = similirity_evaluation(reconstructed.ravel()*mask.ravel(),
                             complete.values.ravel()*mask.ravel())[0][0]
cosine = similirity_evaluation(reconstructed.ravel()*mask.ravel(),
                               complete.values.ravel()*mask.ravel(), "cosine")[0][0]

print("RSq: {}\nRMSE: {}\nPearson: {}\nCosine similarity: {}".format(r2, rmse, corr, cosine))

RSq: 0.9938639148548395
RMSE: 0.29799104246490116
Pearson: 0.8159394505510064
Cosine similarity: 0.9979132318557788


In [47]:
xx = 9
preds = pd.DataFrame([complete.iloc[xx].values, reconstructed[xx], complete.columns], 
             index=["User", "Preds", "Items"]).T.sort_values("User", ascending=False)
preds

Unnamed: 0,User,Preds,Items
5624,10,9.98883,Living the Simple Life: A Guide to Scaling Dow...
10050,10,10,The Five People You Meet in Heaven
12853,9,8.99655,Who Moved My Cheese? An Amazing Way to Deal wi...
2830,9,8.98362,Divine Secrets of the Ya-Ya Sisterhood : A Novel
7520,8,7.99427,RAMONA FOREVER (Ramona Quimby (Paperback))
10617,6,6.01415,The Lilac Bus: Stories
9107,6,5.97508,Term Limits
4486,6,6.00489,Honourable Schoolboy
5143,0,0.0591984,Kaleidoscope
10569,0,0,The Last Time They Met : A Novel


In [53]:
preds[preds["User"].isna()].sort_values("Preds", ascending=False)

Unnamed: 0,User,Preds,Items
6470,,10,My Little Blue Dress
5119,,10,Just Me and My Dad (Golden Look-Look Book)
11304,,10,The Scar
12397,,10,Under the Lake
1449,,10,Bluebeard
3548,,10,First Avenue
5133,,10,Justice for Some
10417,,10,The Inheritor
7216,,10,Pessimisms: Famous (and not so famous) Observa...
9282,,10,The Ballad of Typhoid Mary


In [58]:
def get_reconstructed_ratings(user, Users, Items, df):
    
    user = np.atleast_2d(user)
    
    vec = user @ Items
    
    preds = pd.DataFrame([complete.iloc[xx].values, reconstructed[xx], complete.columns], 
                         index=["User", "Preds", "Items"]).T
    return preds[preds["User"].isna()].sort_values("Preds", ascending=False)

In [59]:
get_reconstructed_ratings(i1, U, I, complete)

ValueError: shapes (1,13150) and (50,13150) not aligned: 13150 (dim 1) != 50 (dim 0)

In [61]:
U.shape, I.shape

((12490, 50), (50, 13150))

In [125]:
xxx = complete.iloc[0].fillna(0).values.reshape((1,-1))
((I @ xxx.T).T @ I).shape

(1, 13150)

In [126]:
xxx.shape, I.shape, U.shape

((1, 13150), (50, 13150), (12490, 50))

In [131]:
(xxx @ I.T) @ U.T

array([[ 464.14839094,  334.77402609, -237.11318784, ...,  -90.62886511,
          68.11662654,   73.31544791]])

In [132]:
U @ (I @ xxx.T)

array([[ 464.14839094],
       [ 334.77402609],
       [-237.11318784],
       ...,
       [ -90.62886511],
       [  68.11662654],
       [  73.31544791]])

In [139]:
((xxx @ I.T)).shape

(1, 50)

In [134]:
(U @ I).shape

(12490, 13150)

In [138]:
(xxx @ (U@I)).shape

ValueError: shapes (1,13150) and (12490,13150) not aligned: 13150 (dim 1) != 12490 (dim 0)

In [140]:
tsvd.components_.shape

(10, 13150)

In [141]:
U.shape

(12490, 50)

In [143]:
xxx.shape

(1, 13150)

In [156]:
similirity_evaluation(xxx @ I.T, U).shape

(1, 12490)

In [151]:
similirity_evaluation(xxx @ I.T, U).max()

0.5084103157007126

In [152]:
similirity_evaluation(xxx @ I.T, U).min()

-0.463650119000801

In [153]:
np.percentile(similirity_evaluation(xxx @ I.T, U), 90)

0.20640708205877015

In [155]:
(similirity_evaluation(xxx @ I.T, U)<0.25).mean()

0.944275420336269

In [117]:
complete.iloc[0].fillna(0).values

array([0., 0., 0., ..., 0., 0., 0.])

In [119]:
tsvd.transform(complete.iloc[0].fillna(0).values.reshape((1,-1)))

array([[ 1.90254889,  1.95334065, -3.24689092,  2.88255786, -6.49299172,
        -1.20928313,  0.58378095,  1.14808991, -0.04961669, -0.21718322]])

## Experimenting with apriori and association rules

This seems to work great.

## Nearest Neighbor method (might be interesting with better vector representations)