In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr
from feather import read_dataframe
%matplotlib inline

In [2]:
complete = read_dataframe("user_book_ratings.feather",).pivot_table("Book-Rating",
                                                                    index="User-ID",
                                                                    columns="Book-Title")

In [3]:
complete.head(10)

Book-Title,Murder of a Sleeping Beauty (Scumble River Mysteries (Paperback)),"Q-Space (Star Trek The Next Generation, Book 47)","Q-Zone (Star Trek The Next Generation, Book 48)",""" Lamb to the Slaughter and Other Stories (Penguin 60s S.)","""A"" is for Alibi : A Kinsey Millhone Mystery (A Kinsey Millhone Mystery)","""O"" Is for Outlaw","""Surely You're Joking, Mr. Feynman!"": Adventures of a Curious Character","""The Happy Prince"" and Other Stories (Penguin Popular Classics)",'Salem's Lot,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,...,Zodiac: The Eco-Thriller,"Zoey Phillips (Girlfriends) (Harlequin Superromance, No. 1020)",Zombies of the Gene Pool,Zoya,Zoya's Story: An Afghan Woman's Struggle for Freedom,ZwÃ?Â¶lf.,e,iI Paradiso Degli Orchi,one hundred years of solitude,stardust
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User_100004,,,,,,,,,,,...,,,,,,,,,,
User_100009,,,,,,,,,,,...,,,,,,,,,,
User_10001,,,,,,,,,,,...,,,,,,,,,,
User_100010,,,,,,,,,,,...,,,,,,,,,,
User_100038,,,,,,,,,,,...,,,,,,,,,,
User_100053,,,,,,,,,,,...,,,,,,,,,,
User_100066,,,,,,,,,,,...,,,,,,,,,,
User_100088,,,,,,,,,,,...,,,,,,,,,,
User_100098,,,,,,,,,,,...,,,,,,,,,,
User_100115,,,,,,,,,,,...,,,,,,,,,,


In [4]:
complete.memory_usage().sum() / 1024 ** 3

1.2238024920225143

In [5]:
############# Utilities #############

def get_user(i, df):
    """Get row i; should be the same dtype as RowIndex"""
    return df.loc[i].values


def get_item(j, df):
    """Get row j from the transponsed matrix, aka item j"""
    return df.loc[:,j].values


def compute_evaluation_metric(y_true, y_pred, metric="MSE"):
    """Given two vectors computes the selected metric."""
    
    # This works only on the common elements
    # missing values are ignored
    diff = np.subtract(y_true, y_pred)
    
    # Count NaNs in the difference, check for NaNs and 
    # invert that, finally count non-zeros 
    # https://stackoverflow.com/a/21778195/6655150
    N = np.count_nonzero(~np.isnan(diff))
    
    if metric == "MSE":
        distance = (1 / N) * np.nansum( np.power(diff, 2) )
    elif metric == "MAE":
        distance = (1 / N) * np.nansum( np.abs(diff) )
    elif metric == "RMSE":
        distance = np.sqrt((1 / N) * np.nansum( np.power(diff, 2) ))
    elif metric == "R2":
        y_mean = np.nanmean(y_true)
        diffs1 = np.nansum(np.square(y_true - y_mean))
        diffs2 = np.nansum(np.square(y_pred - y_true))
        distance = 1 - diffs2 / diffs1
        
    return distance


def cos_sim(user, others):
    """
        This is nothing more than the implementation of cosine similarity
        in a way to handle (ignore) missing values. It is about 2-3x faster
        than replacing NaNs with zeros and then using scipy's or sklearn's
        functions. It assumes the input vectors are `atleast_2D`.
    """
    
    # add some numbers for numeric stability
    sum_sq = np.nansum(np.multiply(user, others), axis=1)

    magnitudes = np.multiply(np.sqrt(np.nansum(np.square(user))),
                             np.sqrt(np.nansum(np.square(others), axis=1))) 
    
    # calculate cosing similarity and add 
    # a small amount to avoid division by 0
    return np.divide(sum_sq+1e-7, magnitudes+1e-3).reshape((user.shape[0], -1))

def pearson_sim(user, others):
    """
        Similarly to the calculation of cosine similarity, this custom approach
        is faster (~1.5x) than filling the whole matrix (this time with the row
        mean) and then using `scipy.stats.stats.pearsonr` with a list comprehension
        iterating over all the rows.
    """
    
    user_mean = np.nanmean(user)
    others_means = np.nanmean(others, axis=1, keepdims=True)

    diff1 = np.subtract( user, user_mean ) 
    diff2 = np.subtract( others, others_means )

    var1 = np.sqrt( np.nansum( np.power(diff1, 2) ) )
    var2 = np.sqrt( np.nansum( np.power(diff2, 2), axis=1, 
                               keepdims=True) )

    num = np.nansum( np.multiply( diff1, diff2 ), axis=1,
                     keepdims=True )
    denom = np.multiply( var1, var2 )
    
    return np.divide(num, (denom + 1e-3)).reshape((user.shape[0], -1))


def similirity_evaluation(user, others, method="pearson"):
    """
        Given two vectors, or a vector and list of vectors, computes
        the selected similarity metric(s).
    """
    
    # make sure they are in the correct shape
    user = np.atleast_2d(user)
    others = np.atleast_2d(others)

    if method == "pearson":
        return pearson_sim(user, others)
    
    elif method == "cosine":
        return cos_sim(user, others)


############# Functions used for actual prediction #############

    
def find_similar_users(user, df, metric="pearson"):
    """
        Given a user and a dataframe (optionally a similarity metric)
        this function finds the most similar users.
    """
    
    sims = similirity_evaluation(user, df, metric)
    results = pd.DataFrame([df.index, sims.ravel()],
                           index=["Users", "Similarity scores"]).T
    return results.sort_values("Similarity scores", ascending=False).reset_index(drop=True)


def find_similar_items(item, df, metric="cosine"):
    """
        Given an item and a dataframe (optionally a similarity metric)
        this function finds the most similar items.
    """
    sims = similirity_evaluation(item, df.T, metric)
    results = pd.DataFrame([df.columns, sims.ravel()],
                           index=["Items", "Similarity scores"]).T
    return results.sort_values("Similarity scores", ascending=False).reset_index(drop=True)

    
## THIS MAY NOT BE WORKING CORRECTLY
def predict_user_ratings(user, others):
    
    # make sure the shape is correct
    user = np.atleast_2d(user)
    others = np.atleast_2d(others)

    # use numpy to calculate the mean ignoring NaNs
    user_mean = np.nanmean(user)
    others_means =  np.nanmean(others, axis=1, keepdims=True)
    
    diff2 = np.subtract( others, others_means )
    
    similarities = similirity_evaluation(user, others, "cosine")
    
    num = np.nansum( np.multiply(similarities.T, diff2), axis=0 )
    denom = np.nansum(similarities)
            
    return np.add(user_mean, np.divide(num, denom + 1e-4))

In [6]:
i1 = get_user("User_8", complete)
i2 = get_user("User_11400", complete)
i3 = get_user(["User_11400", "User_67544"], complete)
i4 = get_user("User_67544", complete)

j1 = get_item(' Murder of a Sleeping Beauty (Scumble River Mysteries (Paperback))', complete)
j2 = get_item('"A" is for Alibi : A Kinsey Millhone Mystery (A Kinsey Millhone Mystery)', complete)
j3 = get_item(['Zodiac: The Eco-Thriller', 
               'Zoey Phillips (Girlfriends) (Harlequin Superromance, No. 1020)',], complete)
j4 = get_item(['\'Salem\'s Lot', '"O" Is for Outlaw', 
              ' Murder of a Sleeping Beauty (Scumble River Mysteries (Paperback))'], complete)
j5 = get_item('Harry Potter and the Chamber of Secrets (Book 2)',complete)

(compute_evaluation_metric(i1, i2, "RMSE"), compute_evaluation_metric(i1, i2, "MAE"),
 compute_evaluation_metric(i1, i2, "MSE"))

(5.0, 5.0, 25.0)

In [7]:
# Similarities between this item and all others
sims = similirity_evaluation(get_item('Harry Potter and the Chamber of Secrets (Book 2)',complete),
                      complete.T, "cosine")
sims

array([[4.41018016e-11, 4.14264303e-11, 9.01698410e-11, ...,
        2.90427559e-11, 4.05922560e-11, 2.91007505e-02]])

In [8]:
# Pick 5 random users and check whether the model works
# better than predicting the mean
idx = np.random.choice(complete.index, 5)
for user in idx:
    u = get_user(user, complete)
    user_itempreds = predict_user_ratings(u, complete)
        
    dummy_scorer = compute_evaluation_metric(u, np.ones(u.shape)*np.nanmean(u))
    model_scorer = compute_evaluation_metric(u, user_itempreds)
    
    print(f"{user}:\n\tDummy: {dummy_scorer}\n\tModel: {model_scorer}\n")

User_199016:
	Dummy: 17.346938775510207
	Model: 14.909697781245558

User_38262:
	Dummy: 16.0
	Model: 7.365435706194869

User_231132:
	Dummy: 10.31
	Model: 7.512381408952878

User_239554:
	Dummy: 9.884297520661155
	Model: 8.245968484275094

User_275650:
	Dummy: 0.0
	Model: 8.342767556273507e-05



In [9]:
find_similar_items(j5, complete, "cosine").head(10)

Unnamed: 0,Items,Similarity scores
0,Harry Potter and the Chamber of Secrets (Book 2),1.0
1,Harry Potter and the Prisoner of Azkaban (Book 3),0.578975
2,Harry Potter and the Goblet of Fire (Book 4),0.569977
3,Harry Potter and the Sorcerer's Stone (Book 1),0.451425
4,Harry Potter and the Order of the Phoenix (Boo...,0.361031
5,Harry Potter and the Sorcerer's Stone (Harry P...,0.346424
6,The Fellowship of the Ring (The Lord of the Ri...,0.160708
7,The Hobbit: or There and Back Again,0.135877
8,Dr. Seuss's A B C (I Can Read It All by Myself...,0.119482
9,Ramona the Pest (Ramona Quimby (Paperback)),0.119221


In [10]:
find_similar_users(i1, complete, "cosine").head(10)

Unnamed: 0,Users,Similarity scores
0,User_8,0.99996
1,User_67544,0.398504
2,User_116866,0.311829
3,User_219008,0.275409
4,User_263325,0.134908
5,User_11676,0.0200369
6,User_65322,0.0001
7,User_46133,0.0001
8,User_201634,0.0001
9,User_243200,0.0001


In [11]:
complete.loc[["User_8", "User_67544"]].dropna(thresh=2, axis=1)

Book-Title,Clara Callan
User-ID,Unnamed: 1_level_1
User_8,5.0
User_67544,8.0


In [84]:
from spotlight.interactions import Interactions
from spotlight.evaluation import rmse_score

In [None]:
rmse_score()

#### This doesn't work as expected

In [110]:
user_itempreds = predict_user_ratings(i1, complete)
test = pd.DataFrame(np.vstack((user_itempreds, i1, complete.columns))).T.dropna(thresh=3, axis=0)
test.sort_values(1, ascending=False).head(10)

Unnamed: 0,0,1,2
3572,3.42708,10,Five Quarters of the Orange
10050,3.89349,9,The Five People You Meet in Heaven
4015,3.90966,8,Good in Bed
7448,3.11904,8,Prodigy
778,3.15,8,An Italian Affair
6954,3.2346,8,Open House
11888,3.31879,8,The Winner
5076,3.12627,8,Joy School
2467,3.15577,8,Dating Big Bird
4031,3.18008,8,Gracie: A Love Story


In [11]:
from sklearn.decomposition import NMF, TruncatedSVD, dict_learning
import time

In [12]:
def decompose_matrix(matrix, decomposer, evaluate=False):
    """
        matrix: 2D numpy array or pandas dataframe
        decomposer: sklearn class that performs the decomposition.
                    The decomposer can be any object but it is expected
                    to implement an API similar to sklearn with a
                    `fit_transform` method and a `components_` attribute.
        evaluate: bool, whether to also evaluate the model.
    
        Decomposes a matrix with the given method and returns the two
        decomposed vectors. If evaluate=True then this also returns an
        evaluation of the decomposition.
        
        Since we don't want to treat missing values as zeros for evaluation
        but we need numbers to perform the decomposition. One way to test
        this is to add one to all the existent values, then replace NaNs
        with zeros, perform the matrix decomposition, reconstruct the matrix,
        and finally subtract one and calculate the metrics. For the resulting
        decomposition remember to subtract 1 from every future reconstruction,
        and also do min-max scaling (since recostruction may have negative or
        too-high values).        
    """
    
    matrix = (matrix+int(evaluate)).fillna(0).values

    # perform the decomposition
    User = decomposer.fit_transform(matrix)
    Item = decomposer.components_
    
    if evaluate:
        # Reconstruct the matrix and replace negatives with NaNs
        reconstructed = User @ Item - 1
        reconstructed[reconstructed<0] = np.nan
        reconstructed[reconstructed>10] = 10
    
        # R_squared, pearson correlation, and cosine similarity between 
        # the reconstructed and original matrices
        r2 = compute_evaluation_metric(matrix, reconstructed, "R2")
        rmse = compute_evaluation_metric(matrix, reconstructed, "RMSE")
        corr = similirity_evaluation(reconstructed.ravel(), matrix.ravel())[0][0]
        cosine = similirity_evaluation(reconstructed.ravel(), matrix.ravel(), "cosine")[0][0]
    
        print("RSq: {}\nRMSE: {}\nPearson: {}\nCosine similarity: {}".format(r2, rmse, corr, cosine))
        return User, Item
    
    return User, Item


def recreate_matrix(U, I, ):
    """
        Given User and Item decomposed vectors, it performs the necessary
        processing needed after `decompose_matrix`. It returns the full
        reconstructed matrix. It places upper and lower bounds upon ratings
        by clipping extremes rather than min/max scaling (as this seemed to
        be the best performing approach).
    """
    
    return (U @ I - 1).clip(0,10)

In [69]:
# find rows with common ratings with i2
ff3 = complete.T[((~np.isnan(i2)) & (~np.isnan(complete))).any(0)].T
ff3

Book-Title,A Patchwork Planet (Ballantine Reader's Circle),A Time to Kill,Along Came a Spider (Alex Cross Novels),An Italian Affair,Big Cherry Holler: A Big Stone Gap Novel (Ballantine Reader's Circle),Big Stone Gap : A Novel,Bread Alone : A Novel,Clara Callan,Dating Big Bird,Durable Goods,...,The Horse Whisperer,The Hours: A Novel,The Pilot's Wife : A Novel Tag: Author of the Weight of Water (Oprah's Book Club (Hardcover)),The Poisonwood Bible,The Red Tent (Bestselling Backlist),The Winner,True to Form : A Novel,Winter Solstice,Wuthering Heights (Penguin Classics),Young Wives
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User_100004,,,,,,,,,,,...,,,,,,,,,,
User_100009,,,,,,,,,,,...,,,,,,,,,,
User_10001,,,,,,,,,,,...,,,,,,,,,,
User_100010,,,,,,,,,,,...,,,,,,,,,,
User_100038,,,,,,,,,,,...,,,,,,,,,,
User_100053,,,,,,,,,,,...,,,,,0.0,,,,,
User_100066,,,,,,,,,,,...,,,,,,,,,,
User_100088,,0.0,,,,,,,,,...,,,,,,,,,,
User_100098,,,,,,,,,,,...,,,,,,,,,,
User_100115,,,,,,,,,,,...,,,,,,,,,,


In [70]:
ff3.shape

(12490, 38)

In [71]:
# drop rows and columns with less than 5 common ratings
nnans3 = ff3.dropna(thresh=2, axis=1).dropna(thresh=2, axis=0)
nnans3

Book-Title,A Patchwork Planet (Ballantine Reader's Circle),A Time to Kill,Along Came a Spider (Alex Cross Novels),An Italian Affair,Big Cherry Holler: A Big Stone Gap Novel (Ballantine Reader's Circle),Big Stone Gap : A Novel,Bread Alone : A Novel,Clara Callan,Dating Big Bird,Durable Goods,...,The Horse Whisperer,The Hours: A Novel,The Pilot's Wife : A Novel Tag: Author of the Weight of Water (Oprah's Book Club (Hardcover)),The Poisonwood Bible,The Red Tent (Bestselling Backlist),The Winner,True to Form : A Novel,Winter Solstice,Wuthering Heights (Penguin Classics),Young Wives
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User_10030,,0.0,,,,,,,,,...,,,,,0.0,,,,,
User_100459,,10.0,8.0,,,,,,,,...,0.0,0.0,,10.0,0.0,,,4.0,,
User_100846,,,,,,,,,,,...,,,,,,8.0,,,,
User_101209,,,,,,,,,,,...,0.0,,,,,,,,,
User_101305,,,,,,,,,,0.0,...,,,,0.0,5.0,,,,,
User_101876,,,,,0.0,,,,,,...,0.0,,,,,8.0,,,,
User_102275,,,,,,,,,,,...,,0.0,,,,,,,,
User_102647,,10.0,10.0,,,,,,,,...,,,,,,,,,,
User_102967,,0.0,0.0,,,8.0,,,,,...,0.0,,0.0,,,0.0,,,,
User_10314,,,,,,,9.0,,,,...,,,,,9.0,,,,,


In [72]:
nnans3.shape

(1065, 38)

In [74]:
# Find the similarity between the new vector and the candidates
i2_short = nnans3.loc[:,"A Patchwork Planet (Ballantine Reader's Circle)"]
similirity_evaluation(i2_short, nnans3.T, "cosine").round(2)

array([[1.  , 0.  , 0.05, 0.21, 0.04, 0.  , 0.  , 0.25, 0.18, 0.11, 0.16,
        0.08, 0.08, 0.08, 0.03, 0.13, 0.06, 0.31, 0.  , 0.  , 0.05, 0.41,
        0.1 , 0.  , 0.04, 0.  , 0.  , 0.06, 0.08, 0.05, 0.11, 0.06, 0.09,
        0.05, 0.12, 0.05, 0.05, 0.12]])

In [78]:
similirity_evaluation(i2_short, nnans3.loc[:,"An Italian Affair"].T,
                      "cosine").round(2)

array([[0.21]])

In [76]:
nnans3.shape, i2_short.shape

((1065, 38), (1065,))

In [79]:
# check original (only common values) to see if 0.44 correlation appears plausible 
complete.loc[:,["An Italian Affair",
                "A Patchwork Planet (Ballantine Reader's Circle)"]].dropna(thresh=2, axis=0)

Book-Title,An Italian Affair,A Patchwork Planet (Ballantine Reader's Circle)
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1
User_11400,8.0,7.0
User_194600,7.0,6.0


## Experimenting with tensorflow for decomposition

This didn't seem to work very well.

In [13]:
from tqdm import tqdm

In [14]:
import tensorflow as tf

In [15]:
sess = tf.Session()

In [100]:
mask = (complete * 0 + 1).values
mask

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [31]:
n = 50

InputTensor = tf.placeholder(tf.float32, shape=complete.shape)

UserTensor = tf.Variable(tf.random_normal((complete.shape[0],n), dtype=tf.float32),
                         dtype=tf.float32)
ItemTensor = tf.Variable(tf.random_normal((n, complete.shape[1]), dtype=tf.float32),
                              dtype=tf.float32)

# a mask of missing values (true when present, false when nan)
Mask = tf.constant(mask.astype(np.float32))

# Using the mask, we force the loss to be computed only on known values
mul = tf.multiply(Mask, tf.matmul(UserTensor, ItemTensor,))
diffs = tf.subtract(InputTensor, mul)
squares = tf.square(diffs)
loss = tf.reduce_sum(squares)

opt = tf.train.AdamOptimizer(learning_rate=0.1).minimize(loss)
sess.run(tf.global_variables_initializer())


In [129]:
for i in range(100):
    o, l = sess.run([opt, loss], feed_dict={InputTensor: complete.fillna(0).astype(np.float32).values})
    print(f"Epoch: {i}, Loss: {l:.2f}")

Epoch: 0, Loss: 14289.63
Epoch: 1, Loss: 14227.79
Epoch: 2, Loss: 14164.59
Epoch: 3, Loss: 14100.28
Epoch: 4, Loss: 14036.40
Epoch: 5, Loss: 13973.60
Epoch: 6, Loss: 13911.76
Epoch: 7, Loss: 13850.11
Epoch: 8, Loss: 13787.98
Epoch: 9, Loss: 13726.62
Epoch: 10, Loss: 13667.57
Epoch: 11, Loss: 13609.78
Epoch: 12, Loss: 13550.91
Epoch: 13, Loss: 13490.33
Epoch: 14, Loss: 13429.59
Epoch: 15, Loss: 13370.40
Epoch: 16, Loss: 13313.06
Epoch: 17, Loss: 13256.37
Epoch: 18, Loss: 13199.00
Epoch: 19, Loss: 13141.23
Epoch: 20, Loss: 13084.20
Epoch: 21, Loss: 13027.37
Epoch: 22, Loss: 12969.55
Epoch: 23, Loss: 12911.79
Epoch: 24, Loss: 12856.25
Epoch: 25, Loss: 12802.81
Epoch: 26, Loss: 12749.26
Epoch: 27, Loss: 12694.53
Epoch: 28, Loss: 12639.84
Epoch: 29, Loss: 12586.69
Epoch: 30, Loss: 12534.84
Epoch: 31, Loss: 12482.73
Epoch: 32, Loss: 12429.13
Epoch: 33, Loss: 12374.80
Epoch: 34, Loss: 12321.97
Epoch: 35, Loss: 12271.06
Epoch: 36, Loss: 12220.21
Epoch: 37, Loss: 12168.66
Epoch: 38, Loss: 12117

In [130]:
U = sess.run(UserTensor)
I = sess.run(ItemTensor)

In [131]:
(U @ I).max(), (U @ I).min()

(132.1411, -142.38553)

In [132]:
reconstructed = U @ I

r2 = compute_evaluation_metric(complete, reconstructed, "R2")
rmse = compute_evaluation_metric(complete, reconstructed, "RMSE")
corr = similirity_evaluation(reconstructed.ravel(), complete.values.ravel())[0][0]
cosine = similirity_evaluation(reconstructed.ravel(), complete.values.ravel(), "cosine")[0][0]

print("RSq: {}\nRMSE: {}\nPearson: {}\nCosine similarity: {}".format(r2, rmse, corr, cosine))

RSq: 0.9983887317828947
RMSE: 0.15270084527897196
Pearson: 0.020078781351281643
Cosine similarity: 0.024350265111531315


## This seems to be working nicely

In [133]:
r2 = compute_evaluation_metric(complete*mask, reconstructed*mask, "R2")
rmse = compute_evaluation_metric(complete*mask, reconstructed*mask, "RMSE")
corr = similirity_evaluation(reconstructed.ravel()*mask.ravel(),
                             complete.values.ravel()*mask.ravel())[0][0]
cosine = similirity_evaluation(reconstructed.ravel()*mask.ravel(),
                               complete.values.ravel()*mask.ravel(), "cosine")[0][0]

print("RSq: {}\nRMSE: {}\nPearson: {}\nCosine similarity: {}".format(r2, rmse, corr, cosine))

RSq: 0.9983887317828947
RMSE: 0.15270084527897196
Pearson: 0.9991940731590394
Cosine similarity: 0.999451652817667


In [134]:
xx = 7
pd.DataFrame([complete.iloc[xx].values, reconstructed[xx]], 
             index=["User", "Preds"]).T.sort_values("User", ascending=False)

Unnamed: 0,User,Preds
13025,10.0,9.999001
2175,10.0,9.998142
7629,10.0,9.996739
8932,10.0,10.001343
6701,10.0,9.954445
9527,10.0,9.998348
6169,10.0,9.976924
9586,10.0,10.002548
9741,10.0,9.997175
9751,10.0,10.061455


In [135]:
reconstructed -= reconstructed.min()
reconstructed /= reconstructed.max() / 10

r2 = compute_evaluation_metric(complete*mask, reconstructed*mask, "R2")
rmse = compute_evaluation_metric(complete*mask, reconstructed*mask, "RMSE")
corr = similirity_evaluation(reconstructed.ravel()*mask.ravel(),
                             complete.values.ravel()*mask.ravel())[0][0]
cosine = similirity_evaluation(reconstructed.ravel()*mask.ravel(),
                               complete.values.ravel()*mask.ravel(), "cosine")[0][0]

print("RSq: {}\nRMSE: {}\nPearson: {}\nCosine similarity: {}".format(r2, rmse, corr, cosine))

RSq: -0.4229778117400018
RMSE: 4.537917383782371
Pearson: 0.999194068761257
Cosine similarity: 0.5866675388091684


In [136]:
xx = 7
pd.DataFrame([complete.iloc[xx].values, reconstructed[xx]], 
             index=["User", "Preds"]).T.sort_values("User", ascending=False)

Unnamed: 0,User,Preds
13025,10.0,5.550810
2175,10.0,5.550779
7629,10.0,5.550728
8932,10.0,5.550896
6701,10.0,5.549188
9527,10.0,5.550787
6169,10.0,5.550007
9586,10.0,5.550940
9741,10.0,5.550745
9751,10.0,5.553086


## Experimenting with apriori and association rules

This seems to work great.

## Nearest Neighbor method (might be interesting with better vector representations)