In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr
from feather import read_dataframe
%matplotlib inline

In [117]:
complete = read_dataframe("user_book_ratings.feather",).pivot_table("Book-Rating",
                                                                    index="User-ID",
                                                                    columns="Book-Title")

In [118]:
complete.head(10)

Book-Title,Murder of a Sleeping Beauty (Scumble River Mysteries (Paperback)),"Q-Space (Star Trek The Next Generation, Book 47)","Q-Zone (Star Trek The Next Generation, Book 48)",""" Lamb to the Slaughter and Other Stories (Penguin 60s S.)","""A"" is for Alibi : A Kinsey Millhone Mystery (A Kinsey Millhone Mystery)","""O"" Is for Outlaw","""Surely You're Joking, Mr. Feynman!"": Adventures of a Curious Character","""The Happy Prince"" and Other Stories (Penguin Popular Classics)",'Salem's Lot,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,...,Zodiac: The Eco-Thriller,"Zoey Phillips (Girlfriends) (Harlequin Superromance, No. 1020)",Zombies of the Gene Pool,Zoya,Zoya's Story: An Afghan Woman's Struggle for Freedom,ZwÃ?Â¶lf.,e,iI Paradiso Degli Orchi,one hundred years of solitude,stardust
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User_100004,,,,,,,,,,,...,,,,,,,,,,
User_100009,,,,,,,,,,,...,,,,,,,,,,
User_10001,,,,,,,,,,,...,,,,,,,,,,
User_100010,,,,,,,,,,,...,,,,,,,,,,
User_100038,,,,,,,,,,,...,,,,,,,,,,
User_100053,,,,,,,,,,,...,,,,,,,,,,
User_100066,,,,,,,,,,,...,,,,,,,,,,
User_100088,,,,,,,,,,,...,,,,,,,,,,
User_100098,,,,,,,,,,,...,,,,,,,,,,
User_100115,,,,,,,,,,,...,,,,,,,,,,


In [119]:
complete.memory_usage().sum() / 1024 ** 3

1.2238024920225143

In [147]:
def get_user(i, df):
    """Get row i; should be the same dtype as RowIndex"""
    return df.loc[i].values

def get_item(j, df):
    """Get row j from the transponsed matrix, aka item j"""
    return df.loc[:,j].values

def compute_evaluation_metric(y_true, y_pred, metric="MSE"):
    """Given two vectors computes the selected metric."""
    
    # This works only on the common elements
    # missing values are ignored
    diff = np.subtract(y_true, y_pred)
    
    # Count NaNs in the difference, check for NaNs and 
    # invert that, finally count non-zeros 
    # https://stackoverflow.com/a/21778195/6655150
    N = np.count_nonzero(~np.isnan(diff))
    
    if metric == "MSE":
        distance = (1 / N) * np.nansum( np.power(diff, 2) )
    elif metric == "MAE":
        distance = (1 / N) * np.nansum( np.abs(diff) )
    elif metric == "RMSE":
        distance = np.sqrt((1 / N) * np.nansum( np.power(diff, 2) ))
    return distance


def cos_sim(user, others):
    """
        This is nothing more than the implementation of cosine similarity
        in a way to handle (ignore) missing values. It is about 2-3x faster
        than replacing NaNs with zeros and then using scipy's or sklearn's
        functions. It assumes the input vectors are `atleast_2D`.
    """
    
    # add some numbers for numeric stability
    sum_sq = np.nansum(np.multiply(user, others), axis=1)

    magnitudes = np.multiply(np.sqrt(np.nansum(np.square(user))),
                             np.sqrt(np.nansum(np.square(others), axis=1))) 
    
    # calculate cosing similarity and add 
    # a small amount to avoid division by 0
    return np.divide(sum_sq+1e-7, magnitudes+1e-3).reshape((user.shape[0], -1))

def pearson_sim(user, others):
    """
        Similarly to the calculation of cosine similarity, this custom approach
        is faster (~1.5x) than filling the whole matrix (this time with the row
        mean) and then using `scipy.stats.stats.pearsonr` with a list comprehension
        iterating over all the rows.
    """
    
    user_mean = np.nanmean(user)
    others_means = np.nanmean(others, axis=1, keepdims=True)

    diff1 = np.subtract( user, user_mean ) 
    diff2 = np.subtract( others, others_means )

    var1 = np.sqrt( np.nansum( np.power(diff1, 2) ) )
    var2 = np.sqrt( np.nansum( np.power(diff2, 2), axis=1, 
                               keepdims=True) )

    num = np.nansum( np.multiply( diff1, diff2 ), axis=1,
                     keepdims=True )
    denom = np.multiply( var1, var2 )
    
    return np.divide(num, (denom + 1e-3)).reshape((user.shape[0], -1))

def similirity_evaluation(user, others, method="pearson"):
    """
        Given two vectors, or a vector and list of vectors, computes
        the selected similarity metric(s).
    """
    
    # make sure they are in the correct shape
    user = np.atleast_2d(user)
    others = np.atleast_2d(others)

    if method == "pearson":
        return pearson_sim(user, others)
    
    elif method == "cosine":
        return cos_sim(user, others)
    
def predict_user_ratings(user, others):
    
    # make sure the shape is correct
    user = np.atleast_2d(user)
    others = np.atleast_2d(others)

    # use numpy to calculate the mean ignoring NaNs
    user_mean = np.nanmean(user)
    others_means =  np.nanmean(others, axis=1, keepdims=True)
    
    diff2 = np.subtract( others, others_means )
    similarities = similirity_evaluation(user, others, "cosine")
    
    num = np.nansum( np.multiply(similarities.T, diff2), axis=0 )
    denom = np.nansum(similarities)
            
    return np.add(user_mean, np.multiply(num, 1/(denom + 1e-4)))

In [142]:
def find_similar_users(user, df, metric="pearson"):
    """
        Given a user and a dataframe (optionally a similarity metric)
        this function finds users
    """
    
    sims = similirity_evaluation(user, df, metric)
    results = pd.DataFrame([df.index, sims.ravel()],
                           index=["Users", "Similarity scores"]).T
    return results.sort_values("Similarity scores", ascending=False).reset_index(drop=True)

def find_similar_items(item, df, metric="cosine"):
    sims = similirity_evaluation(item, df.T, metric)
    results = pd.DataFrame([df.columns, sims.ravel()],
                           index=["Items", "Similarity scores"]).T
    return results.sort_values("Similarity scores", ascending=False).reset_index(drop=True)

In [129]:
i1 = get_user("User_8", complete)
i2 = get_user("User_11400", complete)
i3 = get_user(["User_11400", "User_67544"], complete)
i4 = get_user("User_67544", complete)

j1 = get_item(' Murder of a Sleeping Beauty (Scumble River Mysteries (Paperback))', complete)
j2 = get_item('"A" is for Alibi : A Kinsey Millhone Mystery (A Kinsey Millhone Mystery)', complete)
j3 = get_item(['Zodiac: The Eco-Thriller', 
               'Zoey Phillips (Girlfriends) (Harlequin Superromance, No. 1020)',], complete)
j4 = get_item(['\'Salem\'s Lot', '"O" Is for Outlaw', 
              ' Murder of a Sleeping Beauty (Scumble River Mysteries (Paperback))'], complete)
j5 = get_item('Harry Potter and the Chamber of Secrets (Book 2)',complete)

(compute_evaluation_metric(i1, i2, "RMSE"), compute_evaluation_metric(i1, i2, "MAE"),
 compute_evaluation_metric(i1, i2, "MSE"))

(5.0, 5.0, 25.0)

In [124]:
sims = similirity_evaluation(get_item('Harry Potter and the Chamber of Secrets (Book 2)',complete),
                      complete.T, "cosine")
sims

array([[4.41018016e-11, 4.14264303e-11, 9.01698410e-11, ...,
        2.90427559e-11, 4.05922560e-11, 2.91007505e-02]])

In [169]:
user_itempreds = predict_user_ratings(i2, complete)
sims = similirity_evaluation(i2, complete)

In [170]:
test = pd.DataFrame(np.vstack((user_itempreds, i2, complete.columns))).T
test

Unnamed: 0,0,1,2
0,3.05291,,Murder of a Sleeping Beauty (Scumble River My...
1,3.05475,,"Q-Space (Star Trek The Next Generation, Book 47)"
2,3.04952,,"Q-Zone (Star Trek The Next Generation, Book 48)"
3,3.05561,,""" Lamb to the Slaughter and Other Stories (Pen..."
4,3.05374,,"""A"" is for Alibi : A Kinsey Millhone Mystery (..."
5,3.03652,,"""O"" Is for Outlaw"
6,3.05637,,"""Surely You're Joking, Mr. Feynman!"": Adventur..."
7,3.04657,,"""The Happy Prince"" and Other Stories (Penguin ..."
8,3.05444,,'Salem's Lot
9,3.04748,,...AND THE HORSE HE RODE IN ON : THE PEOPLE V....


In [177]:
for user in complete.index[:5]:
    u = get_user(user, complete)
    user_itempreds = predict_user_ratings(u, complete)
    sims = similirity_evaluation(u, complete)
    
    dummy_scorer = compute_evaluation_metric(u, np.ones(u.shape)*np.nanmean(u))
    model_scorer = compute_evaluation_metric(u, user_itempreds)
    
    print(f"{user}:\n\tDummy: {dummy_scorer}\n\tModel: {model_scorer}\n")

User_100004:
	Dummy: 23.140495867768596
	Model: 18.278362220126084

User_100009:
	Dummy: 16.53333333333333
	Model: 15.091299868063858

User_10001:
	Dummy: 13.88888888888889
	Model: 7.741781488302394

User_100010:
	Dummy: 0.0
	Model: 6.521531911340406e-06

User_100038:
	Dummy: 0.0
	Model: 3.7623095072846116e-05



In [143]:
find_similar_items(j5, complete, "cosine")

Unnamed: 0,Items,Similarity scores
0,Harry Potter and the Chamber of Secrets (Book 2),1
1,Harry Potter and the Prisoner of Azkaban (Book 3),0.578975
2,Harry Potter and the Goblet of Fire (Book 4),0.569977
3,Harry Potter and the Sorcerer's Stone (Book 1),0.451425
4,Harry Potter and the Order of the Phoenix (Boo...,0.361031
5,Harry Potter and the Sorcerer's Stone (Harry P...,0.346424
6,The Fellowship of the Ring (The Lord of the Ri...,0.160708
7,The Hobbit: or There and Back Again,0.135877
8,Dr. Seuss's A B C (I Can Read It All by Myself...,0.119482
9,Ramona the Pest (Ramona Quimby (Paperback)),0.119221


In [144]:
%%timeit
find_similar_items(j5, complete, "cosine")

2.5 s ± 11.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [133]:
%%timeit
find_similar_items(j5, complete, "pearson")

4.54 s ± 42.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [140]:
find_similar_users(i1, complete, "cosine")

Unnamed: 0,Users,Similarity scores
11487,User_8,0.99996
10849,User_67544,0.398504
814,User_116866,0.311829
5755,User_219008,0.275409
8030,User_263325,0.134908
808,User_11676,0.0200369
10734,User_65322,0.0001
9782,User_46133,0.0001
4889,User_201634,0.0001
7006,User_243200,0.0001


In [134]:
%%timeit
find_similar_users(i1, complete, "cosine")

2.49 s ± 7.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [135]:
%%timeit
find_similar_users(i1, complete, "pearson")

4.41 s ± 10.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### User-based tests:

In [173]:
similirity_evaluation(i1, i3, "pearson")

array([[-0.09138566,  0.13498689]])

In [174]:
similirity_evaluation(i1, i3, "cosine")

array([[6.62261793e-10, 3.98504416e-01]])

In [74]:
%%timeit
cos_sim(i1, i3)

148 µs ± 2.54 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [75]:
%%timeit
pearson_sim(i1, i3)

500 µs ± 2.73 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [170]:
%%timeit
similirity_evaluation(i1, i3, "cosine")

157 µs ± 1.22 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [77]:
%%timeit
similirity_evaluation(i1, i3, "pearson")

509 µs ± 2.07 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Item-based tests:

In [211]:
similirity_evaluation(j2, j4, "pearson")

array([[-0.06359102,  0.02868645,  0.00793028]])

In [212]:
similirity_evaluation(j2, j4, "cosine")

array([[1.16175623e-10, 8.09315160e-11, 1.24607297e-02]])

In [161]:
%%timeit
similirity_evaluation(j2, j4, "cosine")

635 µs ± 8.63 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [81]:
%%timeit
similirity_evaluation(j2, j4, "pearson")

1.36 ms ± 23 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [90]:
a = similirity_evaluation(i1, complete)

In [91]:
a[1:][np.argmax(a[1:])]

array([0.2022545])

In [92]:
np.argmax(a[1:])

11326

In [93]:
complete.iloc[[0, 11326+1]].dropna(thresh=1, axis=1)

ISBN,0002005018,0140119906,0312265840,0312306326,0316780375,0399135782,0440236738,0679770151,0767905385,0786866586,0821749528
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
8,5.0,,,,,0.0,,,,,
252282,,9.0,6.0,3.0,0.0,0.0,0.0,7.0,0.0,5.0,0.0


In [96]:
# find rows with common ratings with i2
ff3 = complete.T[((~np.isnan(i2)) & (~np.isnan(complete))).any(0)].T
ff3

ISBN,0002005018,0060084405,0060175532,0060512822,0060958022,0140126562,0140430016,0151008116,0312195516,0312274920,...,0671021001,067166641X,0679449434,0743225325,074341134X,0743418174,0786868716,1551664992,1552781542,1853261912
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,0.0,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,7.0,,,,,,,,,
383,,,,,,,,,,,...,,,,,,,,,,
384,,,,,,,,,,,...,,,,,,,,,,
388,,,,,,,,,,,...,,,,,,,,,,
408,,,,,,,,,,,...,,,,,,,,,,
424,,,,,,,,,,,...,6.0,,,,,,,,,


In [97]:
ff3.shape

(12490, 38)

In [252]:
# drop rows and columns with less than 5 common ratings
nnans3 = ff3.dropna(thresh=2, axis=1).dropna(thresh=2, axis=0)
nnans3

ISBN,0002005018,0060084405,0060175532,0060512822,0060958022,0140126562,0140430016,0151008116,0312195516,0312274920,...,0671021001,067166641X,0679449434,0743225325,074341134X,0743418174,0786868716,1551664992,1552781542,1853261912
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,,,,,,,,,,,...,7.0,,,,,,,,,
882,,,,,,,,,0.0,0.0,...,,,,,,,,,,
1424,,,,7.0,,,,,,,...,,,,,,,0.0,,,
1848,,,,,,,,,,,...,,,,,,,10.0,,,
2110,,,,,,,,5.0,,,...,,,,,,,0.0,,,
2333,,,,,,,,,9.0,,...,,,,,,,,,,
2766,,,,,8.0,,,,,,...,,,,,,,,,,
3363,,,,,0.0,,,,,,...,,,,,,0.0,,,,
3373,,,,,0.0,,,,,,...,,,,,,,,,,
4017,,,,,,,,,,,...,,,,,,,0.0,,,


In [253]:
nnans3.shape

(883, 38)

In [296]:
# Find the similarity between the new vector and the candidates
i2_short = nnans3.loc[:,"0151008116"]
similirity_evaluation(i2_short, nnans3.T, "cosine").round(2)

array([[0.11, 0.  , 0.  , 0.07, 0.11, 0.06, 0.  , 1.  , 0.18, 0.02, 0.12,
        0.07, 0.05, 0.09, 0.  , 0.  , 0.03, 0.06, 0.18, 0.18, 0.1 , 0.  ,
        0.01, 0.05, 0.05, 0.03, 0.08, 0.  , 0.08, 0.  , 0.07, 0.04, 0.12,
        0.12, 0.13, 0.13, 0.14, 0.  ]])

In [255]:
similirity_evaluation(i2_short, nnans3.loc[:,"0151008116"].T, "cosine").round(2)

array([[0.11]])

In [299]:
nnans3.shape, i2_short.shape

((883, 38), (883,))

In [256]:
# check original (only common values) to see if 0.44 correlation appears plausible 
complete.loc[:,["0060958022","0151008116"]].dropna(thresh=2, axis=0)

ISBN,0060958022,0151008116
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1
11400,10.0,6.0
11676,3.0,6.0
43246,0.0,0.0
98741,0.0,0.0
113519,8.0,9.0
114868,0.0,0.0
139742,0.0,0.0
172742,0.0,0.0
201290,0.0,0.0
264317,0.0,0.0


In [275]:
ii1 = complete.dropna(thresh=2, axis=0).T.values
pearsonr(ii1[0,:], ii1[1,:])

(nan, 1.0)

In [277]:
ii1.shape

(14755, 11931)

In [326]:
predict_i(i1, complete.values)

array([5.10557647, 2.5       , 2.3248965 , ..., 2.5       , 2.5       ,
       2.5       ])

In [327]:
predict_i(i2, complete.values)

array([3.00031253, 3.06117812, 3.04107743, ..., 3.05825516, 3.06124779,
       3.05160399])

In [328]:
predict_i(i4, complete)

array([6.76274148, 6.625     , 6.61740402, ..., 6.625     , 6.64395034,
       6.63900064])

In [329]:
pj1 = predict_i(get_item("0060958022"), complete.T.values)
pj1

array([3.28665504, 3.28701045, 3.28753367, ..., 3.28563866, 3.28397332,
       3.28902673])

In [330]:
pj1.max(), pj1.min()

(5.749083253619088, 3.0239473786652513)

In [331]:
pj2 = predict_i(get_item("0151008116"), complete.T.values)
pj2

array([4.17182472, 4.17182336, 4.17366063, ..., 4.16760712, 4.17522016,
       4.1735431 ])

In [332]:
pj2.max(), pj2.min()

(6.678546628117268, 3.9005324537219246)

In [333]:
pj2 = predict_i(get_item("0151008116"), complete.T.values)
pj2

array([4.17182472, 4.17182336, 4.17366063, ..., 4.16760712, 4.17522016,
       4.1735431 ])

In [334]:
preds = predict_i(i2_short, nnans3.T)
preds

array([3.25771254, 3.15944676, 3.28837306, 3.60032211, 3.69521197,
       3.59819321, 3.45268945, 3.01250799, 3.4767577 , 3.07625759,
       3.07192041, 3.65018902, 3.03111267, 2.30878096, 3.81809197,
       3.1903585 , 3.16149781, 4.00726237, 2.18888988, 3.20429946,
       3.24694719, 3.71515639, 3.93581135, 3.55492262, 3.23461876,
       2.36959207, 3.39089289, 3.06199426, 3.17407433, 3.70178995,
       3.2847156 , 3.80698116, 5.03280303, 3.28568041, 6.26741684,
       3.2514888 , 3.59133727, 3.63270423, 2.22733659, 3.73486387,
       3.62608575, 3.40048946, 3.14417427, 2.88790841, 3.28241253,
       3.5056749 , 3.15691557, 3.04741394, 3.83755428, 7.42219818,
       3.32384496, 3.30654764, 3.43129087, 3.1835838 , 3.74798546,
       3.17888678, 3.33788524, 3.16149781, 3.42508768, 3.53569926,
       3.64214881, 2.55892388, 3.49219069, 3.1907591 , 3.31094932,
       3.78381863, 3.2478728 , 3.72279069, 3.63115602, 2.74138293,
       3.3205056 , 3.47809287, 3.59898504, 3.7230413 , 3.52081

In [335]:
preds.max(), preds.mean(), preds.min(), preds.std()

(7.422198182599594, 3.3698630136986303, 1.6851247135632899, 0.4790006614074269)

In [336]:
preds.argmax() # who is the "closest user" (in essense, who is the user most likely to rate this well)

49

In [337]:
nnans3.iloc[49]

ISBN
0002005018     NaN
0060084405     NaN
0060175532     NaN
0060512822     NaN
0060958022     NaN
0140126562     NaN
0140430016     NaN
0151008116    10.0
0312195516     9.0
0312274920     NaN
0312305060     8.0
0312978383     6.0
0316789089     NaN
0345445848     0.0
0345445856     NaN
0375504036     NaN
0375506039     NaN
0375724850     NaN
0375727345     9.0
0380723085    10.0
0385333412     NaN
0425116840     0.0
0440211727     0.0
0440222656     9.0
0446364193     8.0
0446606324     0.0
0449003981     NaN
0553582658     0.0
0671021001    10.0
067166641X     NaN
0679449434     NaN
0743225325     NaN
074341134X    10.0
0743418174     9.0
0786868716     8.0
1551664992     NaN
1552781542     NaN
1853261912     NaN
Name: 16795, dtype: float64

In [338]:
nnans3.iloc[49]["0151008116"] # seems legit

10.0

In [339]:
similirity_evaluation(get_item("0151008116"), get_item("0380723085"), "cosine")

array([[0.11914448]])