In [3]:
from scipy.sparse import rand, random
from scipy import stats
from scipy import sparse
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
%load_ext memory_profiler

In [70]:
# Create random user data with ratings 0-5
user_data = np.random.random_integers(0, 5, (10,5))
user_data = pd.DataFrame(user_data)

print('done here')
print(user_data.shape)

done here
(10, 5)


  user_data = np.random.random_integers(0, 5, (10,5))


# Original

In [71]:
# Testing original version

# Cosine similarity function
def similarity_cosine(vec_x, vec_y):
    return np.dot(vec_x, vec_y) / (vec_length(vec_x) * vec_length(vec_y))

# Vector length function
def vec_length(vector):
    return np.sqrt(np.dot(vector, vector))

# Pearson similarity function
def similarity_pearson(vec_x, vec_y):
    vec_x_normalized = vec_x - vec_x.mean()
    vec_y_normalized = vec_y - vec_y.mean()
    return np.dot(vec_x_normalized, vec_y_normalized) / (vec_length(vec_x_normalized) * vec_length(vec_y_normalized))

# CF function
def recommendItemCF(user_data, item_similarity_matrix, item_columns):
    
    user_records = np.sign(user_data[item_columns].values)
    user_preference = np.divide(user_data[item_columns], 
                                user_data[item_columns].sum(axis = 1).to_frame()).fillna(0).values # Normalized
    user_results_dict = {}
    for i,idx in tqdm(enumerate(user_data.index)):
        item_list = user_records[i]
        item_weights = user_preference[i]
        user_results_dict[idx] = pd.Series(np.sum(item_list * item_similarity_matrix * item_weights, axis = 0))
        #user_results_dict[idx] = pd.Series(np.sum(item_list.reshape(-1,1) * item_similarity_matrix * item_weights , axis = 0))
    return pd.DataFrame({key:value.to_dict() for key,value in user_results_dict.items()})


# Similarity functino
def item_similarity(matrix):
    sim_matrix = np.diag(np.ones(len(matrix)))
    for i in range(len(matrix)):
        for j in range(i+1, len(matrix)):
            sim_matrix[i][j] = sim_matrix[j][i] = similarity_cosine(matrix[i], matrix[j])
    return sim_matrix

In [72]:
# Time test of original version
start = time.time()
res1 = recommendItemCF(user_data = user_data, item_similarity_matrix = item_similarity(user_data), item_columns = user_data.columns)
print(time.time() - start)

KeyError: 5

In [53]:
# Memory test of original version
%%memit
res1 = recommendItemCF(user_data = user_data, item_similarity_matrix = item_similarity(user_data), item_columns = user_data.columns)

  user_preference = np.divide(user_data[item_columns],
1000it [00:08, 123.93it/s]


peak memory: 312.30 MiB, increment: 122.71 MiB


# Einstein

In [73]:
# Updated version using einstein summation from numpy 

def recommendItemCF_z(user_data, item_columns):
    user_records = np.sign(user_data[item_columns])
    user_preference = np.divide(user_data[item_columns], 
                                user_data[item_columns].sum(axis = 1).to_frame()).fillna(0).values
    
    return pd.DataFrame(np.einsum("ij,jk,ij->ji", user_records.T, cosine_similarity(user_data).T, user_preference.T))

In [74]:
# Time test for Einstein version
start = time.time()
resz = recommendItemCF_z(user_data = user_data, item_columns = user_data.columns)
print(time.time() - start)

0.0485990047454834


  user_preference = np.divide(user_data[item_columns],


In [56]:
# Memory test for Einstein version
%%memit
resz = recommendItemCF_z(user_data = user_data, item_columns = user_data.columns)

  user_preference = np.divide(user_data[item_columns],


peak memory: 234.25 MiB, increment: 7.76 MiB


# Update1

In [57]:
# additional version test for optimization, using sklearns cosine similarity instead
def recommendItemCF_o(user_data, item_similarity_matrix, item_columns):
    user_records = np.sign(user_data[item_columns])
    user_preference = np.divide(user_data[item_columns], 
                                user_data[item_columns].sum(axis = 1).to_frame()).fillna(0).values
    user_results_dict = {}
    
    for i,idx in tqdm(enumerate(user_data.index)):
        item_list = user_records[i]
        item_weights = user_preference[i]
        user_results_dict[idx] = dict(enumerate(np.sum(item_list.values * item_similarity_matrix * item_weights, axis = 0).flatten(), 1))
        #user_results_dict[idx] = dict(enumerate(np.sum(item_list.reshape(-1,1) * item_similarity_matrix * item_weights , axis = 0).flatten(), 1))
    return pd.DataFrame(user_results_dict)


def item_similarity_o(matrix):
    return cosine_similarity(matrix)

In [58]:
# Time test
start = time.time()
res2 = recommendItemCF_o(user_data = user_data, item_similarity_matrix = item_similarity_o(user_data), item_columns = user_data.columns)
print(time.time() - start)

  user_preference = np.divide(user_data[item_columns],
1000it [00:08, 121.94it/s]


8.769530057907104


In [59]:
# Memory test
%%memit
res2 = recommendItemCF_o(user_data = user_data, item_similarity_matrix = item_similarity_o(user_data), item_columns = user_data.columns)

  user_preference = np.divide(user_data[item_columns],
1000it [00:06, 147.76it/s]


peak memory: 355.46 MiB, increment: 162.80 MiB


# Update1\*

In [75]:
# additional version utilizing numpy arrays rather than pandas where possible 

def recommendItemCF_o2(user_data, item_similarity_matrix, item_columns):
    user_records = np.nan_to_num(np.sign(user_data[item_columns]),0).T
    user_preference = np.nan_to_num(
        np.divide(user_data[item_columns].values, user_data[item_columns].sum(axis = 1).values.reshape(-1,1))
        ,0).T
    user_results_dict = {}
    for i,idx in tqdm(enumerate(user_data.index)):
        item_list = user_records[i]
        item_weights = user_preference[i]
        user_results_dict[idx] = dict(
            enumerate(
                np.sum(item_list * item_similarity_matrix * item_weights, axis = 1),
                1
            )
        )

    return pd.DataFrame(user_results_dict)


def item_similarity_o2(matrix):
    return cosine_similarity(matrix)

In [76]:
# Time test
start = time.time()
res3 = recommendItemCF_o2(user_data = user_data, item_similarity_matrix = item_similarity_o2(user_data), item_columns = user_data.columns)
print(time.time() - start)

5it [00:00, 5097.60it/s]


IndexError: index 5 is out of bounds for axis 0 with size 5

In [62]:
# Memory test
%%memit
res3 = recommendItemCF_o2(user_data = user_data, item_similarity_matrix = item_similarity_o2(user_data), item_columns = user_data.columns)

1000it [00:06, 148.69it/s]


peak memory: 351.54 MiB, increment: 142.41 MiB


ORIGINAL OUTPUT

In [67]:
# Comparing outputs -- res1
res1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,1.072100,0.546056,1.357055,1.393925,0.263707,0.811989,0.000000,1.117406,1.411092,0.000000,...,0.282218,1.374968,0.858639,0.538992,0.549987,0.272163,0.825311,1.071682,1.403028,1.384939
1,0.810548,0.275226,1.094384,0.000000,1.063320,1.364211,1.082419,1.126402,1.422453,1.361530,...,1.137962,0.000000,0.000000,0.543331,0.831623,1.097417,0.831956,1.350388,0.565730,0.279218
2,1.062445,0.811707,1.344833,0.552549,1.306660,0.268225,0.000000,0.276836,1.398384,0.267698,...,0.000000,1.362585,0.283636,0.267069,1.362585,1.078848,1.363131,0.531015,0.000000,0.274493
3,0.265841,0.541605,0.000000,0.276513,0.261558,0.805372,0.266256,0.554150,1.399592,1.071719,...,1.399592,0.545505,1.135522,0.267300,0.818257,0.269945,0.818585,0.797211,1.113275,0.549461
4,1.314612,0.803490,1.331220,1.093911,1.293433,0.265510,0.526666,0.548067,0.830537,1.324943,...,0.000000,1.079033,0.561529,0.793096,0.269758,0.533963,1.079465,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.534753,0.000000,0.812263,0.556221,0.526138,0.540017,0.000000,0.836027,1.407679,0.538956,...,0.844607,0.000000,0.285521,0.537688,0.000000,1.357524,1.372191,0.000000,0.839781,0.000000
996,1.321322,0.000000,1.338015,1.099495,1.300035,0.266866,0.529354,0.275432,0.000000,0.000000,...,0.834776,0.813406,0.564395,0.531430,0.542271,1.341723,0.271244,1.056646,1.383344,1.092407
997,0.000000,1.100007,0.546746,1.404003,0.000000,0.000000,1.351921,0.844113,1.421293,0.544168,...,0.000000,1.107927,0.000000,1.085777,1.384908,1.370654,1.385463,0.809572,1.413172,0.000000
998,1.064310,1.355220,0.269439,0.000000,1.047163,1.343482,1.065972,0.554643,0.280168,0.536337,...,0.840503,0.818987,1.136534,0.000000,1.091982,0.000000,0.546210,1.329869,0.000000,1.099901


EINSTEIN OUTPUT

In [78]:
# User data refresh
user_data

Unnamed: 0,0,1,2,3,4
0,0,5,2,4,0
1,5,0,4,0,0
2,4,2,2,3,1
3,3,5,3,5,0
4,0,5,4,5,4
5,5,4,4,3,4
6,3,0,4,1,5
7,0,4,5,0,2
8,0,3,2,4,2
9,5,3,3,1,2


In [77]:
# Einstein results
resz

Unnamed: 0,0,1,2,3,4
0,0.0,3.030548,1.212219,2.424439,0.0
1,3.125374,0.0,2.500299,0.0,0.0
2,2.58459,1.292295,1.292295,1.938442,0.646147
3,1.443384,2.40564,1.443384,2.40564,0.0
4,0.0,2.094086,1.675269,2.094086,1.675269
5,2.056409,1.645127,1.645127,1.233846,1.645127
6,1.486589,0.0,1.982118,0.49553,2.477648
7,0.0,2.457733,3.072167,0.0,1.228867
8,0.0,1.976172,1.317448,2.634896,1.317448
9,2.737458,1.642475,1.642475,0.547492,1.094983


UPDATE1-MT OUTPUT

In [65]:
# Update1 results
res2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
1,1.072662,0.546342,1.357767,1.394657,0.263845,0.000000,0.000000,0.000000,1.411833,0.000000,...,0.282367,0.000000,0.859090,0.000000,0.000000,0.272306,0.825744,0.000000,1.403765,0.000000
2,0.803472,0.272823,1.084831,0.000000,1.054038,1.352301,1.072970,1.116569,0.000000,0.000000,...,1.128028,0.000000,0.000000,0.538588,0.824363,0.000000,0.000000,1.338599,0.560791,0.000000
3,1.061668,0.811114,1.343851,0.000000,1.305705,0.268029,0.000000,0.276633,1.397362,0.267503,...,0.000000,1.361589,0.283428,0.266874,1.361589,1.078060,1.362134,0.530627,0.000000,0.274293
4,0.263997,0.000000,0.000000,0.274595,0.259744,0.000000,0.264409,0.550306,1.389884,1.064285,...,1.389884,0.541721,1.127646,0.000000,0.812582,0.268073,0.812907,0.791681,0.000000,0.000000
5,1.354166,0.827665,1.371273,1.126824,1.332350,0.000000,0.542512,0.564557,0.855526,0.000000,...,0.000000,1.111499,0.000000,0.816959,0.000000,0.550029,1.111944,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,0.534751,0.000000,0.812260,0.556219,0.526136,0.540014,0.000000,0.836024,0.000000,0.000000,...,0.844604,0.000000,0.285520,0.537686,0.000000,1.357518,1.372186,0.000000,0.000000,0.000000
997,1.336736,0.000000,1.353623,1.112320,1.315200,0.269978,0.535529,0.278645,0.000000,0.000000,...,0.844514,0.822894,0.570979,0.537629,0.000000,1.357374,0.274408,1.068972,1.399480,0.000000
998,0.000000,1.095836,0.544673,1.398679,0.000000,0.000000,1.346795,0.840913,0.000000,0.000000,...,0.000000,1.103726,0.000000,1.081660,1.379657,0.000000,1.380209,0.806503,1.407813,0.000000
999,1.058793,1.348195,0.000000,0.000000,0.000000,0.000000,1.060447,0.551769,0.278716,0.533557,...,0.836147,0.814741,1.130643,0.000000,1.086322,0.000000,0.543378,1.322976,0.000000,1.094200


UPDATE1-MP OUTPUT

In [69]:
# Update1* results
res3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
1,0.700222,0.717291,0.682139,0.689024,0.676947,0.679560,0.685164,0.699780,0.698082,0.655478,...,0.672053,0.721218,0.683092,0.676792,0.671008,0.695760,0.684282,0.714668,0.693705,0.671723
2,0.698687,0.716004,0.681217,0.688107,0.675990,0.679805,0.684806,0.699273,0.696816,0.653510,...,0.669875,0.721350,0.683463,0.676531,0.670684,0.694102,0.682155,0.714616,0.693289,0.670820
3,0.693153,0.710478,0.675537,0.681713,0.670771,0.673156,0.678929,0.693259,0.691028,0.648738,...,0.664260,0.714944,0.677013,0.670582,0.664707,0.688613,0.677290,0.708004,0.685532,0.665751
4,0.689422,0.705668,0.671197,0.678008,0.666783,0.669443,0.675215,0.689791,0.688014,0.644534,...,0.661208,0.710679,0.673716,0.666171,0.660873,0.684465,0.673101,0.704659,0.682059,0.661062
5,0.705968,0.725141,0.689237,0.695882,0.684398,0.686367,0.692372,0.706682,0.705205,0.660947,...,0.678308,0.729159,0.690351,0.684257,0.677010,0.702413,0.691151,0.721758,0.701102,0.678761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,0.697462,0.715960,0.680439,0.687244,0.674879,0.678573,0.683541,0.697929,0.695554,0.652175,...,0.669320,0.719923,0.682470,0.675188,0.668937,0.694244,0.681979,0.713470,0.690792,0.670564
997,0.697804,0.715315,0.680289,0.687059,0.675475,0.678379,0.683431,0.698077,0.695899,0.652691,...,0.669722,0.719421,0.681474,0.675515,0.668824,0.694302,0.681433,0.713878,0.690882,0.669896
998,0.701808,0.720536,0.683419,0.691069,0.678392,0.682219,0.687417,0.702102,0.699701,0.656476,...,0.673771,0.724746,0.685900,0.679752,0.674077,0.697070,0.686316,0.717446,0.696014,0.673781
999,0.691420,0.708022,0.672516,0.680801,0.667494,0.670912,0.677143,0.691696,0.689251,0.646542,...,0.662228,0.712390,0.674987,0.668687,0.662168,0.686737,0.675739,0.706620,0.684061,0.663885
