# Recommender System

### Replication of lecture examples

In [9]:
import numpy as np
import pandas as pd

### Cosine similarity:
$$sim(\mu, v) = \frac{\sum_{i\in I_{\mu v}}r_{\mu i}r_{v i}}{\sqrt{\sum_{i\in I_{\mu v}}r_{\mu i}^2}\sqrt{\sum_{i\in I_{\mu v}}r_{v i}^2}}$$

function cos_sim () <br>
vec1 \& vec2 two input vectors <br>
output: cosine similarity

In [10]:
def cos_sim(vec1, vec2):
    vec1 = (vec2+1e-5)/(vec2+1e-5)*vec1
    vec2 = (vec1+1e-5)/(vec1+1e-5)*vec2
    num = np.sum(vec1*vec2)
    den = np.sqrt(np.sum(vec1**2))*np.sqrt(np.sum(vec2**2))
    return num/den

### Pearson similarity:
$$sim(\mu, v) = \frac{\sum_{i\in I_{\mu v}}(r_{\mu i}-\bar{r}_\mu)(r_{v i}- \bar{r}_v)}{\sqrt{\sum_{i\in I_{\mu v}}(r_{\mu i}-\bar{r}_\mu)^2}\sqrt{\sum_{i\in I_{\mu v}}(r_{v i}- \bar{r}_v)^2}}$$

function pear_sim () <br>
vec1 \& vec2 two input vectors <br>
output: pearson similarity

In [11]:
def pear_sim(vec1, vec2):
    c_vec1 = vec1 - np.mean(vec1)
    c_vec2 = vec2 - np.mean(vec2)
    c_vec1 = (c_vec2+1e-5)/(c_vec2+1e-5)*c_vec1
    c_vec2 = (c_vec1+1e-5)/(c_vec1+1e-5)*c_vec2
    num = np.sum(c_vec1*c_vec2)
    den = (np.sum(c_vec1**2))**0.5*(np.sum(c_vec2**2))**0.5
    return num/den

function get_simiarity matrix () <br>
input: <br>
df: dataframe <br>
sim_measure: similarity measurement, either 'cosine' OR 'pearson'<br>
output: similarity matrix

In [12]:
def get_similarity_matrix(df, sim_measure = 'perason'):
    nb_users = df.shape[0]
    similarity_matrix = np.zeros((nb_users, nb_users))
    if sim_measure == 'cosine':
        for i in range(nb_users):
            for j in range(nb_users):
                similarity_matrix[i,j] = cos_sim(df.iloc[i], df.iloc[j])
    elif sim_measure == 'pearson':
        for i in range(nb_users):
            for j in range(nb_users):
                similarity_matrix[i,j] = pear_sim(df.iloc[i], df.iloc[j])
    else:
        print('please input correct similarity measurements')
    return similarity_matrix

Find top k nearest neighours <br>
input:<br>
similarities<br>
k<br>

In [13]:
def get_TopK_neighbors(similarities, TopK):
    sorted_neighours = np.argsort(similarities)[::-1]
    return sorted_neighours[1:TopK+1]

### Basic prediction
$$\hat{r_{ui}} = \frac{\sum_{v\in N^k_i(\mu)}sim(\mu, v)r_{vi}}{|\sum_{v\in N^k_i(\mu)}sim(\mu, v)|}$$
<br>
**function basic_estimate()**<br>
input: <br>
similarity: $sim(\mu, v)$ <br>
rating:$r_{vi}$<br>
output: estimate

In [14]:
def basic_estimate(similarities, rating):
    estimate = np.sum(similarities*rating)/np.sum(abs(similarities))
    return estimate

### Mean-centered prediction
$$\hat{r_{ui}} = \bar{r}_\mu + \frac{\sum_{v\in N^k_i(\mu)}sim(\mu, v)(r_{vi}-\bar{r}_v)}{|\sum_{v\in N^k_i(\mu)}sim(\mu, v)|}$$

**function mean_centered_estimate()**<br>
input: <br>
similarity: $sim(\mu, v)$ <br>
rating:$r_{vi}$<br>
mean
output: estimate

In [15]:
def mean_centered_estimate(similarities, rating, item_mean, mean):
    num = np.sum(similarities*(rating-mean))
    den = np.sum(abs(similarities))
    estimate = item_mean + num/den
    return estimate

### Find single estimation for particular user and item

**function get_single_estimate()**<br>
Find single estimation for particular user and item
input: <br>
df: dataframe <br>
user: user of interest<br>
item: item of interest <br>
TopK: k number of neighbours for collabrating <br>
sim_measure: simiarlity measurement, either 'pearson' OR 'cosine'<br>
method: estimation method, either 'basic' OR 'mean_centered' <br>
output: estimate

In [16]:
def get_single_estimate(df, user, item, TopK, sim_measure = 'pearson', method = 'mean_centered'):
    mean_rating = np.mean(df, axis = 1)
    sim_matrix = get_similarity_matrix(df, sim_measure = sim_measure)
    sim = sim_matrix[user]
    TopK_ngbrs_idx = get_TopK_neighbors(sim, TopK)
    TopK_ngbrs_sim = sim[TopK_ngbrs_idx]
    TopK_ngbrs_rating = df.iloc[TopK_ngbrs_idx, item]
    TopK_ngbrs_mean_rating = mean_rating[TopK_ngbrs_idx]
    if method == 'basic':
        estimate = basic_estimate(TopK_ngbrs_sim,TopK_ngbrs_rating)
    elif method == 'mean_centered':
        estimate = mean_centered_estimate(TopK_ngbrs_sim, TopK_ngbrs_rating, mean_rating[user], TopK_ngbrs_mean_rating)
    else:
        print('please input correct estimation method')
    return estimate

**function get_estimated_matrix()**<br>
Find estimation for the whole user-item matrix <br>
input: <br>
df: dataframe <br>
TopK: k number of neighbours for collabrating <br>
sim_measure: simiarlity measurement, either 'pearson' OR 'cosine'<br>
method: estimation method, either 'basic' OR 'mean_centered' <br>
output: estimate

In [17]:
def get_estimated_matrix(df, TopK, sim_measure, method):
    estimated_matrix = np.zeros((df.shape))
    nb_users, nb_items = df.shape[0], df.shape[1]
    mean_rating = np.mean(df, axis = 1)
    similarity_matrix = get_similarity_matrix(df, sim_measure = 'pearson')
    for i in range(nb_users):
        for j in range(nb_items):
            estimated_matrix[i,j] = get_single_estimate(df, i, j, TopK, sim_measure = sim_measure, method = method)
    return pd.DataFrame(estimated_matrix)

### Data Generation

In [18]:
d = {'item0': [np.nan, 7, 6, 1, 1], 'item1': [3, 6, 7, 2, np.nan], 
      'item2': [3, 7, np.nan, 2, 1], 'item3': [1, 4, 4, 3, 2],
      'item4': [1, 5, 3, 3, 3], 'item5': [np.nan, 4, 4, 4, 3]}
df = pd.DataFrame(data=d).astype('float32')

In [19]:
df

Unnamed: 0,item0,item1,item2,item3,item4,item5
0,,3.0,3.0,1.0,1.0,
1,7.0,6.0,7.0,4.0,5.0,4.0
2,6.0,7.0,,4.0,3.0,4.0
3,1.0,2.0,2.0,3.0,3.0,4.0
4,1.0,,1.0,2.0,3.0,3.0


In [20]:
cos_sim(df.iloc[0], df.iloc[1])

0.9561829

In [21]:
pear_sim(df.iloc[0], df.iloc[1])

0.8944271909999159

In [22]:
get_single_estimate(df, user = 0, item = 0,TopK=2, sim_measure='pearson', method='basic')

6.487984325166911

In [23]:
get_single_estimate(df, user = 0, item = 5, TopK=2, sim_measure='pearson', method='mean_centered')

0.8584108747239225

In [24]:
get_estimated_matrix(df, TopK=2, sim_measure='pearson', method='mean_centered')

Unnamed: 0,0,1,2,3,4,5
0,3.346395,3.370427,2.731976,0.858411,0.83438,0.858411
1,6.027116,7.027116,6.060737,4.587853,4.148589,5.148589
2,5.441189,5.586271,6.01373,3.586271,4.01373,4.158811
3,1.409795,1.50774,1.951027,2.860822,3.860822,3.409795
4,1.224972,1.258343,1.258343,2.741657,2.741657,2.775028


## <span style="color:red"> *Can you replicate this procedure for item-base approach?* </span>

## Extension: SVD Method

In [27]:
from sklearn.decomposition import TruncatedSVD

#### transform pandas data frame to numpy array

In [28]:
X = np.array(df)
X

array([[nan,  3.,  3.,  1.,  1., nan],
       [ 7.,  6.,  7.,  4.,  5.,  4.],
       [ 6.,  7., nan,  4.,  3.,  4.],
       [ 1.,  2.,  2.,  3.,  3.,  4.],
       [ 1., nan,  1.,  2.,  3.,  3.]], dtype=float32)

#### Find the missing index

In [29]:
missing_idx = np.isnan(X)
missing_idx

array([[ True, False, False, False, False,  True],
       [False, False, False, False, False, False],
       [False, False,  True, False, False, False],
       [False, False, False, False, False, False],
       [False,  True, False, False, False, False]])

#### Copy the original arrary X and initialize the missing value with 2

In [30]:
X1 = X.copy()
X1[missing_idx] = 2
X1

array([[2., 3., 3., 1., 1., 2.],
       [7., 6., 7., 4., 5., 4.],
       [6., 7., 2., 4., 3., 4.],
       [1., 2., 2., 3., 3., 4.],
       [1., 2., 1., 2., 3., 3.]], dtype=float32)

#### Apply the trucated SVD with number of components set as 2

In [31]:
svd = TruncatedSVD(n_components=2, n_iter=10, random_state=1)
svd.fit(X1)
W = svd.transform(X1)
H = svd.components_

In [32]:
W

array([[ 5.0392003, -0.679335 ],
       [13.606015 , -1.4701223],
       [11.012465 , -0.407228 ],
       [ 5.8194103,  2.9146538],
       [ 4.723533 ,  2.317932 ]], dtype=float32)

In [33]:
H

array([[ 0.46895048,  0.502429  ,  0.38339925,  0.3361569 ,  0.35501695,
         0.3763203 ],
       [-0.53193945, -0.19476764, -0.29994512,  0.31165352,  0.38700145,
         0.58501345]], dtype=float32)

#### Get the reformed matrix X2 through the production of two factorized matrice W \& H

In [34]:
X2 = np.matmul(W,H)
X2

array([[2.7245004 , 2.6641529 , 2.1357887 , 1.4822448 , 1.5260978 ,
        1.4989332 ],
       [7.162564  , 7.122389  , 5.657492  , 4.1155868 , 4.2614264 ,
        4.2601786 ],
       [5.380921  , 5.612296  , 4.344317  , 3.575002  , 3.7520137 ,
        3.9059803 ],
       [1.178596  , 2.3561604 , 1.3569214 , 2.864597  , 3.1939645 ,
        3.895074  ],
       [0.98210377, 1.9217819 , 1.1157467 , 2.31024   , 2.5739775 ,
        3.1335828 ]], dtype=float32)

#### Repeat earilier step by replacing the missing values with those values in newly reformed matrix X2

In [35]:
X3 = X.copy()
X3[missing_idx] = X2[missing_idx]
X3

array([[2.7245004, 3.       , 3.       , 1.       , 1.       , 1.4989332],
       [7.       , 6.       , 7.       , 4.       , 5.       , 4.       ],
       [6.       , 7.       , 4.344317 , 4.       , 3.       , 4.       ],
       [1.       , 2.       , 2.       , 3.       , 3.       , 4.       ],
       [1.       , 1.9217819, 1.       , 2.       , 3.       , 3.       ]],
      dtype=float32)

In [36]:
svd = TruncatedSVD(n_components=2, n_iter=10, random_state=1)
svd.fit(X3)
W3 = svd.transform(X3)
H3 = svd.components_

In [37]:
X4 = np.matmul(W3,H3)

In [38]:
X4

array([[3.058597 , 2.8461556, 2.6325283, 1.3299459, 1.2590867, 1.1737914],
       [6.872852 , 6.9315777, 6.2484818, 4.171023 , 4.2655015, 4.3520336],
       [5.9590497, 6.010938 , 5.418298 , 3.6185925, 3.7009687, 3.776499 ],
       [1.1539471, 2.139154 , 1.6550188, 2.851798 , 3.3292387, 3.8499036],
       [0.9103511, 1.7034409, 1.3154991, 2.2847703, 2.6689286, 3.0879116]],
      dtype=float32)

In [39]:
X5 = X.copy()
X5[missing_idx] = X4[missing_idx]
X5

array([[3.058597 , 3.       , 3.       , 1.       , 1.       , 1.1737914],
       [7.       , 6.       , 7.       , 4.       , 5.       , 4.       ],
       [6.       , 7.       , 5.418298 , 4.       , 3.       , 4.       ],
       [1.       , 2.       , 2.       , 3.       , 3.       , 4.       ],
       [1.       , 1.7034409, 1.       , 2.       , 3.       , 3.       ]],
      dtype=float32)

......

......

## <span style="color:red"> *Can you write down a iteration loop for this procedure and set your own stoping criterion?* </span>