In [None]:
import numpy as np
import pandas as pd

In [None]:
def cosine_similarity(x: np.array, y: np.array):
  """Gets cosine similarity bestween arrays x and yC"""
  
  cosine_sim = np.dot(x, y)/(np.linalg.norm(x) * np.linalg.norm(y))

  return cosine_sim

In [None]:
def array_centering(v: np.array):
  """Subtract elements from average"""
  v = v.copy()
  non_zeros = v > 0

  # substituição pela média
  v[non_zeros] = v[non_zeros] - np.mean(v[non_zeros]) + 1e-6

  return v

def centered_cosine_similarity(x: np.array, y: np.array):
  """Gets centered cosine similarity between arrays x and y"""

  x = array_centering(x)
  y = array_centering(y)
  centered_cosine_sim = cosine_similarity(x, y)

  return centered_cosine_sim

In [None]:
# Ratings matrix
ratings = {'I1': [4,   5,   0,   4.1,  1],
           'I2': [0,   4.5, 0,   3,    4],
           'I3': [0,   4,   0,   0,    0],
           'I4': [4.7, 0,   1.5, 4.9,  2.5],
           'I5': [1,   0,   5,   0,    3.8],
           'I6': [0,   0,   4,   0,    1],
           'I7': [0,   0,   0,   3,    5]}
ratings = pd.DataFrame(ratings, index = ['U1', 'U2', 'U3', 'U4', 'U5'], dtype=float)

# Prints matrix
ratings

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7
U1,4.0,0.0,0.0,4.7,1.0,0.0,0.0
U2,5.0,4.5,4.0,0.0,0.0,0.0,0.0
U3,0.0,0.0,0.0,1.5,5.0,4.0,0.0
U4,4.1,3.0,0.0,4.9,0.0,0.0,3.0
U5,1.0,4.0,0.0,2.5,3.8,1.0,5.0


In [None]:
x = ratings.loc['U1'].values
y = ratings.loc['U2'].values
cos_sim = cosine_similarity(x, y)
print(f"Cosine similarity between U1 and U2 is {cos_sim:.2f}.") 

Cosine similarity between U1 and U2 is 0.41.


In [None]:
x = ratings.loc['U1'].values
y = ratings.loc['U3'].values
cos_sim = cosine_similarity(x, y)
print(f"Cosine similarity between U1 and U3 is {cos_sim:.2f}.") 

Cosine similarity between U1 and U3 is 0.29.


In [None]:
x = ratings.loc['U1'].values
y = ratings.loc['U4'].values
cos_sim = cosine_similarity(x, y)
print(f"Cosine similarity between U1 and U4 is {cos_sim:.2f}.") 

Cosine similarity between U1 and U4 is 0.82.


In [None]:
x = ratings.loc['U1'].values
y = ratings.loc['U5'].values
cos_sim = cosine_similarity(x, y)
print(f"Cosine similarity between U1 and U5 is {cos_sim:.2f}.") 

Cosine similarity between U1 and U5 is 0.39.


In [None]:
x = ratings.loc['U1'].values
y = ratings.loc['U2'].values
cos_sim = centered_cosine_similarity(x, y)
print(f"Centralized cosine similarity between U1 and U2 is {cos_sim:.2f}.") 

Centralized cosine similarity between U1 and U2 is 0.20.


In [None]:
x = ratings.loc['U1'].values
y = ratings.loc['U3'].values
cos_sim = centered_cosine_similarity(x, y)
print(f"Centralized cosine similarity between U1 and U3 is {cos_sim:.2f}.") 

Centralized cosine similarity between U1 and U3 is -0.89.


In [None]:
x = ratings.loc['U1'].values
y = ratings.loc['U4'].values
cos_sim = centered_cosine_similarity(x, y)
print(f"Centralized cosine similarity between U1 and U4 is {cos_sim:.2f}.") 

Centralized cosine similarity between U1 and U4 is 0.44.


In [None]:
x = ratings.loc['U1'].values
y = ratings.loc['U5'].values
cos_sim = centered_cosine_similarity(x, y)
print(f"Centralized cosine similarity between U1 and U5 is {cos_sim:.2f}.") 

Centralized cosine similarity between U1 and U5 is -0.39.


In [None]:
def estimate_rating(ratings,
                    user_index=0,
                    item_index=0, 
                    k=2,
                    similarity=centered_cosine_similarity,
                    aggregation='mean'):
  """
  Estime the rating of a pair (user, item) based on collaborative filtering
  Item-Item with neighborhood K
  """
 
  # Item rating array
  item_rating = ratings.iloc[item_index].values
 
  # Calculates similarity between item_index and other indexes
  items_indexes = np.array([i for i in range(ratings.shape[0]) if i != item_index])
  similarities = np.array([similarity(x=item_rating, y=ratings.iloc[i].values) for i in items_indexes])
  
  # Finds the k more similar items and have evaluation
  items_rated = np.where(ratings.iloc[items_indexes, user_index] > 0)[0]
  sorted_similarities = np.argsort(-1*similarities[items_rated])
  k_closest = items_indexes[items_rated[sorted_similarities]][:k]

  # Aggregation
  ratings_k_closest = ratings.iloc[k_closest, user_index]
  if aggregation=='mean':
    prediction = np.mean(ratings_k_closest)
  elif aggregation=='wmean':
    similarities_k_closest = similarities[items_rated][sorted_similarities][:k]
    prediction = np.dot(np.abs(similarities_k_closest),
                        ratings_k_closest)/np.sum(np.abs(similarities_k_closest))
  else:
    raise ValueError(f"{aggregation} is an invalid value for aggregation!")
  
  # Handles singularity
  prediction = prediction if not np.isnan(prediction) else 3.
  
  # Verbose
  print(f"Prediction for user_index={user_index}, item_index={item_index}," \
        f"k={k}, aggregation={aggregation} is: {prediction:.2f}")
  return prediction

In [None]:
# Estimate rating of user U1 to item I2
estimate_rating(ratings, user_index=1, item_index=0, aggregation='mean')

Prediction for user_index=1, item_index=0,k=2, aggregation=mean is: 3.75


3.75

In [None]:
# Estimate rating of user U1 to item I1
estimate_rating(ratings, user_index=0, item_index=0, aggregation='mean')

Prediction for user_index=0, item_index=0,k=2, aggregation=mean is: 4.55


4.55

In [None]:
# Gets absolute error
ratings_train = ratings.copy()
ratings_train.iloc[0, 0] = 0.
ratings_train

# Real value
y_true = ratings.values[0, 0]
y_true

# Prediction
y_pred = estimate_rating(ratings_train, user_index=0, item_index=0, aggregation='mean')

# RMSE
rmse = np.sqrt(np.mean((y_true-y_pred)**2))
print(f"FC test RMSE with mean aggregation: {rmse: .2f}")

Prediction for user_index=0, item_index=0,k=2, aggregation=mean is: 4.55
FC test RMSE with mean aggregation:  0.55


In [None]:
# Different recommendation metrics
b_u = ratings.iloc[0].sum() / ratings[ratings['I2'] > 0]['I2'].value_counts().sum()
b_i = ratings['I2'].sum() / ratings[ratings['I2'] > 0]['I2'].value_counts().sum()
mu = ratings[ratings > 0].sum().sum() / (ratings > 0).sum().sum()
r = b_u + b_i - mu

print(f'b_u = {b_u: .2f}')
print(f'b_i = {b_i: .2f}')
print(f'mu = {mu: .2f}')
print(f'r = {r: .2f}')

b_u =  3.23
b_i =  3.83
mu =  3.47
r =  3.59
