Se determina el flujo completo que debe pasar el dato original hasta la obtención de las métricas de evaluación.

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

def generate_ratings_matrix(data_rows: np.ndarray):
    r = np.zeros((943, 1682))
    r[:] = np.nan
    for i in data_rows:
        r[i[0] - 1][i[1] - 1] = i[2]
    return r

# Carga de datos de entrenamiento y test (el test es un 10% del entrenamiento en nuestro caso)
train_ratings = pd.read_csv("../data/train.csv")
test_ratings = pd.read_csv("../data/test.csv")

In [4]:
print(train_ratings.head(3))

   user_id  item_id  rating  timestamp
0       85      423       4  879454046
1      290       71       5  880473667
2      152      692       5  880149963


In [5]:
print(test_ratings.head(3))

   user_id  item_id  rating  timestamp
0      301       39       3  882076292
1      288      121       2  886893063
2      234      614       3  892334609


# Normalización de los datos

En este caso por ejemplo usamos Z Score.

In [6]:
class ZScoreNormalization:
    def __init__(self, user: bool = True):
        self.user = user
        self.means = []
        self.stds = []

    def transform(self, r: np.ndarray):
        n_u, n_i = r.shape
        if self.user:
            self.means = np.hstack([np.reshape(np.nanmean(r, axis=1), (-1, 1))] * n_i)
            self.stds = np.hstack([np.reshape(np.nanstd(r, axis=1), (-1, 1))] * n_i)
        else:
            self.means = np.vstack([np.reshape(np.nanmean(r, axis=0), (1, -1))] * n_u)
            self.stds = np.vstack([np.reshape(np.nanstd(r, axis=0), (1, -1))] * n_u)

        return (r - self.means) / self.stds

    def reverse_transform(self, r: np.ndarray):
        # We use hadamard product
        return np.multiply(self.stds, r) + self.means

Convertimos los ratings de entrenamiento a una matriz de ratings.

In [7]:
r = generate_ratings_matrix(train_ratings.values)

In [8]:
print(r)

[[ 5.  3.  4. ... nan nan nan]
 [ 4. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [ 5. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan  5. nan ... nan nan nan]]


In [9]:
r_centered = ZScoreNormalization().transform(r)

In [10]:
print(r_centered)

[[ 1.11297179 -0.50180231  0.30558474 ...         nan         nan
          nan]
 [ 0.23036965         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 ...
 [ 0.98149546         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 [        nan  1.27155178         nan ...         nan         nan
          nan]]


# Cálculo del coeficiente de correlación de Pearson entre usuarios

In [11]:
def pearson_correlation(u: np.ndarray, v: np.ndarray):
    """
    Computes the pearson correlation between two vectors
    Args:
        u: First vector
        v: Second vector

    Returns:
        The pearson correlation value
    """

    u_mean = np.nanmean(u)
    v_mean = np.nanmean(v)

    mean_deviation = (u - u_mean) * (v - v_mean)
    num = np.nansum(mean_deviation)

    # We can get the common elements from mean_deviation
    common_indices = np.argwhere(~ np.isnan(mean_deviation))
    u_common = u[common_indices]
    v_common = v[common_indices]

    den = np.sqrt(np.sum(np.square(u_common - u_mean)) * np.sum(np.square(v_common - v_mean)))
    return num / den

In [21]:
sim_matrix = np.zeros((943, 943))

for i, u in tqdm(enumerate(r_centered)):
    for j, v in enumerate(r_centered):
        sim_matrix[i][j] = pearson_correlation(u, v)

  return num / den
943it [02:50,  5.54it/s]


No vale hacerlo con el .corr() de pandas porque la lógica es distinta al calcular las desviaciones estándar.

In [34]:
sim_matrix

array([[ 1.        ,  0.21493829,  0.13931314, ...,  0.22516616,
        -0.15267653,  0.08732096],
       [ 0.21493829,  1.        , -0.21903385, ..., -0.29904328,
         0.203545  ,  0.14623625],
       [ 0.13931314, -0.21903385,  1.        , ...,  0.83481571,
         0.31565668,  0.12403473],
       ...,
       [ 0.22516616, -0.29904328,  0.83481571, ...,  1.        ,
        -0.48163175,  0.07110123],
       [-0.15267653,  0.203545  ,  0.31565668, ..., -0.48163175,
         1.        ,  0.38047957],
       [ 0.08732096,  0.14623625,  0.12403473, ...,  0.07110123,
         0.38047957,  1.        ]])

Guardamos la distancia para futuros procesos.

In [51]:
with open("./sim_matrix.npy", "wb") as f:
    np.save(f, sim_matrix)

In [53]:
with open("./sim_matrix.npy", "rb") as f:
    sim_matrix = np.load(f)

Seleccionamos el número adecuado de vecinos cercanos al usuario activo (si lo hacemos para usuarios). Después, computamos el weighted rating. 