In [6]:
# Exercise 2

import numpy as np
from scipy.sparse import csr_matrix

def centered_cosine_sim(vector_x, vector_y):
    # Convert to dense arrays and replace NaNs with zeros
    vector_x = vector_x.toarray().flatten()
    vector_y = vector_y.toarray().flatten()
    # Center the vectors by subtracting the mean of non-zero entries
    x_mean = np.mean(vector_x[vector_x != 0])
    y_mean = np.mean(vector_y[vector_y != 0])
    centered_x = vector_x - x_mean
    centered_y = vector_y - y_mean
    # Compute the centered cosine similarity
    similarity = np.dot(centered_x, centered_y) / (np.linalg.norm(centered_x) * np.linalg.norm(centered_y))
    return similarity

In [7]:
def fast_centered_cosine_sim(matrix, vector):
    # Convert the matrix and vector to dense arrays
    matrix = matrix.toarray()
    vector = vector.toarray().flatten()
    # Center the vector
    vector_mean = np.mean(vector[vector != 0])
    centered_vector = vector - vector_mean
    # Center each row in the matrix
    matrix_centered = matrix - np.nan_to_num(matrix.mean(axis=1, keepdims=True))
    # Calculate cosine similarities
    dot_product = matrix_centered.dot(centered_vector)
    norms = np.linalg.norm(matrix_centered, axis=1) * np.linalg.norm(centered_vector)
    similarities = dot_product / norms
    return similarities

In [8]:
import numpy as np
from scipy.sparse import csr_matrix

# Test for case b.1
def test_centered_cosine_sim_case_1():
    k = 100
    vector_x = csr_matrix(np.array([i + 1 for i in range(k)]))
    vector_y = csr_matrix(np.array([k - i for i in range(k)]))
    similarity = centered_cosine_sim(vector_x, vector_y)
    print("Similarity (case 1):", similarity)

# Test for case b.2
def test_centered_cosine_sim_case_2():
    k = 100
    c_values = [2, 3, 4, 5, 6]
    vector_x = np.array([np.nan if (i % 10 == 0 and i // 10 in c_values) else i + 1 for i in range(k)])
    vector_x_sparse = csr_matrix(np.nan_to_num(vector_x, nan=0))
    vector_y = csr_matrix(np.array([k - i for i in range(k)]))
    similarity = centered_cosine_sim(vector_x_sparse, vector_y)
    print("Similarity (case 2):", similarity)

# Run tests
test_centered_cosine_sim_case_1()
test_centered_cosine_sim_case_2()

Similarity (case 1): -1.0000000000000002
Similarity (case 2): -0.9479121771961365
