In [2]:
import pandas as pd
import numpy as np
from typing import Callable
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [3]:
def load_data(file, sep='\t'):
    return pd.read_csv(f'./lfm-challenge-data/{file}', delimiter=sep)

In [4]:
users = load_data('lfm-challenge.user')
items = load_data('lfm-challenge.item')
inter_train = load_data('lfm-challenge.inter_train')
inter_test = load_data('lfm-challenge.inter_test')
test_users = pd.read_csv(f'./lfm-challenge-data/test_indices.txt')['users'].values

n_users = users['user_id'].values.size
n_items = items.index.values.size

In [5]:
def create_interaction_matrix(users, items, inter, threshold=1, binary=False):
    interaction_matrix = np.zeros((n_users, n_items), dtype=np.int8)
    
    for user in range(n_users):
        interacted_items = inter.loc[inter['user_id'] == user, 'item_id'].values
        rate_of_items = inter.loc[inter['user_id'] == user, 'listening_events'].values
        
        for item in range(interacted_items.size):
            rating = rate_of_items[item]
            if binary:
                rating = 0 if rating < threshold else 1
            
            interaction_matrix[user, interacted_items[item]] = rating
    
    return interaction_matrix

In [6]:
interaction_matrix = create_interaction_matrix(users, items, inter_train, binary=True)
test_interaction_matrix = create_interaction_matrix(users, items, inter_test, binary=True)

In [12]:
def create_item_knn(interaction_matrix, n_neighbors=5):
    # Convert the numpy array to a sparse matrix
    interaction_csr = csr_matrix(interaction_matrix)

    # Initialize the model
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
    model_knn.fit(interaction_csr.T)  # Fit the model on the transpose of the matrix (items as rows)

    return model_knn

In [13]:
def make_recommendations(user_id, model_knn, interaction_matrix, n_recommendations=10):
    # Get the interactions of the user
    user_interactions = interaction_matrix[user_id]

    # Get the indices of the items that the user has interacted with
    interacted_indices = np.where(user_interactions > 0)[0]

    # Initialize an array to hold the distances and indices
    distances, indices = [], []

    # Get the distances and indices of the nearest items to the ones the user has interacted with
    for idx in interacted_indices:
        dist, ind = model_knn.kneighbors(interaction_matrix[idx].reshape(1, -1), n_neighbors=n_recommendations+1)
        distances.append(dist)
        indices.append(ind)

    # Flatten the distances and indices arrays
    distances = np.asarray(distances).flatten()
    indices = np.asarray(indices).flatten()

    # Get the top N recommendations
    recommendations = indices[np.argsort(distances)[:n_recommendations]]

    return recommendations


In [14]:
model_knn = create_item_knn(interaction_matrix)

In [15]:
recommendations = {}
for i, user_id in enumerate(test_users):
    print(f"{i}/100", end="\r")
    recommendations[user_id] = make_recommendations(user_id, model_knn, interaction_matrix)

0/100

ValueError: X has 10000 features, but NearestNeighbors is expecting 9699 features as input.