In [29]:
import pandas as pd
import numpy as np
from typing import Callable
from sklearn.metrics.pairwise import cosine_similarity

# Load Data

In [30]:
def load_data(file, sep='\t'):
    return pd.read_csv(f'./lfm-challenge-data/{file}', delimiter=sep)

In [31]:
users = load_data('lfm-challenge.user')
items = load_data('lfm-challenge.item')
inter_train = load_data('lfm-challenge.inter_train')
inter_test = load_data('lfm-challenge.inter_test')
test_users = pd.read_csv(f'./lfm-challenge-data/test_indices.txt')['users'].values

n_users = users['user_id'].values.size
n_items = items.index.values.size

In [40]:
test_users

array([4251, 9092, 6483, 4517, 4353, 7505, 1504, 3152, 1606, 6897, 1771,
       4815, 4173, 7909, 3592, 6689, 8063, 1954, 8530, 9346, 2202, 8896,
       8598, 1247, 1572, 8070, 7687, 1849, 7330, 1367, 2340, 5343, 6779,
       5069, 5256, 2810, 8733, 7546, 6189, 7438,  900, 8155, 5282, 3762,
       4289, 4618, 6097,  657, 7312, 2211, 6274, 8691, 6594, 3554, 4318,
       4493, 8899, 4947, 7072, 7183, 3882, 3577, 7421,  744, 6172, 6617,
       7970,  836, 2684,  956, 9359, 7305,  427, 8231, 6749, 4235, 8257,
       1895, 3522,  620,  919, 4820, 9368, 4227, 3030, 7789, 6084, 6538,
       4554, 7514, 3982, 3426, 7198, 1272, 6637, 3195,  870, 5462, 4347,
       8060])

# Interaction Matrix

In [32]:
def create_interaction_matrix(users, items, inter, threshold=1, binary=False):
    interaction_matrix = np.zeros((n_users, n_items), dtype=np.int8)
    
    for user in range(n_users):
        interacted_items = inter.loc[inter['user_id'] == user, 'item_id'].values
        rate_of_items = inter.loc[inter['user_id'] == user, 'listening_events'].values
        
        for item in range(interacted_items.size):
            rating = rate_of_items[item]
            if binary:
                rating = 0 if rating < threshold else 1
            
            interaction_matrix[user, interacted_items[item]] = rating
    
    return interaction_matrix

In [33]:
interaction_matrix = create_interaction_matrix(users, items, inter_train, binary=True)
test_interaction_matrix = create_interaction_matrix(users, items, inter_test, binary=True)

# ItemKNN

# Get Item Similarities

In [18]:
def jaccard_score(a: np.array, b: np.array) -> float:
    """
    a, b: - vectors of the same length corresponding to the two items

    returns: float - jaccard similarity score for a and b
    """
    score = None

    # TODO: YOUR IMPLEMENTATION
    intersection = np.logical_and(a, b).sum()
    union = np.logical_or(a, b).sum()
    score = intersection / union

    return float(score)

In [19]:
def calculate_sim_scores(similarity_measure: Callable[[int, int], float],
                         inter: np.array,
                         target_vec: np.array) -> np.array:
    """
    similarity_measure: Callable - function that measures similarity, use your jaccard_score function from above
    inter: np.array - interaction matrix - calculate similarity between each item and the target item (see below)
    target_vec: np.array - target item vector
    
    returns: np.array - similarities between every item from <inter> and <target_vec> in the respective order
    """

    item_similarities = None

    # TODO: YOUR IMPLEMENTATION
    def func(col, target):
        return similarity_measure(col, target)
    
    item_similarities = np.apply_along_axis(func, axis=0, arr=inter, target=target_vec)

    return np.array(item_similarities)

In [36]:
# here we pass your jaccard_score function as "similarity_measure" parameter
item_sims = calculate_sim_scores(similarity_measure=jaccard_score, inter=interaction_matrix, target_vec=interaction_matrix[:, 0])
item_sims

array([1.        , 0.02702703, 0.        , ..., 0.        , 0.        ,
       0.        ])

# Estimate User Item Scores

In [37]:
def get_user_item_score(sim_scores_calculator: Callable[[Callable, np.array, np.array], np.array],
                        inter: np.array,
                        target_user: int,
                        target_item: int,
                        n: int = 2) -> float:
    """
    sim_scores_calculator: Callable - function that calculates similarities, using calculate_sim_scores
                                      from above, already defined in the next cell
    inter: np.array - interaction matrix
    target_user: target user id
    target_item: int - target item id
    n: int - n closest neighbors to consider for the score prediction
    
    returns: float - mean of similarity scores = user-item 'fitness' score
    """

    item_similarities_mean = None

    # TODO: YOUR IMPLEMENTATION.
    consumed_items = np.where(inter[target_user] == 1)[0]
    new_inter = np.delete(inter[:, consumed_items], target_user, axis=0)
    target_vector = np.delete(inter, target_user, axis=0)[:, target_item]
    
    item_similarities_mean = np.mean(np.sort(sim_scores_calculator(new_inter, target_vector))[::-1][:n])
    

    return item_similarities_mean


In [39]:
# you need to pass a "sim_scores_calculator" function into the "get_user_item_score" function,
# but "calculate_sim_scores" also takes a similarity measure function as parameter.
# The similarity measure is not necessarily present inside the "get_user_item_score" function
# Ideally you want to provide the similarity measure together with the "calculate_sim_scores" function

# The following line of code is one possible way to "bind" parameters to a function: you can now use the "sim_score_calc" function as parameter,
# which will always use your "jaccard_score" function as first parameter for "calculate_sim_scores" and passes through the other parameters
# This procedure is a way to keep your functions generic, you can now simply change your similarity measure via the
# function calls without needing to change the function bodies themselves
#
# Another advantage for you is that when we test your solution, we are going to pass our own implementations into your functions
# That means if you made a mistake in Task 1, you will still be able to get full points for consequent tasks if you did everything else correctly
# So make sure that your functions work independently from your other implemented functions by using the code we provide in this cell

def sim_score_calc(inter, target_vec): return calculate_sim_scores(jaccard_score, inter, target_vec)


# TODO: Fill in the missing parameters
item_sim = get_user_item_score(sim_score_calc, interaction_matrix, 0, 1, 10)

print(item_sim)

0.026919362626750362


# Recommender

In [25]:
def recTopK(user_item_scorer: Callable[[Callable, np.array, int, int], float],
            inter_matr: np.array,
            user: int,
            top_k: int,
            n: int) -> (np.array, np.array):
    '''
    user_item_scorer: Callable - wrapper function that calculates user-item score, using get_user_item_score function
                                 from above, already defined in the next cell
    inter_matr: np.array - interaction matrix
    user: int -  user_id
    top_k: int - expected length of the resulting list
    n: int - number of neighbors to consider
    
    returns - array of recommendations (sorted in the order of descending scores) & array of corresponding scores
    '''

    top_rec = None
    scores = None

    # TODO: YOUR IMPLEMENTATION.
    unseen_items = np.where(inter_matr[user] == 0)[0]
    scores = []
    
    for item in unseen_items:
        score = user_item_scorer(inter_matr, user, item, n)
        scores.append(score)
      
    top_rec = unseen_items[np.argsort(scores)[::-1][:top_k]]
    scores = np.array(np.sort(scores)[::-1][:top_k])

    return np.array(top_rec), np.array(scores)

In [27]:
# see Task 2 for an explanation of the following function definition
def user_item_scorer(inter, target_user, target_item, n): return get_user_item_score(sim_score_calc, inter,
                                                                                     target_user, target_item, n)


# TODO: Fill in the missing parameters
rec_item_cf, scores_item_cf = recTopK(user_item_scorer, interaction_matrix, 0, 10, 4)

In [45]:
def recommend():
    pred = []
    
    for i, user_id in enumerate(test_users):
        print(f"{i}/{len(test_users)}")
        rec_item_cf, scores_item_cf = recTopK(user_item_scorer, interaction_matrix, user_id, 10, 4)
        pred.append(rec_item_cf)

    return np.array(pred)
        
predictions = recommend()

0/100
1/100
2/100
3/100
4/100
5/100
6/100
7/100
8/100
9/100
10/100
11/100
12/100
13/100
14/100
15/100
16/100
17/100
18/100
19/100
20/100
21/100
22/100
23/100
24/100
25/100
26/100
27/100
28/100
29/100
30/100
31/100
32/100
33/100
34/100
35/100
36/100
37/100
38/100
39/100
40/100
41/100
42/100
43/100
44/100
45/100
46/100
47/100
48/100
49/100
50/100
51/100
52/100
53/100
54/100
55/100
56/100
57/100
58/100
59/100
60/100
61/100
62/100
63/100
64/100
65/100
66/100
67/100
68/100
69/100
70/100
71/100
72/100
73/100
74/100
75/100
76/100
77/100
78/100
79/100
80/100
81/100
82/100
83/100
84/100
85/100
86/100
87/100
88/100
89/100
90/100
91/100
92/100
93/100
94/100
95/100
96/100
97/100
98/100
99/100


In [47]:
np.save('my_array.npy', predictions)

In [49]:
def get_ndcg_score(predictions: np.ndarray, test_interaction_matrix: np.ndarray, topK=10) -> float:
    """
    predictions - np.ndarray - predictions of the recommendation algorithm for each user.
    test_interaction_matrix - np.ndarray - test interaction matrix for each user.
    topK - int - topK recommendations should be evaluated.
    
    returns - average ndcg score over all users.
    """
    n_users = predictions.shape[0]
    discounts = np.log2(np.arange(2, topK+2)) # discounts for positions 1 to topK (0-indexed)
    ndcg_scores = np.zeros(n_users)

    for user in range(n_users):
        top_items = predictions[user]
        relevant_items = test_interaction_matrix[user].nonzero()[0]
        if len(relevant_items) == 0:
            continue

        # calculate DCG
        dcg = 0
        for i, item in enumerate(top_items):
            if item in relevant_items:
                dcg += 1 / discounts[i]

        # calculate IDCG
        n_relevant = min(topK, len(relevant_items))
        idcg = np.sum(1 / discounts[:n_relevant]) # it is 1 for each relevant item in ideal case
        
        ndcg_scores[user] = dcg / idcg

    return np.mean(ndcg_scores)


In [50]:
score = get_ndcg_score(predictions, test_interaction_matrix)
score

0.002200917662980802

In [48]:
print("Recommendations with Item CF: ", rec_item_cf)
print("With Scores: ", scores_item_cf)
print("-" * 75)

Recommendations with Item CF:  [   2  184 1626 2061 2249 2242 1304 2070    5  177]
With Scores:  [0.06069266 0.05301205 0.05008259 0.04966753 0.04560784 0.04372179
 0.04112483 0.0405     0.03960521 0.03960249]
---------------------------------------------------------------------------
