In [3]:
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import torch
from torch import nn
from torch.autograd import Variable

# Load Data

In [4]:
def load_data(file, sep='\t'):
    return pd.read_csv(f'./lfm-challenge-data/{file}', delimiter=sep)

In [5]:
users = load_data('lfm-challenge.user')
items = load_data('lfm-challenge.item')
inter_train = load_data('lfm-challenge.inter_train')
inter_test = load_data('lfm-challenge.inter_test')
test_users = pd.read_csv(f'./lfm-challenge-data/test_indices.txt')['users'].values

n_users = users['user_id'].values.size
n_items = items.index.values.size

# Data Aggregation

### Set item_id as a column

In [6]:
items['item_id'] = items.index
items.head()

Unnamed: 0,artist,track,country,item_id
0,Wovenwar,Confession,US,0
1,Mike Shinoda,Ghosts,US,1
2,System of a Down,Lonely Day,US,2
3,Three Days Grace,Painkiller,CA,3
4,Muse,Pressure,GB,4


In [7]:
# def get_artist_genre(artist_name, client_id, client_secret):
#     # Set up spotipy with your app credentials
#     sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))
    
#     # Search for the artist
#     result = sp.search(artist_name, type="artist")
    
#     # Get the first artist from the search results
#     artist = result['artists']['items'][0]

#     return artist['genres']

# # Usage:
# client_id = "f59d90ec64ff43f4b7b17d50f0261e1b"
# client_secret = "0b000f60662e4dde96424326bb66ee7e"

In [8]:
# def safe_get_genre(artist):
#     try:
#         return get_artist_genre(artist, client_id, client_secret)
#     except:
#         return []

# items['genre'] = items['artist'].apply(safe_get_genre)


# Interaction Matrix

In [9]:
def create_interaction_matrix(users, items, inter, threshold=1, binary=False):
    interaction_matrix = np.zeros((n_users, n_items), dtype=np.int8)
    
    for user in range(n_users):
        interacted_items = inter.loc[inter['user_id'] == user, 'item_id'].values
        rate_of_items = inter.loc[inter['user_id'] == user, 'listening_events'].values
        
        for item in range(interacted_items.size):
            rating = rate_of_items[item]
            if binary:
                rating = 0 if rating < threshold else 1
            
            interaction_matrix[user, interacted_items[item]] = rating
    
    return interaction_matrix

In [10]:
interaction_matrix = create_interaction_matrix(users, items, inter_train, binary=True)
test_interaction_matrix = create_interaction_matrix(users, items, inter_test, binary=True)

# POP Recommender

In [6]:
def recTopKPop(inter_matr: np.array,
               user: int,
               top_k: int) -> np.array:
    '''
    inter_matr - np.array from the task 1;
    user - user_id, integer;
    top_k - expected length of the resulting list;

    returns - list/array of top K popular items that the user has never seen
              (sorted in the order of descending popularity);
    '''
    # TODO: YOUR IMPLEMENTATION
    top_pop = None
    
    rec_items_before = np.where(inter_matr[user] == 1)[0]
    interactions = pd.Series(np.sum(inter_matr, axis=0)).drop(rec_items_before).sort_values(ascending=False)
    top_pop = interactions.index[:top_k].to_numpy()

    return top_pop

In [7]:
predictions = []

for user in range(users['user_id'].values.size):
    print(f'User {user}/{users_size}', end='\r')
    top_10 = recTopKPop(inter_matr=interaction_matrix, user=user, top_k=10)
    predictions.append(top_10)

predictions = np.array(predictions)

User 9698/9699

## Test Predictions

In [8]:
def get_ndcg_score(predictions: np.ndarray, test_interaction_matrix: np.ndarray, topK=10) -> float:
    """
    predictions - np.ndarray - predictions of the recommendation algorithm for each user.
    test_interaction_matrix - np.ndarray - test interaction matrix for each user.
    topK - int - topK recommendations should be evaluated.
    
    returns - average ndcg score over all users.
    """
    n_users = predictions.shape[0]
    discounts = np.log2(np.arange(2, topK+2)) # discounts for positions 1 to topK (0-indexed)
    ndcg_scores = np.zeros(n_users)

    for user in range(n_users):
        top_items = predictions[user]
        relevant_items = test_interaction_matrix[user].nonzero()[0]
        if len(relevant_items) == 0:
            continue

        # calculate DCG
        dcg = 0
        for i, item in enumerate(top_items):
            if item in relevant_items:
                dcg += 1 / discounts[i]

        # calculate IDCG
        n_relevant = min(topK, len(relevant_items))
        idcg = np.sum(1 / discounts[:n_relevant]) # it is 1 for each relevant item in ideal case
        
        ndcg_scores[user] = dcg / idcg

    return np.mean(ndcg_scores)


In [9]:
predictions = predictions.reshape((9699, 10))

In [10]:
score = get_ndcg_score(predictions, test_interaction_matrix)
score

0.027628057179704013

# POP Recommender Country

In [37]:
def recTopKPopByCountry(inter_matr: np.array,
               user: int,
               top_k: int,
               users: pd.DataFrame) -> np.array:
    '''
    inter_matr - np.array from the task 1;
    user - user_id, integer;
    top_k - expected length of the resulting list;
    users: pandas Dataframe consisting of user information for all users, requires a "country" column

    returns - list/array of top K popular items that the user has never seen
              (sorted in the order of descending popularity);
    '''
    # TODO: YOUR IMPLEMENTATION
    top_pop = None

    rec_items_before = np.where(inter_matr[user] == 1)[0]
    country = users.loc[users['user_id'] == user, 'country'].values[0]
    users_from_same_country = users.loc[users['country'] == country, 'user_id'].values
    new_inter_matr = inter_matr[users_from_same_country]
    
    interactions = pd.Series(np.sum(new_inter_matr, axis=0)).drop(rec_items_before).sort_values(ascending=False, kind='mergesort')
    top_pop = interactions.index[:top_k].to_numpy()
    
    return top_pop

In [38]:
predictions = []

for user in range(users['user_id'].values.size):
    print(f'User {user}/{users_size}', end='\r')
    top_10 = recTopKPopByCountry(inter_matr=interaction_matrix, user=user, top_k=10, users=users)
    predictions.append(top_10)

predictions = np.array(predictions)

User 9698/9699

In [40]:
predictions.shape

(9699, 10)

In [41]:
score = get_ndcg_score(predictions, test_interaction_matrix)
score

0.031679933604007816

In [32]:
items

Unnamed: 0,artist,track,country,item_id
0,Wovenwar,Confession,US,0
1,Mike Shinoda,Ghosts,US,1
2,System of a Down,Lonely Day,US,2
3,Three Days Grace,Painkiller,CA,3
4,Muse,Pressure,GB,4
...,...,...,...,...
9995,Northstar,The Pornographer's Dream,US,9995
9996,The New Regime,A Way To Feel Again,US,9996
9997,Soda Stereo,En Remolinos (SEP7IMO DIA),AR,9997
9998,Grace Lightman,An Ordinary Life,GB,9998


# Neural Network