In [1]:
import pandas as pd


df = pd.read_csv('data/normalized_filtered_user_listening.csv', usecols=lambda column: column not in ['Unnamed: 0'])

# Initialize an empty list to store your dictionaries
dataset_dicts = []

# Iterate over the DataFrame rows
for index, row in df.iterrows():
    # Construct a dictionary for each row and append to the list
    row_dict = {
        "user_id": row["user_id"],
        "track_id": row["track_id"],
        "playcount": row["normalized_playcount"]
    }
    dataset_dicts.append(row_dict)

In [2]:
import numpy as np

musicdf=pd.read_csv('data/music_info.csv')
musicdf.head()

from scipy.sparse import csr_matrix

features_list = [
    'year',
    'duration_ms',
    'danceability',
    'energy',
    'loudness',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo'
]

item_features_array = musicdf[features_list].values
item_features_sparse = csr_matrix(item_features_array, dtype=np.float32)
item_features_sparse

<50683x11 sparse matrix of type '<class 'numpy.float32'>'
	with 549394 stored elements in Compressed Sparse Row format>

In [3]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x['user_id'] for x in dataset_dicts),
            (x['track_id'] for x in dataset_dicts))

(interactions, weights) = dataset.build_interactions(((x['user_id'], x['track_id'])
                                                      for x in dataset_dicts))

print(repr(interactions))



<692376x28597 sparse matrix of type '<class 'numpy.int32'>'
	with 3651141 stored elements in COOrdinate format>


In [4]:
from lightfm import LightFM

model = LightFM(loss='logistic')
model.fit(interactions,item_features=item_features_sparse)

<lightfm.lightfm.LightFM at 0x1d40ea4fbb0>

In [36]:
from lightfm.evaluation import auc_score

train_auc = auc_score(model, interactions, item_features=item_features_sparse).mean()

In [5]:
# interactions

# from scipy.sparse import coo_matrix


# # To convert to a dense matrix (if the matrix is not too large):
# dense_matrix = interactions.todense()

# # To list non-zero entries without converting to dense:
# non_zero_entries = zip(interactions.row, interactions.col, interactions.data)
# for entry in non_zero_entries:
#     print(entry)

In [8]:
music_tracks=musicdf['track_id'].values

In [9]:
_, item_embeddings = model.get_item_representations(item_features_sparse)

from annoy import AnnoyIndex

factors = item_embeddings.shape[1] # Length of item vector that will be indexed
annoy_idx = AnnoyIndex(factors)  
for i in range(item_embeddings.shape[0]):
    v = item_embeddings[i]
    annoy_idx.add_item(i, v)

annoy_idx.build(10) # 10 trees
annoy_idx.save('music_annoytree.ann')

  annoy_idx = AnnoyIndex(factors)


True

In [10]:
def music(track_id,music_df):
    related_music=music_df[music_df['track_id']==track_id]
    # print(f'{related_music.name} by {related_music.artist}')
    music_name = related_music['name'].iloc[0]
    artist_name = related_music['artist'].iloc[0]
    return f'{music_name} by {artist_name}'

def nearest_movies_annoy(track_id, index,music_df, n=10, print_output=True):
    nn = index.get_nns_by_item(track_id, 10)


    track_id=music_tracks[track_id]
    
    if print_output:
        print('Closest to %s : \n' % music(track_id,music_df))
    titles = [music(music_tracks[i],music_df) for i in nn]
    if print_output:
        print("\n".join(titles))
    
nearest_movies_annoy(13, annoy_idx, musicdf)

Closest to Chop Suey! by System of a Down : 

Empty by The Cranberries
Origins by Tennis
Deed I Do by Diana Krall
Stand Up (And Be Counted) by Venom
Mr. Torture by Helloween
Gryning by Finntroll
Here Comes All The People by Liars
Loucura by Mariza
Threw It Away by Soil
What Do You Want by Jerrod Niemann


# User Related Recommendation

In [12]:
import numpy as np

# Assuming 'item_embeddings' is your array of embeddings
normalized_embeddings = np.array([emb / np.linalg.norm(emb) if np.linalg.norm(emb) != 0 else emb for emb in item_embeddings])


In [13]:
norms = np.linalg.norm(normalized_embeddings, axis=1)
max_norm = norms.max()
extra_dimension = np.sqrt(max_norm ** 2 - norms ** 2)
norm_data = np.append(normalized_embeddings, extra_dimension.reshape(norms.shape[0], 1), axis=1)

#First an Annoy index:

user_factors = norm_data.shape[1]
annoy_member_idx = AnnoyIndex(user_factors)  # Length of item vector that will be indexed

for i in range(norm_data.shape[0]):
    v = norm_data[i]
    annoy_member_idx.add_item(i, v)
    
annoy_member_idx.build(10)


  annoy_member_idx = AnnoyIndex(user_factors)  # Length of item vector that will be indexed


True

In [14]:
normalized_embeddings[0]

array([ 0.37305456, -0.46896148,  0.3143484 ,  0.15208659, -0.4772629 ,
       -0.35165125, -0.02878246,  0.10749982, -0.2103113 , -0.33302584],
      dtype=float32)

In [15]:
_, user_embeddings = model.get_user_representations()

In [33]:
# annoy_member_idx.get_nns_by_vector()
# np.append(user_embeddings[4], 0)
annoy_member_idx.get_nns_by_vector(np.append(user_embeddings[3], 0), n=5, search_k=1000000, include_distances=True)

([25337, 15659, 40535, 40152, 46740],
 [1.1076070070266724,
  1.1076135635375977,
  1.107615351676941,
  1.107618808746338,
  1.1076197624206543])

In [19]:
def sample_recommendation(user_ids, model, data, music_info, n_items=10, print_output=True):
    n_users, n_items = data.shape
    music_tracks=musicdf['track_id'].values
    for user_id in user_ids:
        known_positives = music_tracks[interactions.tocsr()[user_id].indices]
        print(known_positives)
        top_items = [music_tracks[i] for i in annoy_member_idx.get_nns_by_vector(np.append(user_embeddings[user_id], 0), 5)]
        print(annoy_member_idx.get_nns_by_vector(np.append(user_embeddings[user_id], 0), 5))
        print(top_items)
        if print_output == True:
            print("User %s" % user_id)
            print("     Known positives:")

            for x in known_positives[:3]:
                print("        %s" % music(x,music_info))

            print("     Recommended:")

            for x in top_items[:3]:
                print("        %s" % music(x,music_info))

In [29]:
sample_recommendation([13202], model, interactions, musicdf, print_output=True)

['TRJENQH128E078E00F']
[25337, 15659, 40535, 40152, 46740]
['TRDAOJL128F932C383', 'TRFFCRE128F4298E8C', 'TRDZBOR128F42893C9', 'TRXIABC128F932E7DD', 'TRXLVNH128F92F9A04']
User 13202
     Known positives:
        Respectable by The Rolling Stones
     Recommended:
        Dopesmoker by Sleep
        45:33 by LCD Soundsystem
        dlp 3 by William Basinski
