In [3]:
"""This module features functions and classes to manipulate data for the
collaborative filtering algorithm.
"""

from pathlib import Path

import scipy
import pandas as pd


def str_to_index(df,user_id='user_id',track_id='track_id'):
    # Create a mapping from user IDs and track IDs to numeric indices
    user_id_to_index = {user_id: i for i, user_id in enumerate(df.user_id.unique())}
    track_id_to_index = {track_id: i for i, track_id in enumerate(df.track_id.unique())}

    # Replace the strings with numeric indices
    df['user'] = df.user_id.map(user_id_to_index)
    df['track'] = df.track_id.map(track_id_to_index)

    df.set_index(["user", "track"], inplace=True)

    return df

def load_user_artists(user_artists_file: Path) -> scipy.sparse.csr_matrix:
    """Load the user artists file and return a user-artists matrix in csr
    fromat.
    """
    user_artists = str_to_index(pd.read_csv(user_artists_file).drop(['Unnamed: 0'], axis=1))
    # user_artists.set_index(["user_id", "track_id"], inplace=True)
    coo = scipy.sparse.coo_matrix(
        (
            user_artists.normalized_playcount.astype(float),
            (
                user_artists.index.get_level_values(0),
                user_artists.index.get_level_values(1),
            ),
        )
    )
    return coo.tocsr()


class ArtistRetriever:
    """The ArtistRetriever class gets the artist name from the artist ID."""

    def __init__(self):
        self._artists_df = None

    def track_mapping(self,artists_df,interactin_df):
        track_id_to_index = {track_id: i for i, track_id in enumerate(interactin_df.track_id.unique())}
        artists_df['track']=artists_df['track_id'].map(track_id_to_index)
        return artists_df

    def get_artist_name_from_id(self, artist_id: int) -> str:
        return self._artists_df.loc[artist_id, "name"]
    def get_track_name_from_id(self, artist_id: int) -> str:
        return self._artists_df.loc[artist_id, "artist"]

    def load_artists(self, artists_file: Path, interaction_file: Path) -> None:
        """Load the artists file and stores it as a Pandas dataframe in a
        private attribute.
        """
        artists_df = pd.read_csv(artists_file)
        interaction_df=pd.read_csv(interaction_file)
        artists_df=self.track_mapping(artists_df,interaction_df)
        artists_df = artists_df.set_index("track")
        self._artists_df = artists_df



In [8]:
user_artists_matrix = load_user_artists(
    Path("../data/normalized_filtered_user_listening.csv")
)
print(user_artists_matrix)

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	0.5
  (3, 4)	1.0
  (3, 5)	0.25
  (4, 6)	0.3333333333333333
  (4, 7)	1.0
  (4, 8)	0.3333333333333333
  (4, 9)	0.6666666666666666
  (5, 10)	0.6666666666666666
  (5, 11)	0.3333333333333333
  (5, 12)	0.6666666666666666
  (5, 13)	0.3333333333333333
  (5, 14)	0.3333333333333333
  (5, 15)	0.6666666666666666
  (5, 16)	0.6666666666666666
  (5, 17)	1.0
  (5, 18)	0.3333333333333333
  (5, 19)	0.3333333333333333
  (5, 20)	0.3333333333333333
  (6, 21)	0.25
  (6, 22)	0.25
  (6, 23)	1.0
  (6, 24)	0.25
  :	:
  (692373, 197)	0.1666666666666666
  (692373, 198)	0.1666666666666666
  (692373, 199)	0.3333333333333333
  (692373, 201)	0.1666666666666666
  (692373, 287)	0.5
  (692373, 391)	0.1666666666666666
  (692373, 393)	0.1666666666666666
  (692373, 574)	0.1666666666666666
  (692373, 634)	0.3333333333333333
  (692373, 635)	0.1666666666666666
  (692373, 684)	1.0
  (692373, 792)	0.6666666666666666
  (692373, 1071)	0.1666666666666666
  (692373, 2047)	0.5
  (69237

In [49]:
artist_retriever = ArtistRetriever()
music_path="../data/music_info.csv"
df_path="../data/normalized_filtered_user_listening.csv"
artist_retriever.load_artists(Path(music_path),Path(df_path))
track_id=1
artist = artist_retriever.get_artist_name_from_id(track_id)
print(artist)

Sea Lion


In [4]:
"""This module features the ImplicitRecommender class that performs
recommendation using the implicit library.
"""


from pathlib import Path
from typing import Tuple, List

import implicit
import scipy

# from musiccollaborativefiltering.data import load_user_artists, ArtistRetriever


class ImplicitRecommender:
    """The ImplicitRecommender class computes recommendations for a given user
    using the implicit library.

    Attributes:
        - artist_retriever: an ArtistRetriever instance
        - implicit_model: an implicit model
    """

    def __init__(
        self,
        artist_retriever: ArtistRetriever,
        implicit_model: implicit.recommender_base.RecommenderBase,
    ):
        self.artist_retriever = artist_retriever
        self.implicit_model = implicit_model

    def fit(self, user_artists_matrix: scipy.sparse.csr_matrix) -> None:
        """Fit the model to the user artists matrix."""
        self.implicit_model.fit(user_artists_matrix)

    def recommend(
        self,
        user_id: int,
        user_artists_matrix: scipy.sparse.csr_matrix,
        n: int = 10,
    ) -> Tuple[List[str], List[float]]:
        """Return the top n recommendations for the given user."""
        artist_ids, scores = self.implicit_model.recommend(
            user_id, user_artists_matrix[n], N=n
        )
        artists = [
            self.artist_retriever.get_artist_name_from_id(artist_id)
            for artist_id in artist_ids
        ]
        tracks = [
            self.artist_retriever.get_track_name_from_id(artist_id)
            for artist_id in artist_ids
        ]
        return artists, tracks, scores





# Train Test Recommender Build With Evaluation

In [43]:
from implicit.evaluation import leave_k_out_split, precision_at_k, train_test_split

music_path="../data/music_info.csv"
df_path="../data/normalized_filtered_user_listening.csv"
    
# load user artists matrix
user_artists = load_user_artists(Path(df_path))

# instantiate artist retriever
artist_retriever = ArtistRetriever()
artist_retriever.load_artists(Path(music_path),Path(df_path))

# instantiate ALS using implicit
implict_model = implicit.als.AlternatingLeastSquares(
    factors=50, iterations=30, regularization=0.01,  use_gpu=implicit.gpu.HAS_CUDA
)
train, test= train_test_split(user_artists, train_percentage=0.8)

# instantiate recommender, fit, and recommend
recommender = ImplicitRecommender(artist_retriever, implict_model)
recommender.fit(train)



  0%|          | 0/30 [00:00<?, ?it/s]

In [44]:
# precision = precision_at_k(recommender.implicit_model,train,test)
from implicit.evaluation import ranking_metrics_at_k
evaluation= ranking_metrics_at_k(recommender.implicit_model,train,test)

  0%|          | 0/359331 [00:00<?, ?it/s]

In [45]:
evaluation

{'precision': 0.07091151326745865,
 'map': 0.03791456799176038,
 'ndcg': 0.0518229615598299,
 'auc': 0.5345274134470126}

# BRP Test


In [13]:
from implicit.evaluation import leave_k_out_split, precision_at_k, train_test_split

music_path="../data/music_info.csv"
df_path="../data/normalized_filtered_user_listening.csv"
    
# load user artists matrix
user_artists = load_user_artists(Path(df_path))

# instantiate artist retriever
artist_retriever = ArtistRetriever()
artist_retriever.load_artists(Path(music_path),Path(df_path))

# instantiate ALS using implicit   REMOVE USE_GPU IF HAS PROBLEM NEXT TIME
implict_model = implicit.cpu.bpr.BayesianPersonalizedRanking(
    factors=50, iterations=1500, regularization=0.01
)
train, test= train_test_split(user_artists, train_percentage=0.8)

# instantiate recommender, fit, and recommend
recommender = ImplicitRecommender(artist_retriever, implict_model)
# recommender.fit(train)



In [4]:
# precision = precision_at_k(recommender.implicit_model,train,test)
from implicit.evaluation import ranking_metrics_at_k
evaluation= ranking_metrics_at_k(recommender.implicit_model,train,test)
evaluation

  0%|          | 0/358841 [00:00<?, ?it/s]

{'precision': 0.17437309895894793,
 'map': 0.08949041708871283,
 'ndcg': 0.1210256391935781,
 'auc': 0.5851766632658045}

# LogisticMatrixFactorization

In [65]:
from implicit.evaluation import leave_k_out_split, precision_at_k, train_test_split

music_path="../data/music_info.csv"
df_path="../data/normalized_filtered_user_listening.csv"
    
# load user artists matrix
user_artists = load_user_artists(Path(df_path))

# instantiate artist retriever
artist_retriever = ArtistRetriever()
artist_retriever.load_artists(Path(music_path),Path(df_path))

# instantiate ALS using implicit
implict_model = implicit.cpu.lmf.LogisticMatrixFactorization(
    factors=50, iterations=350, regularization=0.01
)
train, test= train_test_split(user_artists, train_percentage=0.8)

# instantiate recommender, fit, and recommend
recommender = ImplicitRecommender(artist_retriever, implict_model)
recommender.fit(train)



  0%|          | 0/350 [00:00<?, ?it/s]

In [66]:
# precision = precision_at_k(recommender.implicit_model,train,test)
from implicit.evaluation import ranking_metrics_at_k
evaluation= ranking_metrics_at_k(recommender.implicit_model,train,test)
evaluation

  0%|          | 0/358686 [00:00<?, ?it/s]

{'precision': 0.03328953983495499,
 'map': 0.011677252233224775,
 'ndcg': 0.018300015743458783,
 'auc': 0.5138439182556687}

# Save Model

In [5]:
## https://benfred.github.io/implicit/api/models/gpu/als.html
recommender.implicit_model.save('800-BPS')

In [14]:
recommender=recommender.implicit_model.load('800-BPS.npz')

# Full Recommender Build

In [None]:
music_path="../data/music_info.csv"
df_path="../data/normalized_filtered_user_listening.csv"
    
# load user artists matrix
user_artists = load_user_artists(Path(df_path))

# instantiate artist retriever
artist_retriever = ArtistRetriever()
artist_retriever.load_artists(Path(music_path),Path(df_path))

# instantiate ALS using implicit
implict_model = implicit.als.AlternatingLeastSquares(
    factors=50, iterations=25, regularization=0.01,  use_gpu=implicit.gpu.HAS_CUDA
)

# instantiate recommender, fit, and recommend
recommender = ImplicitRecommender(artist_retriever, implict_model)
recommender.fit(user_artists)
# artists, scores = recommender.recommend(2, user_artists, n=5)

# # print results
# for artist, score in zip(artists, scores):
#     print(f"{artist}: {score}")

#  Logging of Data Method

In [None]:
import tqdm
import time
import codecs
import numpy as np
import logging

# Set the output filename
output_filename = 'recommendationlog.txt'

# Load the user listening data
df_logging = pd.read_csv('../data/normalized_filtered_user_listening.csv').drop(['Unnamed: 0'], axis=1)

# Extract unique user IDs
users = df_logging.user_id.unique()

# Extract the track IDs
artist_df = df_logging['track_id']

# Start timing the recommendation process
start = time.time()

# Specify the number of logs to generate
# Max is 692999
log_amount = 100000

# Initialize a progress bar
with tqdm.tqdm(total=log_amount) as progress:
    # Open the output file for writing
    with codecs.open(output_filename, "w", "utf8") as o:
        # Set the batch size
        batch_size = 1000
        to_generate = np.arange(log_amount)
        for startidx in range(0, len(to_generate), batch_size):
            batch = to_generate[startidx : startidx + batch_size]
            # Get recommendations for the batch of users
            ids, scores = recommender.implicit_model.recommend(
                batch, user_artists[batch], filter_already_liked_items=True
            )
            for i, userid in enumerate(batch):
                username = users[userid]
                for other, score in zip(ids[i], scores[i]):
                    # print(other)  # Print the recommended item (for debugging)
                    o.write(f"{username}\t{artist_df[other]}\t{score}\n")
            progress.update(len(batch))

# Log the time taken for recommendation
logging.debug("Generated recommendations in %0.2fs", time.time() - start)

In [34]:
import pandas as pd

# Specify the path to your log file
log_file_path = 'recommendationlog.txt'

# Read the tab-separated data into a DataFrame
df = pd.read_csv(log_file_path, sep='\t', header=None, names=['User', 'Track', 'Score'])

# Now 'df' contains your data in a DataFrame format
print(df.head())  # Display the first few rows

                                       User               Track     Score
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  TRWYSES128F92FA039  0.000397
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  TRQRMNO128F4235E4D  0.000300
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  TRAFUNV128F92CFEB2  0.000187
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  TRWAQOC12903CB84CA  0.000181
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  TRIMYMS128E07840C8  0.000158
