In [None]:
!pip install lightfm

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import sparse as sp
from warnings import filterwarnings
from matplotlib import pyplot as plt

from lightfm import LightFM
from sklearn.metrics import pairwise as pw
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score, reciprocal_rank

sns.set()
filterwarnings('ignore')

%matplotlib inline

## Classes and UDFs

#### Useful functions for detailed data inspection

In [None]:
# Create Data audit Report for continuous variables
def cont_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  
                      x.std(), x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),
                          x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75), 
                              x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max()], 
                  index = ['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1', 
                               'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

In [None]:
# Create Data audit Report for categorical variables
def cat_summary(x):
    Mode = x.value_counts().sort_values(ascending = False)[0:1].reset_index()
    return pd.Series([x.count(), x.isnull().sum(), Mode.iloc[0, 0], Mode.iloc[0, 1], 
                          round(Mode.iloc[0, 1] * 100/x.count(), 2)], 
                  index = ['N', 'NMISS', 'MODE', 'FREQ', 'PERCENT'])

In [None]:
# Function to return key for any value
def get_key(val, dictionary):
    for key, value in dictionary.items():
        if val == value:
            return key
 
    raise Exception("Song doesn't exist in the database!")

#### Recommendation Class for popularity based model

In [None]:
# Class for Popularity based Recommender System model
class popularity_recommender():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.popularity_recommendations = None
        
    # Create the popularity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

        # Get a count of user_ids for each unique song as recommendation score
        train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
        train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
    
        # Sort the songs based upon recommendation score
        train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1])
    
        # Generate a recommendation rank based upon score
        train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
        # Get the top 10 recommendations
        self.popularity_recommendations = train_data_sort.head(10)

    # Use the popularity based recommender system model to make recommendations
    def recommend(self, user_id):    
        user_recommendations = self.popularity_recommendations
        
        # Add user_id column for which the recommendations are being generated
        user_recommendations['user_id'] = user_id
    
        # Bring user_id column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        
        return user_recommendations

#### Useful functions for personalized hybrid recommendation model

In [None]:
# Function to create a user dictionary based on their index and number in interaction dataset
def create_user_dict(interactions):
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 

    for i in user_id:
        user_dict[i] = counter
        counter += 1

    new_dict = dict([(value, key) for key, value in user_dict.items()])

    return new_dict

In [None]:
# Function to create an item dictionary based on their item_id and item name  
def create_item_dict(df, id_col, name_col):
    item_dict ={}

    for i in range(df.shape[0]):
        item_dict[(df.loc[i, id_col])] = df.loc[i, name_col]

    return item_dict

In [None]:
# Function to produce user recommendations
def sample_recommendation_user(model, interactions, user_id, user_dict, 
                               item_dict, threshold = 0, nrec_items = 10):
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x, np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id, :] \
                                 [interactions.loc[user_id, :] > threshold].index) \
                                  .sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0: nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    
    print("Recommended songs for UserID:", user_id)
    counter = 1

    for i in scores:
        print(str(counter) + '- ' + i)
        counter+=1

In [None]:
# Function to create item-item distance embedding matrix
def create_item_emdedding_distance_matrix(model, interactions):
    
    df_item_norm_sparse = sp.csr_matrix(model.item_embeddings)
    similarities = pw.cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    
    return item_emdedding_distance_matrix

In [None]:
# Function to create item-item recommendation
def item_item_recommendation(item_emdedding_distance_matrix, item_id, item_dict, n_items = 10):
    
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[item_id,:]. \
                                  sort_values(ascending = False).head(n_items+1). \
                                  index[1:n_items+1]))
    
    print("Song of interest: {0}".format(item_dict[item_id]))
    print("Song(s) similar to the above item are as follows:-")
    counter = 1
    
    for i in recommended_items:
        print(str(counter) + '. ' +  item_dict[i])
        counter+=1

## Data Import & Inspection

In [None]:
triplets = 'Data/10000.txt'
songsData = 'Data/song_data.csv'

In [None]:
rawData1 = pd.read_table(triplets, header=None)
rawData1.columns = ['user_id', 'song_id', 'listen_count']
rawData2 =  pd.read_csv(songsData)

In [None]:
# Create a new copy of the triplets dataset & change user_ids
# from string format to indexed values for easier computations
rawData1_userIndexed = rawData1.copy()
rawData1_userIndexed.user_id = rawData1.index + 1

In [None]:
# Merge the triplets data (user indexed) with songs data
rawData = pd.merge(rawData1_userIndexed, rawData2.drop_duplicates(['song_id']), on="song_id", how="left")

In [None]:
# Create a subset of top fifty thousand observations to work with, 
# as the entire dataset is TOO expensive to compute on!!!
data = rawData.head(50000).copy()

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.describe(include='all')

In [None]:
data.select_dtypes(include='int64').apply(cont_summary)

In [None]:
data.select_dtypes(include='object').apply(cat_summary)

## EDA

In [None]:
grp_title = rawData.groupby(['title']).agg({'listen_count': 'count'}).reset_index()
grp_title['percentage'] = rawData['listen_count'].div(rawData.listen_count.sum()) * 100
grp_title

In [None]:
grp_title.sort_values(by=['listen_count']).head(10)

In [None]:
grp_title.sort_values(by=['listen_count'], ascending=False).head(10)

In [None]:
plt.figure(figsize=(7, 4), dpi=110)
plt.hist(grp_title.listen_count, bins=150)
plt.xlim(0, 800)
plt.show()

## Recommedations

In [None]:
users = data['user_id'].unique()

#### Interaction Matrix

In [None]:
# Create a pivot table (interaction matrix) from the original dataset
x = data.pivot_table(index='user_id', columns='song_id', values='listen_count')

In [None]:
xNan = x.fillna(0)

In [None]:
interaction = sp.csr_matrix(xNan.values)

#### Popularity Model

In [None]:
id = int(input('Enter the ID of a user to get their popularity-based song recommendations: '))

In [None]:
popModel = popularity_recommender()

In [None]:
# Popularity based recommendations by title
popModel.create(data, 'user_id', 'title')
popModel.recommend(users[id-1])

In [None]:
# Popularity based recommendations by artists
popModel.create(data, 'user_id', 'artist_name')
popModel.recommend(users[id-1])

#### Personalized Hybrid Model

In [None]:
hybridModel = LightFM(loss='warp-kos', n=20, k=20, learning_schedule='adadelta')
hybridModel.fit(interaction, epochs=600, num_threads=512)

In [None]:
precision_at_k(hybridModel, interaction).mean().round(4) * 100

In [None]:
recall_at_k(hybridModel, interaction).mean().round(4) * 100

In [None]:
auc_score(hybridModel, interaction).mean().round(4) * 100

In [None]:
reciprocal_rank(hybridModel, interaction).mean().round(4) * 100

#### Personal Recommendations

In [None]:
# Creating user dictionary based on their index and number in the interaction matrix using recsys library
userDict = create_user_dict(interactions=x)

In [None]:
# Creating a song dictionary based on their songID and artist name
songDict = create_item_dict(df=rawData, id_col='song_id', name_col='title')

In [None]:
# Recommend songs using lightfm library
id = int(input('Enter the ID of a user to get their personalized song recommendations: '))
sample_recommendation_user(model=hybridModel, interactions=x, user_id=id, 
                           user_dict=userDict, item_dict=songDict, threshold=5, nrec_items=10)

In [None]:
# Recommend songs similar to a given songID
song = input('Enter a song to get similar recommendations: ')
songID = get_key(song, songDict)
songItemDist = create_item_emdedding_distance_matrix(model=hybridModel, interactions=x)
item_item_recommendation(item_emdedding_distance_matrix=songItemDist, item_id=songID,
                                    item_dict=songDict, n_items=10)