<a class="anchor" id="section5"></a>
## Section 5 Content-based Recommender

In [None]:
import pandas as pd
import numpy as np
import pickle
from itertools import islice

In [None]:
userID_artistID_weight_percentage = pd.read_csv('data/interim/userID_artistID_weight_percentage.csv', encoding='utf-8')
artist_metadata_final = pd.read_csv('data/interim/artist_metadata_final.csv', encoding='utf-8')
artists = pd.read_table('data/dataset/artists.dat', encoding='utf-8')

artists = artists.rename(columns={'id': 'artistID'})
artists = artists[['artistID', 'name']]

Final = artist_metadata_final.merge(artists, left_on='id', right_on='artistID', how='outer')
Final = Final.reindex(columns=['artistID', 'name_y','metadata'])
Final.fillna("", inplace=True)

In [None]:
def create_pivot_table(pd_df):
    data = pd_df.values
    #print(data.shape)
    rows, row_pos = np.unique(data[:, 0], return_inverse=True)
    cols, col_pos = np.unique(data[:, 1], return_inverse=True)
    pivot_matrix = np.zeros((len(rows), len(cols)), dtype=data.dtype)
    pivot_matrix[row_pos, col_pos] = data[:, 2]
    print(pivot_matrix.shape)
    return pivot_matrix


def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)  # axis=0: apply vertical(row)/axis=1: apply horizontial(row)
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array(
            [np.abs(similarity).sum(axis=1)]).T
        # pred = 0 + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        mean_item_rating = ratings.mean(axis=0)
        ratings_diff = (ratings - mean_item_rating[np.newaxis, :])
        # pred = 0 + ratings_diff.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        pred = mean_item_rating[np.newaxis, :] + ratings_diff.dot(similarity) / np.array(
            [np.abs(similarity).sum(axis=1)])
    elif type == 'content':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred.clip(min=0)

indices = pd.Series(Final.index, index=Final['name_y']).drop_duplicates()
def get_recommendations(artist, cosine_sim, method_name, num_recommend=10):
    # Get the index of the artist
    idx = indices[artist]

    # Get the pairwise similarity scores of all artists with that artist
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the artists based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top K most similar artists
    sim_scores = sim_scores[1:num_recommend]

    # Get the artist indices
    artist_indices = [i[0] for i in sim_scores]

    # Get the pred scores
    artist_pred_score = [i[1] for i in sim_scores]

    # Return the top K most similar artists
    return pd.DataFrame({'artistId': artist_indices,'name_y': list(Final['name_y'].iloc[artist_indices].values), 'metadata': list(Final['metadata'].iloc[artist_indices].values), method_name: artist_pred_score})

def pred_norm(similarity, a, b):
  return (b-a)*((similarity-np.min(similarity, axis=1)[np.newaxis, :])/np.max(similarity, axis=1)[np.newaxis, :]-np.min(similarity, axis=1)[np.newaxis, :])+a

In [None]:
def recommend_artists(pred_df, userID, artists, original_ratings, num_recommendations, method_name):
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = pred_df.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    print(sorted_user_predictions)
    
    # Get the user's data and merge in the artist information.
    user_data = original_ratings[original_ratings.userID == (userID)]
    user_full = (user_data.merge(artists, how = 'left', left_on = 'artistID', right_on = 'artistID').
                     sort_values(['weight_percentage'], ascending=False)
                 )
    
    # Recommend the highest predicted rating artists that the user hasn't seen yet.
    recommendations = (artists[~artists['artistID'].isin(user_full['artistID'])].
         merge(pd.DataFrame(sorted_user_predictions).rename_axis('artistID').reset_index(), how = 'left',
               left_on = 'artistID',
               right_on = 'artistID').
         rename(columns = {user_row_number: method_name}).
         sort_values(method_name, ascending = False).
                       iloc[:num_recommendations, :]
                      )

    return user_full, recommendations

In [None]:
ratings_f = userID_artistID_weight_percentage.groupby('userID').filter(lambda x: len(x) >= 0)
artist_list_rating = ratings_f.artistID.unique().tolist()
artists_filter = artists[artists.artistID.isin(artist_list_rating)]

In [None]:
ratings_f1 = userID_artistID_weight_percentage.reindex(columns=['artistID', 'userID','weight_percentage'])
userID_artistID_weight_percentage = userID_artistID_weight_percentage.reindex(columns=['artistID', 'userID','weight_percentage'])

In [None]:
ratings_f2 = userID_artistID_weight_percentage.pivot(index="artistID", columns="userID", values="weight_percentage").fillna(0)
ratings_f2.shape

In [None]:
ratings_f2 = create_pivot_table(userID_artistID_weight_percentage)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(Final['metadata'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=Final.index.tolist())
print(tfidf_df.shape)

In [None]:
from sklearn.metrics.pairwise import linear_kernel

In [None]:
content_correlation = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
def pred_norm(similarity, a, b):
  return (b-a)*((similarity-np.min(similarity, axis=1)[np.newaxis, :])/np.max(similarity, axis=1)[np.newaxis, :]-np.min(similarity, axis=1)[np.newaxis, :])+a
content_prediction = pred_norm(content_correlation, 0,5)
content_pred_df = pd.DataFrame(content_prediction, columns = list(artists_filter.artistID))
print(content_pred_df.shape)
print('Content-based RS without ranking')
user_full, recommendations = recommend_artists(content_pred_df, 4, Final, ratings_f1, 5, 'CT_3c_pred_rating')
recommendations

In [None]:
###Create Recommendation based on userID, their artists and the weight of that artist, which is based on
###the relative frequency of how often a userUd listens to that artist.

In [None]:
list_for_recommendation = pd.read_csv('data/interim/list_for_recommendation.csv', encoding='utf-8')

In [None]:
#Creates a dictionary with recommendations for a user based on the whole list of artists an user listen to. To recommendation weights are based on the relative frequency of how often
#an user has listen to the artist. Recommendation is based on tags and genre.
dictionary = {}
x = 0
y = 0
for index, row in list_for_recommendation.iterrows():
    try:
        if row['userID'] not in dictionary:
            dictionary[row['userID']] = {}
            based_on = ([row['name']])
            based_on = based_on[0]
            recommendation = get_recommendations(based_on, content_prediction, 'CT_3c_pred_rating', num_recommend=11)
            
            for index, row_rec in recommendation.iterrows():
                if not row_rec['name_y'] in dictionary[row['userID']]:
                    dictionary[row['userID']][row_rec['name_y']] =  row_rec['CT_3c_pred_rating'] * row['weight_percentage']
                else:
                    dictionary[row['userID']][row_rec['name_y']] = dictionary[row['userID']][row_rec['name_y']] + (row_rec['CT_3c_pred_rating'] * row['weight_percentage'])


            #dictionary[row['userID']][row['name']] = 2
        else:
            based_on = [row['name']]
            based_on = based_on[0]
            recommendation = get_recommendations(based_on, content_prediction, 'CT_3c_pred_rating', num_recommend=11)
            for index, row_rec in recommendation.iterrows():
                if not row_rec['name_y'] in dictionary[row['userID']]:
                    dictionary[row['userID']][row_rec['name_y']] =  row_rec['CT_3c_pred_rating'] * row['weight_percentage']
                else:
                    dictionary[row['userID']][row_rec['name_y']] = dictionary[row['userID']][row_rec['name_y']] + (row_rec['CT_3c_pred_rating'] * row['weight_percentage'])
    except:
        y = y + 1
    x = x + 1
    print(x)
print(y)
        

In [None]:
#save dictionary
dictionary = open("userID_recommendations.pkl", "wb")
pickle.dump(dictionary, dictionary)
dictionary.close()

In [None]:
df = pd.DataFrame(columns={'userID','name','rec_value'})
df

In [None]:
#create dataframe with top 10 artist for each user
x = 0
y = 0
for userID in dictionary:
    sorted_dictionary = dict(sorted(dictionary[userID].items(), key=lambda x: x[1], reverse=True))
    for artist in islice(sorted_dictionary, 10):
        df = df.append(pd.DataFrame(data={'userID': userID, 'name': artist, 'rec_value': sorted_dictionary[artist] }, index=[x]))
        x = x + 1
y = y + 1

In [None]:
df.to_csv('content_based_result.csv')