In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import paired_distances
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from src.recommender import ItemRecommender

import re

In [None]:
numpd.read_pickle('../num_mine.pkl')
pd.read_pickle('../num_2010s.pkl')
pd.read_pickle('../num_rolling.pkl')
pd.read_pickle('../num_mega_main.pkl')

In [None]:
df_2010_str, df_2010_ref, df_2010_num, df_2010_arrs = split_df_columns(df_2010s)
df_2010_str

In [None]:
df_2010_str.columns

In [None]:
df_2010_str['bag_of_words'] = ''
columns = df_2010_str.columns
for index, row in df_2010_str.iterrows():
    words = ''
    for col in columns:
        if col == 'artist_genres':
            words += ' '.join(row[col])+ ' '
        else:
            words += row[col]+ ' '
    row['bag_of_words'] = words
    
df_2010_str.drop(columns = [col for col in df_2010_str.columns if col!= 'bag_of_words'], inplace = True)

df_2010_str

In [None]:
df = pd.read_csv('../my_albums.csv', engine='python')

In [None]:
df.set_index(['album','name'], inplace=True)

In [None]:
df

In [None]:
df['has_featured_artist'] = np.where(df['featured_artists'].isna(), 0, 1)
df.head()

In [None]:
df.columns

## Seperating Columns based on dtype, also removing a few that don't seem important

In [None]:
df_str = df[['artist', 'featured_artists', 'release_date']]

df_ref = df[['id', 'uri', 'track_href', 'analysis_url',
          'artist_uri', 'album_uri', 'album_image_url']]

df_num = df[['year', 'has_featured_artist', 'track_number', 'tracks_on_album', 'explicit', 'duration_ms', 'popularity',
          'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
          'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
          'track_length', 'tempo_confidence', 'end_fade_in', 'start_fade_out','end_silence_time']]

df_arrs = df[['sections', 'tatums', 'beats', 'bars']]

### Looking at string values

In [None]:
df_ref.head()

In [None]:
df_str.fillna('No Features', inplace=True)
df_str['release_date'] =  pd.to_datetime(df_str['release_date'], format='%Y-%m-%d')

In [None]:
df_str.head()

### Looking at numerical values

In [None]:
df_num.info()

In [None]:
df_num.head(1).T

### Looking at the Audio Analysis values

In [None]:
ex_1 = df_arrs['bars'].head(1)
ex_1.values

In [None]:
vals = ex_1.to_dict()

In [None]:
type(vals)

In [None]:
pd.DataFrame(vals, index=['1'])

In [None]:
from collections import OrderedDict, defaultdict
dd = defaultdict(list)
ex1_vals = ex_1.to_dict(dd)
ex1_vals

In [None]:
ex1_vals.values()

In [None]:
ex1_vals_lst = ex1_vals.tolist()
ex1_vals_lst[0]

In [None]:
type(ex1_vals_lst[0])

In [None]:
for x in ex1_vals.tolist():
    print(x + '\n')

In [None]:
class ItemRecommender():
    '''
    Content based item recommender
    '''
    def __init__(self, similarity_measure=cosine_similarity):
        self.similarity_matrix = None
        self.item_names = None
        self.similarity_measure = similarity_measure

    
    def fit(self, X, titles=None):
        '''
        Takes a numpy array of the item attributes and creates the similarity matrix

        INPUT -
            X: NUMPY ARRAY - Rows are items, columns are feature values / or DF
            titles: LIST - List of the item names/titles in order of the numpy arrray
        
        OUTPUT - None


        Notes:  You might want to keep titles and X as attributes to refer to them later

        Create a similarity matrix of item to item similarity
        '''

        # While keeping this as a sparse matrix would be best the cosign sim
        # function returns a array so there is no reason.
        
        if isinstance(X, pd.DataFrame):
            self.item_counts = X
            self.item_names = X.index
            self.similarity_df = pd.DataFrame(self.similarity_measure(X.values, X.values),
                 index = self.item_names)
        else:
            self.item_counts = X
            self.similarity_df = pd.DataFrame(self.similarity_measure(X, X),
                 index = titles)
            self.item_names = self.similarity_df.index

        
    def get_recommendations(self, item, n=5):
        '''
        Returns the top n items related to the item passed in
        INPUT:
            item    - STRING - Name of item in the original DataFrame 
            n       - INT    - Number of top related items to return 
        OUTPUT:
            items - List of the top n related item names

        For a given item find the n most similar items to it (this can be done using the similarity matrix created in the fit method)
        '''
        return self.item_names[self.similarity_df.loc[item].values.argsort()[-(n+1):-1]].values[::-1]


    def get_user_profile(self, items):
        '''
        Takes a list of items and returns a user profile. A vector representing the likes of the user.
        INPUT: 
            items  -   LIST - list of movie names user likes / has seen

        OUTPUT: 
            user_profile - NP ARRAY - array representing the likes of the user 
                    The columns of this will match the columns of the trained on matrix
    

        Using the list of items liked by the user create a profile which will be a 1 x number of features array.  This should be the addition of the values for all liked item features (you can choose how to normalize if you think it is needed)
        '''
        user_profile = np.zeros(self.item_counts.shape[1])
        for item in items:
            user_profile += self.item_counts.loc[item].values

        return user_profile


    def get_user_recommendation(self, items, n=5):
        '''
        Takes a list of movies user liked and returns the top n items for that user

        INPUT 
            items  -   LIST - list of movie names user likes / has seen
            n -  INT - number of items to return

        OUTPUT 
            items - LIST - n recommended items

    
        Make use of the get_user_profile method to create a user profile that will be used to get the similarity to all items and recommend the top n.
        '''
        num_items = len(items)
        user_profile = self.get_user_profile(items)

        user_sim =  self.similarity_measure(self.item_counts, user_profile.reshape(1,-1))

        return self.item_names[user_sim[:,0].argsort()[-(num_items+n):-num_items]].values[::-1]





