****Project Task 3 - Song Ranker****

In [2]:
import os
import numpy as np
import json
import time
from scipy.sparse import csc_matrix
from sklearn.preprocessing import normalize

In [3]:
path = '/Users/yames/Robotics/MMD/project/lastfm_subset/'

In [4]:
class RankingSystem:
    def __init__(self, path='./lastfm_train/', t_threshold=0.0, g_threshold=50, n_songs=5, beta=0.2):
        """
        :param path:        location of the folder containing all the last.fm json files
        :param t_threshold: minimum similarity required for two songs to be considered adjacent
        :param g_threshold: minimum count required for a tag to be included
        :param n_songs:     number of best matches to return to user
        :param n_songs:     teleport probability
        """
        self.path = path
        self.g_threshold = g_threshold
        self.t_threshold = t_threshold
        self.n_songs = n_songs
        self.beta = beta
        self.info_dict = {}
        self.adjacency_matrix, self.matrix_keys, self.tag_matrix, self.tag_names = self.load_json_data()
        self.N = len(self.matrix_keys)
        degrees = self.adjacency_matrix.sum(axis=0).A1
        self.dead_ends = (degrees == 0).astype(np.int)
        self.stochastic_adjacency = normalize(self.adjacency_matrix, norm='l1', axis=0)

    def load_json_data(self):
        t0 = time.time()

        # start with a smallish array, grow dynamically
        edges = np.empty([765, 2], dtype='<U18')
        tags = np.empty([543, 2], dtype='<U18')

        # count how big the array needs to be
        edge_index = 0
        tag_index = 0

        # dirwalk over all json files
        for root, dirs, files in os.walk(self.path):
            for name in files:
                if name.endswith(".json"):
                    # who the ???? is jason?
                    jason = json.load(open(root + '/' + name))
                    self.info_dict[jason['track_id']] = jason['artist'] + ' - ' + jason['title']

                    # read tags
                    this_song_tags = np.asarray(jason['tags'], dtype='<U18')
                    num_tags = len(this_song_tags)
                    if num_tags:
                        this_song_tag_counts = this_song_tags[:, 1].astype(np.int)
                        this_song_tags[:, 1] = jason['track_id']  # second column contains the trackID having the tag
                        tags_above_threshold = this_song_tags[this_song_tag_counts > self.g_threshold]
                        num_tags = len(tags_above_threshold)
                        if tag_index + num_tags > len(tags):  # dynamically grow array
                            tags = np.vstack([tags, np.empty_like(tags)])
                        tags[tag_index: tag_index + num_tags] = tags_above_threshold
                        tag_index += num_tags

                    # read similar songs
                    this_song_edges = np.asarray(jason['similars'])
                    num_edges = len(this_song_edges)
                    if num_edges:  # ignore files that have no similars listed
                        this_song_edge_weights = this_song_edges[:, 1].astype(np.float)
                        this_song_edges[:, 1] = jason['track_id']  # second column contains the trackID the edge leaves
                        edges_above_threshold = this_song_edges[this_song_edge_weights > self.t_threshold]
                        num_edges = len(edges_above_threshold)
                        if edge_index + num_edges > len(edges):  # dynamically grow array
                            edges = np.vstack([edges, np.empty_like(edges)])
                        edges[edge_index: edge_index + num_edges] = edges_above_threshold
                        edge_index += num_edges

        # reduce array size back to the amount necessary
        edges = edges[:edge_index]
        tags = tags[:tag_index]
        print('''All json files read after {} seconds. Found {} edges and {} tags. 
        Populating sparse matrices...'''.format(time.time()-t0, len(edges), len(tags)))

        # we don't want 18 character strings as identifiers, integers are much easier to work with
        array_containing_all_trackids = np.hstack([edges.reshape(-1), tags[:, 1]])
        tag_strings, tag_integers = np.unique(tags[:, 0], return_inverse=True)
        del tags
        del edges
        trackids, array_of_track_integers = np.unique(array_containing_all_trackids, return_inverse=True)
        del array_containing_all_trackids
        in_nodes = array_of_track_integers[:edge_index]
        out_nodes = array_of_track_integers[edge_index:2*edge_index]
        integer_tag_owners = array_of_track_integers[2*edge_index:]

        # create sparse matrices
        adjacency_matrix = csc_matrix((np.ones(len(out_nodes)), (out_nodes, in_nodes)), shape=[len(trackids)] * 2)
        tag_matrix = csc_matrix((np.ones(len(tag_integers)), (integer_tag_owners, tag_integers)),
                                shape=[len(trackids), len(tag_strings)])

        print('Parsed json files and created matrices in {} seconds'.format(time.time()-t0))
        return adjacency_matrix, trackids, tag_matrix, tag_strings

    def get_all_tags(self):
        return self.tag_names

    def compute_page_rank_vector(self, epsilon=0.0000001, beta=None, S=None):
        """
        :param beta:    probability to teleport
        :param S:       vector of songs to teleport to (higher values means higher teleport likelihood)
        :param epsilon: convergence error
        :return:        page rank vector
        """
        if S is None:
            S = np.ones(self.N)
        if beta is None:
            beta = self.beta
        initial_y = np.random.rand(self.N)  # initial_y = np.ones(N)
        initial_y /= np.sum(initial_y)

        # this added factor means that information doesn't get lost in dead ends, but instead teleports out:
        dead_end_escaper = self.dead_ends.dot(initial_y)
        y = self.stochastic_adjacency.dot(initial_y) * (1 - beta) + (S / np.sum(S)) * (beta + dead_end_escaper)

        i = 0   # just so we don't loop too often if it doesn't converge
        while np.linalg.norm(y - initial_y) > epsilon and i < 100:
            i += 1
            initial_y = y
            dead_end_escaper = self.dead_ends.dot(initial_y)
            y = self.stochastic_adjacency.dot(initial_y) * (1 - beta) + (S / np.sum(S)) * (beta + dead_end_escaper)
        return y

    def topic_specific_page_rank(self, tags=(), n_songs=None, beta=None):
        """
        :param beta:    probability of teleport
        :param tags:    tags to include for the topic specifc page rank, as list of strings.
        :param n_songs: override global default for number of songs to return
        :return:        top n ranked tracks from the given page rank
        """
        if n_songs is None:
            n_songs = self.n_songs
        if beta is None:
            beta = self.beta
        feature_strings = []
        S = np.zeros(self.N)
        for tag in tags:
            bla = np.argwhere(self.tag_names == tag).flatten()
            if len(bla):
                feature_strings.append(tag)
                S += np.asarray(self.tag_matrix.getcol(bla[0]).todense()).flatten()
        if len(feature_strings) == 0:
            print('No valid tags provided, computing ranking on global data set instead.')
            S = None
        else:
            print('Using features', feature_strings, 'with teleport subset size:', np.sum(S), '/', len(S))
        rank_vector = self.compute_page_rank_vector(beta=beta, S=S)
        return [self.info_dict.get(tid, tid+' - Unknown Track') for tid in self.matrix_keys[np.argsort(rank_vector)[:-(n_songs + 1):-1]]]


## Initialize Ranking System

This may take a while to parse all the json files, as there are quite a few of them. For us it's approximately 20 minutes for the entire training set (2.5gb of json). However once this is done, one can easily compute many page rank queries.

In [5]:
ranking = RankingSystem(path=path)

All json files read after 9.939356088638306 seconds. Found 612051 edges and 11773 tags. 
        Populating sparse matrices...
Parsed json files and created matrices in 11.102881908416748 seconds


## Get All Tags

there's quite a few junk tags even after sorting out, but one can look up the tags that are possible to pass to the page rank.

In [6]:
tags = ranking.get_all_tags()
print('In total there are', len(tags), 'tags. They are:')
print(tags)
# # Uncomment this if you want a wall of text:
# for tag in tags:
#     print(tag)

In total there are 3749 tags. They are:
['---04fh' '--7' '--8' ..., 'zuidelijk' 'zydeco' 'zz96jd']


## Topic Specific Page Rank

It would perhaps be possible to play around with upper/lower case variants to see how it affects results.

In [7]:
best_songs = ranking.topic_specific_page_rank(beta=0.2, tags=('hip hop', 'rap'),  n_songs=10)
print(best_songs)

Using features ['hip hop', 'rap'] with teleport subset size: 183.0 / 203970
['Dilated Peoples - Hard Hitters', 'Mobb Deep - Hell On Earth (Front Lines)', 'JAY-Z - Change Clothes', 'LL Cool J - Headsprung', 'Busta Rhymes - Genesis', 'The Game - Scream On Em', 'Dendemann - Sachmagehtsnoch', 'Common - The 6th Sense', "The Beatnuts - No Escapin' This", "The Pharcyde - Passin' Me By"]


In [8]:
best_songs = ranking.topic_specific_page_rank(beta=0.4, tags=('rock', 'metal', 'power-metal', 'folk metal'))
print(best_songs)

Using features ['rock', 'metal', 'folk metal'] with teleport subset size: 643.0 / 203970
['System of a Down - Streamline', 'Helmet - Street Crab', 'Becoming the Archetype - Ex Nihilo', 'Y&T - Mean Streak', 'Helmet - Give It']
