In [1]:
from __future__ import print_function

import os
import sys
import numpy as np
import tensorflow as tf
from tensorflow import keras
import json
import random

with open('genremap.json', 'r') as f:
    loaded_genres = json.loads(f.read())

BASE_DIR_GLOVE = 'launchpad/Musecage/other/'
GLOVE_DIR = os.path.join(BASE_DIR_GLOVE, 'glove.6B')

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'),encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [2]:
print('Embedding vector size: ',len(embeddings_index['the']))

Embedding vector size:  300


In [3]:
for key in loaded_genres:
    key = key.lower()

In [4]:
genre_array = [[key] for key in loaded_genres]
genre_index = np.array([i for i in range(len(genre_array))])

def keep_genre(genre_array):
    """A function that takes in an array of different genres and returns an array of genre embeddings. 
    If a word isn't found within Glove, that word is simply taken out of the genre embedding. 
    """
    genre_embeddings=[]
    for i in range(len(genre_array)): #For each genre
        genre_embedding=np.zeros((300))
        genre = genre_array[i][0]
        for word in genre.split(): #For each word in each genre
            try: #If the word embedding is found
                word_embedding = embeddings_index[word]
                genre_embedding = [a+b for a,b in zip(genre_embedding, word_embedding)] #Sum the word embeddings
            except:
                continue
        genre_embeddings.append(genre_embedding)
    return genre_embeddings

In [5]:
genre_embeddings=np.array(keep_genre(genre_array))
genre_embeddings.shape

(276, 300)

In [6]:
from sklearn.neighbors import KNeighborsClassifier
knn_genres = KNeighborsClassifier(n_neighbors=3)
knn_genres.fit(genre_embeddings, genre_index)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [37]:
def create_playlist(title, num_songs):
    playlist = []
    title_embedding = np.array(keep_genre([[title]]))
    closest_genres = knn_genres.kneighbors(title_embedding)
    #print(closest_genres)
    #for i in range(3):
    #    print(genre_array[closest_genres[1][0][i]][0])
    distances, neighbors = closest_genres[0][0], closest_genres[1][0]
    for song in loaded_genres[genre_array[neighbors[0]][0]]:
        if len(playlist) < num_songs and song in loaded_genres[genre_array[neighbors[1]][0]] and loaded_genres[genre_array[neighbors[2]][0]]:
            playlist.append(song)
    while len(playlist) < num_songs:
        sum_dist = distances[0] + distances[1] + distances[2]
        p1, p2, p3 = int(100 - (distances[0] * 100 / sum_dist)), int(100 - (distances[1] * 100 / sum_dist)), int(100 - (distances[2] * 100 / sum_dist))
        my_list = [genre_array[neighbors[0]][0]] * p1 + [genre_array[neighbors[1]][0]] * p2 + [genre_array[neighbors[2]][0]] * p3
        rand_genre = random.choice(my_list)
        rand_song = loaded_genres[rand_genre][random.randint(0, len(loaded_genres[rand_genre])-1)]
        playlist.append(rand_song)
    return playlist
print(create_playlist('dance', 20))

['USAT21702748', 'GBCTA0700277', 'GBAHS1400160', 'USAT21702748', 'USS9T1900010', 'NOG841617010', 'USS9T1900010', 'USVT10300001', 'USSR39300201', 'GBAHS1400160', 'USAT21702748', 'USUM71024160', 'USRC10800301', 'CA5KR1821202', 'USAT21502771', 'USVT10300001', 'USSR39320004', 'USS9T1900010', 'USS9T1900010', 'USDM31679801']
