In [1]:
from __future__ import print_function

import os
import sys
import numpy as np
import tensorflow as tf
from tensorflow import keras
import json

with open('genremap.json', 'r') as f:
    loaded_genres = json.loads(f.read())

BASE_DIR_GLOVE = 'launchpad/Musecage/other/'
GLOVE_DIR = os.path.join(BASE_DIR_GLOVE, 'glove.6B')

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'),encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [2]:
print('Embedding vector size: ',len(embeddings_index['the']))

Embedding vector size:  300


In [3]:
genre_array = [[key.lower()] for key in loaded_genres]
genre_index = np.array([i for i in range(len(genre_array))])

def keep_genre(genre_array):
    """A function that takes in an array of different genres and returns an array of genre embeddings. 
    If a word isn't found within Glove, that word is simply taken out of the genre embedding. 
    """
    genre_embeddings=[]
    for i in range(len(genre_array)): #For each genre
        genre_embedding=np.zeros((300))
        genre = genre_array[i][0]
        for word in genre.split(): #For each word in each genre
            try: #If the word embedding is found
                word_embedding = embeddings_index[word]
                genre_embedding = [a+b for a,b in zip(genre_embedding, word_embedding)] #Sum the word embeddings
            except:
                continue
        genre_embeddings.append(genre_embedding)
    return genre_embeddings

In [4]:
genre_embeddings=np.array(keep_genre(genre_array))
genre_embeddings.shape

(124, 300)

In [5]:
from sklearn.neighbors import KNeighborsClassifier
knn_genres = KNeighborsClassifier(n_neighbors=3)
knn_genres.fit(genre_embeddings, genre_index)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [25]:
test_genre = "dance"
test_embedding = np.array(keep_genre([[test_genre]]))
closest_genres = knn_genres.kneighbors(test_embedding)
print(closest_genres) #Here the closest questions are questions with index 0, 1, 2

(array([[6.70472543, 6.73302524, 6.86090552]]), array([[ 40,   3, 103]]))


In [26]:
for i in range(3):
    print(genre_array[closest_genres[1][0][i]])

['alt dance']
['dance pop']
['disco']
