In [1]:
import json
from pathlib import Path
from operator import itemgetter
import pandas as pd

###For TESTING
import copy 

from gensim.models import Word2Vec

## Read and Load Data

In [2]:
data_folder = Path("../data/genre_playlists/").glob('**/*')
files = [x for x in data_folder if x.is_file()]

## Prepare data for Word2Vec Model


https://machinelearningmastery.com/develop-word-embeddings-python-gensim/  

Documentation  
https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec

In [3]:
#Get arrays of [song name, playlist genre] from playlist. 
def getTrackGenres(playlist, corpus):
    for track in playlist['tracks']:
        track_info = [track['name'], playlist['genre']]
        corpus.append(track_info)
    

In [4]:
%%time

#Create a list of [song name, playlist genre] arrays
corpus = []
totalTracks = 0
totalPlaylists = 0

#Read files and update corpus with song info
for i, file in enumerate(files):
    with open(file) as f:
        data = json.load(f)
        for playlist in data:
            getTrackGenres(playlist, corpus)
            totalTracks += playlist['num_tracks']
            totalPlaylists += 1
    print("Finished Reading file ", i+1, " | Tracks Added:", totalTracks)


Finished Reading file  1  | Tracks Added: 19193
Finished Reading file  2  | Tracks Added: 38931
Finished Reading file  3  | Tracks Added: 58486
Finished Reading file  4  | Tracks Added: 78091
Finished Reading file  5  | Tracks Added: 97594
Finished Reading file  6  | Tracks Added: 117261
Finished Reading file  7  | Tracks Added: 135716
Finished Reading file  8  | Tracks Added: 155193
Finished Reading file  9  | Tracks Added: 174794
Finished Reading file  10  | Tracks Added: 194417
Finished Reading file  11  | Tracks Added: 214047
Finished Reading file  12  | Tracks Added: 233793
Finished Reading file  13  | Tracks Added: 253272
Finished Reading file  14  | Tracks Added: 272670
Finished Reading file  15  | Tracks Added: 292022
Finished Reading file  16  | Tracks Added: 311444
Finished Reading file  17  | Tracks Added: 330957
Finished Reading file  18  | Tracks Added: 350559
Finished Reading file  19  | Tracks Added: 369807
Finished Reading file  20  | Tracks Added: 389039
Finished Readi

In [5]:
# print(corpus[:20])
print('Total "words" in corpus:', len(corpus))
print('Total playlists:', totalPlaylists)

Total "words" in corpus: 492861
Total playlists: 5035


## Word2Vec Model

In [6]:
%%time

embeddingSizes = [500,750,1000] #To test later when we use all files

#Train Word2Vec model with corpus
model = Word2Vec(corpus, size=750)

#List of output vectors for vocab
X = model[model.wv.vocab]
print(model)

Word2Vec(vocab=13792, size=750, alpha=0.025)
CPU times: user 13.3 s, sys: 344 ms, total: 13.7 s
Wall time: 6.32 s


## K Means Clustering

Separate data into 200 clusters 

[https://ai.intelligentonlinetools.com/ml/k-means-clustering-example-word2vec/]

In [7]:
from sklearn import cluster
from sklearn import metrics

#Separate output vectors into 200 clusters
kmeans = cluster.KMeans(n_clusters=200)
kmeans.fit(X)

#Each vector is labeled with cluster id 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
print ("Centroids data")
print (centroids)

Cluster id labels for inputted data
[ 64  10  59 ...  56 179 137]
Centroids data
[[-7.05407228e-06  1.04567684e-04  6.37382327e-05 ...  1.51394226e-04
   1.21196601e-04  1.91854415e-04]
 [-1.14272145e-04 -3.54990993e-06 -5.26204894e-05 ... -6.03562876e-05
   1.39689160e-04  1.63036515e-04]
 [ 7.72058775e-05 -5.88560724e-05  8.76561462e-05 ... -6.53993557e-05
   5.10740247e-05  3.69872723e-05]
 ...
 [-3.47670866e-05 -1.17054415e-05 -1.38397954e-04 ...  2.30595833e-05
  -8.65088987e-06  8.53746096e-05]
 [ 1.71098545e-05  9.38329686e-05 -3.49293987e-05 ...  5.05526114e-05
  -8.09424091e-05  4.83081822e-05]
 [-2.36626911e-05  3.39794315e-05  3.70609414e-05 ...  6.13953598e-05
  -4.36554983e-05  1.02046193e-04]]


In [8]:
#Convert vocab to a list
vocab = list(model.wv.vocab)

In [9]:
#Map each vector index to its cluster id
#For example, first row in cluster_map refers to X[0], then data_index would be 0 and cluster would be its cluster id
cluster_map = pd.DataFrame()
cluster_map['data_index'] = range(len(X))
cluster_map['cluster'] = labels

In [10]:
#Copy and convert corpus into a dictionary for genre labeling later
new_corpus = dict()
for word in corpus:
    new_corpus[word[0]] = word[1]

In [11]:
#For each cluster, create a vocabulary of the genres and occurrences, then return the genre that best represents the cluster
def getMaxGenre(cluster):
    cluster_genres = dict()
    for i in range(len(cluster)):
        index = cluster.iloc[i].data_index
        name = vocab[index]
        if name in new_corpus:
            genre = new_corpus[name]
            word_list = genre.split()
            for w in word_list:
                if w in cluster_genres:
                    cluster_genres[w] = cluster_genres[w] + 1
                else:
                    cluster_genres[w] = 1
    #Indie has the majority in half of the 200 clusters, so it's best to leave it out
    cluster_genres = dict(filter(lambda x: x[0] != 'indie', cluster_genres.items()))
    if cluster_genres:
        sorted_genres = list(sorted(cluster_genres.items(), key=itemgetter(1), reverse=True))
        #Pop, rock, and metal would make up the majority after indie, so I tuned down their chance of being qualified to be the majority
        if sorted_genres[0][0] in ['pop', 'rock', 'metal'] and (sorted_genres[0][1] - sorted_genres[1][1]) <= 8:
            max_genre = sorted_genres[1][0]
        else:
            max_genre = sorted_genres[0][0]
        return max_genre
    else:
        return None

In [12]:
#Create a dictionary of the genres that best represent each cluster
final_genres = dict()

#Get the majority genre for each cluster
for i in range(200):
    cluster = cluster_map[cluster_map.cluster == i]
    max_genre = getMaxGenre(cluster)
    if max_genre in final_genres:
        final_genres[max_genre] = final_genres[max_genre] + 1
    else:
        final_genres[max_genre] = 1

final_genres = dict(sorted(final_genres.items(), key=itemgetter(1), reverse=True))
final_genres

#hip should go with hop lol

{'metal': 23,
 'rock': 20,
 'pop': 17,
 'jazz': 15,
 'folk': 9,
 'hip': 7,
 'house': 6,
 'japanese': 6,
 'black': 5,
 'canadian': 5,
 'musica': 5,
 'rap': 4,
 'classic': 3,
 'alternative': 3,
 'punk': 3,
 'swedish': 3,
 'modern': 2,
 'folklore': 2,
 None: 2,
 'deep': 2,
 'hard': 2,
 'german': 2,
 'polish': 2,
 'dutch': 2,
 'austrian': 2,
 'indonesian': 2,
 'hong': 1,
 'beat': 1,
 'russian': 1,
 'hardcore': 1,
 'neo': 1,
 'histoire': 1,
 'belgian': 1,
 'american': 1,
 'electronic': 1,
 'cinematic': 1,
 'north': 1,
 'boston': 1,
 'turkish': 1,
 'rockabilly': 1,
 'karaoke': 1,
 'cristiano': 1,
 'charanga': 1,
 'gaming': 1,
 'chinese': 1,
 'of': 1,
 'heavy': 1,
 'uk': 1,
 "children's": 1,
 'trival': 1,
 'americana': 1,
 'kurdish': 1,
 'australian': 1,
 'narodna': 1,
 'organic': 1,
 'brazilian': 1,
 'guitar': 1,
 'ccm': 1,
 'irish': 1,
 'soundtrack': 1,
 'metalcore': 1,
 'club': 1,
 'rebetiko': 1,
 'afro': 1,
 'gospel': 1,
 'post-post-hardcore': 1,
 'african': 1,
 'piano': 1,
 'hop': 1,
 's