In [None]:
import json
from pathlib import Path
from operator import itemgetter
import pandas as pd

###For TESTING
import copy 

from gensim.models import Word2Vec

## Read and Load Data

In [None]:
data_folder = Path("../data/genre_playlists/").glob('**/*')
files = [x for x in data_folder if x.is_file()]

## Prepare data for Word2Vec Model


https://machinelearningmastery.com/develop-word-embeddings-python-gensim/  

Documentation  
https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec

In [None]:
#Get arrays of [song name, playlist genre] from playlist. 
def getTrackGenres(playlist, corpus):
    for track in playlist['tracks']:
        track_info = [track['name'], playlist['genre']]
        corpus.append(track_info)
    

In [None]:
%%time

#Create a list of [song name, playlist genre] arrays
corpus = []
totalTracks = 0
totalPlaylists = 0

#Read files and update corpus with song info
for i, file in enumerate(files):
    with open(file) as f:
        data = json.load(f)
        for playlist in data:
            getTrackGenres(playlist, corpus)
            totalTracks += playlist['num_tracks']
            totalPlaylists += 1
    print("Finished Reading file ", i+1, " | Tracks Added:", totalTracks)


In [None]:
# print(corpus[:20])
print('Total "words" in corpus:', len(corpus))
print('Total playlists:', totalPlaylists)

## Word2Vec Model

In [None]:
%%time

embeddingSizes = [500,750,1000] #To test later when we use all files

#Train Word2Vec model with corpus
model = Word2Vec(corpus, size=750)

#List of output vectors for vocab
X = model[model.wv.vocab]
print(model)

## K Means Clustering

Separate data into 200 clusters 

[https://ai.intelligentonlinetools.com/ml/k-means-clustering-example-word2vec/]

In [None]:
from sklearn import cluster
from sklearn import metrics

#Separate output vectors into 200 clusters
kmeans = cluster.KMeans(n_clusters=200)
kmeans.fit(X)

#Each vector is labeled with cluster id 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
print ("Centroids data")
print (centroids)

In [None]:
#Convert vocab to a list
vocab = list(model.wv.vocab)

In [None]:
#Map each vector index to its cluster id
#For example, first row in cluster_map refers to X[0], then data_index would be 0 and cluster would be its cluster id
cluster_map = pd.DataFrame()
cluster_map['data_index'] = range(len(X))
cluster_map['cluster'] = labels

In [None]:
#Copy and convert corpus into a dictionary for genre labeling later
new_corpus = dict()
for word in corpus:
    new_corpus[word[0]] = word[1]

In [None]:
#For each cluster, create a vocabulary of the genres and occurrences, then return the genre that best represents the cluster
def getMaxGenre(cluster):
    cluster_genres = dict()
    for i in range(len(cluster)):
        index = cluster.iloc[i].data_index
        name = vocab[index]
        if name in new_corpus:
            genre = new_corpus[name]
            word_list = genre.split()
            for w in word_list:
                if w in cluster_genres:
                    cluster_genres[w] = cluster_genres[w] + 1
                else:
                    cluster_genres[w] = 1
    #Indie has the majority in half of the 200 clusters, so it's best to leave it out
    cluster_genres = dict(filter(lambda x: x[0] != 'indie', cluster_genres.items()))
    if cluster_genres:
        sorted_genres = list(sorted(cluster_genres.items(), key=itemgetter(1), reverse=True))
        #Pop, rock, and metal would make up the majority after indie, so I tuned down their chance of being qualified to be the majority
        if sorted_genres[0][0] in ['pop', 'rock', 'metal'] and (sorted_genres[0][1] - sorted_genres[1][1]) <= 8:
            max_genre = sorted_genres[1][0]
        else:
            max_genre = sorted_genres[0][0]
        return max_genre
    else:
        return None

In [None]:
#Create a dictionary of the genres that best represent each cluster
final_genres = dict()

#Get the majority genre for each cluster
for i in range(200):
    cluster = cluster_map[cluster_map.cluster == i]
    max_genre = getMaxGenre(cluster)
    if max_genre in final_genres:
        final_genres[max_genre] = final_genres[max_genre] + 1
    else:
        final_genres[max_genre] = 1

final_genres = dict(sorted(final_genres.items(), key=itemgetter(1), reverse=True))
final_genres

#hip should go with hop lol