In [1]:
from pathlib import Path
from gensim.models import Word2Vec
import json

## Load Data

In [2]:
listOfTracks = dict()
numKey = dict()
totalTracks = 0
allPlaylists = []

In [3]:
#MPD Data

mpd_folder = Path("spotify_million_playlist_dataset/data/filtered_data/").glob('**/*')
mpd_folder_files = [x for x in mpd_folder if x.is_file()]

EN_folder = Path("genre_playlists/genre_playlists/").glob('**/*')
EN_files = [x for x in EN_folder if x.is_file()]

In [4]:
%%time
#Get MPD data

for file in mpd_folder_files:
    with open(file) as f:
        data = json.load(f)
    for playlist in data['playlists']:
        totalTracks += playlist['num_tracks']
        newPlaylist = []
        for track in playlist['tracks']:
            key = (track['track_name'], track['artist_name'])
            if key not in listOfTracks:
                numKey[len(listOfTracks)] = key
                listOfTracks[key] = len(listOfTracks)

            newPlaylist.append(str(listOfTracks[key]))   

        allPlaylists.append(newPlaylist)
        

Wall time: 206 ms


In [5]:
%%time
#Get EveryNoise Data

for file in EN_files:
    with open(file) as f:
        data = json.load(f)
    for playlist in data:
        newPlaylist = []
        for track in playlist['tracks']:
            key = (track["name"], track['artists'][0]['name'])
            if key not in listOfTracks:
                numKey[len(listOfTracks)] = key
                listOfTracks[key] = len(listOfTracks)

            newPlaylist.append(str(listOfTracks[key]))

        allPlaylists.append(newPlaylist)

Wall time: 11.5 s


## Build Word2Vec Model

In [14]:
%%time

embeddingSizes = [500,750,1000] #To test later when we use all files

model = Word2Vec(allPlaylists, min_count = 1, window = 10, iter = 20,size = 250)

print(model)

Word2Vec(vocab=401261, size=250, alpha=0.025)
Wall time: 1min 45s


In [7]:
tracks = list(model.wv.vocab)
#print(tracks)

In [None]:
%%time

#Show Graph
#This is directly from the site above
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot

X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
pyplot.scatter(result[:, 0], result[:, 1])

words = list(model.wv.vocab)
for i, word in enumerate(words):
	pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

In [8]:
def CreatePlaylist(model, playlist, playlist_len):
    return [numKey[int(x)] for x,y in model.wv.most_similar(positive = playlist, topn=playlist_len)]

In [9]:
def MatchSongs(model, playlist):
    newPlaylist = []
    for song in playlist:
        for x, y in model.wv.most_similar(song,topn = 15): #x is song, y is how similiar it is(out of 100%)
            if numKey[int(x)] not in newPlaylist:
                newPlaylist.append(numKey[int(x)])
                break;
    
    return newPlaylist

In [10]:
def SameSongs(playlist1, playlist2):
    SameSongAmt = 0
    SameSongList = []
    
    for song in playlist1:
        if song in playlist2:
            SameSongAmt += 1
            SameSongList.append(song)
                
    return(SameSongAmt, SameSongList)

In [11]:
def TrainModel(model, playlist, epochs = 5):
    model.build_vocab(playlist,update = True)
    model.train(playlist, total_examples = model.corpus_count, epochs = 5, compute_loss = True)

In [15]:
playlist = allPlaylists[7]

crt_playlist = CreatePlaylist(model,playlist, len(playlist))
mtch_songs = MatchSongs(model,playlist)

In [16]:
SameSongAmt, SameSongList = SameSongs(crt_playlist,mtch_songs)

print("Amount of Same Songs:", SameSongAmt)      
print(len(crt_playlist),len(mtch_songs))

Amount of Same Songs: 14
100 70
