In [None]:
!pip install git+https://github.com/jasonlaska/spherecluster.git

In [2]:
# Data Analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
plt.style.use('seaborn')
sns.set_style("whitegrid")

# Modeling
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from spherecluster import SphericalKMeans
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from scipy import stats

# Additional
import math
import random
import itertools
import multiprocessing
from tqdm import tqdm
from time import time
import logging
import pickle



In [11]:
import csv
playlists = []
with open('/content/drive/MyDrive/Project2/spotify_playlist_integrated_ver1.csv', 'r', encoding='utf-8') as f:
    rdr = csv.reader(f)
    i=0
    for line in rdr:
        playlists.append(line)
playlist_spotify = playlists

In [12]:
playlists = []
with open('/content/drive/MyDrive/Project2/melon_playlist_integrated_ver1.csv', 'r', encoding='utf-8') as f:
    rdr = csv.reader(f)
    i=0
    for line in rdr:
        playlists.append(line)
playlist_melon = playlists

In [14]:
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

class Callback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 1
        self.training_loss = []

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 1:
            current_loss = loss
        else:
            current_loss = loss - self.loss_previous_step
        print(f"Loss after epoch {self.epoch}: {current_loss}")
        self.training_loss.append(current_loss)
        self.epoch += 1
        self.loss_previous_step = loss

In [15]:
MODEL_PATH = "/content/drive/MyDrive/Project2/model"
playlist_train, playlist_test = train_test_split(playlist_melon, test_size = 0.25, 
                                                 shuffle = True, random_state = 123)

In [16]:
model = Word2Vec(
    size = 256,      # Dimensionality of the feature vectors 
    window = 10,      # The maximum distance between the current and predicted word within a sentence
    min_count = 1, #  Ignores all words with total absolute frequency lower than this
    sg = 0,         # using CBOW architecture
    negative = 20,   #   If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used
    workers = multiprocessing.cpu_count()-1)  #  Use these many worker threads to train the model
print(model)

Word2Vec(vocab=0, size=256, alpha=0.025)


In [17]:
logging.disable(logging.NOTSET) # enable logging
t = time()

model.build_vocab(playlist_train)

print(f"Time to build vocab: {round((time() - t), 2)} seconds")

2020-11-25 10:07:20,210 : INFO : collecting all words and their counts
2020-11-25 10:07:20,212 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-11-25 10:07:20,379 : INFO : PROGRESS: at sentence #10000, processed 341483 words, keeping 131562 word types
2020-11-25 10:07:20,546 : INFO : PROGRESS: at sentence #20000, processed 683694 words, keeping 201514 word types
2020-11-25 10:07:20,707 : INFO : PROGRESS: at sentence #30000, processed 1025480 words, keeping 253649 word types
2020-11-25 10:07:20,878 : INFO : PROGRESS: at sentence #40000, processed 1367603 words, keeping 298478 word types
2020-11-25 10:07:20,993 : INFO : collected 323498 word types from a corpus of 1597953 raw words and 46854 sentences
2020-11-25 10:07:20,993 : INFO : Loading a fresh vocabulary
2020-11-25 10:07:22,501 : INFO : effective_min_count=1 retains 323498 unique words (100% of original 323498, drops 0)
2020-11-25 10:07:22,502 : INFO : effective_min_count=1 leaves 1597953 word corpus 

Time to build vocab: 70.81 seconds


In [18]:
logging.disable(logging.INFO) # disable logging
callback = Callback() # instead, print out loss for each epoch
t = time()

model.train(playlist_train,
            total_examples = model.corpus_count,
            epochs = 100,
            compute_loss = True,
            callbacks = [callback]) 

print(f"Time to train the model: {round((time() - t), 2)} seconds")

Loss after epoch 1: 14828024.0
Loss after epoch 2: 4609496.0
Loss after epoch 3: 3578350.0
Loss after epoch 4: 3098172.0
Loss after epoch 5: 2771384.0
Loss after epoch 6: 2528216.0
Loss after epoch 7: 2344754.0
Loss after epoch 8: 2202596.0
Loss after epoch 9: 2014788.0
Loss after epoch 10: 1864004.0
Loss after epoch 11: 1731536.0
Loss after epoch 12: 1616864.0
Loss after epoch 13: 1519440.0
Loss after epoch 14: 1430544.0
Loss after epoch 15: 1353560.0
Loss after epoch 16: 1280724.0
Loss after epoch 17: 1217792.0
Loss after epoch 18: 1160464.0
Loss after epoch 19: 1108104.0
Loss after epoch 20: 1059796.0
Loss after epoch 21: 1015888.0
Loss after epoch 22: 974148.0
Loss after epoch 23: 938400.0
Loss after epoch 24: 899728.0
Loss after epoch 25: 864848.0
Loss after epoch 26: 835140.0
Loss after epoch 27: 804088.0
Loss after epoch 28: 776996.0
Loss after epoch 29: 748996.0
Loss after epoch 30: 719900.0
Loss after epoch 31: 697624.0
Loss after epoch 32: 675080.0
Loss after epoch 33: 652496

In [19]:
model.save(MODEL_PATH+"melon_song2vec_ver2.model

SyntaxError: ignored

In [None]:
logging.disable(logging.INFO) # disable logging
model = Word2Vec.load(MODEL_PATH+"melon_song2vec_ver2.model")

In [None]:
plt.plot(range(1, model.epochs+1), model.callbacks[0].training_loss)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss", fontweight = "bold")
plt.show()

In [None]:
fig, axes = plt.subplots(6, 1, figsize = (50, 30))

slug = '2DMgt7RVEBD0Likx7GGHqg'
song_id_list = [(slug, ""), *[t for t in model.wv.most_similar(slug)[:5]]] 

for ax, (song_id, sim) in zip(axes.flat, song_id_list):
    ax.imshow([model.wv[song_id]], cmap = "binary", aspect = "auto")
    idx=music_data[(music_data['id']==song_id)].index[0]
    music_data[(music_data['id']==song_id)]['artist - title'][idx]
    print(music_data[(music_data['id']==song_id)]['artist - title'][idx])
    ax.set_title(music_data[(music_data['id']==song_id)]['artist - title'][idx], fontsize = 50)
    ax.yaxis.set_label_position("right")
    ax.set_ylabel(f"Similarity:\n{sim:.3f}" if sim != song_id_list[0][1] else sim,
                  rotation = "horizontal", ha = "left", va = "center", fontsize = 50)
    ax.set_xticks([])
    ax.set_yticks([])
plt.show()