In [1]:
# prepare workspace
import itertools
import json
import pandas as pd
import numpy as np
import random
from sklearn.manifold import TSNE
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# fix seed for repeatable experiments
np.random.seed(1)

In [2]:
# parameters
# python doesn't seem to have a simple way to create C-like struct
class Args:
    pass
args = Args()
args.input_json = 'data/song_map.json'
args.input_txt = 'data/song_vectors.txt'
args.input_csv = 'data/mars_tianchi_songs.csv'

args.num_plot = 1500
args.label_type = 'gender' # possible values include: artist_id, publish_time, song_init_plays, lang, gender

In [3]:
# read JSON idx_to_song map
with open(args.input_json) as f:
    song_map = json.load(f)
idx_to_song = song_map['idx_to_song']

In [4]:
# read song csv file
info_songs = pd.read_csv(args.input_csv,
                        names = ['song_id', 'artist_id', 'publish_time', 'song_init_plays', 'lang', 'gender'])
info_songs.set_index('song_id', inplace=True)

In [5]:
# read txt file which contains song embedding vectors and store them in a numpy matrix
# there is nice function genfromtxt from numpy to do just this
# note that we skip both the header and the first song embedding "</s>" which is added by default
song_embeddings = np.genfromtxt(args.input_txt, skip_header=2, delimiter=' ')

songs = song_embeddings[:, 0].astype(np.int32)
embeddings = song_embeddings[:, 1:]

In [6]:
# random sample songs to display
num_songs = songs.shape[0]

plot_only = np.random.choice(xrange(num_songs), args.num_plot, replace=False)
song_ids = [idx_to_song[str(idx)] for idx in songs[plot_only]]

# Define TSNE
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)

In [7]:
# convert song_id to labels like artist, gender, language etc
labels = info_songs.loc[song_ids, args.label_type].as_matrix()

# transform embeddings to even lower dimension 
low_dim_embs = tsne.fit_transform(embeddings[plot_only, :])

In [8]:
# Visualize
def plot_with_labels(low_dim_embs, labels, filename='tsne.pdf'):
  assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
  fig = plt.figure()  
  for i, label in enumerate(labels):
    x, y = low_dim_embs[i,:]
    plt.scatter(x, y)
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
  fig.savefig(filename)

def plot_with_labels_using_color(low_dim_embs, labels, filename='tsne_color.pdf'):
  assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
  fig = plt.figure()  
  unique_class = np.unique(labels)
  marker_generator = itertools.cycle('o<.^s')
  for class_ in unique_class:
        idx = np.equal(labels, class_)
        x = low_dim_embs[idx, 0]
        y = low_dim_embs[idx, 1]
        
        # plot scatter of x,y use random color
        plt.scatter(x, y, color=np.random.rand(3,), marker=marker_generator.next())
  fig.savefig(filename)

# plot_with_labels(low_dim_embs, labels)
plot_with_labels_using_color(low_dim_embs, labels, filename='tsne_color_' + args.label_type + '_' + str(args.num_plot) + '.pdf')