In [2]:
import numpy as np
from scipy import spatial
import array
import sqlite3
from fuzzywuzzy import process
from IPython.display import clear_output

locstr = "/mnt/datassd2/spoticry-data/"


In [3]:
conn = sqlite3.connect(locstr + "spoticry.db")
c = conn.cursor()

In [4]:
c.execute('''
    SELECT DISTINCT songs.song_id, artist, title, vec
    FROM transformer
    JOIN songs ON transformer.song_id = songs.song_id;
''')
data = c.fetchall()

In [5]:
#ids = dict(zip([d[0] for d in data], range(len(data))))
ids = [d[0] for d in data]
vecs = np.array([np.array(array.array("f", d[3])) for d in data])

In [6]:
tree = spatial.KDTree(vecs)
normed_vecs = np.divide(vecs, np.linalg.norm(vecs, axis = 1, keepdims=True))
norm_tree = spatial.KDTree(normed_vecs)
id_to_name = dict([(d[0], d[1] + ' : ' + d[2]) for d in data])
name_to_id = dict([(d[1] + ' : ' + d[2], d[0]) for d in data])

In [7]:
def nearest(vector, num, normalize = True):
    if normalize:
        vector = np.divide(vector, np.linalg.norm(vector))
        return norm_tree.query(vector, num)
    else:
        return tree.query(vector, num)

def combine_songs(name1, name2, op, num):
    try:
        id1 = name_to_id[name1]
        id2 = name_to_id[name2]
        index1 = ids.index(id1)
        index2 = ids.index(id2)
    except:
        return None
    new = op(vecs[index1], vecs[index2])
    dists, inds = nearest(new, num)
    songs = [id_to_name[ids[d]] for d in inds]
    return songs

def similar_songs(song_name, num, normalize = True):
    try:
        id1 = name_to_id[song_name]
        index = ids.index(id1)
    except:
        return None
    dists, inds = nearest(vecs[index], num + 1, normalize)
    songs = [id_to_name[ids[d]] for d in inds[1:]]
    return zip(dists[1:], songs)

def song_search(search_term, num_res = 5):
    out = process.extract(search_term, list(name_to_id.keys()), limit=num_res)
    return [val[0] for val in out]

In [10]:
query = input("Search for a song: ")
res = song_search(query)
for n in range(len(res)):
    print(n, res[n])
query = int(input("Select a title number: "))
while (query < 0 or query > len(res) - 1):
    query = int(input("Select a valid title number: "))
clear_output()
out = similar_songs(res[query], 5, normalize = False)
if out is not None:
    print(f"\nSimilar to {res[query]}\n")
    for song in out:
        print("{:.2f} | ".format(song[0]), song[1])



Similar to Horse Feathers : Blood on the Snow

5817593902823.52 |  Pretenders : Talk Of The Town
6854282501165.04 |  The Police : Born In The 50's
8658489855113.42 |  Ben Folds Five : Cigarette
8959207293490.88 |  They Might Be Giants : Don't Let's Start
9106526229181.04 |  Gabriella Cilmi : Echo Beach


In [42]:
out = combine_songs('Paul Simon : Kathy\'s Song', 'Blink-182 : Wasting Time',np.add, 5)
for song in out:
    print(song)

The Strokes : New York City Cops
Ensiferum : White Storm
HAMMERFALL : Born To Rule
Mercyful Fate : Is That You_ Melissa
Nina Simone : One September Day


In [24]:
len(ids)

37070