In [17]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import nltk
from sklearn.metrics.pairwise import cosine_similarity

import re

import json

In [2]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [3]:
embed = hub.Module(module_url)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.


In [4]:
def get_features(texts):
    if type(texts) is str:
        texts = [texts]
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        return sess.run(embed(texts))
    

In [5]:
def remove_stopwords(stop_words, tokens):
    res = []
    for token in tokens:
        if not token in stop_words:
            res.append(token)
    return res

def process_text(text):
    text = text.encode('ascii', errors='ignore').decode()
    text = text.lower()
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'#+', ' ', text )
    text = re.sub(r'@[A-Za-z0-9]+', ' ', text)
    text = re.sub(r"([A-Za-z]+)'s", r"\1 is", text)
    #text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"won't", "will not ", text)
    text = re.sub(r"isn't", "is not ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip()
    return text

def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemma_list = []
    for token in tokens:
        lemma = lemmatizer.lemmatize(token, 'v')
        if lemma == token:
            lemma = lemmatizer.lemmatize(token)
        lemma_list.append(lemma)
    # return [ lemmatizer.lemmatize(token, 'v') for token in tokens ]
    return lemma_list


def process_all(text):
    text = process_text(text)
    return ' '.join(remove_stopwords(stop_words, text.split()))

lemmatize("where is academy located ".split())
#process_all("location of academy")

['where', 'be', 'academy', 'locate']

In [6]:
from lyrics import LyricFinder

In [39]:
songs = [
    "the man who sold the world nirvana",
    "smells like teen spirit",
    "rape me nirvana",
    "come as you are nivrana",
    "about a girl nirvana",
    "love buzz nirvana",
    "where did you sleep last night nirvana",
    "american idiot green day",
    "time of your life green day",
    "boulevard of broken dreams green day",
    "wake me up when september ends green day"
]

In [40]:
lfinder = LyricFinder()
lyrics = []
songs_found = []

In [41]:
for song in songs:
    print(song)
    lyric = lfinder.search(song)
    if lyric:
        lyrics.append(process_text(lyric))
        songs_found.append(song)

the man who sold the world nirvana
Searching lyrics.wikia.com
smells like teen spirit
Searching lyrics.wikia.com
rape me nirvana
Searching lyrics.wikia.com
come as you are nivrana
Searching lyrics.wikia.com
about a girl nirvana
Searching lyrics.wikia.com
love buzz nirvana
Searching lyrics.wikia.com
where did you sleep last night nirvana
Searching lyrics.wikia.com
american idiot green day
Searching lyrics.wikia.com
time of your life green day
Searching lyrics.wikia.com
boulevard of broken dreams green day
Searching lyrics.wikia.com
wake me up when september ends green day
Searching lyrics.wikia.com


In [43]:
len(lyrics) == len(songs_found)

True

In [44]:
lyric2vec = get_features(lyrics)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [45]:
lyric2vec.shape

(10, 512)

In [65]:
sims = cosine_similarity(lyric2vec, lyric2vec)

In [68]:
np.array(songs_found)[np.flip(np.argsort(sims, axis=1), axis=1)[:, 1:2]]

array([['boulevard of broken dreams green day'],
       ['american idiot green day'],
       ['wake me up when september ends green day'],
       ['come as you are nivrana'],
       ['where did you sleep last night nirvana'],
       ['boulevard of broken dreams green day'],
       ['smells like teen spirit'],
       ['the man who sold the world nirvana'],
       ['the man who sold the world nirvana'],
       ['the man who sold the world nirvana']], dtype='<U40')

In [69]:
def find_song(query, songs, lyric2vec):
    query = process_text(query)
    query_vec = get_features(query)
    res = []
    scores = cosine_similarity(query_vec, lyric2vec).ravel()
    sorted_idx = np.argsort(scores)[::-1]
    return list(zip(np.array(songs)[sorted_idx], scores[sorted_idx]))

In [71]:
find_song("she is overboard selfish", songs, lyric2vec)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


[('come as you are nivrana', 0.23251113),
 ('smells like teen spirit', 0.23131013),
 ('love buzz nirvana', 0.18689522),
 ('about a girl nirvana', 0.17386755),
 ('rape me nirvana', 0.14984597),
 ('time of your life green day', 0.1436481),
 ('american idiot green day', 0.12091069),
 ('boulevard of broken dreams green day', 0.118268244),
 ('where did you sleep last night nirvana', 0.10211811),
 ('the man who sold the world nirvana', 0.091701984)]