In [13]:
import gensim.downloader as api
import gensim
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
from gensim import utils
from scipy.spatial.distance import cdist
import re

stop_words = stopwords.words('english')
g_model = gensim.models.KeyedVectors.load_word2vec_format("C:/Users/Giannis/gensim-data/word2vec-google-news-300/GoogleNews-vectors-negative300.bin", binary=True)  


In [37]:
# preprocess the lyrics column of the dataframe by converting to lowercase and removing punctuation
def preprocess_lyrics(df):
    df["lyrics"] = [" ".join(utils.simple_preprocess(x)) for x in df["lyrics"]]
    return df

def encode_lyrics(df):
    embeddings = []
    for lyrics in df["lyrics"]:
        embedding = []
        for word in lyrics.split():
            try:
                embedding.append(g_model[word])
            except KeyError:
                pass
        embeddings.append(np.mean(embedding, axis=0))
    return np.array(embeddings)

def save_encoded_df(df, filename):
    np.save(filename, encode_lyrics(preprocess_lyrics(df)))

def load_encoded_df(filename):
    return np.load(filename)

def highlight_words(query, lyric):
    for word in query.split():
        if word != "a":
            pattern = re.compile(re.escape(word), re.IGNORECASE)
            lyric = pattern.sub(r'<b>\g<0></b>', lyric)
    return lyric
   


def find_similar_songs(query, encoded_df, df):
    df_query = pd.DataFrame({"lyrics": [query]})
    query_embedding = preprocess_lyrics(df_query)
    query_embedding = encode_lyrics(query_embedding)
    distances= cdist(encoded_df, query_embedding)
    smallest_distances = np.argsort(distances.T)[0][:10]
    results = [] 
    for distance in smallest_distances:
        results.append(df.iloc[distance])
    return results

In [38]:

# load the encoded dataframe from a file
encoded_df = load_encoded_df("encoded_data.npy")

# find the best 10 results based on cosine distance
df = pd.read_csv("C:/Users/Giannis/Desktop/SearchEngine/data/clean_songs.tsv", sep="\t")

query = "apple"
results = find_similar_songs(query, encoded_df, df)

results_dict = {"artist":[], "title":[], "lyrics":[]}
for result in results:
    results_dict["artist"].append(result["artist"])
    results_dict["title"].append(result["title"])
    results_dict["lyrics"].append(highlight_words(query, result["lyrics"]))

df_results = pd.DataFrame(results_dict)
df_results.head()

Unnamed: 0,artist,title,lyrics
0,Children,Apples And Oranges,I like apples and oranges. I like apples and ...
1,System Of A Down,Vicinity Of Obscenity,Liar! Liar! Banana banana banana terracott...
2,Tori Amos,Datura,Get out of my garden Passion vine Texas sa...
3,Deep Purple,The Orange Juice Song,"Orange juice, Just thinkin bout that orange o..."
4,The Monkees,Peter Percival Pattersons Pet Pig Porky,Peter Percival Patterson had a pet pig named P...


In [39]:
df_results.lyrics[0]

'I like apples and oranges.  I like apples and oranges.  Apples and oranges are so sweet.  Apples and oranges are good to eat.  I like apples and oranges.    Orange juice is so sweet,  <b>Apple</b> sauce is fun to eat,  <b>Apple</b> pie with ice cream -- what a tasty treat.  I like apples and oranges.  I like apples and oranges.    Apples and oranges are so sweet.  Apples and oranges are good to eat.  I like apples and oranges.    Orange juice is so sweet,  <b>Apple</b> sauce is fun to eat,  And <b>apple</b> pie with ice cream -- what a tasty treat.  I like apples and oranges.  I like apples and oranges.    Apples and oranges are so sweet.  Apples and oranges are good to eat.  I like apples and oranges.    Orange juice is so sweet,  <b>Apple</b> sauce is fun to eat,  <b>Apple</b> pie with ice cream -- oooh, what a tasty treat.  I like apples and oranges.  I like apples and oranges.  Applies and oranges  Apples and oranges (repeat to fade)'