In [8]:
import gensim.downloader as api
import gensim
import pandas as pd
from nltk.corpus import stopwords
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim import utils
from scipy.spatial.distance import cdist

stop_words = stopwords.words('english')
g_model = gensim.models.KeyedVectors.load_word2vec_format("C:/Users/Giannis/gensim-data/word2vec-google-news-300/GoogleNews-vectors-negative300.bin", binary=True)  


In [12]:
# preprocess the lyrics column of the dataframe by converting to lowercase and removing punctuation
def preprocess_lyrics(df):
    df["lyrics"] = [" ".join(utils.simple_preprocess(x)) for x in df["lyrics"]]
    return df

def encode_lyrics(df):
    embeddings = []
    for lyrics in df["lyrics"]:
        embedding = []
        for word in lyrics.split():
            try:
                embedding.append(g_model[word])
            except KeyError:
                pass
        embeddings.append(np.mean(embedding, axis=0))
    return np.array(embeddings)

def save_encoded_df(df, filename):
    np.save(filename, encode_lyrics(preprocess_lyrics(df)))

def load_encoded_df(filename):
    return np.load(filename)

def find_similar_songs(query, encoded_df, df):
    df_query = pd.DataFrame({"lyrics": [query]})
    query_embedding = preprocess_lyrics(df_query)
    query_embedding = encode_lyrics(query_embedding)
    distances= cdist(encoded_df, query_embedding)
    smallest_distances = np.argsort(distances.T)[0][:10]
    results = [] 
    for distance in smallest_distances:
        results.append(df.iloc[distance])
    return results

In [14]:

# load the encoded dataframe from a file
encoded_df = load_encoded_df("encoded_data.npy")

# find the best 10 results based on cosine distance
df = pd.read_csv("C:/Users/Giannis/Desktop/SearchEngine/data/clean_songs.tsv", sep="\t")

query = "waterloo"
results = find_similar_songs(query, encoded_df, df)

results_dict = {"artist":[], "title":[], "lyrics":[]}
for result in results:
    results_dict["artist"].append(result["artist"])
    results_dict["title"].append(result["title"])
    results_dict["lyrics"].append(result["lyrics"])

df_results = pd.DataFrame(results_dict)
df_results.head()

Unnamed: 0,artist,title,lyrics
0,ABBA,Waterloo,My my At Waterloo Napoleon did surrender Oh ...
1,Def Leppard,Waterloo Sunset,"Dirty old river, must you keep rolling Flowin..."
2,Celine Dion,Ave Maria,"Ave Maria, Maiden mild Oh, listen to a maiden..."
3,Venom,Kings Of Evil,Saints and sinners - losers winners Crucified...
4,Diana Ross,Ave Maria,Ave maria Gratia plena Maria Gratia plena ...
