## Loading of Data 

In [2]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix, hstack
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity


df = pd.read_csv("../../Downloads/songs_feature_eng_pca.csv")
df = df[['track_name', "artist_name", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6"]]


# Cosine Similarity using Annoy Package

We use the 6 different Principal Components to find the top 5 most similar tracks using cosine similarity, denoted by 'angular' using the Annoy package. 


In [None]:
from annoy import AnnoyIndex
import pandas as pd
import numpy as np

pcs = df[['PC1', "PC2", "PC3", "PC4", "PC5", "PC6"]]

f = pcs.shape[1]

t = AnnoyIndex(f, 'angular')  # 'angular' is equivalent to cosine similarity

for i, vector in enumerate(pcs.to_numpy()):
    t.add_item(i, vector)

t.build(10) # Adjust more - more precise but take longer

top_5_similar = {i: [] for i in range(pcs.shape[0])}

for i in range(pcs.shape[0]):
    nearest = t.get_nns_by_item(i, 6, include_distances=True)
    
    indices, distances = nearest[0][1:], nearest[1][1:]
    
    similarities = [1 - d for d in distances]
    
    top_5_similar[i] = list(zip(indices, similarities))

for i in range(1, 6):
    df[f'Track_Name_{i}'] = np.nan
    df[f'Artist_Name_{i}'] = np.nan
    df[f'Similarity_{i}'] = np.nan

for idx, sims in top_5_similar.items():
    for i, (sim_idx, sim_score) in enumerate(sims, start=1):
        df.at[idx, f'Track_Name_{i}'] = df.at[sim_idx, 'track_name']
        df.at[idx, f'Artist_Name_{i}'] = df.at[sim_idx, 'artist_name']
        df.at[idx, f'Similarity_{i}'] = sim_score

df.to_csv("../../Downloads/BT4222ProjectExcel/songs_with_similarities_final.csv", index=False)

