# Imports

In [128]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

import warnings
warnings.filterwarnings("ignore")

# 1. Dataset Preparation & Analysis

## 1.2 - Loading the dataset

In [129]:
data = pd.read_csv("../SOEN-471-Music-Recommender-System/data/data.csv") 
genre_data = pd.read_csv('../SOEN-471-Music-Recommender-System/data/data_by_genres.csv')
year_data = pd.read_csv('../SOEN-471-Music-Recommender-System/data/data_by_year.csv')

## 1.3 - Data Exploration

In [None]:
data.info()

In [130]:
data.describe()

Unnamed: 0,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
count,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0
mean,0.528587,1976.787241,0.502115,0.537396,230948.3,0.482389,0.084575,0.16701,5.199844,0.205839,-11.46799,0.706902,31.431794,0.098393,116.86159
std,0.263171,25.917853,0.376032,0.176138,126118.4,0.267646,0.278249,0.313475,3.515094,0.174805,5.697943,0.455184,21.826615,0.16274,30.708533
min,0.0,1921.0,0.0,0.0,5108.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0
25%,0.317,1956.0,0.102,0.415,169827.0,0.255,0.0,0.0,2.0,0.0988,-14.615,0.0,11.0,0.0349,93.421
50%,0.54,1977.0,0.516,0.548,207467.0,0.471,0.0,0.000216,5.0,0.136,-10.58,1.0,33.0,0.045,114.729
75%,0.747,1999.0,0.893,0.668,262400.0,0.703,0.0,0.102,8.0,0.261,-7.183,1.0,48.0,0.0756,135.537
max,1.0,2020.0,0.996,0.988,5403500.0,1.0,1.0,1.0,11.0,1.0,3.855,1.0,100.0,0.97,243.507


In [None]:
genre_data.info()

In [None]:
year_data.info()

## 1.4 - Data Visualisation

In [None]:
from yellowbrick.target import FeatureCorrelation 

feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year'] 

X, y = data[feature_names], data['popularity'] 
features = np.array(feature_names) # convert to numpy array for indexing
visualizer = FeatureCorrelation(labels=features) # Instantiate the visualizer with the Covariance ranking algorithm

plt.rcParams['figure.figsize']=(20,20) # set the size of the plot
visualizer.fit(X, y) # Fit the data to the visualizer      
visualizer.show() # Finalize and render the figure

In [None]:
# create a plotly figure
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence']
fig = px.line(year_data, x='year', y=sound_features, title='Trend of sound features over decades')
fig.show() 

In [None]:
# create a plotly figure
top10_genres = genre_data.nlargest(10, 'popularity')

fig = px.bar(top10_genres, x='genres', y=['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence'], barmode='group',
            title='Characteristics of top 10 genres') 
fig.show()

## 1.7 - Content Based Recommendation

#### k-NN

###### The KNN algorithm is used to find the most similar songs to a given query song, based on their feature (such as loudness, tempo, and danceability) values. The data is preprocessed by rescaling values and dropping unnecessary features. The algorithm is trained on the dataset using a kd-tree implementation, which makes the nearest neighbor search more efficient.
###### The recommendation function takes as input a song ID and returns the top n (default is 5) most similar songs to that query song. The function uses the trained KNN model to find the nearest neighbors based on feature values and prints out the names and artists of the recommended songs.
###### This is a content-based recommendation system, as it only takes into account the features of the songs and not user preferences or behavior. The algorithm uses similarity in features to make recommendations.

In [None]:
data.head()

In [None]:
# Group by song name and count the number of unique artists
grouped_by_song = data.groupby('name')['artists'].nunique()

# Filter out unique song names and count the number of groups with more than 1 artist
song_names_with_diff_artists = grouped_by_song[grouped_by_song > 1]
count = sum(song_names_with_diff_artists)

# Print the result
print(f'There are {count} unique song names with different artists.')

# Get an example of a song name in the list
song_name = song_names_with_diff_artists.index[64]

# Print all the rows with that song name
print(f'All the rows with the song name "{song_name}":')
print(data.loc[data['name'] == song_name])

In [131]:
# create a new dataframe and drop the columns that are not needed for the clustering 
df_for_kNN = data.drop(['artists','duration_ms','explicit','key','mode','name','popularity','release_date','year'],axis=1) 
df_for_kNN.head() # check the new dataframe

Unnamed: 0,valence,acousticness,danceability,energy,id,instrumentalness,liveness,loudness,speechiness,tempo
0,0.0594,0.982,0.279,0.211,4BJqT0PrAfrxzMOxytFOIz,0.878,0.665,-20.096,0.0366,80.954
1,0.963,0.732,0.819,0.341,7xPhfUan2yNtyFG0cUWkt8,0.0,0.16,-12.441,0.415,60.936
2,0.0394,0.961,0.328,0.166,1o6I8BglA6ylDMrIELygv1,0.913,0.101,-14.85,0.0339,110.339
3,0.165,0.967,0.275,0.309,3ftBPsC5vPBKxYSee08FDH,2.8e-05,0.381,-9.316,0.0354,100.109
4,0.253,0.957,0.418,0.193,4d6HGyGT8e121BsdKmw9v6,2e-06,0.229,-10.096,0.038,101.665


In [132]:
# Compute the minimum and maximum values for each column
for col in df_for_kNN.columns:
    min_val = df_for_kNN[col].min()
    max_val = df_for_kNN[col].max()
    print(f"{col}: min={min_val}, max={max_val}")

valence: min=0.0, max=1.0
acousticness: min=0.0, max=0.996
danceability: min=0.0, max=0.988
energy: min=0.0, max=1.0
id: min=000G1xMMuwxNHmwVsBdtj1, max=7zzuPsjj9L3M7ikqGmjN0D
instrumentalness: min=0.0, max=1.0
liveness: min=0.0, max=1.0
loudness: min=-60.0, max=3.855
speechiness: min=0.0, max=0.97
tempo: min=0.0, max=243.507


In [133]:
# rescale values and change ID to be primary key Spotify ID
df_for_kNN['loudness'] = df_for_kNN['loudness']+60 # since the minimum value is -60 and we want to make it 0
df_for_kNN['loudness'] = df_for_kNN['loudness']/63.855 # since the maximum value is now 63.855 and we want to make it 1
df_for_kNN['tempo'] = df_for_kNN['tempo']/244.091 # since the maximum value is 244.091 and we want to make it 1
df_for_kNN.index = df_for_kNN['id'] # change index to be the id
df_for_kNN = df_for_kNN.drop(['id'],axis=1) # remove id column
df_for_kNN.head() # check if everything is ok


Unnamed: 0_level_0,valence,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4BJqT0PrAfrxzMOxytFOIz,0.0594,0.982,0.279,0.211,0.878,0.665,0.624916,0.0366,0.331655
7xPhfUan2yNtyFG0cUWkt8,0.963,0.732,0.819,0.341,0.0,0.16,0.744797,0.415,0.249645
1o6I8BglA6ylDMrIELygv1,0.0394,0.961,0.328,0.166,0.913,0.101,0.707071,0.0339,0.45204
3ftBPsC5vPBKxYSee08FDH,0.165,0.967,0.275,0.309,2.8e-05,0.381,0.793736,0.0354,0.41013
4d6HGyGT8e121BsdKmw9v6,0.253,0.957,0.418,0.193,2e-06,0.229,0.781521,0.038,0.416505


In [135]:
train, test = train_test_split(df_for_kNN, test_size=0.2, random_state=42) # split the data into train and test

In [139]:
# get song ids for the train and test sets
train_ids = train.index.tolist()
test_ids = test.index.tolist()

In [None]:
# visualisation
corr_matrix = df_for_kNN.corr() # Compute the correlation matrix
sns.heatmap(corr_matrix, cmap='coolwarm', annot=True) # Plot the correlation matrix as a heatmap
plt.title('Correlation Matrix of Song Features') 
plt.show() 

In [144]:
# model_knn = NearestNeighbors(algorithm='kd_tree',n_neighbors=6) # Create a KNN model
# mat_songs = csr_matrix(df_for_kNN.values) # Create a sparse matrix 
# model_knn.fit(mat_songs) # Fit the model to the sparse matrix
# Create the kNN model
model = NearestNeighbors(n_neighbors=10, metric='cosine')
model.fit(X_train)


In [145]:
import random

# Choose a random song from the training set
song_index = random.randint(0, X_train.get_shape()[0] - 1)
song_features = X_train[song_index]
song_id = train_ids[song_index]

# Find the k nearest neighbors of the chosen song
k = 10
distances, indices = model.kneighbors([song_features.toarray()[0]], n_neighbors=k)

# Print the recommended songs
print(f"Because you listened to '{song_id}':")
for i in range(1, k):
    neighbor_song_id = train_ids[indices[0][i]]
    print(f"{i}: '{neighbor_song_id}'")


Because you listened to '54XuXEJEfZbaCW5etHimul':
1: '46Jrnnms3QlYrUOZQTQmwE'
2: '01OUk7upOV5oWavszGavr3'
3: '4GV1rXaP8aMyEO5o6KdJHo'
4: '5hFkKLE06GhSuqoLJR2oP7'
5: '2dsLUhp3LMNFjsk3iESHq6'
6: '6kJjpYfomJYBfats8mDcBP'
7: '2OetRiA7svb9KwiXkRjhLw'
8: '5wwLfjdiSHsgKyEO6V5rDe'
9: '1g8BTj84UX9vEiWiZtLoGf'


In [146]:
def evaluate_knn_model(model, X_train, y_train, X_test, y_test, k):
    model.fit(X_train)
    distances, indices = model.kneighbors(X_test, n_neighbors=k)
    y_pred = [y_train[indices[i][0]] for i in range(len(X_test))]
    return f1_score(y_test, y_pred, average='weighted')


In [None]:
# def recommend(id_song, model, number_of_recommendations=5): # Create a function to recommend songs
#     query = df_for_kNN.loc[id_song].to_numpy().reshape(1,-1) # Get the song features
#     print('Searching for recommendations...')
#     distances, indices = model.kneighbors(query,n_neighbors = number_of_recommendations) # Get the distances and indices of the nearest songs

#     # Loop through the indices and print the song names and artists of the nearest songs
#     for i, index in enumerate(indices[0]):
#      if index != id_song:
#         song_info = data[['name','artists']].loc[index]
#         print(f"{i+1}. {song_info['name']} by {song_info['artists']}")

In [None]:
# name = input('Enter song title: ') # Get the song title from the user
# print('Song title search results: ') 
# print(data[['artists','name']].where(data['name'] == name).dropna()) # Print the search results

# song_ind = int(input('Enter the index value of the requested song: ')) # Get the index value of the song from the user since more than one song can have the same name
# id_song = data['id'].loc[song_ind] # Get the song id

# song = data['name'].loc[song_ind] # Get the song nam
# artists = data['artists'].loc[song_ind] # Get the song artists

# print('Song selected is ', song, 'by', artists) 

# nor = int(input('Enter number of recommendations: ')) # Get the number of recommendations (nor) from the user

# recommend(id_song, model_knn, nor) # Call the recommend function