# Recommender

## Imports

In [117]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, polynomial_kernel, sigmoid_kernel, rbf_kernel, laplacian_kernel, chi2_kernel, euclidean_distances, manhattan_distances, cosine_distances
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import time
from ast import literal_eval

## Preprocessing

In [118]:
by_isrc = pd.read_csv('data/checkpoint/by_isrc_oldest.csv')
#by_isrc.set_index('isrc', inplace=True)

Create a DataFrame which contains only the relevant features for the recommender system

In [119]:
df_recommender = by_isrc.drop(columns=['artists', 'genres', 'album', 'release_date', 'release_date_precision', 'chart_power', 'uri', 'popularity', 'name', 'spotify_id'])
df_recommender.set_index('isrc', inplace=True)

Scale the data, so every feature has the same influence.

In [120]:
scaler = MinMaxScaler()
df_recommender_scaled = scaler.fit_transform(df_recommender)
df_recommender_scaled = pd.DataFrame(df_recommender_scaled, columns=df_recommender.columns, index = df_recommender.index)

In [121]:
df_recommender_scaled.reset_index(inplace=True)
df_recommender.reset_index(inplace=True)

In [122]:
df_recommender_scaled.head().T

Unnamed: 0,0,1,2,3,4
isrc,AEA040700577,AEA040700578,AEA040700579,AEA040700580,AEA040700581
danceability,0.295132,0.595335,0.503043,0.685598,0.48073
energy,0.231,0.184,0.264,0.43,0.428
key,0.818182,0.636364,0.272727,0.454545,0.454545
loudness,0.620601,0.618011,0.578544,0.626845,0.631776
mode,1.0,0.0,1.0,1.0,0.0
speechiness,0.043182,0.031818,0.046384,0.035227,0.045041
acousticness,0.955823,0.817269,0.957831,0.431727,0.752008
instrumentalness,0.382,0.951,0.887,0.838,0.941
liveness,0.237,0.0827,0.271,0.124,0.115


In [123]:
df_recommender_scaled.describe().T[['min', 'max']]

Unnamed: 0,min,max
danceability,0.0,1.0
energy,0.0,1.0
key,0.0,1.0
loudness,0.0,1.0
mode,0.0,1.0
speechiness,0.0,1.0
acousticness,0.0,1.0
instrumentalness,0.0,1.0
liveness,0.0,1.0
valence,0.0,1.0


Convert every genre to a feature. If a song is part of a genre it should contain the value 1 otherwise 0.

In [124]:
help = by_isrc.copy()
help['genres'] = help['genres'].apply(literal_eval)
help = help.explode('genres')

In [125]:
ct = pd.crosstab(help['isrc'], help['genres'])
# ct.reset_index(inplace=True)
ct = ct.applymap(lambda x: 1 if x > 1 else x)
ct.reset_index(inplace=True)

In [126]:
display(ct.head().T)
ct.shape

Unnamed: 0_level_0,0,1,2,3,4
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
isrc,AEA040700577,AEA040700578,AEA040700579,AEA040700580,AEA040700581
acoustic,0,0,0,0,0
afrobeat,0,0,0,0,0
alt-rock,0,0,0,0,0
alternative,0,0,0,0,0
...,...,...,...,...,...
techno,0,0,0,0,0
trance,0,0,0,0,0
trip-hop,0,0,0,0,0
turkish,0,0,0,0,0


(415124, 111)

In [127]:
ctMinMax = ct.describe().T
if (ctMinMax['min'].min() != 0) | (ctMinMax['min'].max() != 0) | (ctMinMax['max'].min() != 1) | (ctMinMax['max'].max() != 1):
    print('Values are not scaled correctly')
else:
    print('Values are all scaled between 0 and 1')

Values are all scaled between 0 and 1


Merge both DataFrames together to create the Recommender System.

In [128]:
ct_merged = ct.merge(df_recommender_scaled, on=['isrc'], how='right')

In [129]:
ct_merged.isna().any().sum()

0

In [130]:
ct_merged.set_index(['isrc'], inplace=True)

In [131]:
display(ct_merged.head())
ct_merged.shape

Unnamed: 0_level_0,acoustic,afrobeat,alt-rock,alternative,ambient,anime,black-metal,bluegrass,blues,brazil,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AEA040700577,0,0,0,0,0,0,0,0,0,0,...,1.0,0.043182,0.955823,0.382,0.237,0.371,0.638375,0.043657,0.6,0.555556
AEA040700578,0,0,0,0,0,0,0,0,0,0,...,0.0,0.031818,0.817269,0.951,0.0827,0.495,0.52147,0.057345,0.6,0.555556
AEA040700579,0,0,0,0,0,0,0,0,0,0,...,1.0,0.046384,0.957831,0.887,0.271,0.304,0.335621,0.033613,0.8,0.555556
AEA040700580,0,0,0,0,0,0,0,0,0,0,...,1.0,0.035227,0.431727,0.838,0.124,0.656,0.416267,0.032676,0.8,0.555556
AEA040700581,0,0,0,0,0,0,0,0,0,0,...,0.0,0.045041,0.752008,0.941,0.115,0.752,0.611925,0.055849,0.8,0.555556


(415124, 124)

In [132]:
#by_isrc = pd.read_csv('data/checkpoint/by_isrc_oldest.csv')
#ct_merged = pd.read_csv('data/checkpoint/ct_merged.csv')
by_isrc.set_index('isrc', inplace=True)
#ct_merged.set_index('isrc', inplace=True)

Try different distance measures / similarity functions.

In [133]:
def rec_question(by_isrc, ids, df_help):
    i = input('Which song did you mean? (Enter the index of the song)')
    if (int(i)<0) or (int(i)>(len(ids)-1)):
        print('False input, try again!')
        return rec_question(by_isrc, ids, df_help)
    else:
        name = df_help.loc[int(i), 'name']
        artists = df_help.loc[int(i), 'artists']
        print(f'You selected {name} by {artists}.')
        return df_help.loc[int(i), 'isrc']

In [134]:
def get_isrc(track: str):
    global by_isrc
    ids = list(by_isrc[by_isrc.name == track].index)
    index = ids[0]
    if len(ids) == 0:
        print('No song with this name available!')
        return -1
    elif len(ids) > 1:
        print('There are multiple songs with this name:')
        df_help = by_isrc.loc[ids].reset_index()
        display(df_help)
        index = rec_question(by_isrc, ids, df_help)
    return index


In [135]:
ct_merged

Unnamed: 0_level_0,acoustic,afrobeat,alt-rock,alternative,ambient,anime,black-metal,bluegrass,blues,brazil,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AEA040700577,0,0,0,0,0,0,0,0,0,0,...,1.0,0.043182,0.955823,0.382000,0.2370,0.371,0.638375,0.043657,0.6,0.555556
AEA040700578,0,0,0,0,0,0,0,0,0,0,...,0.0,0.031818,0.817269,0.951000,0.0827,0.495,0.521470,0.057345,0.6,0.555556
AEA040700579,0,0,0,0,0,0,0,0,0,0,...,1.0,0.046384,0.957831,0.887000,0.2710,0.304,0.335621,0.033613,0.8,0.555556
AEA040700580,0,0,0,0,0,0,0,0,0,0,...,1.0,0.035227,0.431727,0.838000,0.1240,0.656,0.416267,0.032676,0.8,0.555556
AEA040700581,0,0,0,0,0,0,0,0,0,0,...,0.0,0.045041,0.752008,0.941000,0.1150,0.752,0.611925,0.055849,0.8,0.555556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ved049201554,0,0,0,0,0,0,0,0,0,0,...,1.0,0.035537,0.750000,0.000000,0.6600,0.430,0.326515,0.051879,0.8,0.777778
ved049201555,0,0,0,0,0,0,0,0,0,0,...,1.0,0.042562,0.781124,0.000027,0.3590,0.898,0.605816,0.045353,0.8,0.777778
ved049201619,0,0,0,0,0,0,0,0,0,0,...,1.0,0.069215,0.755020,0.000026,0.7730,0.467,0.759033,0.046749,0.8,0.777778
ved049201677,0,0,0,0,0,0,0,0,0,0,...,0.0,0.144628,0.934739,0.000000,0.1040,0.961,0.461657,0.028419,0.8,1.000000


In [136]:
def recommend_tracks_kernel(isrc: str, recommender_function, distance: bool = False):
    '''
    Recommends tracks that are similar to the provided track.

    Parameter
    ---------
    track: str
        Provided track

    df: pd.DataFrame
        DataFrame used for the Recommendation
    
    '''
    global ct_merged

    global by_isrc

    kernel_array = recommender_function(ct_merged, ct_merged[ct_merged.index == str(isrc)])
    kernel_df = pd.DataFrame(kernel_array, index=ct_merged.index)

    kernel_df = kernel_df.rename(columns={0: 'Score'})
    kernel_df = kernel_df.merge(by_isrc, how='left', on='isrc')
    #display(kernel_df.sort_values(by='Score', ascending=distance).head(6))
    return kernel_df.sort_values(by='Score', ascending=distance)


In [138]:
def create_visualizations(data, scale=False):
    fig = go.Figure()
    count = 0
    if scale:
        scaleableFeatures = ['danceability', 'energy',
                           'key', 'loudness', 'mode', 'speechiness',
                           'acousticness', 'instrumentalness', 'liveness',
                           'valence', 'time_signature', 'tempo', 'danceability']
        data_scale = data.copy()
        data_scale = data[['danceability', 'energy',
                           'key', 'loudness', 'mode', 'speechiness',
                           'acousticness', 'instrumentalness', 'liveness',
                           'valence', 'time_signature', 'tempo', 'danceability']]
        scaler = MinMaxScaler()
        data_scale = pd.DataFrame(scaler.fit_transform(data_scale), columns=data_scale.columns, index=data_scale.index)
        data.drop(columns=scaleableFeatures, inplace=True)
        data = pd.concat([data,data_scale], axis=1)
    for index, song in data.iterrows():
        count += 1
        if (count >= 5) and (count <=49):
            continue
        if count == 51:
            break
        df_radar = pd.DataFrame(song[['danceability', 'energy',
                       'key', 'loudness', 'mode', 'speechiness',
                       'acousticness', 'instrumentalness', 'liveness',
                       'valence', 'time_signature', 'tempo', 'danceability']])
        df_radar.reset_index(inplace=True)
        df_radar.rename(columns={"index":"feature"}, inplace=True)

        fig.add_trace(go.Scatterpolar(
            r = df_radar[index],
            theta = df_radar['feature'],
            mode = 'lines',
            fill = 'none',
            name = str(count) + '. ' + song['name']
        ))

    fig.update_layout(
        height = 1000,
        template='plotly_dark'
    )
    fig.show()
    

    
def recommend(track, is_isrc=False):
    if is_isrc == False:
        track = get_isrc(track.lower())
        # check if an error occurred
        if track == -1:
            return
    data = recommend_tracks_kernel(track, euclidean_distances, True)
    display(data[:5])
    create_visualizations(data, True)

## Recommender

In [147]:
recommend('sun of jamaica')

There are multiple songs with this name:


Unnamed: 0,isrc,genres,name,artists,album,release_date,release_date_precision,uri,spotify_id,chart_power,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year
0,DEBL61211529,['new-age'],sun of jamaica,cusco,Virgin Islands,1983,year,spotify:track:3sBcUmtLRiMJs4Kz95E4lj,3sBcUmtLRiMJs4Kz95E4lj,,...,0,0.0294,0.37,0.736,0.152,0.942,129.062,245787,4,1983
1,DEK897900010,['disco'],sun of jamaica,goombay dance band,Sun of Jamaica,1980-01-01,day,spotify:track:5apwlmvQphkCJbHnMGUP2t,5apwlmvQphkCJbHnMGUP2t,3255.0,...,0,0.0324,0.0984,0.000252,0.745,0.862,129.241,264000,4,1980
2,SEPQM0610326,"['pop', 'swedish']",sun of jamaica,vikingarna,Kramgoa låtar 8,1980-01-01,day,spotify:track:5XnBT7Vl6czwTi30cmt6nW,5XnBT7Vl6czwTi30cmt6nW,,...,1,0.0302,0.167,0.326,0.226,0.935,124.847,247933,4,1980
3,TWA538690806,['mandopop'],sun of jamaica,tracy huang,PARADISE IN MY HEART,1986-08-15,day,spotify:track:2EVGkSVkocXiCQBOpRcigW,2EVGkSVkocXiCQBOpRcigW,,...,1,0.0714,0.529,0.0,0.297,0.672,140.93,202333,4,1986


Which song did you mean? (Enter the index of the song) 1


You selected sun of jamaica by goombay dance band.


Unnamed: 0_level_0,Score,genres,name,artists,album,release_date,release_date_precision,uri,spotify_id,chart_power,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DEK897900010,0.0,['disco'],sun of jamaica,goombay dance band,Sun of Jamaica,1980-01-01,day,spotify:track:5apwlmvQphkCJbHnMGUP2t,5apwlmvQphkCJbHnMGUP2t,3255.0,...,0,0.0324,0.0984,0.000252,0.745,0.862,129.241,264000,4,1980
QM4TX2159959,0.132689,['disco'],midnight love affair,carol douglas,The Best of Carol Douglas,1980-03-05,day,spotify:track:1DL9ufyIicBcK5cSD3lVgR,1DL9ufyIicBcK5cSD3lVgR,,...,0,0.0382,0.144,0.004,0.655,0.826,118.939,230527,4,1980
USSM18100885,0.256576,['disco'],this place hotel - live from the 1981 u.s. tour,the jacksons,Live,1981-11-11,day,spotify:track:57CeIB4zE8utvUAI4FcBNZ,57CeIB4zE8utvUAI4FcBNZ,,...,0,0.0621,0.141,7.1e-05,0.682,0.681,106.534,279453,4,1981
USSM10022564,0.313017,['disco'],wondering who,the jacksons,Triumph,1980-09-26,day,spotify:track:3Pi7JOJKdrc9dedGbQcAwa,3Pi7JOJKdrc9dedGbQcAwa,,...,0,0.0867,0.228,0.00407,0.697,0.781,130.501,257893,4,1980
USSM10026971,0.329375,['disco'],rock with you - live from the 1981 u.s. tour,the jacksons,Live,1981-11-11,day,spotify:track:2NaUCbP0AWBrHdFFJUUqtm,2NaUCbP0AWBrHdFFJUUqtm,,...,0,0.0526,0.288,0.00107,0.919,0.796,121.506,238253,4,1981


## Evaluation

In [140]:
def count_similar_elements(*lists):
    count = 0

    # Get the first list
    first_list = lists[0]

    # Count similar elements
    for elem in first_list:
        if all(elem in lst for lst in lists):
            count += 1

    # Check if all elements are similar and in the same order
    same_order = all(all(lst[i] == first_list[i] for i in range(len(first_list))) for lst in lists)

    return count, same_order

In [141]:
def try_functions_2(isrc):
    kernel_functions = [cosine_similarity]
    distance_functions = [euclidean_distances, manhattan_distances]
    result = {}
    for kernel_function in kernel_functions:
        result[kernel_function.__name__] = recommend_tracks_kernel(isrc, kernel_function, False)[:10].index.to_list()

    for distance_function in distance_functions:
        result[distance_function.__name__] = recommend_tracks_kernel(isrc, distance_function, True)[:10].index.to_list()

    return count_similar_elements(result['cosine_similarity'], result['euclidean_distances'], result['manhattan_distances'])

In [142]:
def try_n_songs(n):
    result = [0] * 12
    songs = by_isrc.sample(n=n, random_state=37)
    for index in tqdm(range(len(songs))):
        isrc = songs.iloc[index].name
        if len(ct_merged[ct_merged.index == str(isrc)]) == 0:
            continue
        count, same_order = try_functions_2(isrc)
        if same_order:
            print('same order')
            result[11] +=1
        else:
            result[count] += 1
    return result

In [108]:
result = try_n_songs(200)
result_df = pd.DataFrame([result], columns=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'Same order'])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [06:32<00:00,  1.96s/it]


In [109]:
rdf = result_df.T

In [110]:
rdf.reset_index(inplace=True)

In [111]:
rdf.rename(columns={'index': 'num_same_songs', 0: 'occurences'}, inplace=True)

In [112]:
rdf

Unnamed: 0,num_same_songs,occurences
0,0,0
1,1,0
2,2,0
3,3,3
4,4,3
5,5,16
6,6,38
7,7,61
8,8,60
9,9,18


In [114]:
fig = px.bar(rdf, x='num_same_songs', y='occurences', template='plotly_dark')
fig.update_layout(
    height = 1000,
    #width = 2000,
    yaxis_title='Occurences',
    xaxis_title='Number of same songs',
    title='Number of same songs for different similarity functions',
    #paper_bgcolor='#000',
    #plot_bgcolor='#000'
)
fig.show()