In [49]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, polynomial_kernel, sigmoid_kernel, rbf_kernel, laplacian_kernel, chi2_kernel, euclidean_distances, manhattan_distances

In [79]:
df = pd.read_csv('data/DE/data-neu.csv')

In [3]:
df.head()

Unnamed: 0,name,artists,album,release_date,release_date_precision,chart_power,spotify_id,uri,popularity,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,isrc,genres
0,That's No Way To Get Along,Robert Wilkins,The Original Rolling Stone,1980-01-01,day,,5JKdsNzhJGpoaKBFZpOKQ3,spotify:track:5JKdsNzhJGpoaKBFZpOKQ3,22,0.427,...,0.0407,0.98,0.0,0.192,0.381,97.078,173827,4,USA370640304,acoustic
1,I Got A Problem,Albert Collins,Frostbite,1980-02-05,day,,6e2PhVePvqe7w4VNBTslef,spotify:track:6e2PhVePvqe7w4VNBTslef,15,0.76,...,0.0573,0.328,0.0357,0.0447,0.849,102.183,274707,4,USARL8071903,acoustic
2,The Highway Is Like A Woman,Albert Collins,Frostbite,1980-02-05,day,,4Qzcy6jU0yeck7IuCqGWSi,spotify:track:4Qzcy6jU0yeck7IuCqGWSi,16,0.751,...,0.0361,0.419,0.11,0.105,0.771,98.545,302600,4,USARL8071904,acoustic
3,Alabama Blues,Robert Wilkins,The Original Rolling Stone,1980-01-01,day,,42zUiYaKltE7jFWb57fXAW,spotify:track:42zUiYaKltE7jFWb57fXAW,11,0.49,...,0.0716,0.972,0.0,0.159,0.905,205.718,157933,3,USA370640301,acoustic
4,Get Away Blues,Robert Wilkins,The Original Rolling Stone,1980-01-01,day,,0hCrfp9WImVO6KWI30O1sM,spotify:track:0hCrfp9WImVO6KWI30O1sM,4,0.629,...,0.0431,0.968,0.00238,0.0941,0.322,119.752,213107,4,USA370640300,acoustic


In [4]:
df['genres'].unique()

array(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'bluegrass', 'blues', 'brazil', 'british', 'cantopop',
       'chicago-house', 'children', 'chill', 'classical', 'club',
       'comedy', 'country', 'dance', 'dancehall', 'disco', 'dub',
       'dubstep', 'edm', 'electro', 'electronic', 'folk', 'forro',
       'french', 'funk', 'garage', 'german', 'gospel', 'goth', 'groove',
       'grunge', 'guitar', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'indian', 'indie',
       'indie-pop', 'industrial', 'j-dance', 'j-idol', 'j-pop', 'j-rock',
       'jazz', 'k-pop', 'latin', 'latino', 'malay', 'mandopop', 'metal',
       'minimal-techno', 'mpb', 'new-age', 'opera', 'pagode', 'party',
       'piano', 'pop', 'pop-film', 'power-pop', 'progressive-house',
       'psych-rock', 'punk', 'punk-rock', 'r-n-b', 'reggae', 'reggaeton',
       'rock', 'rock-n-roll', 'rockabilly', 'salsa', 'samba', 'sertanejo',
       'sing

Check for duplicate values

In [5]:
df.shape

(3376797, 24)

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.shape

(863828, 24)

There were a lot of duplicate values..

Check for null values

In [8]:
df.isna().any()

name                      False
artists                   False
album                     False
release_date              False
release_date_precision    False
chart_power                True
spotify_id                False
uri                       False
popularity                False
danceability              False
energy                    False
key                       False
loudness                  False
mode                      False
speechiness               False
acousticness              False
instrumentalness          False
liveness                  False
valence                   False
tempo                     False
duration_ms               False
time_signature            False
isrc                      False
genres                    False
dtype: bool

The null values for the Chart Power are valid, because not every song was in the Charts.

In [10]:
df.columns

Index(['name', 'artists', 'album', 'release_date', 'release_date_precision',
       'chart_power', 'spotify_id', 'uri', 'popularity', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'isrc', 'genres'],
      dtype='object')

First group the songs by the spotify id. This brings together all instances with different genres but same spotify id.

In [11]:
by_spotify_id = df.copy()
by_spotify_id = by_spotify_id.groupby('spotify_id').agg({
    'genres': list,
    'name': list,
    'artists': list,
    'album': list,
    'release_date': list,
    'release_date_precision': list,
    'uri': list,
    'isrc':list,
    'chart_power': list, 
    'popularity': list, 
    'danceability': list, 
    'energy': list,
    'key': list, 
    'loudness': list, 
    'mode': list, 
    'speechiness': list, 
    'acousticness': list, 
    'instrumentalness': list, 
    'liveness': list, 
    'valence': list,
    'tempo': list, 
    'duration_ms': list, 
    'time_signature': list
})

In [12]:
by_spotify_id.head()

Unnamed: 0_level_0,genres,name,artists,album,release_date,release_date_precision,uri,isrc,chart_power,popularity,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
spotify_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000HmzbYBg0Uxe6cE47Tws,"[alt-rock, alternative, indie, rock]","[Exhuming McCarthy - Live, Exhuming McCarthy -...","[R.E.M., R.E.M., R.E.M., R.E.M.]","[Green (25th Anniversary Deluxe Edition), Gree...","[1988-11-07, 1988-11-07, 1988-11-07, 1988-11-07]","[day, day, day, day]","[spotify:track:000HmzbYBg0Uxe6cE47Tws, spotify...","[USWB11301009, USWB11301009, USWB11301009, USW...","[nan, nan, nan, nan]","[15, 15, 15, 15]",...,"[-3.698, -3.698, -3.698, -3.698]","[1, 1, 1, 1]","[0.121, 0.121, 0.121, 0.121]","[0.0113, 0.0113, 0.0113, 0.0113]","[6.11e-05, 6.11e-05, 6.11e-05, 6.11e-05]","[0.981, 0.981, 0.981, 0.981]","[0.8, 0.8, 0.8, 0.8]","[129.346, 129.346, 129.346, 129.346]","[193547, 193547, 193547, 193547]","[4, 4, 4, 4]"
000KGSi8GylA9JmrcU6xtB,[hardcore],[Wake Up And Live],[Youth Of Today],[Connecticut Fun],[1985],[year],[spotify:track:000KGSi8GylA9JmrcU6xtB],[USA560674294],[nan],[0],...,[-8.624],[0],[0.176],[0.338],[0.162],[0.301],[0.38],[107.281],[72387],[4]
000OJYYsNv9b3nSuVgnPfZ,"[pop, rock, rockabilly]","[Josephine, Josephine, Josephine]","[Shakin' Stevens, Shakin' Stevens, Shakin' Ste...","[Give Me Your Heart Tonight, Give Me Your Hear...","[1982-10-04, 1982-10-04, 1982-10-04]","[day, day, day]","[spotify:track:000OJYYsNv9b3nSuVgnPfZ, spotify...","[GBARL0801581, GBARL0801581, GBARL0801581]","[nan, nan, nan]","[18, 18, 18]",...,"[-2.251, -2.251, -2.251]","[1, 1, 1]","[0.0719, 0.0719, 0.0719]","[0.00745, 0.00745, 0.00745]","[0.000159, 0.000159, 0.000159]","[0.234, 0.234, 0.234]","[0.859, 0.859, 0.859]","[174.11, 174.11, 174.11]","[184360, 184360, 184360]","[4, 4, 4]"
000P83HDtOHcNVFZy7Q2Yu,[salsa],[El Licor de Tu Boquita],[El Gran Combo De Puerto Rico],[Unity],[1980],[year],[spotify:track:000P83HDtOHcNVFZy7Q2Yu],[USMRE0801806],[nan],[21],...,[-8.377],[0],[0.0736],[0.438],[4.17e-05],[0.126],[0.696],[88.94],[242493],[4]
000QVWRKGiK8oOGfCjrHuY,[folk],[Dil Wich Kide Chhupa Ke Rakh Laan],"[Sardool Sikander,Amar Noori]",[Gora Rang Deyin Na Rabba],[1989-02-02],[day],[spotify:track:000QVWRKGiK8oOGfCjrHuY],[INS188910383],[nan],[6],...,[-7.435],[1],[0.0725],[0.705],[0.0],[0.199],[0.833],[86.47],[364000],[4]


Then reduce all categorical features such that there are no duplicated values in an instance for one feature.

In [None]:
def reduce_list(elements, string_return = True):
    '''
    Removes duplicate elements in a list

    Parameter
    ---------
    elements: list
        List that should be reduced

    string_return: boolean; default=True
        Whether a list with just one element should be returned as string or list

    Return
    ------
    unique_elements: list or str
    
    '''
    unique_elements = []
    for element in elements:
        if element not in unique_elements:
            unique_elements.append(element)
    if (len(unique_elements) == 1 and string_return):
        return unique_elements[0]
    return unique_elements

In [None]:
# by_spotify_id['name'] = by_spotify_id['name'].agg(reduce_list)
# by_spotify_id['isrc'] = by_spotify_id['isrc'].agg(reduce_list)

In [13]:
by_spotify_id.head().T

spotify_id,000HmzbYBg0Uxe6cE47Tws,000KGSi8GylA9JmrcU6xtB,000OJYYsNv9b3nSuVgnPfZ,000P83HDtOHcNVFZy7Q2Yu,000QVWRKGiK8oOGfCjrHuY
genres,"[alt-rock, alternative, indie, rock]",[hardcore],"[pop, rock, rockabilly]",[salsa],[folk]
name,"[Exhuming McCarthy - Live, Exhuming McCarthy -...",[Wake Up And Live],"[Josephine, Josephine, Josephine]",[El Licor de Tu Boquita],[Dil Wich Kide Chhupa Ke Rakh Laan]
artists,"[R.E.M., R.E.M., R.E.M., R.E.M.]",[Youth Of Today],"[Shakin' Stevens, Shakin' Stevens, Shakin' Ste...",[El Gran Combo De Puerto Rico],"[Sardool Sikander,Amar Noori]"
album,"[Green (25th Anniversary Deluxe Edition), Gree...",[Connecticut Fun],"[Give Me Your Heart Tonight, Give Me Your Hear...",[Unity],[Gora Rang Deyin Na Rabba]
release_date,"[1988-11-07, 1988-11-07, 1988-11-07, 1988-11-07]",[1985],"[1982-10-04, 1982-10-04, 1982-10-04]",[1980],[1989-02-02]
release_date_precision,"[day, day, day, day]",[year],"[day, day, day]",[year],[day]
uri,"[spotify:track:000HmzbYBg0Uxe6cE47Tws, spotify...",[spotify:track:000KGSi8GylA9JmrcU6xtB],"[spotify:track:000OJYYsNv9b3nSuVgnPfZ, spotify...",[spotify:track:000P83HDtOHcNVFZy7Q2Yu],[spotify:track:000QVWRKGiK8oOGfCjrHuY]
isrc,"[USWB11301009, USWB11301009, USWB11301009, USW...",[USA560674294],"[GBARL0801581, GBARL0801581, GBARL0801581]",[USMRE0801806],[INS188910383]
chart_power,"[nan, nan, nan, nan]",[nan],"[nan, nan, nan]",[nan],[nan]
popularity,"[15, 15, 15, 15]",[0],"[18, 18, 18]",[21],[6]


In [17]:
def get_release_year_index(release_dates):
    '''
    Returns the index of the release year.

    Parameter
    ---------
    release_dates: List
        List that contains all dates a version of the song was released.

    Return
    ------
    min_index: number
        Index in the list that contains the oldest release year.
    '''
    release_year = []
    for date in release_dates:
        release_year.append(int(date[:4]))
    return (np.array(release_year)).argmin()

In [41]:
def select_oldest_song(df, isrc_flag=False):
    features = ['artists', 'album', 'release_date_precision', 'uri', 'release_date', 'chart_power', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
                'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'name']

    if isrc_flag:
        features.append('genres')
        features.append('spotify_id')
    else:
        features.append('isrc')
    df_copy = df.copy()
    for index, instance in df_copy.iterrows():
        if type(instance['release_date']) == list:
            min_index = get_release_year_index(instance['release_date'])
            for feature in features:
                if type(instance[feature] == list):
                    df_copy.loc[index, feature] = instance[feature][min_index]
    return df_copy

In [19]:
by_spotify_id = select_oldest_song(by_spotify_id)

genres                                                        [classical]
name                                       Kyllikki, Op. 41: III. Commodo
artists                               Jean Sibelius,Erik T. Tawaststjerna
album                     Sibelius: Complete Original Piano Music, Vol. 1
release_date                                                   1987-01-31
release_date_precision                                                day
uri                                  spotify:track:7zzySLCXBTSdWnu406T9R4
isrc                                                         SEAEB8753030
chart_power                                                           NaN
popularity                                                              0
danceability                                                        0.431
energy                                                               0.13
key                                                                    10
loudness                              

In [22]:
by_spotify_id.head().T

spotify_id,000HmzbYBg0Uxe6cE47Tws,000KGSi8GylA9JmrcU6xtB,000OJYYsNv9b3nSuVgnPfZ,000P83HDtOHcNVFZy7Q2Yu,000QVWRKGiK8oOGfCjrHuY
genres,"[alt-rock, alternative, indie, rock]",[hardcore],"[pop, rock, rockabilly]",[salsa],[folk]
name,Exhuming McCarthy - Live,Wake Up And Live,Josephine,El Licor de Tu Boquita,Dil Wich Kide Chhupa Ke Rakh Laan
artists,R.E.M.,Youth Of Today,Shakin' Stevens,El Gran Combo De Puerto Rico,"Sardool Sikander,Amar Noori"
album,Green (25th Anniversary Deluxe Edition),Connecticut Fun,Give Me Your Heart Tonight,Unity,Gora Rang Deyin Na Rabba
release_date,1988-11-07,1985,1982-10-04,1980,1989-02-02
release_date_precision,day,year,day,year,day
uri,spotify:track:000HmzbYBg0Uxe6cE47Tws,spotify:track:000KGSi8GylA9JmrcU6xtB,spotify:track:000OJYYsNv9b3nSuVgnPfZ,spotify:track:000P83HDtOHcNVFZy7Q2Yu,spotify:track:000QVWRKGiK8oOGfCjrHuY
isrc,USWB11301009,USA560674294,GBARL0801581,USMRE0801806,INS188910383
chart_power,,,,,
popularity,15,0,18,21,6


In [None]:
# by_spotify_id.to_csv('data/checkpoint/by_spotify_id_oldest.csv')

In [None]:
# by_spotify_id = pd.read_csv('data/checkpoint/by_spotify_id_oldest.csv')

The next step is to group all the songs by the isrc number since this should be the unique identifier. 

In [27]:
by_isrc = by_spotify_id.copy()
by_isrc.reset_index(inplace=True)
by_isrc = by_isrc.groupby('isrc').agg({
    'genres': list,
    'name': list,
    'artists': list,
    'album': list, 
    'release_date': list,
    'release_date_precision': list,
    'uri': list,
    'spotify_id':list,
    'chart_power': list, 
    'popularity': list, 
    'danceability': list, 
    'energy': list,
    'key': list, 
    'loudness': list, 
    'mode': list, 
    'speechiness': list, 
    'acousticness': list, 
    'instrumentalness': list, 
    'liveness': list, 
    'valence': list,
    'tempo': list, 
    'duration_ms': list, 
    'time_signature': list
})

In [28]:
by_isrc.head().T

isrc,AEA040700577,AEA040700578,AEA040700579,AEA040700580,AEA040700581
genres,"[[j-pop, jazz, pop]]","[[groove, j-pop, jazz, pop]]","[[groove, j-pop, jazz, pop]]","[[groove, j-pop, jazz, pop]]","[[groove, j-pop, jazz, pop, pop]]"
name,[Bala Wala Chi],[Houdou Nisbi],[Nafs Al Sheghlat],[Yalla Kichou Barra],[Ma Tfel]
artists,[Ziad Rahbani],[Ziad Rahbani],[Ziad Rahbani],[Ziad Rahbani],[Ziad Rahbani]
album,[Houdou Nisbi],[Houdou Nisbi],[Houdou Nisbi],[Houdou Nisbi],[Houdou Nisbi]
release_date,[1985-01-01],[1985-01-01],[1985-01-01],[1985-01-01],[1985-01-01]
release_date_precision,[day],[day],[day],[day],[day]
uri,[spotify:track:0fylgLeNObjVvwhd8caHqX],[spotify:track:0yMFpBNCYXqwwOAg23bC8a],[spotify:track:6G8l1kI8QlTD0UDIak5F8H],[spotify:track:21g76Lq5Jg4QvfTDvi4PlH],[spotify:track:0pKxrkFh8fxPKpkO29MYmi]
spotify_id,[0fylgLeNObjVvwhd8caHqX],[0yMFpBNCYXqwwOAg23bC8a],[6G8l1kI8QlTD0UDIak5F8H],[21g76Lq5Jg4QvfTDvi4PlH],[0pKxrkFh8fxPKpkO29MYmi]
chart_power,[nan],[nan],[nan],[nan],[nan]
popularity,[41],[31],[21],[17],[31]


In [42]:
by_isrc = select_oldest_song(by_isrc, True)

In [43]:
by_isrc.head().T

isrc,AEA040700577,AEA040700578,AEA040700579,AEA040700580,AEA040700581
genres,"[j-pop, jazz, pop]","[groove, j-pop, jazz, pop]","[groove, j-pop, jazz, pop]","[groove, j-pop, jazz, pop]","[groove, j-pop, jazz, pop, pop]"
name,Bala Wala Chi,Houdou Nisbi,Nafs Al Sheghlat,Yalla Kichou Barra,Ma Tfel
artists,Ziad Rahbani,Ziad Rahbani,Ziad Rahbani,Ziad Rahbani,Ziad Rahbani
album,Houdou Nisbi,Houdou Nisbi,Houdou Nisbi,Houdou Nisbi,Houdou Nisbi
release_date,1985-01-01,1985-01-01,1985-01-01,1985-01-01,1985-01-01
release_date_precision,day,day,day,day,day
uri,spotify:track:0fylgLeNObjVvwhd8caHqX,spotify:track:0yMFpBNCYXqwwOAg23bC8a,spotify:track:6G8l1kI8QlTD0UDIak5F8H,spotify:track:21g76Lq5Jg4QvfTDvi4PlH,spotify:track:0pKxrkFh8fxPKpkO29MYmi
spotify_id,0fylgLeNObjVvwhd8caHqX,0yMFpBNCYXqwwOAg23bC8a,6G8l1kI8QlTD0UDIak5F8H,21g76Lq5Jg4QvfTDvi4PlH,0pKxrkFh8fxPKpkO29MYmi
chart_power,,,,,
popularity,41,31,21,17,31


In [45]:
# by_isrc.to_csv('data/checkpoint/by_isrc_oldest.csv')

In [62]:
# by_isrc = pd.read_csv('data/checkpoint/by_isrc_oldest.csv')
# by_isrc.set_index('isrc', inplace=True)

In [None]:
# special_cases = by_isrc.copy()
# special_cases['name'] = special_cases['name'].agg(reduce_list, string_return=False)
# special_cases = special_cases[special_cases['name'].apply(len) > 1]

In [None]:
# special_cases.shape

Beispiel eines special cases

In [None]:
# df[df.isrc == 'AUCI10753909']

Filtern der Special cases: Wirklich relevant sind lediglich Lieder aus Deutschland, USA, UK, Italien und Schweden. Daher werden zunächst alle anderen Lieder herausgefiltert.

In [None]:
# country_codes = ['DE', 'IT', 'GB', 'US', 'SE']

In [None]:
# relevant_special_cases = pd.DataFrame()
# sum_entries = 0
# for code in country_codes:
#     rsc_country = special_cases[special_cases.index.str.startswith(code)]
#     sum_entries += rsc_country.shape[0]
#     relevant_special_cases = pd.concat([relevant_special_cases, rsc_country])
# sum_entries == relevant_special_cases.shape[0]

In [None]:
# print(f'Old shape: {special_cases.shape}')
# special_cases.drop(index=list(relevant_special_cases.index.values), inplace=True)
# print(f'New shape: {special_cases.shape}')

In [None]:
# relevant_special_cases.shape[0] + special_cases.shape[0]

In [None]:
# relevant_special_cases.head()

In [None]:
# relevant_special_cases

In [None]:
# indices = special_cases.index

In [None]:
# by_isrc_copyy = by_isrc_copyy.drop(index=indices.values)

In [None]:
# s = by_isrc_copyy['name'].apply(lambda x: type(x) != str)

In [None]:
# by_isrc.loc[s.values]

In [63]:
by_isrc['year'] = by_isrc['release_date'].apply(lambda x: int(x[:4]))

In [64]:
by_isrc['year'].unique()

array([1985, 1989, 1984, 1987, 1986, 1988, 1982, 1983, 1980, 1981, 2023])

In [65]:
index_songs_from_2023 = list(by_isrc[by_isrc['year'] == 2023].index)
by_isrc.drop(index=index_songs_from_2023, inplace=True)

In [66]:
by_isrc['year'].unique()

array([1985, 1989, 1984, 1987, 1986, 1988, 1982, 1983, 1980, 1981])

In [67]:
by_isrc.release_date_precision = by_isrc.release_date_precision.astype(str)

In [90]:
by_isrc.to_csv('data/checkpoint/by_isrc_oldest.csv')

Create a DataFrame which contains only the relevant features for the recommender system

In [69]:
df_recommender = by_isrc.drop(columns=['artists', 'genres', 'album', 'release_date', 'release_date_precision', 'chart_power', 'uri', 'popularity', 'name', 'spotify_id'])

Drop duplicate values..

In [70]:
df_recommender.shape

(427258, 14)

In [71]:
df_recommender.drop_duplicates(inplace=True)

In [72]:
df_recommender.shape

(426322, 14)

There were some duplicates..

In [73]:
df_recommender.head().T

isrc,AEA040700577,AEA040700578,AEA040700579,AEA040700580,AEA040700581
danceability,0.291,0.587,0.496,0.676,0.474
energy,0.231,0.184,0.264,0.43,0.428
key,9.0,7.0,3.0,5.0,5.0
loudness,-18.808,-18.954,-21.179,-18.456,-18.178
mode,1.0,0.0,1.0,1.0,0.0
speechiness,0.0418,0.0308,0.0449,0.0341,0.0436
acousticness,0.952,0.814,0.954,0.43,0.749
instrumentalness,0.382,0.951,0.887,0.838,0.941
liveness,0.237,0.0827,0.271,0.124,0.115
valence,0.371,0.495,0.304,0.656,0.752


Scale the data, so every feature has the same influence.

In [74]:
scaler = MinMaxScaler()
df_recommender_scaled = scaler.fit_transform(df_recommender)
df_recommender_scaled = pd.DataFrame(df_recommender_scaled, columns=df_recommender.columns, index = df_recommender.index)

In [75]:
df_recommender_scaled.reset_index(inplace=True)
df_recommender.reset_index(inplace=True)

In [76]:
df_recommender_scaled.head().T

Unnamed: 0,0,1,2,3,4
isrc,AEA040700577,AEA040700578,AEA040700579,AEA040700580,AEA040700581
danceability,0.295132,0.595335,0.503043,0.685598,0.48073
energy,0.231,0.184,0.264,0.43,0.428
key,0.818182,0.636364,0.272727,0.454545,0.454545
loudness,0.620601,0.618011,0.578544,0.626845,0.631776
mode,1.0,0.0,1.0,1.0,0.0
speechiness,0.043182,0.031818,0.046384,0.035227,0.045041
acousticness,0.955823,0.817269,0.957831,0.431727,0.752008
instrumentalness,0.382,0.951,0.887,0.838,0.941
liveness,0.237,0.0827,0.271,0.124,0.115


In [77]:
df_recommender_scaled.describe().T[['min', 'max']]

Unnamed: 0,min,max
danceability,0.0,1.0
energy,0.0,1.0
key,0.0,1.0
loudness,0.0,1.0
mode,0.0,1.0
speechiness,0.0,1.0
acousticness,0.0,1.0
instrumentalness,0.0,1.0
liveness,0.0,1.0
valence,0.0,1.0


Convert every genre to a feature. If a song is part of a genre it should contain the value 1 otherwise 0.

In [80]:
ct = pd.crosstab(df['isrc'], df['genres'])
# ct.reset_index(inplace=True)
ct = ct.applymap(lambda x: 1 if x > 1 else x)
ct.reset_index(inplace=True)

In [81]:
display(ct.head().T)
ct.shape

Unnamed: 0_level_0,0,1,2,3,4
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
isrc,AEA040700577,AEA040700578,AEA040700579,AEA040700580,AEA040700581
acoustic,0,0,0,0,0
afrobeat,0,0,0,0,0
alt-rock,0,0,0,0,0
alternative,0,0,0,0,0
...,...,...,...,...,...
techno,0,0,0,0,0
trance,0,0,0,0,0
trip-hop,0,0,0,0,0
turkish,0,0,0,0,0


(427265, 111)

In [82]:
ct.describe()

genres,acoustic,afrobeat,alt-rock,alternative,ambient,anime,black-metal,bluegrass,blues,brazil,...,soul,spanish,swedish,synth-pop,tango,techno,trance,trip-hop,turkish,world-music
count,427265.0,427265.0,427265.0,427265.0,427265.0,427265.0,427265.0,427265.0,427265.0,427265.0,...,427265.0,427265.0,427265.0,427265.0,427265.0,427265.0,427265.0,427265.0,427265.0,427265.0
mean,0.002249,0.000506,0.019016,0.020184,0.006385,0.0025,0.001353,0.004108,0.028579,0.011335,...,0.025977,0.015737,0.0242,0.016741,0.003656,0.005196,5.6e-05,0.000363,0.014244,1.9e-05
std,0.047372,0.022479,0.136582,0.14063,0.07965,0.049934,0.036755,0.063958,0.166622,0.10586,...,0.159066,0.124458,0.153671,0.128301,0.060353,0.071895,0.007495,0.019043,0.118496,0.004327
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [83]:
ctMinMax = ct.describe().T
if (ctMinMax['min'].min() != 0) | (ctMinMax['min'].max() != 0) | (ctMinMax['max'].min() != 1) | (ctMinMax['max'].max() != 1):
    print('Values are not scaled correctly')
else:
    print('Values are all scaled between 0 and 1')

Values are all scaled between 0 and 1


Merge both DataFrames together to create the Recommender System.

In [84]:
ct_merged = ct.merge(df_recommender_scaled, on=['isrc'], how='right')

In [85]:
ct_merged.isna().any().sum()

0

In [86]:
ct_merged.set_index(['isrc'], inplace=True)

In [87]:
display(ct_merged.head().T)
ct_merged.shape

isrc,AEA040700577,AEA040700578,AEA040700579,AEA040700580,AEA040700581
acoustic,0.000000,0.000000,0.000000,0.000000,0.000000
afrobeat,0.000000,0.000000,0.000000,0.000000,0.000000
alt-rock,0.000000,0.000000,0.000000,0.000000,0.000000
alternative,0.000000,0.000000,0.000000,0.000000,0.000000
ambient,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...
valence,0.371000,0.495000,0.304000,0.656000,0.752000
tempo,0.638375,0.521470,0.335621,0.416267,0.611925
duration_ms,0.043657,0.057345,0.033613,0.032676,0.055849
time_signature,0.600000,0.600000,0.800000,0.800000,0.800000


(426322, 124)

In [88]:
ct_merged.to_csv('data/checkpoint/ct_merged.csv')

Overall there are 124 features used for the Recommendation system.

Try different distance measures / similarity functions.

In [103]:
def recommend_tracks_kernel(track: str, recommender_function, distance: bool = False):
    '''
    Recommends tracks that are similar to the provided track.

    Parameter
    ---------
    track: str
        Provided track

    df: pd.DataFrame
        DataFrame used for the Recommendation
    
    '''
    global ct_merged

    global by_isrc

    ids = list(by_isrc[by_isrc.name == track].index)

    if (len(ids)):
        kernel_array = recommender_function(ct_merged, ct_merged[ct_merged.index == ids[0]])
        kernel_df = pd.DataFrame(kernel_array, index=ct_merged.index)

        kernel_df = kernel_df.rename(columns={0: 'Score'})
        kernel_df = kernel_df.merge(by_isrc, how='left', on='isrc')
        display(kernel_df.sort_values(by='Score', ascending=distance).head(6))
        return kernel_df
    else:
        print('Error')


In [104]:
def try_functions(track):
    kernel_functions = [cosine_similarity]
    distance_functions = [euclidean_distances, manhattan_distances]
    result = {}
    for kernel_function in kernel_functions:
        display(kernel_function.__name__)
        result[kernel_function.__name__] = recommend_tracks_kernel(track, kernel_function, False)

    for distance_function in distance_functions:
        display(distance_function.__name__)
        result[distance_function.__name__] = recommend_tracks_kernel(track, distance_function, True)

    return result

In [106]:
by_isrc.head().T

isrc,AEA040700577,AEA040700578,AEA040700579,AEA040700580,AEA040700581
genres,"[j-pop, jazz, pop]","[groove, j-pop, jazz, pop]","[groove, j-pop, jazz, pop]","[groove, j-pop, jazz, pop]","[groove, j-pop, jazz, pop, pop]"
name,Bala Wala Chi,Houdou Nisbi,Nafs Al Sheghlat,Yalla Kichou Barra,Ma Tfel
artists,Ziad Rahbani,Ziad Rahbani,Ziad Rahbani,Ziad Rahbani,Ziad Rahbani
album,Houdou Nisbi,Houdou Nisbi,Houdou Nisbi,Houdou Nisbi,Houdou Nisbi
release_date,1985-01-01,1985-01-01,1985-01-01,1985-01-01,1985-01-01
release_date_precision,day,day,day,day,day
uri,spotify:track:0fylgLeNObjVvwhd8caHqX,spotify:track:0yMFpBNCYXqwwOAg23bC8a,spotify:track:6G8l1kI8QlTD0UDIak5F8H,spotify:track:21g76Lq5Jg4QvfTDvi4PlH,spotify:track:0pKxrkFh8fxPKpkO29MYmi
spotify_id,0fylgLeNObjVvwhd8caHqX,0yMFpBNCYXqwwOAg23bC8a,6G8l1kI8QlTD0UDIak5F8H,21g76Lq5Jg4QvfTDvi4PlH,0pKxrkFh8fxPKpkO29MYmi
chart_power,,,,,
popularity,41,31,21,17,31


In [113]:
result = try_functions("I'm Still Standing")

'cosine_similarity'

Unnamed: 0_level_0,Score,genres,name,artists,album,release_date,release_date_precision,uri,spotify_id,chart_power,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GBALX8300190,1.0,"[piano, rock]",I'm Still Standing,Elton John,Too Low For Zero,1983-05-30,day,spotify:track:1jDJFeK9x3OZboIAHsY9k2,1jDJFeK9x3OZboIAHsY9k2,1185.0,...,1,0.179,0.356,0.121,0.14,0.772,176.808,183440,4,1983
GBALX8300192,0.992293,"[piano, rock]",Religion,Elton John,Too Low For Zero,1983-05-30,day,spotify:track:2UI4siFbFGTMXUaFBnBfGC,2UI4siFbFGTMXUaFBnBfGC,,...,1,0.0464,0.184,0.000624,0.117,0.809,138.488,246800,4,1983
GBALX8300191,0.988456,"[piano, rock]",Too Low For Zero,Elton John,Too Low For Zero,1983-05-30,day,spotify:track:45pcawfb1hdBjq69krZpMZ,45pcawfb1hdBjq69krZpMZ,,...,1,0.0393,0.435,0.0102,0.204,0.719,137.061,346267,4,1983
GBALX0080002,0.985744,"[piano, rock]",Slow Down Georgie (She's Poison),Elton John,Breaking Hearts,1984-07-09,day,spotify:track:5KwePD7DfNdGs9EpFTiuCW,5KwePD7DfNdGs9EpFTiuCW,,...,1,0.0332,0.213,7e-06,0.0878,0.654,137.423,249547,4,1984
GBA098200010,0.984981,"[piano, rock]",Dear John - Remastered 2003,Elton John,Jump Up!,1982-01-01,day,spotify:track:2KkATTWT3nJalkA5588fQ9,2KkATTWT3nJalkA5588fQ9,,...,1,0.0298,0.0245,0.000782,0.155,0.964,169.721,213360,4,1982
GBA098100030,0.984944,"[piano, rock]",Just Like Belgium - Remastered 2003,Elton John,The Fox,1981-01-01,day,spotify:track:0g04hQLDBQGedUIRDp1mmn,0g04hQLDBQGedUIRDp1mmn,,...,1,0.0338,0.0836,0.00372,0.131,0.86,143.529,249253,4,1981


'euclidean_distances'

Unnamed: 0_level_0,Score,genres,name,artists,album,release_date,release_date_precision,uri,spotify_id,chart_power,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GBALX8300190,2.980232e-08,"[piano, rock]",I'm Still Standing,Elton John,Too Low For Zero,1983-05-30,day,spotify:track:1jDJFeK9x3OZboIAHsY9k2,1jDJFeK9x3OZboIAHsY9k2,1185.0,...,1,0.179,0.356,0.121,0.14,0.772,176.808,183440,4,1983
GBALX8300192,0.3312513,"[piano, rock]",Religion,Elton John,Too Low For Zero,1983-05-30,day,spotify:track:2UI4siFbFGTMXUaFBnBfGC,2UI4siFbFGTMXUaFBnBfGC,,...,1,0.0464,0.184,0.000624,0.117,0.809,138.488,246800,4,1983
GBALX8300191,0.41036,"[piano, rock]",Too Low For Zero,Elton John,Too Low For Zero,1983-05-30,day,spotify:track:45pcawfb1hdBjq69krZpMZ,45pcawfb1hdBjq69krZpMZ,,...,1,0.0393,0.435,0.0102,0.204,0.719,137.061,346267,4,1983
GBALX0080002,0.4559662,"[piano, rock]",Slow Down Georgie (She's Poison),Elton John,Breaking Hearts,1984-07-09,day,spotify:track:5KwePD7DfNdGs9EpFTiuCW,5KwePD7DfNdGs9EpFTiuCW,,...,1,0.0332,0.213,7e-06,0.0878,0.654,137.423,249547,4,1984
GBA098100030,0.4622599,"[piano, rock]",Just Like Belgium - Remastered 2003,Elton John,The Fox,1981-01-01,day,spotify:track:0g04hQLDBQGedUIRDp1mmn,0g04hQLDBQGedUIRDp1mmn,,...,1,0.0338,0.0836,0.00372,0.131,0.86,143.529,249253,4,1981
GBA098200010,0.4628681,"[piano, rock]",Dear John - Remastered 2003,Elton John,Jump Up!,1982-01-01,day,spotify:track:2KkATTWT3nJalkA5588fQ9,2KkATTWT3nJalkA5588fQ9,,...,1,0.0298,0.0245,0.000782,0.155,0.964,169.721,213360,4,1982


'manhattan_distances'

Unnamed: 0_level_0,Score,genres,name,artists,album,release_date,release_date_precision,uri,spotify_id,chart_power,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GBALX8300190,0.0,"[piano, rock]",I'm Still Standing,Elton John,Too Low For Zero,1983-05-30,day,spotify:track:1jDJFeK9x3OZboIAHsY9k2,1jDJFeK9x3OZboIAHsY9k2,1185.0,...,1,0.179,0.356,0.121,0.14,0.772,176.808,183440,4,1983
GBALX8300192,0.890646,"[piano, rock]",Religion,Elton John,Too Low For Zero,1983-05-30,day,spotify:track:2UI4siFbFGTMXUaFBnBfGC,2UI4siFbFGTMXUaFBnBfGC,,...,1,0.0464,0.184,0.000624,0.117,0.809,138.488,246800,4,1983
GBALX8300191,1.176658,"[piano, rock]",Too Low For Zero,Elton John,Too Low For Zero,1983-05-30,day,spotify:track:45pcawfb1hdBjq69krZpMZ,45pcawfb1hdBjq69krZpMZ,,...,1,0.0393,0.435,0.0102,0.204,0.719,137.061,346267,4,1983
GBA098200010,1.185498,"[piano, rock]",Dear John - Remastered 2003,Elton John,Jump Up!,1982-01-01,day,spotify:track:2KkATTWT3nJalkA5588fQ9,2KkATTWT3nJalkA5588fQ9,,...,1,0.0298,0.0245,0.000782,0.155,0.964,169.721,213360,4,1982
GBALX0080004,1.311476,"[piano, rock]",Li'l 'Frigerator,Elton John,Breaking Hearts,1984-07-09,day,spotify:track:4qCzeQdIDQbGOhc6nUsdKC,4qCzeQdIDQbGOhc6nUsdKC,,...,1,0.0399,0.0466,0.287,0.134,0.832,165.451,217053,4,1984
GBA098100030,1.332439,"[piano, rock]",Just Like Belgium - Remastered 2003,Elton John,The Fox,1981-01-01,day,spotify:track:0g04hQLDBQGedUIRDp1mmn,0g04hQLDBQGedUIRDp1mmn,,...,1,0.0338,0.0836,0.00372,0.131,0.86,143.529,249253,4,1981
