In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, polynomial_kernel, sigmoid_kernel, rbf_kernel, laplacian_kernel, chi2_kernel, euclidean_distances, manhattan_distances

In [4]:
df = pd.read_csv('data/DE/data-neu.csv')

In [5]:
df.head()

Unnamed: 0,name,artists,album,release_date,release_date_precision,chart_power,spotify_id,uri,popularity,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,isrc,genres
0,That's No Way To Get Along,Robert Wilkins,The Original Rolling Stone,1980-01-01,day,,5JKdsNzhJGpoaKBFZpOKQ3,spotify:track:5JKdsNzhJGpoaKBFZpOKQ3,22,0.427,...,0.0407,0.98,0.0,0.192,0.381,97.078,173827,4,USA370640304,acoustic
1,I Got A Problem,Albert Collins,Frostbite,1980-02-05,day,,6e2PhVePvqe7w4VNBTslef,spotify:track:6e2PhVePvqe7w4VNBTslef,15,0.76,...,0.0573,0.328,0.0357,0.0447,0.849,102.183,274707,4,USARL8071903,acoustic
2,The Highway Is Like A Woman,Albert Collins,Frostbite,1980-02-05,day,,4Qzcy6jU0yeck7IuCqGWSi,spotify:track:4Qzcy6jU0yeck7IuCqGWSi,16,0.751,...,0.0361,0.419,0.11,0.105,0.771,98.545,302600,4,USARL8071904,acoustic
3,Alabama Blues,Robert Wilkins,The Original Rolling Stone,1980-01-01,day,,42zUiYaKltE7jFWb57fXAW,spotify:track:42zUiYaKltE7jFWb57fXAW,11,0.49,...,0.0716,0.972,0.0,0.159,0.905,205.718,157933,3,USA370640301,acoustic
4,Get Away Blues,Robert Wilkins,The Original Rolling Stone,1980-01-01,day,,0hCrfp9WImVO6KWI30O1sM,spotify:track:0hCrfp9WImVO6KWI30O1sM,4,0.629,...,0.0431,0.968,0.00238,0.0941,0.322,119.752,213107,4,USA370640300,acoustic


In [6]:
df['genres'].unique()

array(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'bluegrass', 'blues', 'brazil', 'british', 'cantopop',
       'chicago-house', 'children', 'chill', 'classical', 'club',
       'comedy', 'country', 'dance', 'dancehall', 'disco', 'dub',
       'dubstep', 'edm', 'electro', 'electronic', 'folk', 'forro',
       'french', 'funk', 'garage', 'german', 'gospel', 'goth', 'groove',
       'grunge', 'guitar', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'indian', 'indie',
       'indie-pop', 'industrial', 'j-dance', 'j-idol', 'j-pop', 'j-rock',
       'jazz', 'k-pop', 'latin', 'latino', 'malay', 'mandopop', 'metal',
       'minimal-techno', 'mpb', 'new-age', 'opera', 'pagode', 'party',
       'piano', 'pop', 'pop-film', 'power-pop', 'progressive-house',
       'psych-rock', 'punk', 'punk-rock', 'r-n-b', 'reggae', 'reggaeton',
       'rock', 'rock-n-roll', 'rockabilly', 'salsa', 'samba', 'sertanejo',
       'sing

Check for duplicate values

In [7]:
df.shape

(3376797, 24)

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.shape

(863828, 24)

There were a lot of duplicate values..

Check for null values

In [10]:
df.isna().any()

name                      False
artists                   False
album                     False
release_date              False
release_date_precision    False
chart_power                True
spotify_id                False
uri                       False
popularity                False
danceability              False
energy                    False
key                       False
loudness                  False
mode                      False
speechiness               False
acousticness              False
instrumentalness          False
liveness                  False
valence                   False
tempo                     False
duration_ms               False
time_signature            False
isrc                      False
genres                    False
dtype: bool

There exist Null values for the Chart Power and for the isrc. The null values for the Chart Power are valid, because not every song was in the Charts. Lets have a look at the null values for isrc:

In [11]:
df[df['isrc'].isna()]

Unnamed: 0,name,artists,album,release_date,release_date_precision,chart_power,spotify_id,uri,popularity,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,isrc,genres


Check for duplicate isrc numbers

In [12]:
df.columns

Index(['name', 'artists', 'album', 'release_date', 'release_date_precision',
       'chart_power', 'spotify_id', 'uri', 'popularity', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'isrc', 'genres'],
      dtype='object')

First group the songs by the spotify id. This brings together all instances with different genres but same spotify id.

In [13]:
by_spotify_id = df.copy()
by_spotify_id = by_spotify_id.groupby('spotify_id').agg({
    'genres': list,
    'name': list,
    'artists': list,
    'album': list,
    'release_date': list,
    'release_date_precision': list,
    'uri': list,
    'isrc':list,
    'chart_power': 'mean', 
    'popularity': 'mean', 
    'danceability': 'mean', 
    'energy': 'mean',
    'key': 'mean', 
    'loudness': 'mean', 
    'mode': 'mean', 
    'speechiness': 'mean', 
    'acousticness': 'mean', 
    'instrumentalness': 'mean', 
    'liveness': 'mean', 
    'valence': 'mean',
    'tempo': 'mean', 
    'duration_ms': 'mean', 
    'time_signature': 'mean'
})

In [14]:
by_spotify_id.head()

Unnamed: 0_level_0,genres,name,artists,album,release_date,release_date_precision,uri,isrc,chart_power,popularity,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
spotify_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000HmzbYBg0Uxe6cE47Tws,"[alt-rock, alternative, indie, rock]","[Exhuming McCarthy - Live, Exhuming McCarthy -...","[R.E.M., R.E.M., R.E.M., R.E.M.]","[Green (25th Anniversary Deluxe Edition), Gree...","[1988-11-07, 1988-11-07, 1988-11-07, 1988-11-07]","[day, day, day, day]","[spotify:track:000HmzbYBg0Uxe6cE47Tws, spotify...","[USWB11301009, USWB11301009, USWB11301009, USW...",,15.0,...,-3.698,1.0,0.121,0.0113,6.1e-05,0.981,0.8,129.346,193547.0,4.0
000KGSi8GylA9JmrcU6xtB,[hardcore],[Wake Up And Live],[Youth Of Today],[Connecticut Fun],[1985],[year],[spotify:track:000KGSi8GylA9JmrcU6xtB],[USA560674294],,0.0,...,-8.624,0.0,0.176,0.338,0.162,0.301,0.38,107.281,72387.0,4.0
000OJYYsNv9b3nSuVgnPfZ,"[pop, rock, rockabilly]","[Josephine, Josephine, Josephine]","[Shakin' Stevens, Shakin' Stevens, Shakin' Ste...","[Give Me Your Heart Tonight, Give Me Your Hear...","[1982-10-04, 1982-10-04, 1982-10-04]","[day, day, day]","[spotify:track:000OJYYsNv9b3nSuVgnPfZ, spotify...","[GBARL0801581, GBARL0801581, GBARL0801581]",,18.0,...,-2.251,1.0,0.0719,0.00745,0.000159,0.234,0.859,174.11,184360.0,4.0
000P83HDtOHcNVFZy7Q2Yu,[salsa],[El Licor de Tu Boquita],[El Gran Combo De Puerto Rico],[Unity],[1980],[year],[spotify:track:000P83HDtOHcNVFZy7Q2Yu],[USMRE0801806],,21.0,...,-8.377,0.0,0.0736,0.438,4.2e-05,0.126,0.696,88.94,242493.0,4.0
000QVWRKGiK8oOGfCjrHuY,[folk],[Dil Wich Kide Chhupa Ke Rakh Laan],"[Sardool Sikander,Amar Noori]",[Gora Rang Deyin Na Rabba],[1989-02-02],[day],[spotify:track:000QVWRKGiK8oOGfCjrHuY],[INS188910383],,6.0,...,-7.435,1.0,0.0725,0.705,0.0,0.199,0.833,86.47,364000.0,4.0


Then reduce all categorical features such that there are no duplicated values in an instance for one feature.

In [15]:
def reduce_list(elements, string_return = True):
    '''
    Removes duplicate elements in a list

    Parameter
    ---------
    elements: list
        List that should be reduced

    string_return: boolean; default=True
        Whether a list with just one element should be returned as string or list

    Return
    ------
    unique_elements: list or str
    
    '''
    unique_elements = []
    for element in elements:
        if element not in unique_elements:
            unique_elements.append(element)
    if (len(unique_elements) == 1 and string_return):
        return unique_elements[0]
    return unique_elements

In [16]:
by_spotify_id['isrc'] = by_spotify_id['isrc'].agg(reduce_list)
by_spotify_id['name'] = by_spotify_id['name'].agg(reduce_list)
by_spotify_id['artists'] = by_spotify_id['artists'].agg(reduce_list)
by_spotify_id['album'] = by_spotify_id['album'].agg(reduce_list)
by_spotify_id['release_date'] = by_spotify_id['release_date'].agg(reduce_list)
by_spotify_id['release_date_precision'] = by_spotify_id['release_date_precision'].agg(reduce_list)
by_spotify_id['uri'] = by_spotify_id['uri'].agg(reduce_list)
by_spotify_id['genres'] = by_spotify_id['genres'].agg(reduce_list)

In [17]:
by_spotify_id.head()

Unnamed: 0_level_0,genres,name,artists,album,release_date,release_date_precision,uri,isrc,chart_power,popularity,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
spotify_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000HmzbYBg0Uxe6cE47Tws,"[alt-rock, alternative, indie, rock]",Exhuming McCarthy - Live,R.E.M.,Green (25th Anniversary Deluxe Edition),1988-11-07,day,spotify:track:000HmzbYBg0Uxe6cE47Tws,USWB11301009,,15.0,...,-3.698,1.0,0.121,0.0113,6.1e-05,0.981,0.8,129.346,193547.0,4.0
000KGSi8GylA9JmrcU6xtB,hardcore,Wake Up And Live,Youth Of Today,Connecticut Fun,1985,year,spotify:track:000KGSi8GylA9JmrcU6xtB,USA560674294,,0.0,...,-8.624,0.0,0.176,0.338,0.162,0.301,0.38,107.281,72387.0,4.0
000OJYYsNv9b3nSuVgnPfZ,"[pop, rock, rockabilly]",Josephine,Shakin' Stevens,Give Me Your Heart Tonight,1982-10-04,day,spotify:track:000OJYYsNv9b3nSuVgnPfZ,GBARL0801581,,18.0,...,-2.251,1.0,0.0719,0.00745,0.000159,0.234,0.859,174.11,184360.0,4.0
000P83HDtOHcNVFZy7Q2Yu,salsa,El Licor de Tu Boquita,El Gran Combo De Puerto Rico,Unity,1980,year,spotify:track:000P83HDtOHcNVFZy7Q2Yu,USMRE0801806,,21.0,...,-8.377,0.0,0.0736,0.438,4.2e-05,0.126,0.696,88.94,242493.0,4.0
000QVWRKGiK8oOGfCjrHuY,folk,Dil Wich Kide Chhupa Ke Rakh Laan,"Sardool Sikander,Amar Noori",Gora Rang Deyin Na Rabba,1989-02-02,day,spotify:track:000QVWRKGiK8oOGfCjrHuY,INS188910383,,6.0,...,-7.435,1.0,0.0725,0.705,0.0,0.199,0.833,86.47,364000.0,4.0


The next step is to group all the songs by the isrc number since this should be the unique identifier. 

In [18]:
by_isrc = by_spotify_id.copy()
# by_isrc = by_isrc[['isrc', 'name']]
by_isrc['count'] = 0
by_isrc = by_isrc.groupby('isrc').agg({
    'genres': list,
    'name': list,
    'artists': list,
    'album': list,
    'release_date': list,
    'release_date_precision': list,
    'uri': list,
    'chart_power': 'mean', 
    'popularity': 'mean', 
    'danceability': 'mean', 
    'energy': 'mean',
    'key': 'mean', 
    'loudness': 'mean', 
    'mode': 'mean', 
    'speechiness': 'mean', 
    'acousticness': 'mean', 
    'instrumentalness': 'mean', 
    'liveness': 'mean', 
    'valence': 'mean',
    'tempo': 'mean', 
    'duration_ms': 'mean', 
    'time_signature': 'mean',
    'count': 'count'
})

by_isrc.sort_values('count', ascending=False)

Unnamed: 0_level_0,genres,name,artists,album,release_date,release_date_precision,uri,chart_power,popularity,danceability,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,count
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ITB450800071,"[funk, [disco, funk], funk, [disco, funk], [di...",[THE CHANGE MEGAMIX - Limited Edition - Unoffi...,"[Change, Change, Change, Change, Change, Chang...","[Sharing Your Love, The Glow of Love, Sharing ...","[1982, 1980, 1982, 1981, 1981, 1980, 1980, 198...","[year, year, year, year, year, year, year, yea...","[spotify:track:0ASuSMQgF8Qs7x1q5w1wv7, spotify...",,2.900,0.63500,...,1.0,0.083900,0.11700,0.04960,0.511000,0.774000,117.941000,1330293.000,4.0,10
FIFMF6500009,"[pop, pop, pop, pop, pop, pop, pop, pop]","[Häävalssi, Häävalssi, Häävalssi, Häävalssi, H...","[Tapio Rautavaara, Tapio Rautavaara, Tapio Rau...","[Superhittiparaati 1966, Reissumiehen taival, ...","[1987, 1981, 1981, 1987, 1988, 1982, 1989, 1988]","[year, year, year, year, year, year, year, year]","[spotify:track:05JC7wtQQoSQe2JbIcE3UV, spotify...",,0.000,0.33425,...,0.0,0.034300,0.87475,0.00000,0.150500,0.420750,101.150000,160870.000,4.0,8
FIFMF6700028,"[[pop, rock], [pop, rock], [pop, rock], [pop, ...","[Ryysyranta, Ryysyranta, Ryysyranta, Ryysyrant...","[Irwin Goodman, Irwin Goodman, Irwin Goodman, ...","[Irwin Goodman, Irwin Goodman, Reteesti, Super...","[1988, 1987, 1983, 1987, 1989, 1987, 1988, 1989]","[year, year, year, year, year, year, year, year]","[spotify:track:04naUKsk841zGfSaySJA49, spotify...",,0.375,0.62900,...,0.0,0.041800,0.23000,0.00000,0.150000,0.758000,114.161000,204000.000,4.0,8
FIFMF6301000,"[[pop, tango], [pop, tango], [pop, tango], [po...","[Tango merellä, Tango merellä, Tango merellä, ...","[Taisto Tammi, Taisto Tammi, Taisto Tammi, Tai...","[Repe, Kultainen 60-luku, 100 Suosituinta iske...","[1984, 1988, 1989, 1987, 1988, 1987, 1989, 1983]","[year, year, year, year, year, year, year, year]","[spotify:track:1vXMGFGDFmX8o2JoYjOfsC, spotify...",,0.000,0.59775,...,0.0,0.040988,0.73175,0.02045,0.327875,0.487875,116.593375,183793.125,4.0,8
FIFMF6300100,"[pop, pop, pop, pop, pop, pop, pop, pop]","[Rakastan sinua, elämä, Rakastan sinua, elämä,...","[Kauko Käyhkö, Kauko Käyhkö, Kauko Käyhkö, Kau...","[Kultainen 60-luku, Superhittiparaati 1964, Ku...","[1988, 1987, 1988, 1989, 1985, 1982, 1983, 1983]","[year, year, year, year, year, year, year, year]","[spotify:track:1YYxa0qi3SQ9Cu4nLnKowC, spotify...",,0.000,0.39300,...,0.0,0.032800,0.98800,0.02020,0.107000,0.135000,81.536000,223267.000,4.0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GBARL0300049,"[[pop, rock, synth-pop]]",[Lucky Day],[Thompson Twins],[Quick Step and Side Kick (Deluxe Edition)],[1983-02-01],[day],[spotify:track:2mtUlAWos1HqvpnbwEQC1s],,7.000,0.73000,...,1.0,0.064800,0.00501,0.00315,0.058800,0.917000,135.081000,232467.000,4.0,1
GBARK9600136,"[[british, pop, r-n-b, soul, synth-pop]]",[People Hold On - Monjack Dub],"[Lisa Stansfield,Dirty Rotten Scoundrels,Dan B...",[Dance Vault Mixes - People Hold On (The Bootl...,[1989-10-24],[day],[spotify:track:3qe5quoROGxWlYtYpDaZeC],,6.000,0.72600,...,0.0,0.049600,0.02230,0.00323,0.090100,0.406000,129.500000,488827.000,4.0,1
GBARK9600135,"[[british, pop, r-n-b, soul, synth-pop]]",[People Hold On - Jon Is The Don Mix],"[Lisa Stansfield,Dirty Rotten Scoundrels,Dan B...",[Dance Vault Mixes - People Hold On (The Bootl...,[1989-10-24],[day],[spotify:track:4eQLC54C9zYr7EeyTDMKCv],,6.000,0.72500,...,0.0,0.036600,0.03630,0.00158,0.107000,0.463000,129.500000,221400.000,4.0,1
GBARK9600134,"[[british, pop, r-n-b, soul, synth-pop]]",[People Hold On - Dirty Radio Mix],"[Lisa Stansfield,Dirty Rotten Scoundrels,Dan B...",[Dance Vault Mixes - People Hold On (The Bootl...,[1989-10-24],[day],[spotify:track:5im9CDEz9wHVVUuGBHgSMx],,23.000,0.62400,...,1.0,0.041300,0.00337,0.00233,0.419000,0.826000,122.269000,561213.000,4.0,1


In [19]:
by_isrc['name'] = by_isrc['name'].agg(reduce_list, string_return=False)
by_isrc['artists'] = by_isrc['artists'].agg(reduce_list)
by_isrc['album'] = by_isrc['album'].agg(reduce_list)
by_isrc['release_date'] = by_isrc['release_date'].agg(reduce_list)
by_isrc['release_date_precision'] = by_isrc['release_date_precision'].agg(reduce_list)
by_isrc['uri'] = by_isrc['uri'].agg(reduce_list)
by_isrc['genres'] = by_isrc['genres'].agg(reduce_list)

In [70]:
special_cases = by_isrc.copy()
special_cases = special_cases[special_cases['name'].apply(len) > 1]
special_cases['name'] = special_cases['name'].apply(reduce_list)

In [71]:
special_cases

Unnamed: 0_level_0,genres,name,artists,album,release_date,release_date_precision,uri,chart_power,popularity,danceability,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,count
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DEA310602454,disco,"[Dance Tonight (Vocal 12"" Version), Do You (Vo...",Duke Lake,"[Dance Tonight, Do You]","[1985, 1983]",year,"[spotify:track:5hq4P45NLrzaefTujZBYyB, spotify...",,13.5,0.6445,...,0.5,0.03640,0.067145,0.208620,0.07995,0.797500,113.8340,390273.5,4.0,2
DEA311501662,disco,"[Dance Tonight (Instrumental), Do You (Instrum...",Duke Lake,"[Dance Tonight, Do You]","[1985, 1983]",year,"[spotify:track:10rKbOLBZ0kT1wMuLf6dBD, spotify...",,3.0,0.6865,...,0.5,0.03175,0.069100,0.770000,0.05640,0.741500,113.2915,313753.5,4.0,2
DEA540000779,"[classical, german]","[Prelude and Fugue in E-Flat Major, BWV 552, ""...","Johann Sebastian Bach,Gabor Lehotka","Bach, J.S.: Organ Music - Preludes and Fugues ...",1988-01-01,day,"[spotify:track:1XPjukamQe2o5GMDgi9ZCy, spotify...",,3.5,0.1385,...,1.0,0.03480,0.883000,0.795500,0.08190,0.201500,96.2980,903333.0,4.0,2
DEA540000781,"[classical, german]","[Herzlich tut mich verlangen, BWV 727, Chorale...","Johann Sebastian Bach,Gabor Lehotka","Bach, J.S.: Organ Music - Preludes and Fugues ...",1988-01-01,day,"[spotify:track:3ZE9uTX2ntxJyePxxuJGT5, spotify...",,3.0,0.1895,...,0.0,0.04385,0.960000,0.981000,0.10900,0.072900,86.7160,159133.0,4.0,2
DEA559609581,"[classical, german]","[Concerto grosso in B-Flat Major, Op. 3 No. 1,...","[George Frideric Handel,Pamela Thorby,Rebecca ...","[Händel: Concerti grossi, Op. 3, Handel: Conce...",1988-01-01,day,"[spotify:track:0ykFWMZOvEKFnuEuDYPRjb, spotify...",,13.0,0.4735,...,1.0,0.04270,0.917500,0.887000,0.08180,0.764500,121.0520,146267.0,4.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
USWR31600887,hip-hop,"[Changes (In Love) - Live, Changes (In Love)]",Sparky D,"[Live & Kicking, I Can't Stop / Changes (In Lo...","[1988, 1988-05-20]","['year', 'day']","[spotify:track:0IoDfvcrYIX1cxWRBrYNYO, spotify...",,0.0,0.7705,...,0.0,0.12150,0.069100,0.000024,0.22250,0.369000,140.2080,275715.0,4.0,2
USWR39100011,r-n-b,"[Is This the End, Is This The End]",New Edition,"[Streetwise Records: The Complete Collection, ...","[1982, 1983]",year,"[spotify:track:3YX0IeJLITvD5dY36UPhvh, spotify...",,22.5,0.6735,...,1.0,0.02815,0.481500,0.000003,0.07510,0.390500,69.0425,251400.0,4.0,2
USWR39800034,"[hip-hop, house, jazz]","[Straight out the Jungle, Straight Out The Jun...",Jungle Brothers,"[Straight out the Jungle, Classic Hip Hop Jams...","[1989-02-03, 1988]","['day', 'year']","[spotify:track:1bJwxecEz5Y39EDelM33uc, spotify...",,14.0,0.8990,...,1.0,0.10700,0.096100,0.001700,0.05310,0.629333,99.8420,239041.0,4.0,3
USZGD8800009,classical,"[Swan Lake, Op. 20a, TH 219 (Excerpts): Scene,...","Pyotr Ilyich Tchaikovsky,Wiener Symphoniker,Ed...","Tchaikovsky: Swan Lake, Op. 20a & The Sleeping...",1988-01-01,day,"[spotify:track:3mngMmVsci4VvyVuwg5nWJ, spotify...",,0.5,0.1745,...,0.0,0.03720,0.760500,0.121000,0.08470,0.121000,107.3240,162667.0,3.5,2


Beispiel eines special cases

In [72]:
df[df.isrc == 'AUCI10753909']

Unnamed: 0,name,artists,album,release_date,release_date_precision,chart_power,spotify_id,uri,popularity,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,isrc,genres
2395845,I Swear,The New Christs,Divine Rites,1988-07-14,day,,1CZGo3n5qfVJQaYONhw3gR,spotify:track:1CZGo3n5qfVJQaYONhw3gR,15,0.238,...,0.0758,4.5e-05,0.51,0.223,0.342,103.152,187266,4,AUCI10753909,alt-rock
2399798,I Swear,The New Christs,Divine Rites,1988-07-14,day,,1CZGo3n5qfVJQaYONhw3gR,spotify:track:1CZGo3n5qfVJQaYONhw3gR,15,0.238,...,0.0758,4.5e-05,0.51,0.223,0.342,103.152,187266,4,AUCI10753909,alternative
2811439,I Swear,The New Christs,Divine Rites,1988-07-14,day,,1CZGo3n5qfVJQaYONhw3gR,spotify:track:1CZGo3n5qfVJQaYONhw3gR,15,0.238,...,0.0758,4.5e-05,0.51,0.223,0.342,103.152,187266,4,AUCI10753909,rock
2858597,Bed of Nails,The New Christs,Distemper,1989-08-11,day,,2b6JwxCVGfgi4MTVgez0ar,spotify:track:2b6JwxCVGfgi4MTVgez0ar,5,0.246,...,0.0673,0.000182,3e-05,0.215,0.266,136.746,391359,4,AUCI10753909,alt-rock
2862765,Bed of Nails,The New Christs,Distemper,1989-08-11,day,,2b6JwxCVGfgi4MTVgez0ar,spotify:track:2b6JwxCVGfgi4MTVgez0ar,5,0.246,...,0.0673,0.000182,3e-05,0.215,0.266,136.746,391359,4,AUCI10753909,alternative
3295973,Bed of Nails,The New Christs,Distemper,1989-08-11,day,,2b6JwxCVGfgi4MTVgez0ar,spotify:track:2b6JwxCVGfgi4MTVgez0ar,5,0.246,...,0.0673,0.000182,3e-05,0.215,0.266,136.746,391359,4,AUCI10753909,rock


Filtern der Special cases: Wirklich relevant sind lediglich Lieder aus Deutschland, USA, UK, Italien und Schweden. Daher werden zunächst alle anderen Lieder herausgefiltert.

In [54]:
country_codes = ['DE', 'IT', 'GB', 'US', 'SE']

In [55]:
relevant_special_cases = pd.DataFrame()
sum_entries = 0
for code in country_codes:
    rsc_country = special_cases[special_cases.index.str.startswith(code)]
    sum_entries += rsc_country.shape[0]
    relevant_special_cases = pd.concat([relevant_special_cases, rsc_country])
sum_entries == relevant_special_cases.shape[0]

True

In [56]:
print(f'Old shape: {special_cases.shape}')
special_cases.drop(index=list(relevant_special_cases.index.values), inplace=True)
print(f'New shape: {special_cases.shape}')

Old shape: (7769, 23)
New shape: (2542, 23)


In [59]:
relevant_special_cases.shape[0] + special_cases.shape[0]

7769

In [62]:
relevant_special_cases.head()

Unnamed: 0_level_0,genres,name,artists,album,release_date,release_date_precision,uri,chart_power,popularity,danceability,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,count
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DEA310602454,disco,"[Dance Tonight (Vocal 12"" Version), Do You (Vo...",Duke Lake,"[Dance Tonight, Do You]","[1985, 1983]",year,"[spotify:track:5hq4P45NLrzaefTujZBYyB, spotify...",,13.5,0.6445,...,0.5,0.0364,0.067145,0.20862,0.07995,0.7975,113.834,390273.5,4.0,2
DEA311501662,disco,"[Dance Tonight (Instrumental), Do You (Instrum...",Duke Lake,"[Dance Tonight, Do You]","[1985, 1983]",year,"[spotify:track:10rKbOLBZ0kT1wMuLf6dBD, spotify...",,3.0,0.6865,...,0.5,0.03175,0.0691,0.77,0.0564,0.7415,113.2915,313753.5,4.0,2
DEA540000779,"[classical, german]","[Prelude and Fugue in E-Flat Major, BWV 552, ""...","Johann Sebastian Bach,Gabor Lehotka","Bach, J.S.: Organ Music - Preludes and Fugues ...",1988-01-01,day,"[spotify:track:1XPjukamQe2o5GMDgi9ZCy, spotify...",,3.5,0.1385,...,1.0,0.0348,0.883,0.7955,0.0819,0.2015,96.298,903333.0,4.0,2
DEA540000781,"[classical, german]","[Herzlich tut mich verlangen, BWV 727, Chorale...","Johann Sebastian Bach,Gabor Lehotka","Bach, J.S.: Organ Music - Preludes and Fugues ...",1988-01-01,day,"[spotify:track:3ZE9uTX2ntxJyePxxuJGT5, spotify...",,3.0,0.1895,...,0.0,0.04385,0.96,0.981,0.109,0.0729,86.716,159133.0,4.0,2
DEA559609581,"[classical, german]","[Concerto grosso in B-Flat Major, Op. 3 No. 1,...","[George Frideric Handel,Pamela Thorby,Rebecca ...","[Händel: Concerti grossi, Op. 3, Handel: Conce...",1988-01-01,day,"[spotify:track:0ykFWMZOvEKFnuEuDYPRjb, spotify...",,13.0,0.4735,...,1.0,0.0427,0.9175,0.887,0.0818,0.7645,121.052,146267.0,4.0,2


In [73]:
indices = special_cases.index

In [74]:
by_isrc = by_isrc.drop(index=indices.values)

In [75]:
by_isrc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 419496 entries, AEA040700577 to ved049201680
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   genres                  419496 non-null  object 
 1   name                    419496 non-null  object 
 2   artists                 419496 non-null  object 
 3   album                   419496 non-null  object 
 4   release_date            419496 non-null  object 
 5   release_date_precision  419496 non-null  object 
 6   uri                     419496 non-null  object 
 7   chart_power             793 non-null     float64
 8   popularity              419496 non-null  float64
 9   danceability            419496 non-null  float64
 10  energy                  419496 non-null  float64
 11  key                     419496 non-null  float64
 12  loudness                419496 non-null  float64
 13  mode                    419496 non-null  float64
 14  speechin

In [76]:
by_isrc.release_date_precision = by_isrc.release_date_precision.astype(str)

In [77]:
by_isrc.release_date_precision.unique()

array(['day', 'year', "['year', 'day']", "['day', 'year']", 'month',
       "['day', 'year', 'month']", "['year', 'day', 'month']",
       "['day', 'month', 'year']", "['month', 'day', 'year']",
       "['year', 'month']", "['month', 'year']", "['month', 'day']",
       "['day', 'month']"], dtype=object)

In [78]:
def search_release_date_precision(val: str):
    return by_isrc[by_isrc['release_date_precision'] == val]

In [79]:
search_release_date_precision("['month', 'day']")

Unnamed: 0_level_0,genres,name,artists,album,release_date,release_date_precision,uri,chart_power,popularity,danceability,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,count
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GBBTK0900023,disco,[If It Makes You Feel Good],Mandy Smith,"[Mandy (Special Edition), Don't You Want Me Ba...","[1988-04, 1989-05-08]","['month', 'day']","[spotify:track:1qeVsGggevl9O0AC1jYrEb, spotify...",,2.5,0.773,...,0.0,0.0579,0.113,0.0112,0.0523,0.868,115.249,270173.0,4.0,2
GBBTK0900028,disco,[Don't You Want Me Baby? - Single Version],Mandy Smith,"[Mandy (Special Edition), Don't You Want Me Ba...","[1988-04, 1989-05-08]","['month', 'day']","[spotify:track:5LNLNw7T4a3IaJjjcG7iqa, spotify...",,14.0,0.669,...,0.0,0.0385,0.0821,5e-06,0.218,0.634,119.923,208733.0,4.0,2
GBBTK0900034,disco,[Don't You Want Me Baby?],Mandy Smith,"[Mandy (Special Edition), Don't You Want Me Ba...","[1988-04, 1989-05-08]","['month', 'day']","[spotify:track:3yML3u0F1UnWPht9M3S3im, spotify...",,3.5,0.72,...,0.0,0.0416,0.125,0.00402,0.0685,0.944,119.975,222293.0,4.0,2
GBF088100896,"[pop, synth-pop]",[Persuasion],Soft Cell,"[Non Stop Erotic Cabaret (Deluxe Edition), Mem...","[1981-12, 1980-05-31]","['month', 'day']","[spotify:track:450PlLeR9YaXpjmP8jqjzu, spotify...",,6.5,0.7005,...,0.5,0.12345,0.2665,6e-06,0.135,0.2585,124.328,461306.5,4.0,2
GBF088100897,"[pop, synth-pop]",[Tainted Love / Where Did Our Love Go - Extend...,Soft Cell,"[Non Stop Erotic Cabaret (Deluxe Edition), Non...","[1981-12, 1981-06-21]","['month', 'day']","[spotify:track:5fkfdFPtgeYGvpGYblD7l1, spotify...",,46.0,0.678,...,1.0,0.0601,0.452,0.001345,0.265,0.452,144.215,540093.5,4.0,2
GBF088100898,"[pop, synth-pop]",[Memorabilia - Extended Version],Soft Cell,"[Non Stop Erotic Cabaret (Deluxe Edition), Non...","[1981-12, 1981-06-21]","['month', 'day']","[spotify:track:0ko4ceSFYTNfDsGwzZ8CDl, spotify...",,11.5,0.7605,...,1.0,0.07195,0.07855,0.5105,0.04435,0.869,133.3485,465520.0,4.0,2
GBF088200768,"[pop, synth-pop]",[What?],Soft Cell,"[Non Stop Erotic Cabaret (Deluxe Edition), Non...","[1981-12, 1981-06-21]","['month', 'day']","[spotify:track:6eqpEFORpUhdFGBg3DY3Az, spotify...",,20.0,0.516,...,0.0,0.1535,0.00926,1e-06,0.10135,0.5825,151.4975,271193.5,4.0,2
GBF088200884,"[pop, synth-pop]",[Torch - Extended Version],Soft Cell,"[Non Stop Erotic Cabaret (Deluxe Edition), Non...","[1981-12, 1981-06-21]","['month', 'day']","[spotify:track:0bp9FdBa7NBoMazKFyMIa5, spotify...",,23.5,0.5625,...,0.0,0.04765,0.259,0.000291,0.1325,0.6395,122.973,507200.0,4.0,2
USIR28600019,"[alt-rock, alternative, groove, hard-rock, met...",[I Am The Law],Anthrax,"[Among The Living (Deluxe Edition), I'm The Ma...","[1987-03, 1987-01-01]","['month', 'day']","[spotify:track:1dWs9fKUnDbW1vkkM5Votu, spotify...",,33.666667,0.285,...,0.666667,0.0796,0.00157,0.001989,0.322433,0.238333,129.737333,352324.333333,4.0,3
USSM10202367,funk,[Don't Hold It In],Herbie Hancock,Monster,"[1980-03, 1980-03-01]","['month', 'day']","[spotify:track:0Se16B9i3foU0Y4kIU0l68, spotify...",,6.0,0.7825,...,0.5,0.0537,0.007055,0.10165,0.07605,0.8405,108.002,482013.5,4.0,2


In [49]:
by_isrc[by_isrc['release_date_precision'] == "['year', 'day', 'month']"]

Unnamed: 0_level_0,genres,name,artists,album,release_date,release_date_precision,uri,chart_power,popularity,danceability,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,count
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GBAHK9500303,"[[pop, rock, synth-pop], rock]",[(It's Not Me) Talking - Single Edit],A Flock Of Seagulls,"[We Are The '80s, The Best Of, Listen]","[1981, 1986-07-01, 1983-05]","['year', 'day', 'month']","[spotify:track:1jpMTnFiCUsZIqkDGEjE8u, spotify...",,13.0,0.41,...,1.0,0.0778,0.055532,0.671,0.26675,0.286,143.25125,301170.0,4.0,4


In [88]:
d = ["1981", "1986-07-01", "1983-05"]

for index, item in enumerate(d):
    d[index] = int(item[:4])

np.array(d).min()

1981

In [57]:
count = 0
for index, row in by_isrc.iterrows():
    if (count < 10):
        print(index, row['name'], row['artists'])
        row['name'] = 'Moin Meister'
        count += 1
    else:
        break

AEA040700577 ['Bala Wala Chi'] Ziad Rahbani
AEA040700578 ['Houdou Nisbi'] Ziad Rahbani
AEA040700579 ['Nafs Al Sheghlat'] Ziad Rahbani
AEA040700580 ['Yalla Kichou Barra'] Ziad Rahbani
AEA040700581 ['Ma Tfel'] Ziad Rahbani
AEA040700582 ['Bil Nisbi La Boukra'] Ziad Rahbani
AEA040700583 ['Bisaraha'] Ziad Rahbani
AEA040700584 ['5 To 7 PM'] Ziad Rahbani
AEA040700585 ['Rouh Khabbir'] Ziad Rahbani
AEA040700586 ['For Sure'] Ziad Rahbani


In [66]:
import pandas as pd

# create a sample DataFrame
df = pd.DataFrame({'Name': ['John', 'Sara', 'Peter'], 'Age': [25, 30, 35], 'City': ['New York', 'Paris', 'London']})

# iterate over each row using iterrows()
for index, row in df.iterrows():
    print(index, row['Name'], row['Age'], row['City'])
    # row['Name'] = 'Jannis'
    # df.loc[[index]['Name']] = 'Jannis'
    # display(df.iloc[index])
    df.loc[index, 'Name'] ='Jannis'


0 John 25 New York
1 Sara 30 Paris
2 Peter 35 London


In [67]:
df

Unnamed: 0,Name,Age,City
0,Jannis,25,New York
1,Jannis,30,Paris
2,Jannis,35,London


Create a DataFrame which contains only the relevant features for the recommender system

In [40]:
df_recommender = by_isrc.drop(columns=['artists', 'genres','album', 'release_date', 'release_date_precision', 'chart_power', 'uri', 'popularity', 'name', 'count'])

Drop duplicate values..

In [41]:
df_recommender.shape

(427265, 13)

In [42]:
df_recommender.drop_duplicates(inplace=True)

In [43]:
df_recommender.shape

(425997, 13)

There were a lot of duplicates..

In [44]:
df_recommender.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AEA040700577,0.291,0.231,9.0,-18.808,1.0,0.0418,0.952,0.382,0.237,0.371,157.168,207560.0,3.0
AEA040700578,0.587,0.184,7.0,-18.954,0.0,0.0308,0.814,0.951,0.0827,0.495,128.386,272240.0,3.0
AEA040700579,0.496,0.264,3.0,-21.179,1.0,0.0449,0.954,0.887,0.271,0.304,82.63,160093.0,4.0
AEA040700580,0.676,0.43,5.0,-18.456,1.0,0.0341,0.43,0.838,0.124,0.656,102.485,155667.0,4.0
AEA040700581,0.474,0.428,5.0,-18.178,0.0,0.0436,0.749,0.941,0.115,0.752,150.656,265173.0,4.0


Since all features except the spotify_id are numerical features and the spotify_id should be an unique identifier, try to group by the id to reduce the DataFrame even more.

In [45]:
df_recommender = df_recommender.groupby(['isrc']).mean()

In [46]:
display(df_recommender.head())
df_recommender.shape

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AEA040700577,0.291,0.231,9.0,-18.808,1.0,0.0418,0.952,0.382,0.237,0.371,157.168,207560.0,3.0
AEA040700578,0.587,0.184,7.0,-18.954,0.0,0.0308,0.814,0.951,0.0827,0.495,128.386,272240.0,3.0
AEA040700579,0.496,0.264,3.0,-21.179,1.0,0.0449,0.954,0.887,0.271,0.304,82.63,160093.0,4.0
AEA040700580,0.676,0.43,5.0,-18.456,1.0,0.0341,0.43,0.838,0.124,0.656,102.485,155667.0,4.0
AEA040700581,0.474,0.428,5.0,-18.178,0.0,0.0436,0.749,0.941,0.115,0.752,150.656,265173.0,4.0


(425997, 13)

The shape stayed the same, so there were no duplicate ids.

Scale the data, so every feature has the same influence.

In [47]:
scaler = MinMaxScaler()
df_recommender_scaled = scaler.fit_transform(df_recommender)
df_recommender_scaled = pd.DataFrame(df_recommender_scaled, columns=df_recommender.columns, index = df_recommender.index)

In [48]:
df_recommender_scaled.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AEA040700577,0.295431,0.231,0.818182,0.620601,1.0,0.043182,0.955823,0.382,0.237,0.371,0.638375,0.043657,0.6
AEA040700578,0.595939,0.184,0.636364,0.618011,0.0,0.031818,0.817269,0.951,0.0827,0.495,0.52147,0.057345,0.6
AEA040700579,0.503553,0.264,0.272727,0.578544,1.0,0.046384,0.957831,0.887,0.271,0.304,0.335621,0.033613,0.8
AEA040700580,0.686294,0.43,0.454545,0.626845,1.0,0.035227,0.431727,0.838,0.124,0.656,0.416267,0.032676,0.8
AEA040700581,0.481218,0.428,0.454545,0.631776,0.0,0.045041,0.752008,0.941,0.115,0.752,0.611925,0.055849,0.8


In [49]:
df_recommender_scaled.reset_index(inplace=True)
df_recommender.reset_index(inplace=True)

Convert every genre to a feature. If a song is part of a genre it should contain the value 1 otherwise 0.

In [50]:
ct = pd.crosstab(df['isrc'], df['genres'])
ct.reset_index(inplace=True)

In [51]:
display(ct.head())
ct.shape

genres,isrc,acoustic,afrobeat,alt-rock,alternative,ambient,anime,black-metal,bluegrass,blues,...,soul,spanish,swedish,synth-pop,tango,techno,trance,trip-hop,turkish,world-music
0,AEA040700577,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AEA040700578,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AEA040700579,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AEA040700580,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AEA040700581,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


(427265, 111)

Merge both DataFrames together to create the Recommender System.

In [52]:
ct_merged = ct.merge(df_recommender_scaled, on=['isrc'], how='left')

In [54]:
ct_merged.set_index(['isrc'], inplace=True)

In [55]:
display(ct_merged.head())
ct_merged.shape

Unnamed: 0_level_0,acoustic,afrobeat,alt-rock,alternative,ambient,anime,black-metal,bluegrass,blues,brazil,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AEA040700577,0,0,0,0,0,0,0,0,0,0,...,0.620601,1.0,0.043182,0.955823,0.382,0.237,0.371,0.638375,0.043657,0.6
AEA040700578,0,0,0,0,0,0,0,0,0,0,...,0.618011,0.0,0.031818,0.817269,0.951,0.0827,0.495,0.52147,0.057345,0.6
AEA040700579,0,0,0,0,0,0,0,0,0,0,...,0.578544,1.0,0.046384,0.957831,0.887,0.271,0.304,0.335621,0.033613,0.8
AEA040700580,0,0,0,0,0,0,0,0,0,0,...,0.626845,1.0,0.035227,0.431727,0.838,0.124,0.656,0.416267,0.032676,0.8
AEA040700581,0,0,0,0,0,0,0,0,0,0,...,0.631776,0.0,0.045041,0.752008,0.941,0.115,0.752,0.611925,0.055849,0.8


(427265, 123)

Overall there are 122 features used for the Recommendation system.

Try different distance measures / similarity functions.

In [None]:
df.sort_values('popularity', ascending=False).head(10)

In [64]:
def recommend_tracks_kernel(track: str, recommender_function, distance: bool = False):
    '''
    Recommends tracks that are similar to the provided track.

    Parameter
    ---------
    track: str
        Provided track

    df: pd.DataFrame
        DataFrame used for the Recommendation
    
    '''
    global ct_merged
    global df

    ids = df[df.name == track]['isrc'].values

    if (len(ids)):
        kernel_array = recommender_function(ct_merged, ct_merged[ct_merged.index == ids[0]])
        kernel_df = pd.DataFrame(kernel_array, index=ct_merged.index)

        kernel_df.reset_index(inplace=True)
        kernel_df = kernel_df.rename(columns={0: 'Score'})
        kernel_df = kernel_df.merge(by_isrc, how='left', on='isrc')
        display(kernel_df.sort_values(by='Score', ascending=distance).head(6))
        return kernel_df
    else:
        print('Error')


In [58]:
def try_functions(track):
    kernel_functions = [cosine_similarity, linear_kernel, polynomial_kernel, sigmoid_kernel, rbf_kernel, laplacian_kernel, chi2_kernel]
    distance_functions = [euclidean_distances, manhattan_distances]
    result = {}
    for kernel_function in kernel_functions:
        display(kernel_function.__name__)
        result[kernel_function.__name__] = recommend_tracks_kernel(track, kernel_function, False)

    for distance_function in distance_functions:
        display(distance_function.__name__)
        result[distance_function.__name__] = recommend_tracks_kernel(track, distance_function, True)

    return result

In [63]:
ct_merged[ct_merged.index == 'DEE868300011']

Unnamed: 0_level_0,acoustic,afrobeat,alt-rock,alternative,ambient,anime,black-metal,bluegrass,blues,brazil,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DEE868300011,0,0,0,0,0,0,0,0,0,0,...,0.821266,1.0,0.047039,0.103748,1e-06,0.111,0.506333,0.389844,0.048928,0.8


In [62]:
ct_merged.head()

Unnamed: 0_level_0,acoustic,afrobeat,alt-rock,alternative,ambient,anime,black-metal,bluegrass,blues,brazil,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
isrc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AEA040700577,0,0,0,0,0,0,0,0,0,0,...,0.620601,1.0,0.043182,0.955823,0.382,0.237,0.371,0.638375,0.043657,0.6
AEA040700578,0,0,0,0,0,0,0,0,0,0,...,0.618011,0.0,0.031818,0.817269,0.951,0.0827,0.495,0.52147,0.057345,0.6
AEA040700579,0,0,0,0,0,0,0,0,0,0,...,0.578544,1.0,0.046384,0.957831,0.887,0.271,0.304,0.335621,0.033613,0.8
AEA040700580,0,0,0,0,0,0,0,0,0,0,...,0.626845,1.0,0.035227,0.431727,0.838,0.124,0.656,0.416267,0.032676,0.8
AEA040700581,0,0,0,0,0,0,0,0,0,0,...,0.631776,0.0,0.045041,0.752008,0.941,0.115,0.752,0.611925,0.055849,0.8


In [65]:
result = try_functions('99 Luftballons')

'cosine_similarity'

ValueError: Input contains NaN.

In [None]:
help = result['euclidean_distances'].sort_values(by='Score', ascending=True)

In [None]:
help.head(10)

In [None]:
help.sort_values(by='Score', ascending=False)