In [29]:
import pandas as pd

songs = pd.read_csv('data/spotify_data.csv', index_col=[0])
# songs.drop_duplicates(inplace=True, keep='first')
songs.dropna(inplace=True)
explicit = songs['explicit'].astype(int).to_numpy()
songs['explicit'] = explicit
unique_trackids = songs.drop_duplicates(subset='track_id', keep='first')
unique_trackids = unique_trackids.set_index(['track_id'])
same_trackid = songs[songs.duplicated(subset=['track_id'], keep=False)].sort_values(by='track_id')
same_trackid.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
15028,001APMDOl3qtx1526T11n1,Pink Sweat$;Kirby,New RnB,Better,0,176320,0,0.613,0.471,1,-6.644,0,0.107,0.316,1e-06,0.117,0.406,143.064,4,chill
103211,001APMDOl3qtx1526T11n1,Pink Sweat$;Kirby,New RnB,Better,0,176320,0,0.613,0.471,1,-6.644,0,0.107,0.316,1e-06,0.117,0.406,143.064,4,soul
85578,001YQlnDSduXd5LgBd66gT,Soda Stereo,Soda Stereo (Remastered),El Tiempo Es Dinero - Remasterizado 2007,38,177266,0,0.554,0.921,2,-4.589,1,0.0758,0.0194,0.0881,0.329,0.7,183.571,1,punk-rock
100420,001YQlnDSduXd5LgBd66gT,Soda Stereo,Soda Stereo (Remastered),El Tiempo Es Dinero - Remasterizado 2007,38,177266,0,0.554,0.921,2,-4.589,1,0.0758,0.0194,0.0881,0.329,0.7,183.571,1,ska
91801,003vvx7Niy0yvhvHt4a68B,The Killers,Hot Fuss,Mr. Brightside,86,222973,0,0.352,0.911,1,-5.23,1,0.0747,0.00121,0.0,0.0995,0.236,148.033,4,rock


In [30]:
by_track_id = same_trackid.groupby(['track_id'], as_index=False).agg({'track_genre': ' '.join})
print(by_track_id)

                     track_id                    track_genre
0      001APMDOl3qtx1526T11n1                     chill soul
1      001YQlnDSduXd5LgBd66gT                  punk-rock ska
2      003vvx7Niy0yvhvHt4a68B      rock alternative alt-rock
3      004h8smbIoAkUNDJvVKwkG                        emo sad
4      006rHBBNLJMpQs8fRC2GDe         sertanejo pagode forro
...                       ...                            ...
16636  7ztSVy67w9rXpKg5L2zN5l                   indie indian
16637  7zubR9uYAWjb5KPZTMm85e  latino latin reggaeton reggae
16638  7zumacGldlmxpoP8bpaeLe            power-pop synth-pop
16639  7zv2vmZq8OjS54BxFzI2wM          metalcore death-metal
16640  7zwn1eykZtZ5LODrf7c0tS      alt-rock alternative rock

[16641 rows x 2 columns]


In [31]:
from sklearn.preprocessing import StandardScaler
string_columns = ['track_name', 'track_id', 'album_name', 'artists']
numerical_data = songs.drop(columns=string_columns)
by_genre = numerical_data.groupby('track_genre').mean()
scaler = StandardScaler()
scaler.fit(numerical_data.values)
by_genre[by_genre.columns] = scaler.transform(by_genre.values)

In [32]:
import numpy as np

id_to_selected_genre = {}
for track_id, genres in by_track_id.to_numpy():
    record_values = unique_trackids.loc[track_id]
    numerical_values = record_values.drop(index=['artists', 'album_name', 'track_name', 'track_genre'])
    scaled = scaler.transform(numerical_values.to_numpy().reshape(1, -1))
    song_vector = scaled.flatten()
    distances = []
    # Compute distance from each genre to the current song
    for genre in genres.split(" "):
        genre_vector = by_genre.loc[genre]
        distance = np.linalg.norm(song_vector - genre_vector, ord=1)
        distances.append((genre, distance))
    # Select genre with the smallest distance
    selected_genre = min(distances, key=lambda x: x[1])[0]
    id_to_selected_genre[track_id] = selected_genre

In [33]:
# Replace genre of tracks with multiple genres by the genre with the smallest distance
for track_id, genre in id_to_selected_genre.items():
    unique_trackids.at[track_id, 'track_genre'] = genre

In [35]:
unique_trackids[unique_trackids['track_genre'] == 'disney']

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
14041,5d22scOOfbrK7GHy3ZTT3s,Jim Cummings;Terence Blanchard,The Princess and the Frog (Original Motion Pic...,Ma Belle Evangeline,65,115693,0,0.378,0.3520,3,-8.745,1,0.0354,0.812,0.000005,0.1060,0.3060,139.491,5,disney
24549,1ymvyDsiAUbzEhmSkfCSYh,Billy Eichner;Seth Rogen;JD McCrary;Childish G...,The Lion King (Original Motion Picture Soundtr...,Hakuna Matata,58,251520,0,0.530,0.5840,0,-11.891,1,0.2180,0.274,0.000000,0.1580,0.5750,81.854,4,disney
24552,3dfpZwSuosIuFqksY0F6gT,Cast - Sofia the First;Sofia,Sofia the First,"Sofia the First Main Title Theme - From ""Sofia...",55,52866,0,0.655,0.7890,4,-3.995,1,0.0489,0.423,0.000000,0.1300,0.6600,92.968,4,disney
24555,5fBcRH4CnYTaFMQtM0JtSl,Alanna Ubach;Antonio Sol,Coco (Original Motion Picture Soundtrack),La Llorona,56,165533,0,0.460,0.5740,4,-7.414,0,0.1120,0.318,0.000000,0.1000,0.3700,67.697,4,disney
24556,0juMU08O9byWiRBtKM1j5E,Lindiwe Mkhize;Lebo M.,The Lion King (Original Motion Picture Soundtr...,Circle of Life/Nants' Ingonyama,56,241800,0,0.297,0.5180,10,-8.152,1,0.0378,0.412,0.000011,0.2980,0.4960,81.425,4,disney
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25542,7buYPUijQW1xk8JsB6BoVk,André Leono,Coração Sem Abrigo,Olha o Que o Amor Me Faz,18,213680,0,0.480,0.3010,3,-10.960,1,0.0357,0.339,0.000000,0.1550,0.1650,127.573,4,disney
25543,4aSt3lGu5Rd9BTUjkIK9MR,Alan Menken,"Pocahontas, Une Légende Indienne (Bande Origin...","I'll Never See Him Again - From ""Pocahontas""/S...",19,114240,0,0.156,0.0788,7,-18.245,1,0.0361,0.969,0.951000,0.3670,0.0398,87.453,3,disney
25544,34rDgdMzrvGPC282kF4Eu4,George Bruns,101 Dalmatians,"All Dog Alert - From ""101 Dalmatians""/Score Ve...",18,57906,0,0.629,0.6190,0,-17.621,1,0.0606,0.835,0.032700,0.1790,0.4000,118.186,1,disney
25545,0son51AkdDBGl5v1Bx12Cu,Kristen Bell,Frost 2 (Svenskt Original Soundtrack/Deluxe Ed...,Home - Outtake,18,176865,0,0.708,0.4240,2,-7.048,1,0.0353,0.724,0.000000,0.0722,0.6370,101.703,4,disney


In [34]:
# Save to csv
unique_trackids.reset_index(inplace=True)
unique_trackids.to_csv('data/spotify_clean.csv', header=True)