# Processing

In this notebook the queried data from the spotify API is processed into an according format.

## Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, polynomial_kernel, sigmoid_kernel, rbf_kernel, laplacian_kernel, chi2_kernel, euclidean_distances, manhattan_distances

## General overview

In [3]:
#df = pd.read_csv('data/DE/data-neu.csv')
df = pd.read_csv('data/data_with_country.csv')

In [4]:
df.head()

Unnamed: 0,name,artists,album,release_date,release_date_precision,chart_power,spotify_id,uri,popularity,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,isrc,genres
0,That's No Way To Get Along,Robert Wilkins,The Original Rolling Stone,1980-01-01,day,,5JKdsNzhJGpoaKBFZpOKQ3,spotify:track:5JKdsNzhJGpoaKBFZpOKQ3,22,0.427,...,0.0407,0.98,0.0,0.192,0.381,97.078,173827,4,USA370640304,acoustic
1,I Got A Problem,Albert Collins,Frostbite,1980-02-05,day,,6e2PhVePvqe7w4VNBTslef,spotify:track:6e2PhVePvqe7w4VNBTslef,15,0.76,...,0.0573,0.328,0.0357,0.0447,0.849,102.183,274707,4,USARL8071903,acoustic
2,The Highway Is Like A Woman,Albert Collins,Frostbite,1980-02-05,day,,4Qzcy6jU0yeck7IuCqGWSi,spotify:track:4Qzcy6jU0yeck7IuCqGWSi,16,0.751,...,0.0361,0.419,0.11,0.105,0.771,98.545,302600,4,USARL8071904,acoustic
3,Alabama Blues,Robert Wilkins,The Original Rolling Stone,1980-01-01,day,,42zUiYaKltE7jFWb57fXAW,spotify:track:42zUiYaKltE7jFWb57fXAW,11,0.49,...,0.0716,0.972,0.0,0.159,0.905,205.718,157933,3,USA370640301,acoustic
4,Get Away Blues,Robert Wilkins,The Original Rolling Stone,1980-01-01,day,,0hCrfp9WImVO6KWI30O1sM,spotify:track:0hCrfp9WImVO6KWI30O1sM,4,0.629,...,0.0431,0.968,0.00238,0.0941,0.322,119.752,213107,4,USA370640300,acoustic


In [5]:
df['genres'].unique()

array(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'bluegrass', 'blues', 'brazil', 'british', 'cantopop',
       'chicago-house', 'children', 'chill', 'classical', 'club',
       'comedy', 'country', 'dance', 'dancehall', 'disco', 'dub',
       'dubstep', 'edm', 'electro', 'electronic', 'folk', 'forro',
       'french', 'funk', 'garage', 'german', 'gospel', 'goth', 'groove',
       'grunge', 'guitar', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'indian', 'indie',
       'indie-pop', 'industrial', 'j-dance', 'j-idol', 'j-pop', 'j-rock',
       'jazz', 'k-pop', 'latin', 'latino', 'malay', 'mandopop', 'metal',
       'minimal-techno', 'mpb', 'new-age', 'opera', 'pagode', 'party',
       'piano', 'pop', 'pop-film', 'power-pop', 'progressive-house',
       'psych-rock', 'punk', 'punk-rock', 'r-n-b', 'reggae', 'reggaeton',
       'rock', 'rock-n-roll', 'rockabilly', 'salsa', 'samba', 'sertanejo',
       'sing

Check for duplicate values

In [6]:
df.shape

(3394589, 24)

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.shape

(868789, 24)

There were a lot of duplicate values..

Check for null values

In [9]:
df.isna().any()

name                      False
artists                   False
album                     False
release_date              False
release_date_precision    False
chart_power                True
spotify_id                False
uri                       False
popularity                False
danceability              False
energy                    False
key                       False
loudness                  False
mode                      False
speechiness               False
acousticness              False
instrumentalness          False
liveness                  False
valence                   False
tempo                     False
duration_ms               False
time_signature            False
isrc                      False
genres                    False
dtype: bool

The null values for the Chart Power are valid, because not every song was in the Charts.

In [10]:
df.columns

Index(['name', 'artists', 'album', 'release_date', 'release_date_precision',
       'chart_power', 'spotify_id', 'uri', 'popularity', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'isrc', 'genres'],
      dtype='object')

## Group by spotify id

First group the songs by the spotify id. This brings together all instances with different genres but same spotify id.

In [11]:
by_spotify_id = df.copy()
by_spotify_id = by_spotify_id.groupby('spotify_id').agg({
    'genres': list,
    'name': list,
    'artists': list,
    'album': list,
    'release_date': list,
    'release_date_precision': list,
    'uri': list,
    'isrc':list,
    'chart_power': list, 
    'popularity': list, 
    'danceability': list, 
    'energy': list,
    'key': list, 
    'loudness': list, 
    'mode': list, 
    'speechiness': list, 
    'acousticness': list, 
    'instrumentalness': list, 
    'liveness': list, 
    'valence': list,
    'tempo': list, 
    'duration_ms': list, 
    'time_signature': list
})

In [12]:
by_spotify_id.shape

(474676, 23)

In [13]:
by_spotify_id.columns

Index(['genres', 'name', 'artists', 'album', 'release_date',
       'release_date_precision', 'uri', 'isrc', 'chart_power', 'popularity',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'time_signature'],
      dtype='object')

Then reduce all categorical features such that there are no duplicated values in an instance for one feature.

In [14]:
by_spotify_id.head().T

spotify_id,000HmzbYBg0Uxe6cE47Tws,000KGSi8GylA9JmrcU6xtB,000OJYYsNv9b3nSuVgnPfZ,000P83HDtOHcNVFZy7Q2Yu,000QVWRKGiK8oOGfCjrHuY
genres,"[alt-rock, alternative, indie, rock]",[hardcore],"[pop, rock, rockabilly]",[salsa],[folk]
name,"[Exhuming McCarthy - Live, Exhuming McCarthy -...",[Wake Up And Live],"[Josephine, Josephine, Josephine]",[El Licor de Tu Boquita],[Dil Wich Kide Chhupa Ke Rakh Laan]
artists,"[R.E.M., R.E.M., R.E.M., R.E.M.]",[Youth Of Today],"[Shakin' Stevens, Shakin' Stevens, Shakin' Ste...",[El Gran Combo De Puerto Rico],"[Sardool Sikander,Amar Noori]"
album,"[Green (25th Anniversary Deluxe Edition), Gree...",[Connecticut Fun],"[Give Me Your Heart Tonight, Give Me Your Hear...",[Unity],[Gora Rang Deyin Na Rabba]
release_date,"[1988-11-07, 1988-11-07, 1988-11-07, 1988-11-07]",[1985],"[1982-10-04, 1982-10-04, 1982-10-04]",[1980],[1989-02-02]
release_date_precision,"[day, day, day, day]",[year],"[day, day, day]",[year],[day]
uri,"[spotify:track:000HmzbYBg0Uxe6cE47Tws, spotify...",[spotify:track:000KGSi8GylA9JmrcU6xtB],"[spotify:track:000OJYYsNv9b3nSuVgnPfZ, spotify...",[spotify:track:000P83HDtOHcNVFZy7Q2Yu],[spotify:track:000QVWRKGiK8oOGfCjrHuY]
isrc,"[USWB11301009, USWB11301009, USWB11301009, USW...",[USA560674294],"[GBARL0801581, GBARL0801581, GBARL0801581]",[USMRE0801806],[INS188910383]
chart_power,"[nan, nan, nan, nan]",[nan],"[nan, nan, nan]",[nan],[nan]
popularity,"[15, 15, 15, 15]",[0],"[18, 18, 18]",[21],[6]


In [15]:
def get_release_year_index(release_dates):
    '''
    Returns the index of the release year.

    Parameter
    ---------
    release_dates: List
        List that contains all dates a version of the song was released.

    Return
    ------
    min_index: number
        Index in the list that contains the oldest release year.
    '''
    release_year = []
    for date in release_dates:
        release_year.append(int(date[:4]))
    return (np.array(release_year)).argmin()

In [16]:
def select_oldest_song_old(df, isrc_flag=False):
    features = ['artists', 'album', 'release_date_precision', 'uri', 'release_date', 'chart_power', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
                'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature', 'name']

    if isrc_flag:
        features.append('genres')
        features.append('spotify_id')
    else:
        features.append('isrc')
    df_copy = df.copy()
    for index, instance in df_copy.iterrows():
        if type(instance['release_date']) == list:
            #min_index = get_release_year_index(instance['release_date'])
            release_year = []
            for date in instance['release_date']:
                release_year.append(int(date[:4]))
            min_index = (np.array(release_year)).argmin()
            for feature in features:
                if type(instance[feature] == list):
                    df_copy.loc[index, feature] = instance[feature][min_index]
    return df_copy

In [17]:
def select_oldest_song(df, isrc_flag=False):
    features = [
        'artists', 'album', 'release_date_precision', 'uri', 'release_date',
        'chart_power', 'popularity', 'danceability', 'energy', 'key', 'loudness',
        'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
        'valence', 'tempo', 'duration_ms', 'time_signature', 'name'
    ]

    if isrc_flag:
        features.extend(['genres', 'spotify_id'])
    else:
        features.append('isrc')

    df_copy = df.copy()

    for index, instance in df_copy.iterrows():
        if isinstance(instance['release_date'], list):
            release_years = [int(date[:4]) for date in instance['release_date']]
            min_index = np.argmin(release_years)

            for feature in features:
                if isinstance(instance[feature], list):
                    df_copy.loc[index, feature] = instance[feature][min_index]

    return df_copy

In [18]:
by_spotify_id = select_oldest_song(by_spotify_id)

In [19]:
by_spotify_id.shape

(474676, 23)

In [20]:
by_spotify_id.to_csv("data/by_spotify_id.csv")

## Group by isrc

The next step is to group all the songs by the isrc number since this should be the unique identifier. 

In [21]:
by_isrc = by_spotify_id.copy()
by_isrc.reset_index(inplace=True)
by_isrc = by_isrc.groupby('isrc').agg({
    'genres': list,
    'name': list,
    'artists': list,
    'album': list, 
    'release_date': list,
    'release_date_precision': list,
    'uri': list,
    'spotify_id':list,
    'chart_power': list, 
    'popularity': list, 
    'danceability': list, 
    'energy': list,
    'key': list, 
    'loudness': list, 
    'mode': list, 
    'speechiness': list, 
    'acousticness': list, 
    'instrumentalness': list, 
    'liveness': list, 
    'valence': list,
    'tempo': list, 
    'duration_ms': list, 
    'time_signature': list
})

In [22]:
by_isrc.head().T

isrc,AEA040700577,AEA040700578,AEA040700579,AEA040700580,AEA040700581
genres,"[[j-pop, jazz, pop]]","[[groove, j-pop, jazz, pop]]","[[groove, j-pop, jazz, pop]]","[[groove, j-pop, jazz, pop]]","[[groove, j-pop, jazz, pop, pop]]"
name,[Bala Wala Chi],[Houdou Nisbi],[Nafs Al Sheghlat],[Yalla Kichou Barra],[Ma Tfel]
artists,[Ziad Rahbani],[Ziad Rahbani],[Ziad Rahbani],[Ziad Rahbani],[Ziad Rahbani]
album,[Houdou Nisbi],[Houdou Nisbi],[Houdou Nisbi],[Houdou Nisbi],[Houdou Nisbi]
release_date,[1985-01-01],[1985-01-01],[1985-01-01],[1985-01-01],[1985-01-01]
release_date_precision,[day],[day],[day],[day],[day]
uri,[spotify:track:0fylgLeNObjVvwhd8caHqX],[spotify:track:0yMFpBNCYXqwwOAg23bC8a],[spotify:track:6G8l1kI8QlTD0UDIak5F8H],[spotify:track:21g76Lq5Jg4QvfTDvi4PlH],[spotify:track:0pKxrkFh8fxPKpkO29MYmi]
spotify_id,[0fylgLeNObjVvwhd8caHqX],[0yMFpBNCYXqwwOAg23bC8a],[6G8l1kI8QlTD0UDIak5F8H],[21g76Lq5Jg4QvfTDvi4PlH],[0pKxrkFh8fxPKpkO29MYmi]
chart_power,[nan],[nan],[nan],[nan],[nan]
popularity,[41],[31],[21],[17],[31]


In [23]:
by_isrc = select_oldest_song(by_isrc, True)

In [24]:
by_isrc.isna().any()

genres                    False
name                      False
artists                   False
album                     False
release_date              False
release_date_precision    False
uri                       False
spotify_id                False
chart_power                True
popularity                False
danceability              False
energy                    False
key                       False
loudness                  False
mode                      False
speechiness               False
acousticness              False
instrumentalness          False
liveness                  False
valence                   False
tempo                     False
duration_ms               False
time_signature            False
dtype: bool

In [25]:
by_isrc.head().T

isrc,AEA040700577,AEA040700578,AEA040700579,AEA040700580,AEA040700581
genres,"[j-pop, jazz, pop]","[groove, j-pop, jazz, pop]","[groove, j-pop, jazz, pop]","[groove, j-pop, jazz, pop]","[groove, j-pop, jazz, pop, pop]"
name,Bala Wala Chi,Houdou Nisbi,Nafs Al Sheghlat,Yalla Kichou Barra,Ma Tfel
artists,Ziad Rahbani,Ziad Rahbani,Ziad Rahbani,Ziad Rahbani,Ziad Rahbani
album,Houdou Nisbi,Houdou Nisbi,Houdou Nisbi,Houdou Nisbi,Houdou Nisbi
release_date,1985-01-01,1985-01-01,1985-01-01,1985-01-01,1985-01-01
release_date_precision,day,day,day,day,day
uri,spotify:track:0fylgLeNObjVvwhd8caHqX,spotify:track:0yMFpBNCYXqwwOAg23bC8a,spotify:track:6G8l1kI8QlTD0UDIak5F8H,spotify:track:21g76Lq5Jg4QvfTDvi4PlH,spotify:track:0pKxrkFh8fxPKpkO29MYmi
spotify_id,0fylgLeNObjVvwhd8caHqX,0yMFpBNCYXqwwOAg23bC8a,6G8l1kI8QlTD0UDIak5F8H,21g76Lq5Jg4QvfTDvi4PlH,0pKxrkFh8fxPKpkO29MYmi
chart_power,,,,,
popularity,41,31,21,17,31


Create a new feature year that contains only the year.

In [26]:
by_isrc['year'] = by_isrc['release_date'].apply(lambda x: int(x[:4]))

In [27]:
by_isrc['year'].unique()

array([1985, 1989, 1984, 1987, 1986, 1988, 1982, 1983, 1980, 1981, 2023])

In [28]:
(by_isrc['year'] > 1990).sum()

7

Remove all songs that are not from the 80s

In [29]:
smaller = list(by_isrc['year'] < 1980) 
larger = list(by_isrc['year'] > 1989)
remove_index = by_isrc[[x or y for x, y in zip(smaller, larger)]].index
print(remove_index)
print(by_isrc.shape)
by_isrc.drop(index=remove_index, inplace=True)
print(by_isrc.shape)

Index(['DER412202108', 'DER412202109', 'DER412202110', 'DER412202111',
       'DER412202112', 'DER412202113', 'DER412202114'],
      dtype='object', name='isrc')
(428362, 24)
(428355, 24)


In [30]:
by_isrc.release_date_precision = by_isrc.release_date_precision.astype(str)

Convert song and artists names to lowercase

In [31]:
by_isrc['name'] = by_isrc['name'].str.lower()
by_isrc['artists'] = by_isrc['artists'].str.lower()

In [32]:
by_isrc.drop_duplicates(['name','artists'], inplace=True)

In [33]:
by_isrc.shape

(416154, 24)

In [34]:
by_isrc.isna().any()

genres                    False
name                      False
artists                   False
album                     False
release_date              False
release_date_precision    False
uri                       False
spotify_id                False
chart_power                True
popularity                False
danceability              False
energy                    False
key                       False
loudness                  False
mode                      False
speechiness               False
acousticness              False
instrumentalness          False
liveness                  False
valence                   False
tempo                     False
duration_ms               False
time_signature            False
year                      False
dtype: bool

In [35]:
by_isrc.to_csv('data/by_isrc.csv')