In [48]:
import pandas as pd
import numpy as np
import math


url = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/spotify.zip"
df_music = pd.read_csv(url)
df_music.shape

(232725, 18)

In [49]:
# There is two genre with similar name: "Children’s Music" and "Children's Music". I rename it before a merge with the following get dummies
df_music["genre"].replace('Children’s Music', "Children's Music", inplace=True)

# Get dummies on genre for Machine Learning Usage
df_music_dumies = pd.concat([df_music , df_music['genre'].str.get_dummies()], axis = 1)

# Factorization of "mode"
df_music_dumies['mode'] = df_music_dumies['mode'].factorize()[0]

# We remove song with a duration below 90 seconds et above 900 seconds
df_music_dumies = df_music_dumies.loc[df_music_dumies['duration_ms'] >= 90000]
df_music_dumies = df_music_dumies.loc[df_music_dumies['duration_ms'] <= 900000]

# We remove Soundtrack and Movie genre
df_music_dumies = df_music_dumies[(df_music_dumies['genre'] != 'Soundtrack') & (df_music_dumies['genre'] != 'Movie') ]

In [50]:
# Get dummies on key for Machine Learning Usage
df_music_dumies_keys = pd.concat([df_music_dumies , df_music_dumies['key'].str.get_dummies()], axis = 1)

In [51]:
# Function to split the popularity in 5 classes in order to use a classification Machine Learning method (suggested by Tarik as he says it's better than Linear Regression here)
def popularity_score(popularity):
  popularity = popularity / 20
  popularity = math.ceil(popularity)
  return popularity

In [52]:
# Applying the function
df_music_dumies_keys["popularity_score"] = df_music["popularity"].apply(popularity_score)

In [53]:
# We remove useless columns
cols = ['genre', 'artist_name', 'track_id', 'key', 'time_signature', 'duration_ms']
df_music_dumies_keys.drop(cols, axis=1, inplace=True)

In [54]:
# There are cells with a popularity of 0 and after many tests it seems we have better Machine Learning accuracy score when we remove it
df_music_without_0_pop = df_music_dumies_keys[df_music_dumies_keys['popularity_score'] != 0]

In [55]:
df_music_without_0_pop

Unnamed: 0,track_name,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,mode,speechiness,...,C,C#,D,D#,E,F,F#,G,G#,popularity_score
135,Be Without You - Kendu Mix,65,0.08300,0.724,0.689,0.000000,0.3040,-5.922,1,0.1350,...,0,0,1,0,0,0,0,0,0,4
136,Desperado,63,0.32300,0.685,0.610,0.000000,0.1020,-5.221,1,0.0439,...,1,0,0,0,0,0,0,0,0,4
137,Ice On My Baby (feat. Kevin Gates) - Remix,62,0.06750,0.762,0.520,0.000004,0.1140,-5.237,1,0.0959,...,0,0,0,0,0,1,0,0,0,4
138,Heaven Falls / Fall on Me,61,0.36000,0.563,0.366,0.002430,0.0955,-6.896,1,0.1210,...,0,0,0,0,0,0,0,0,0,4
139,Love Myself,68,0.59600,0.653,0.621,0.000000,0.0811,-5.721,1,0.0409,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232720,Son Of Slide,39,0.00384,0.687,0.714,0.544000,0.0845,-10.626,0,0.0316,...,0,0,1,0,0,0,0,0,0,2
232721,Burning Fire,38,0.03290,0.785,0.683,0.000880,0.2370,-6.944,1,0.0337,...,0,0,0,0,1,0,0,0,0,2
232722,(I'm Your) Hoochie Coochie Man,47,0.90100,0.517,0.419,0.000000,0.0945,-8.282,0,0.1480,...,0,0,1,0,0,0,0,0,0,3
232723,With My Words,44,0.26200,0.745,0.704,0.000000,0.3330,-7.137,0,0.1460,...,0,0,0,0,0,0,0,0,0,3


In [56]:
df_music_without_0_pop.to_csv('..\data\dataset_algo.csv.zip', header = True, compression='zip')