In [243]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import cross_val_score
import numpy as np
import math
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

url = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/spotify.zip"
df_music = pd.read_csv(url)
df_music.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [229]:
# There is two columns with similar name: "Children’s Music" and "Children's Music". I rename it before a merge with the following get dummies
df_music["genre"].replace('Children’s Music', "Children's Music", inplace=True)

# Get dummies on genre for Machine Learning Usage
df_music_dumies = pd.concat([df_music , df_music['genre'].str.get_dummies()], axis = 1)

# Factorization of "mode"
df_music_dumies['mode'] = df_music_dumies['mode'].factorize()[0]

In [230]:
# Linear regression first test

X = df_music_dumies[[
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence', 'A Capella',
       'Alternative', 'Anime', 'Blues', "Children's Music",
       'Classical', 'Comedy', 'Country', 'Dance', 'Electronic', 'Folk',
       'Hip-Hop', 'Indie', 'Jazz', 'Movie', 'Opera', 'Pop', 'R&B', 'Rap',
       'Reggae', 'Reggaeton', 'Rock', 'Ska', 'Soul', 'Soundtrack', 'World']]
       
y = df_music_dumies['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, train_size = 0.8)

modelLR = LinearRegression().fit(X_train,y_train)

print("accuracy score on train set:",modelLR.score(X_train, y_train))
print("accuracy score on test set:",modelLR.score(X_test, y_test))

accuracy score on train set: 0.6235298303234635
accuracy score on test set: 0.6185060306982363


In [231]:
# Get dummies on key for Machine Learning Usage

df_music_dumies_keys = pd.concat([df_music_dumies , df_music_dumies['key'].str.get_dummies()], axis = 1)

In [232]:
X = df_music_dumies_keys[[
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence', 'A Capella',
       'Alternative', 'Anime', 'Blues', "Children's Music",
       'Classical', 'Comedy', 'Country', 'Dance', 'Electronic', 'Folk',
       'Hip-Hop', 'Indie', 'Jazz', 'Movie', 'Opera', 'Pop', 'R&B', 'Rap',
       'Reggae', 'Reggaeton', 'Rock', 'Ska', 'Soul', 'Soundtrack', 'World', 'A', 'A#', 'B', 'C', 'C#',
       'D', 'D#', 'E', 'F', 'F#', 'G', 'G#']]
       
y = df_music_dumies_keys['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

modelLR = LinearRegression().fit(X_train,y_train)

print("accuracy score on train set:",modelLR.score(X_train, y_train))
print("accuracy score on test set:",modelLR.score(X_test, y_test))

accuracy score on train set: 0.6222983256902899
accuracy score on test set: 0.6250000193339308


In [233]:
cross_val_score(LinearRegression(), X, y, scoring="r2", cv = 6)

array([ 0.10732396, -0.01424236, -0.23472697,  0.13938638, -0.84604192,
        0.40047679])

In [234]:
# Function to split the popularity in 5 classes in order to use a classification Machine Learning method (suggested by Tarik as he says it's better than Linear Regression here)

def popularity_score(popularity):
  popularity = popularity / 20
  popularity = math.ceil(popularity)
  return popularity

In [235]:
# Applying the function

df_music_dumies_keys["popularity_score"] = df_music["popularity"].apply(popularity_score)

In [236]:
cols = ['genre', 'artist_name', 'track_name', 'track_id', 'popularity', 'key', 'time_signature']
df_music_dumies_keys.drop(cols, axis=1, inplace=True)

df_music_dumies_keys.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232725 entries, 0 to 232724
Data columns (total 50 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   acousticness      232725 non-null  float64
 1   danceability      232725 non-null  float64
 2   duration_ms       232725 non-null  int64  
 3   energy            232725 non-null  float64
 4   instrumentalness  232725 non-null  float64
 5   liveness          232725 non-null  float64
 6   loudness          232725 non-null  float64
 7   mode              232725 non-null  int64  
 8   speechiness       232725 non-null  float64
 9   tempo             232725 non-null  float64
 10  valence           232725 non-null  float64
 11  A Capella         232725 non-null  int64  
 12  Alternative       232725 non-null  int64  
 13  Anime             232725 non-null  int64  
 14  Blues             232725 non-null  int64  
 15  Children's Music  232725 non-null  int64  
 16  Classical         23

In [237]:
df_music_dumies_keys["popularity_score"].value_counts()

3    95006
2    72744
4    31546
1    26128
0     6312
5      989
Name: popularity_score, dtype: int64

In [238]:
# As we can see, there is column with a popularity of 0 and we wonder if it has a positive or negative impact on the Machine learning
# We create two dataset for our tests, one with the 0 pop value and one without this value

df_music_with_0_pop = df_music_dumies_keys
df_music_without_0_pop = df_music_dumies_keys[df_music_dumies_keys['popularity_score'] != 0]

In [239]:
df_music_without_0_pop["popularity_score"].value_counts()

3    95006
2    72744
4    31546
1    26128
5      989
Name: popularity_score, dtype: int64

In [240]:
df_music_dumies_keys.columns

Index(['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'mode', 'speechiness',
       'tempo', 'valence', 'A Capella', 'Alternative', 'Anime', 'Blues',
       'Children's Music', 'Classical', 'Comedy', 'Country', 'Dance',
       'Electronic', 'Folk', 'Hip-Hop', 'Indie', 'Jazz', 'Movie', 'Opera',
       'Pop', 'R&B', 'Rap', 'Reggae', 'Reggaeton', 'Rock', 'Ska', 'Soul',
       'Soundtrack', 'World', 'A', 'A#', 'B', 'C', 'C#', 'D', 'D#', 'E', 'F',
       'F#', 'G', 'G#', 'popularity_score'],
      dtype='object')

In [245]:
cols = ['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence', 'A Capella',
       'Alternative', 'Anime', 'Blues', "Children's Music",
       'Classical', 'Comedy', 'Country', 'Dance', 'Electronic', 'Folk',
       'Hip-Hop', 'Indie', 'Jazz', 'Movie', 'Opera', 'Pop', 'R&B', 'Rap',
       'Reggae', 'Reggaeton', 'Rock', 'Ska', 'Soul', 'Soundtrack', 'World', 'A', 'A#', 'B', 'C', 'C#',
       'D', 'D#', 'E', 'F', 'F#', 'G', 'G#']

X = df_music_with_0_pop[cols]
y = df_music_with_0_pop['popularity_score']

In [249]:
# Test with the value 0 in popularity

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2, train_size=0.75)

# Logistic Regression
model = LogisticRegression().fit(X_train,y_train)

print("accuracy score on logistic regression test set:",round(model.score(X_test, y_test), 5))
print("accuracy score on logistic regression train set:",round(model.score(X_train, y_train), 5))
print ("")


# Decision Tree
modelDTR = DecisionTreeClassifier()
modelDTR.fit(X_train, y_train)

print("accuracy score on decision tree classifier test set:",round(modelDTR.score(X_test, y_test), 5))
print("accuracy score on decision tree classifier train set:",round(modelDTR.score(X_train, y_train), 5))
print ("")

# Random Forest
rfc = RandomForestClassifier(n_estimators=15)
rfc.fit(X_train, y_train)

print("accuracy score on RandomForest classifier test set:",round(rfc.score(X_test, y_test), 5))
print("accuracy score on RandomForest classifier train set:",round(rfc.score(X_train, y_train), 5))

accuracy score on logistic regression test set: 0.41985
accuracy score on logistic regression train set: 0.41954

accuracy score on decision tree classifier test set: 0.56703
accuracy score on decision tree classifier train set: 0.99925

accuracy score on RandomForest classifier test set: 0.67152
accuracy score on RandomForest classifier train set: 0.99191


In [250]:
# Test without the value 0 in popularity

X = df_music_without_0_pop[cols]
y = df_music_without_0_pop['popularity_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2, train_size=0.75)

# Logistic Regression
model = LogisticRegression().fit(X_train,y_train)

print("accuracy score on logistic regression test set:",round(model.score(X_test, y_test), 5))
print("accuracy score on logistic regression train set:",round(model.score(X_train, y_train), 5))
print ("")


# Decision Tree
modelDTR = DecisionTreeClassifier()
modelDTR.fit(X_train, y_train)

print("accuracy score on decision tree classifier test set:",round(modelDTR.score(X_test, y_test), 5))
print("accuracy score on decision tree classifier train set:",round(modelDTR.score(X_train, y_train), 5))
print ("")

# Random Forest
rfc = RandomForestClassifier(n_estimators=15)
rfc.fit(X_train, y_train)

print("accuracy score on RandomForest classifier test set:",round(rfc.score(X_test, y_test), 5))
print("accuracy score on RandomForest classifier train set:",round(rfc.score(X_train, y_train), 5))

accuracy score on logistic regression test set: 0.41985
accuracy score on logistic regression train set: 0.41954

accuracy score on decision tree classifier test set: 0.56636
accuracy score on decision tree classifier train set: 0.99925

accuracy score on RandomForest classifier test set: 0.66972
accuracy score on RandomForest classifier train set: 0.99209


In [204]:
df_music_dumies_keys.to_csv("..\data\music_dumies.csv.zip", header = True, compression = "zip")

In [251]:
# Create and fit a scaler model
scaler = StandardScaler().fit(X_train)

# Your scaler model can now transform your data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
model = LogisticRegression().fit(X_train_scaled,y_train)

print("accuracy score on logistic regression train set:",round(model.score(X_train_scaled, y_train), 5))
print("accuracy score on logistic regression test set:",round(model.score(X_test_scaled, y_test), 5))
print ("")

# Decision Tree
modelDTR = DecisionTreeClassifier()
modelDTR.fit(X_train_scaled, y_train)

print("accuracy score on decision tree classifier train set:",round(modelDTR.score(X_train_scaled, y_train), 5))
print("accuracy score on decision tree classifier test set:",round(modelDTR.score(X_test_scaled, y_test), 5))
print ("")

# Random Forest
rfc = RandomForestClassifier(n_estimators=15)
rfc.fit(X_train, y_train)

print("accuracy score on RandomForest classifier test set:",round(rfc.score(X_test, y_test), 5))
print("accuracy score on RandomForest classifier train set:",round(rfc.score(X_train, y_train), 5))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy score on logistic regression train set: 0.67883
accuracy score on logistic regression test set: 0.67702

accuracy score on decision tree classifier train set: 0.99925
accuracy score on decision tree classifier test set: 0.56521

accuracy score on RandomForest classifier test set: 0.66683
accuracy score on RandomForest classifier train set: 0.99194
