In [549]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import cross_val_score
import numpy as np
import math
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

url = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/spotify.zip"
df_music = pd.read_csv(url)
df_music.shape

(232725, 18)

In [523]:
# There is two genre with similar name: "Children’s Music" and "Children's Music". I rename it before a merge with the following get dummies
df_music["genre"].replace('Children’s Music', "Children's Music", inplace=True)

# Get dummies on genre for Machine Learning Usage
df_music_dumies = pd.concat([df_music , df_music['genre'].str.get_dummies()], axis = 1)

# Factorization of "mode"
df_music_dumies['mode'] = df_music_dumies['mode'].factorize()[0]

In [524]:
df_music_dumies.shape

(232725, 44)

In [525]:
# We remove song with a duration below 90 seconds et above 900 seconds

df_music_dumies = df_music_dumies.loc[df_music_dumies['duration_ms'] >= 90000]
df_music_dumies = df_music_dumies.loc[df_music_dumies['duration_ms'] <= 900000]

In [526]:
df_music_dumies = df_music_dumies[(df_music_dumies['genre'] != 'Soundtrack') & (df_music_dumies['genre'] != 'Movie') ]

In [527]:
df_music_dumies.shape

(209308, 44)

In [528]:
# Linear regression first test

X = df_music_dumies[[
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence', 'A Capella',
       'Alternative', 'Anime', 'Blues', "Children's Music",
       'Classical', 'Comedy', 'Country', 'Dance', 'Electronic', 'Folk',
       'Hip-Hop', 'Indie', 'Jazz', 'Movie', 'Opera', 'Pop', 'R&B', 'Rap',
       'Reggae', 'Reggaeton', 'Rock', 'Ska', 'Soul', 'Soundtrack', 'World']]
       
y = df_music_dumies['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

modelLR = LinearRegression().fit(X_train,y_train)

print("accuracy score on train set:",modelLR.score(X_train, y_train))
print("accuracy score on test set:",modelLR.score(X_test, y_test))

accuracy score on train set: 0.5992642450359075
accuracy score on test set: 0.5999739884976283


In [529]:
# Get dummies on key for Machine Learning Usage

df_music_dumies_keys = pd.concat([df_music_dumies , df_music_dumies['key'].str.get_dummies()], axis = 1)

In [530]:
X = df_music_dumies_keys[[
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence', 'A Capella',
       'Alternative', 'Anime', 'Blues', "Children's Music",
       'Classical', 'Comedy', 'Country', 'Dance', 'Electronic', 'Folk',
       'Hip-Hop', 'Indie', 'Jazz', 'Movie', 'Opera', 'Pop', 'R&B', 'Rap',
       'Reggae', 'Reggaeton', 'Rock', 'Ska', 'Soul', 'Soundtrack', 'World', 'A', 'A#', 'B', 'C', 'C#',
       'D', 'D#', 'E', 'F', 'F#', 'G', 'G#']]

y = df_music_dumies_keys['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75)

modelLR = LinearRegression().fit(X_train,y_train)

print("accuracy score on train set:",modelLR.score(X_train, y_train))
print("accuracy score on test set:",modelLR.score(X_test, y_test))

accuracy score on train set: 0.5999068211981908
accuracy score on test set: 0.5993474378219982


In [531]:
cross_val_score(LinearRegression(), X, y, scoring="r2", cv = 6)

array([ 0.08352754,  0.33644026, -0.0979565 ,  0.16003534, -0.35806045,
        0.26022852])

In [532]:
# Function to split the popularity in 5 classes in order to use a classification Machine Learning method (suggested by Tarik as he says it's better than Linear Regression here)

def popularity_score(popularity):
  popularity = popularity / 20
  popularity = math.ceil(popularity)
  return popularity

In [533]:
# Applying the function

df_music_dumies_keys["popularity_score"] = df_music["popularity"].apply(popularity_score)

In [534]:
# We remove useless columns
cols = ['genre', 'artist_name', 'track_name', 'track_id', 'key', 'time_signature']
df_music_dumies_keys.drop(cols, axis=1, inplace=True)

df_music_dumies_keys.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 209308 entries, 135 to 232724
Data columns (total 51 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        209308 non-null  int64  
 1   acousticness      209308 non-null  float64
 2   danceability      209308 non-null  float64
 3   duration_ms       209308 non-null  int64  
 4   energy            209308 non-null  float64
 5   instrumentalness  209308 non-null  float64
 6   liveness          209308 non-null  float64
 7   loudness          209308 non-null  float64
 8   mode              209308 non-null  int64  
 9   speechiness       209308 non-null  float64
 10  tempo             209308 non-null  float64
 11  valence           209308 non-null  float64
 12  A Capella         209308 non-null  int64  
 13  Alternative       209308 non-null  int64  
 14  Anime             209308 non-null  int64  
 15  Blues             209308 non-null  int64  
 16  Children's Music  

In [535]:
df_music_dumies_keys["popularity_score"].value_counts()

3    92238
2    61713
4    31368
1    19181
0     3820
5      988
Name: popularity_score, dtype: int64

In [536]:
# As we can see, there is column with a popularity of 0 and we wonder if it has a positive or negative impact on the Machine learning
# We create two dataset for our tests, one with the 0 pop value and one without this value

df_music_with_0_pop = df_music_dumies_keys
df_music_without_0_pop = df_music_dumies_keys[df_music_dumies_keys['popularity_score'] != 0]

In [537]:
df_music_without_0_pop["popularity_score"].value_counts()

3    92238
2    61713
4    31368
1    19181
5      988
Name: popularity_score, dtype: int64

In [538]:
df_music_without_0_pop.shape

(205488, 51)

In [539]:
df_music_dumies_keys.columns

Index(['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'mode', 'speechiness',
       'tempo', 'valence', 'A Capella', 'Alternative', 'Anime', 'Blues',
       'Children's Music', 'Classical', 'Comedy', 'Country', 'Dance',
       'Electronic', 'Folk', 'Hip-Hop', 'Indie', 'Jazz', 'Movie', 'Opera',
       'Pop', 'R&B', 'Rap', 'Reggae', 'Reggaeton', 'Rock', 'Ska', 'Soul',
       'Soundtrack', 'World', 'A', 'A#', 'B', 'C', 'C#', 'D', 'D#', 'E', 'F',
       'F#', 'G', 'G#', 'popularity_score'],
      dtype='object')

In [540]:
cols = ['acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence', 'A Capella',
       'Alternative', 'Anime', 'Blues', "Children's Music",
       'Classical', 'Comedy', 'Country', 'Dance', 'Electronic', 'Folk',
       'Hip-Hop', 'Indie', 'Jazz', 'Movie', 'Opera', 'Pop', 'R&B', 'Rap',
       'Reggae', 'Reggaeton', 'Rock', 'Ska', 'Soul', 'Soundtrack', 'World']

X = df_music_with_0_pop[cols]
y = df_music_with_0_pop['popularity_score']

In [541]:
# # Test with the value 0 in popularity

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2, train_size=0.6)

# # Logistic Regression
# model = LogisticRegression().fit(X_train,y_train)

# print("accuracy score on logistic regression test set:",round(model.score(X_test, y_test), 5))
# print("accuracy score on logistic regression train set:",round(model.score(X_train, y_train), 5))
# print ("")


# # Decision Tree
# modelDTR = DecisionTreeClassifier()
# modelDTR.fit(X_train, y_train)

# print("accuracy score on decision tree classifier test set:",round(modelDTR.score(X_test, y_test), 5))
# print("accuracy score on decision tree classifier train set:",round(modelDTR.score(X_train, y_train), 5))
# print ("")

# # Random Forest
# rfc = RandomForestClassifier(n_estimators=15)
# rfc.fit(X_train, y_train)

# print("accuracy score on RandomForest classifier test set:",round(rfc.score(X_test, y_test), 5))
# print("accuracy score on RandomForest classifier train set:",round(rfc.score(X_train, y_train), 5))

In [542]:
# Test without the value 0 in popularity

X = df_music_without_0_pop[cols]
y = df_music_without_0_pop['popularity_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

# Logistic Regression
model = LogisticRegression().fit(X_train,y_train)

print("accuracy score on logistic regression test set:",round(model.score(X_test, y_test), 5))
print("accuracy score on logistic regression train set:",round(model.score(X_train, y_train), 5))
print ("")


# Decision Tree
modelDTR = DecisionTreeClassifier()
modelDTR.fit(X_train, y_train)

print("accuracy score on decision tree classifier test set:",round(modelDTR.score(X_test, y_test), 5))
print("accuracy score on decision tree classifier train set:",round(modelDTR.score(X_train, y_train), 5))
print ("")

# Random Forest
rfc = RandomForestClassifier(n_estimators=15)
rfc.fit(X_train, y_train)

print("accuracy score on RandomForest classifier test set:",round(rfc.score(X_test, y_test), 5))
print("accuracy score on RandomForest classifier train set:",round(rfc.score(X_train, y_train), 5))

accuracy score on logistic regression test set: 0.45887
accuracy score on logistic regression train set: 0.46002

accuracy score on decision tree classifier test set: 0.56369
accuracy score on decision tree classifier train set: 0.99926

accuracy score on RandomForest classifier test set: 0.66511
accuracy score on RandomForest classifier train set: 0.99171


In [543]:
# Create and fit a scaler model
scaler = StandardScaler().fit(X_train)

# Your scaler model can now transform your data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
model = LogisticRegression(max_iter=10).fit(X_train_scaled,y_train)

print("accuracy score on logistic regression train set:",round(model.score(X_train_scaled, y_train), 5))
print("accuracy score on logistic regression test set:",round(model.score(X_test_scaled, y_test), 5))
print ("")

# Decision Tree
modelDTR = DecisionTreeClassifier()
modelDTR.fit(X_train_scaled, y_train)

print("accuracy score on decision tree classifier train set:",round(modelDTR.score(X_train_scaled, y_train), 5))
print("accuracy score on decision tree classifier test set:",round(modelDTR.score(X_test_scaled, y_test), 5))
print ("")

# Random Forest
rfc = RandomForestClassifier(n_estimators=15)
rfc.fit(X_train, y_train)

print("accuracy score on RandomForest classifier test set:",round(rfc.score(X_test, y_test), 5))
print("accuracy score on RandomForest classifier train set:",round(rfc.score(X_train, y_train), 5))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy score on logistic regression train set: 0.67351
accuracy score on logistic regression test set: 0.67815

accuracy score on decision tree classifier train set: 0.99926
accuracy score on decision tree classifier test set: 0.56243

accuracy score on RandomForest classifier test set: 0.66912
accuracy score on RandomForest classifier train set: 0.99174


In [554]:
# Logistic Regression
model = LogisticRegression(max_iter=20).fit(X_train_scaled,y_train)

print("accuracy score on logistic regression train set:",round(model.score(X_train_scaled, y_train), 5))
print("accuracy score on logistic regression test set:",round(model.score(X_test_scaled, y_test), 5))
print ("")

accuracy score on logistic regression train set: 0.67356
accuracy score on logistic regression test set: 0.67743



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [545]:
# # Random Forest
# rfc = RandomForestClassifier(n_estimators= 9, max_features= 'sqrt', max_depth= 54, criterion= 'entropy')
# rfc.fit(X_train, y_train)

# print("accuracy score on RandomForest classifier test set:",round(rfc.score(X_test, y_test), 5))
# print("accuracy score on RandomForest classifier train set:",round(rfc.score(X_train, y_train), 5))

In [546]:
# param_grid = {'n_estimators': range(5, 21)
#               ,'max_features': ['auto', 'sqrt', 'log2']
#               ,'max_depth': range(50, 55)
#               ,'criterion': ['gini', 'entropy']
#               }

# rando = RandomizedSearchCV(RandomForestClassifier(), param_grid, n_iter=2)
# rando.fit(X_train, y_train)

# print("best score:",rando.best_score_)
# print("best parameters:",rando.best_params_)

In [547]:
# dico = {'max_depth': range(1,51),
#         'min_samples_leaf': range(1,16),
#         'min_samples_split' : (2, 5, 7, 10, 15, 30)}

# rando = RandomizedSearchCV(DecisionTreeClassifier(), dico, n_iter=2)
# rando.fit(X,y)

# print("best score:",rando.best_score_)
# print("best parameters:",rando.best_params_)

In [548]:
# param_grid = {'n_estimators': range(5,11)
#               #,'max_features': ['auto', 'sqrt', 'log2']
#               #,'max_depth': range(50, 55)
#               #,'criterion': ['gini', 'entropy']
#               }

# grid = GridSearchCV(RandomForestClassifier(), param_grid)
# grid.fit(X_train, y_train)

# print("best score:",grid.best_score_)
# print("best parameters:",grid.best_params_)

In [555]:
df_music_without_0_pop.to_csv("..\data\music_dumies.csv.zip", header = True, compression = "zip")