In [2]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt # data visualization
import ipywidgets as widgets # interactive widgets
from ipywidgets import Box

In [4]:
# data loading and descriptive statistics
df = pd.read_csv('SpotifyFeatures.csv')
df["genre"].value_counts()

Comedy              9681
Soundtrack          9646
Indie               9543
Jazz                9441
Pop                 9386
Electronic          9377
Children’s Music    9353
Folk                9299
Hip-Hop             9295
Rock                9272
Alternative         9263
Classical           9256
Rap                 9232
World               9096
Soul                9089
Blues               9023
R&B                 8992
Anime               8936
Reggaeton           8927
Ska                 8874
Reggae              8771
Dance               8701
Country             8664
Opera               8280
Movie               7806
Children's Music    5403
A Capella            119
Name: genre, dtype: int64

In [8]:
df["genre"].replace({"Children's Music": "Children’s Music"}, inplace=True)
df["genre"].value_counts()
# general category
df["genre"].replace({"Soundtrack": "General","Movie":"General","Anime":"General","Children’s Music":"General","Comedy":"General"}, inplace=True)
# dance
df["genre"].replace({"Hip-Hop": "Dance","R&B":"dance","Dance":"Dance","Rap":"Dance","Pop":"Dance"}, inplace=True)
# folk
df["genre"].replace({"Folk": "Folk","Soul":"Folk","Blues":"Folk","Country":"Folk"}, inplace=True)

df["genre"].value_counts()

General        50825
dance          45606
Folk           36075
Indie           9543
Jazz            9441
Electronic      9377
Rock            9272
Alternative     9263
Classical       9256
World           9096
Reggaeton       8927
Ska             8874
Reggae          8771
Opera           8280
A Capella        119
Name: genre, dtype: int64

In [None]:

# Remove duplicates + unnecessary variables
df.drop_duplicates(subset=['track_id'], keep='first',inplace=True)
df.drop(['genre','artist_name','track_name','track_id','key','time_signature'],axis=1, inplace=True)

# Data cleaning and arrangement the data
#time_signature_df=pd.get_dummies(df["time_signature"]) # create dummy variables for the categorical variable "time_signature" 
#df = pd.concat([df,time_signature_df],axis=1) # add the dummy variables
df['mode'] = np.where(df['mode']=='Major', 1, 0) #change songs' mode (minor/major) to numerical

# change songs duration from milliseconds to seconds
df['duration_ms'] = df['duration_ms'] / 1000
df.rename(columns={'duration_ms': 'duration_s'}, inplace=True) # update column label


In [None]:
X= df.loc[:,df.columns !="popularity"]
y = df["popularity"]


In [None]:
# separate the data to training and testing
X_train, X_test, y_train,y_test=train_test_split(X,y,test_size=0.2)
# save as np.array
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train) 
y_test = np.array(y_test) 

In [None]:
# create a linear regression object
model_regression = linear_model.LinearRegression()

# create a random forest regression object
model_random_forest = RandomForestRegressor()

# create a random forest regression object
model_decision_tree = DecisionTreeRegressor()

In [None]:
# The models
model_regression.fit(X_train,y_train)
model_random_forest.fit(X_train,y_train)
model_decision_tree.fit(X_train,y_train)

In [None]:
# estimate the R² score on train data
print("Train data - the R^2 is",model_regression.score(X_train,y_train))
# estimate the R² score on test data
print("Test data - the R^2 is",model_regression.score(X_test,y_test))

In [None]:
# estimate the R² score on train data
print("Train data - the R^2 is",model_random_forest.score(X_train,y_train))
# estimate the R² score on test data
print("Test data - the R^2 is",model_random_forest.score(X_test,y_test))

In [None]:
# estimate the R² score on train data
print("Train data - the R^2 is",model_decision_tree.score(X_train,y_train))
# estimate the R² score on test data
print("Test data - the R^2 is",model_decision_tree.score(X_test,y_test))

In [None]:
import matplotlib.pyplot as plt
test_samples = 10
regression = []
random_forest = []
decision_tree = []
ground_truth = []
for i in range(test_samples): 
    regression.append(model_regression.predict([X_test[i]])) 
    random_forest.append(model_random_forest.predict([X_test[i]]))
    decision_tree.append(model_decision_tree.predict([X_test[i]]))
    ground_truth.append(y_test[i])

plt.plot(range(len(regression)), regression, label='Linear Regression')
plt.plot(range(len(random_forest)), random_forest, label='Random Forest')
plt.plot(range(len(decision_tree)), decision_tree, label='Decision Tree')
plt.plot(range(len(ground_truth)), ground_truth, label='Ground Truth')
plt.xlim([0, test_samples])
plt.ylim([0, 100])
plt.xlabel('songs')
plt.ylabel('popularity')
plt.legend()
plt.show()

In [None]:
# Variables normalization
acousticness = [0,1]
danceability = [0,1]
duration_s = [0,600]
energy = [0,1]
instrumentalness = [0,1]
liveness = [0,1]
loudness= [ -60,0]
speechiness = [0,1]
tempo = [0,250]
valence = [0,1]
mode = [0,1]
features_range = {"acousticness":[0,1],"danceability" : [0,1],"duration_s":[0,600],"energy":[0,1],"instrumentalness":[0,1],"liveness":[0,1],"loudness": [-60,0],"speechiness" : [0,1],"tempo" : [0,250],"valence" : [0,1],"mode" : [0,1]}


In [None]:
regression = [50]
random_forest = [50]
decision_tree = [50]
features = X.shape[1]
widgets_box = []
headers = X.columns
temp_sample =X.iloc[5]

#features_range["acousticness"][0]
for feature in range(features):
    
    temp_widget = widgets.FloatSlider(
    value=temp_sample[feature],
    min=features_range[headers[feature]][0],
    max=features_range[headers[feature]][1],
    step=0.1,
    description=headers[feature],
    disabled=False,
    continuous_update=False,
    orientation='vertical',
    readout=True,
    readout_format='.1f',
    )
    widgets_box.append(temp_widget)


box = Box(children=widgets_box)
box 

In [None]:
for feature in range(features): 
    temp_sample[feature] = widgets_box[feature].value


In [None]:

regression.append(model_regression.predict([temp_sample])) 
random_forest.append(model_random_forest.predict([temp_sample]))
decision_tree.append(model_decision_tree.predict([temp_sample]))

# Plot a simple line chart
plt.plot(range(len(regression)), regression, label='Linear Regression')
plt.plot(range(len(random_forest)), random_forest, label='Random Forest')
plt.plot(range(len(decision_tree)), decision_tree, label='Decision Tree')
plt.xlim([0, len(regression)])
plt.ylim([0, 100])
plt.legend()
plt.show()