In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt # data visualization
import ipywidgets as widgets # interactive widgets
from ipywidgets import Box
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
# Deep learning
import torch
from torch import nn


In [None]:
# data loading and descriptive statistics
df = pd.read_csv('SpotifyFeatures.csv')


In [None]:
# update column values
df["genre"].replace({"Children's Music": "Children’s Music"}, inplace=True)
# general category
df["genre"].replace({"Soundtrack": "General","Movie":"General","Anime":"General","Children’s Music":"General","Comedy":"General"}, inplace=True)
# dance
df["genre"].replace({"Hip-Hop": "Dance","R&B":"dance","Dance":"Dance","Rap":"Dance","Pop":"Dance"}, inplace=True)
# folk
df["genre"].replace({"Folk": "Folk","Soul":"Folk","Blues":"Folk","Country":"Folk"}, inplace=True)
# Reggae
df["genre"].replace({"Reggaeton": "Reggae","Ska":"Reggae","Reggae":"Reggae","World":"Reggae"}, inplace=True)
# Alternative
df["genre"].replace({"Indie": "Alternative","Rock":"Alternative","Alternative":"Alternative","Electronic":"Alternative","Jazz":"Alternative"}, inplace=True)
# Reggae
df["genre"].replace({"Classical": "Classical","Opera":"Classical","A Capella":"Classical"}, inplace=True)

In [None]:

# Remove duplicates + unnecessary variables
df.drop_duplicates(subset=['track_id'], keep='first',inplace=True)
df.drop(['artist_name','track_name','track_id','key'],axis=1, inplace=True)

# Add dummy variables - time_signature
time_signature_df=pd.get_dummies(df["time_signature"]) 
df = pd.concat([df,time_signature_df],axis=1) 

# Add dummy variables - genre
genre_df=pd.get_dummies(df["genre"]) 
df = pd.concat([df,genre_df],axis=1) 

# remove old variables
df.drop(['genre','time_signature','0/4','1/4'],axis=1, inplace=True)
#df

# Data cleaning and arrangement the data
df['mode'] = np.where(df['mode']=='Major', 1, 0) #change songs' mode (minor/major) to numerical

# change songs duration from milliseconds to seconds
df['duration_ms'] = df['duration_ms'] / 1000
df.rename(columns={'duration_ms': 'duration_s'}, inplace=True) # update column label

In [None]:
# data standardtization
features = ["duration_s","loudness","tempo","popularity"]
for feature in features:
    mean = df[feature].mean()
    std = df[feature].std()
    df[feature] = (df[feature]-mean)/std 

In [None]:
X= df.loc[:,df.columns !="popularity"]
y = df["popularity"]


In [None]:
# Data visualzation
#sample = df.sample(1000)
#sns.kdeplot(x = sample["acousticness"])
#sns.scatterplot(x=sample["acousticness"], y=sample["popularity"])
#sns.pairplot(sample[["acousticness","danceability"]])
#correlation_mat = df.corr()
#print(correlation_mat)
#sns.heatmap(correlation_mat)
#sns.scatterplot(sample["loudness"], sample["acousticness"], hue=sample["popularity"])
#sns.scatterplot(sample["acousticness"], sample["danceability"], size=sample["popularity"])
#g = sns.PairGrid(sample[[ "acousticness","popularity","genre"]], hue="genre")
#g.map_diag(sns.histplot)
#g.map_offdiag(sns.scatterplot)
#g.add_legend();

#df sample = diamonds.sample(3000)

In [None]:
g = sns.pairplot(sample[["loudness", "acousticness", "popularity"]])

In [None]:
print(len(df.columns))

In [None]:
# Split into train and test, and train multiple models by sampling the train set. Finally, just test once on the test set.
# KFold with 5 splits 
folds = KFold(n_splits = 10, shuffle = True)

hyper_params = [{'n_features_to_select': list(range(0, 18))}] # specify range of hyperparameters
model_regression.fit(X_train,y_train)
rfe = RFE(model_regression) 
model_cv = GridSearchCV(estimator = rfe, 
                        param_grid = hyper_params, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)      

# fit the model
model_cv.fit(X_train, y_train)


cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results 
y_pred_reg = model_cv.predict(X_test)
#model_regression.coef
mse1 = mean_squared_error(y_test,y_pred_reg)
# we want smaller rmse
rmse1 = np.sqrt(mse1)
print(rmse1)

In [None]:
plt.figure(figsize=(10,10))

plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"])
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"])
plt.xlabel('number of features')
plt.ylabel('r-squared')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='upper left')

In [None]:
# separate the data to training and testing
X_train, X_test, y_train,y_test=train_test_split(X,y,test_size=0.2)
# save as np.array
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train) 
y_test = np.array(y_test) 

#Split into train and test, and train multiple models by sampling the train set. Finally, just test once on the test set.

In [None]:
# create a linear regression object
model_regression = linear_model.LinearRegression()

# create a random forest regression object
model_random_forest = RandomForestRegressor()

# create a random forest regression object
model_decision_tree = DecisionTreeRegressor()

In [None]:
# The models
model_regression.fit(X_train,y_train)
model_random_forest.fit(X_train,y_train)
model_decision_tree.fit(X_train,y_train)

In [None]:
#rf = model_random_forest(n_estimators = 300)


# create a random forest regression object
model_random_forest = RandomForestRegressor(n_estimators = 200)


model_random_forest.fit(X_train,y_train)
# estimate the R² score on train data
print("Train data - the R^2 is",model_random_forest.score(X_train,y_train))
# estimate the R² score on test data
print("Test data - the R^2 is",model_random_forest.score(X_test,y_test))


In [None]:


hyper_params = [{'n_features_to_select': list(range(0, 18))}] # specify range of hyperparameters
model_regression.fit(X_train,y_train)
rfe = RFE(model_regression) 
model_cv = GridSearchCV(estimator = rfe, 
                        param_grid = hyper_params, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)      

# fit the model
model_cv.fit(X_train, y_train)


cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results 

In [None]:
 y_pred_decision_tree= model_cv.predict(X_test)
#model_regression.coef
mse1 = mean_squared_error(y_test,y_pred_decision_tree)
# we want smaller rmse
rmse1 = np.sqrt(mse1)
print(rmse1)

In [None]:
# estimate the R² score on train data
print("Train data - the R^2 is",model_regression.score(X_train,y_train))
# estimate the R² score on test data
print("Test data - the R^2 is",model_regression.score(X_test,y_test))

In [None]:
# estimate the R² score on train data
print("Train data - the R^2 is",model_random_forest.score(X_train,y_train))
# estimate the R² score on test data
print("Test data - the R^2 is",model_random_forest.score(X_test,y_test))

In [None]:
# estimate the R² score on train data
print("Train data - the R^2 is",model_decision_tree.score(X_train,y_train))
# estimate the R² score on test data
print("Test data - the R^2 is",model_decision_tree.score(X_test,y_test))

In [None]:
y_pred_reg = model_regression.predict(X_test)
y_pred_random_forset = model_random_forest.predict(X_test)
y_pred_decision_tree = model_decision_tree.predict(X_test)

sns.regplot(y_test,y_pred_reg)
from sklearn.metrics import mean_squared_error

#model_regression.coef
mse1 = mean_squared_error(y_test,y_pred_reg)
# we want smaller rmse
rmse1 = np.sqrt(mse1)
print(rmse1)

mse2 = mean_squared_error(y_test,y_pred_random_forset)
# we want smaller rmse
rmse2 = np.sqrt(mse2)
print(rmse2)

mse3 = mean_squared_error(y_test,y_pred_decision_tree)
# we want smaller rmse
rmse3 = np.sqrt(mse3)
print(rmse3)


In [None]:
import matplotlib.pyplot as plt
test_samples = 10
regression = []
random_forest = []
decision_tree = []
ground_truth = []
for i in range(test_samples): 
    regression.append(model_regression.predict([X_test[i]])) 
    random_forest.append(model_random_forest.predict([X_test[i]]))
    decision_tree.append(model_decision_tree.predict([X_test[i]]))
    ground_truth.append(y_test[i])

plt.plot(range(len(regression)), regression, label='Linear Regression')
plt.plot(range(len(random_forest)), random_forest, label='Random Forest')
plt.plot(range(len(decision_tree)), decision_tree, label='Decision Tree')
plt.plot(range(len(ground_truth)), ground_truth, label='Ground Truth')
plt.xlim([0, test_samples])
plt.ylim([0, 100])
plt.xlabel('songs')
plt.ylabel('popularity')
plt.legend()
plt.show()

In [None]:
# Variables normalization
acousticness = [0,1]
danceability = [0,1]
duration_s = [0,600]
energy = [0,1]
instrumentalness = [0,1]
liveness = [0,1]
loudness= [ -60,0]
speechiness = [0,1]
tempo = [0,250]
valence = [0,1]
mode = [0,1]
features_range = {"acousticness":[0,1],"danceability" : [0,1],"duration_s":[0,600],"energy":[0,1],"instrumentalness":[0,1],"liveness":[0,1],"loudness": [-60,0],"speechiness" : [0,1],"tempo" : [0,250],"valence" : [0,1],"mode" : [0,1]}


In [None]:
regression = [50]
random_forest = [50]
decision_tree = [50]
features = X.shape[1]
widgets_box = []
headers = X.columns
temp_sample =X.iloc[5]

#features_range["acousticness"][0]
for feature in range(features):
    
    temp_widget = widgets.FloatSlider(
    value=temp_sample[feature],
    min=features_range[headers[feature]][0],
    max=features_range[headers[feature]][1],
    step=0.1,
    description=headers[feature],
    disabled=False,
    continuous_update=False,
    orientation='vertical',
    readout=True,
    readout_format='.1f',
    )
    widgets_box.append(temp_widget)


box = Box(children=widgets_box)
box 

In [None]:
for feature in range(features): 
    temp_sample[feature] = widgets_box[feature].value


In [None]:

regression.append(model_regression.predict([temp_sample])) 
random_forest.append(model_random_forest.predict([temp_sample]))
decision_tree.append(model_decision_tree.predict([temp_sample]))

# Plot a simple line chart
plt.plot(range(len(regression)), regression, label='Linear Regression')
plt.plot(range(len(random_forest)), random_forest, label='Random Forest')
plt.plot(range(len(decision_tree)), decision_tree, label='Decision Tree')
plt.xlim([0, len(regression)])
plt.ylim([0, 100])
plt.legend()
plt.show()

In [None]:
# deep learning