# Modeling Section #
## 5. Song popularity prediction [Experimental]

Find a model that best fits to the dataset to predict a song's popularity.

In [1]:
# Collecting packages
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, round
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# Configure SparkUI
conf = SparkConf().set('spark.ui.port', '4050')

# Create a URL through you can access the Spark UI
get_ipython().system_raw('./ngrok http 4050 &')

# Access the URL
!curl -s http://localhost:4040/api/tunnels 

# Creating a context. Access SparkUI now in localhost:4050
sc = SparkContext(conf=conf)

# Spark Session
spark = (
    SparkSession.builder                  
      .appName("model_popularity")
      .config('spark.ui.port', '4050')
      .getOrCreate()
)

Collecting data

In [3]:
path_data = "clean_data_music.csv"

schema_music = StructType([
    StructField('Index', IntegerType()),
    StructField('Track_duration', IntegerType()),
    StructField('Popularity', IntegerType()),
    StructField('Danceability', DoubleType()),
    StructField('Energy', DoubleType()),
    StructField('Key', IntegerType()),
    StructField('Loudness', DoubleType()),
    StructField('Acousticness', DoubleType()),
    StructField('Instrumentalness', DoubleType()),
    StructField('Liveness', DoubleType()),
    StructField('Valence', DoubleType()),
    StructField('Tempo', DoubleType()),
    StructField('Year', IntegerType())
])

In [4]:
df = spark.read.csv(
    path_data,
    header=True,
    sep=',',
    schema=schema_music
).drop('index')

In [5]:
df.printSchema()

root
 |-- Track_duration: integer (nullable = true)
 |-- Popularity: integer (nullable = true)
 |-- Danceability: double (nullable = true)
 |-- Energy: double (nullable = true)
 |-- Key: integer (nullable = true)
 |-- Loudness: double (nullable = true)
 |-- Acousticness: double (nullable = true)
 |-- Instrumentalness: double (nullable = true)
 |-- Liveness: double (nullable = true)
 |-- Valence: double (nullable = true)
 |-- Tempo: double (nullable = true)
 |-- Year: integer (nullable = true)



See values near zero:

In [6]:
df.createOrReplaceTempView('base_music')

spark.sql(
    """
      select count(popularity) as Popularity_near_zero
           from base_music
           where popularity <= 5
    """
).show()

+--------------------+
|Popularity_near_zero|
+--------------------+
|                2926|
+--------------------+



### First Model: Random Forest regression

In [7]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

train, test = df.randomSplit([0.75, 0.25], seed = 42)
print("train =", train.count(), " test =", test.count())

input_cols = ['Track_duration', 
              'Danceability', 
              'Energy', 
              'Key', 
              'Loudness', 
              'Acousticness', 
              'Instrumentalness', 
              'Liveness', 
              'Valence', 
              'Tempo',
              'Year']

rf = RandomForestRegressor().setParams(
    numTrees=3,
    maxDepth=5,
    labelCol = "Popularity",
    predictionCol = "prediction"
)

assembler = VectorAssembler(
  inputCols = [x for x in train.columns if x in input_cols],
  outputCol = "features"
)

train = 7570  test = 2418


In [8]:
# Train a Random Forest Regression model on the input data and return an
# object to make predictions on new data
model = Pipeline(stages = [assembler, rf]).fit(train)

In [9]:
predicted = model.transform(test)

In [10]:
predicted.show()

+--------------+----------+------------+------+---+--------+------------+----------------+--------+-------+-------+----+--------------------+------------------+
|Track_duration|Popularity|Danceability|Energy|Key|Loudness|Acousticness|Instrumentalness|Liveness|Valence|  Tempo|Year|            features|        prediction|
+--------------+----------+------------+------+---+--------+------------+----------------+--------+-------+-------+----+--------------------+------------------+
|         97506|        30|       0.521| 0.765|  8|  -6.985|       0.838|         1.61E-6|   0.333|  0.824|123.591|2008|[97506.0,0.521,0....| 35.19144402829796|
|        105200|         0|       0.485| 0.935|  7|   -3.62|     0.00183|           0.873|  0.0915|   0.97|158.427|2005|[105200.0,0.485,0...|30.532799131713645|
|        106293|        10|       0.716| 0.395| 11|  -8.989|         0.4|         2.95E-6|  0.0921|  0.585|130.536|1966|[106293.0,0.716,0...| 36.47585600436779|
|        107386|        26|       

Tunning the model

In [11]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

data = assembler.transform(df)

train, test = data.randomSplit([0.75, 0.25], seed = 42)
print("train =", train.count(), " test =", test.count())

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol = "Popularity",
    predictionCol = "prediction"
)


train = 7570  test = 2418


In [12]:
# Define the hyperparameter grid
param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [2, 3, 4, 5, 7, 10, 15]) \
    .addGrid(rf.maxDepth, [1, 5, 8, 10, 15]) \
    .addGrid(rf.maxBins, [23,27,30, 32] ) \
    .build()

# Cross-validator for regression
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol= "Popularity", metricName="rmse")

crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=5)



# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train)

In [13]:
best_model_rf = cvModel.bestModel

In [14]:
# Analyze parameters and bias
optimal_paratemers = list(range(3))
optimal_paratemers[0] = best_model_rf.getNumTrees
optimal_paratemers[1] = best_model_rf.getMaxDepth()
optimal_paratemers[2] = best_model_rf.getMaxBins()

In [15]:
best_model_rf.featureImportances

SparseVector(11, {0: 0.1091, 1: 0.0439, 2: 0.0575, 3: 0.0171, 4: 0.0812, 5: 0.0812, 6: 0.0431, 7: 0.0294, 8: 0.0566, 9: 0.027, 10: 0.454})

So the most important features for this model to predict popularity are 'Year', 'Track_duration', 'Loudness' and 'Danceability', in order of most to less relevant.

In [16]:
optimal_paratemers

[7, 5, 27]

In [17]:
prediction = best_model_rf.transform(test)

In [18]:
prediction.show()

+--------------+----------+------------+------+---+--------+------------+----------------+--------+-------+-------+----+--------------------+------------------+
|Track_duration|Popularity|Danceability|Energy|Key|Loudness|Acousticness|Instrumentalness|Liveness|Valence|  Tempo|Year|            features|        prediction|
+--------------+----------+------------+------+---+--------+------------+----------------+--------+-------+-------+----+--------------------+------------------+
|         97506|        30|       0.521| 0.765|  8|  -6.985|       0.838|         1.61E-6|   0.333|  0.824|123.591|2008|[97506.0,0.521,0....|27.294247567783657|
|        105200|         0|       0.485| 0.935|  7|   -3.62|     0.00183|           0.873|  0.0915|   0.97|158.427|2005|[105200.0,0.485,0...|25.820714702413056|
|        106293|        10|       0.716| 0.395| 11|  -8.989|         0.4|         2.95E-6|  0.0921|  0.585|130.536|1966|[106293.0,0.716,0...| 34.84142290558858|
|        107386|        26|       

### Second Model: Lasso regression

In [19]:
from pyspark.ml.regression import LinearRegression

# Using only L1 regularization
lasso_regression = LinearRegression(featuresCol="features", labelCol="Popularity", elasticNetParam=1)

# Grid
params_grid = ParamGridBuilder() \
    .addGrid(lasso_regression.regParam, [0.01, 0.1, 1.0, 10]) \
    .build()

cross_validator = CrossValidator(estimator=lasso_regression,
                                 estimatorParamMaps=params_grid,
                                 evaluator=evaluator,
                                 numFolds=5)

In [20]:
cvModel = cross_validator.fit(train)
best_model_lasso = cvModel.bestModel

In [21]:
print("Best Lasso Model\n\r")
print("Coefficients: ", best_model_lasso.coefficients)
print("Bias: {:.2f}".format(best_model_lasso.intercept))

Best Lasso Model

Coefficients:  [4.8777725920173445e-06,17.70209526969712,-7.753719327412031,0.04268624045429491,1.0651030565776527,-2.6922296545157476,-7.931265775565377,-6.526539525541157,-7.977986126727718,0.007775770258983764,-0.29451294961949426]
Bias: 633.82


In [22]:
features_importance = sorted(list(zip(input_cols, 
                                     map(abs, best_model_lasso.coefficients))), 
                                     key=lambda x: x[1], 
                                     reverse=True)

print("Feature Importance:")
for feature, importance in features_importance:
    print("  {}: {:.2f}".format(feature, importance))

Feature Importance:
  Danceability: 17.70
  Valence: 7.98
  Instrumentalness: 7.93
  Energy: 7.75
  Liveness: 6.53
  Acousticness: 2.69
  Loudness: 1.07
  Year: 0.29
  Key: 0.04
  Tempo: 0.01
  Track_duration: 0.00


In [23]:
# Make predictions on the test data
prediction = best_model_lasso.transform(test)
prediction.show()

+--------------+----------+------------+------+---+--------+------------+----------------+--------+-------+-------+----+--------------------+------------------+
|Track_duration|Popularity|Danceability|Energy|Key|Loudness|Acousticness|Instrumentalness|Liveness|Valence|  Tempo|Year|            features|        prediction|
+--------------+----------+------------+------+---+--------+------------+----------------+--------+-------+-------+----+--------------------+------------------+
|         97506|        30|       0.521| 0.765|  8|  -6.985|       0.838|         1.61E-6|   0.333|  0.824|123.591|2008|[97506.0,0.521,0....|29.064259778850555|
|        105200|         0|       0.485| 0.935|  7|   -3.62|     0.00183|           0.873|  0.0915|   0.97|158.427|2005|[105200.0,0.485,0...|27.580735510736076|
|        106293|        10|       0.716| 0.395| 11|  -8.989|         0.4|         2.95E-6|  0.0921|  0.585|130.536|1966|[106293.0,0.716,0...|50.503212316571194|
|        107386|        26|       

### Third Model: Decision Tree Regression

In [24]:
from pyspark.ml.regression import DecisionTreeRegressor

# Using only L1 regularization
dtr = DecisionTreeRegressor(featuresCol="features", labelCol="Popularity", predictionCol = "prediction")

# Grid
params_grid = ParamGridBuilder() \
    .addGrid(dtr.maxBins, [23,27,30, 32]) \
    .addGrid(dtr.maxDepth, [1, 5, 8, 10, 15]) \
    .build()

In [25]:
crossval = CrossValidator(estimator=dtr,
                          estimatorParamMaps=params_grid,
                          evaluator=evaluator,
                          numFolds=5)

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train)

In [26]:
best_model_dtr = cvModel.bestModel
predictions = best_model_dtr.transform(test)

In [27]:
predictions.show()

+--------------+----------+------------+------+---+--------+------------+----------------+--------+-------+-------+----+--------------------+------------------+
|Track_duration|Popularity|Danceability|Energy|Key|Loudness|Acousticness|Instrumentalness|Liveness|Valence|  Tempo|Year|            features|        prediction|
+--------------+----------+------------+------+---+--------+------------+----------------+--------+-------+-------+----+--------------------+------------------+
|         97506|        30|       0.521| 0.765|  8|  -6.985|       0.838|         1.61E-6|   0.333|  0.824|123.591|2008|[97506.0,0.521,0....|27.977777777777778|
|        105200|         0|       0.485| 0.935|  7|   -3.62|     0.00183|           0.873|  0.0915|   0.97|158.427|2005|[105200.0,0.485,0...| 30.79741379310345|
|        106293|        10|       0.716| 0.395| 11|  -8.989|         0.4|         2.95E-6|  0.0921|  0.585|130.536|1966|[106293.0,0.716,0...| 33.98581560283688|
|        107386|        26|       

### Fourth Model: ANN for regression

Using tensor flow for an ANN (Artificial Neural Network model).

In [28]:
df_pandas = df.toPandas()

In [29]:
df_pandas.head()

Unnamed: 0,Track_duration,Popularity,Danceability,Energy,Key,Loudness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Year
0,216270,0,0.617,0.872,8,-12.305,0.0158,0.112,0.408,0.504,111.458,1992
1,237120,64,0.825,0.743,2,-5.995,0.0142,2.1e-05,0.237,0.8,127.045,2009
2,312533,56,0.677,0.665,7,-5.171,0.56,1e-06,0.338,0.706,74.981,1999
3,233400,42,0.683,0.728,9,-8.92,0.568,5.1e-05,0.0384,0.833,75.311,2014
4,448720,0,0.319,0.627,0,-9.611,0.675,7.3e-05,0.289,0.497,85.818,1969


In [30]:
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [31]:
train_ann = train.toPandas()
test_ann = test.toPandas()

In [32]:
train_ann.drop(columns = ['features'], inplace=True)
test_ann.drop(columns = ['features'], inplace=True)

Let's make a few changes in dataset to use sigmoid function for popularity.

In [33]:
df_pandas['Popularity'] = df_pandas['Popularity'].astype(float)

Let's scale the dataset to improve ann results.

In [34]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(df_pandas)

df_scaled = pd.DataFrame(scaler.transform(df_pandas))

In [35]:
df_scaled.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.085069,0.0,0.582489,0.874621,0.727273,0.586519,0.015941,0.113706,0.405322,0.489393,0.418005,0.537313
1,0.099254,0.653061,0.816565,0.745231,0.181818,0.803417,0.014326,2.2e-05,0.230297,0.797213,0.50322,0.791045
2,0.150559,0.571429,0.650011,0.666994,0.636364,0.831741,0.565085,1e-06,0.333675,0.699459,0.218584,0.641791
3,0.096723,0.428571,0.656763,0.730185,0.818182,0.702874,0.573157,5.2e-05,0.027021,0.831531,0.220388,0.865672
4,0.243209,0.0,0.24713,0.628879,0.0,0.679121,0.681129,7.4e-05,0.283521,0.482113,0.27783,0.19403
5,0.069474,0.806122,0.643259,0.712131,0.818182,0.743985,0.039654,1.1e-05,0.027329,0.832571,0.457844,0.313433
6,0.056176,0.795918,0.518343,0.68204,0.545455,0.746597,0.560039,0.0,0.106448,0.297005,0.297364,0.925373
7,0.101585,0.622449,0.428314,0.629882,0.545455,0.746769,0.175578,3.3e-05,0.06479,0.52787,0.592699,0.61194
8,0.114683,0.755102,0.289894,0.654958,0.818182,0.818576,0.083549,0.0,0.074104,0.508111,0.774397,0.373134
9,0.079942,0.0,0.520594,0.683043,1.0,0.839887,0.114024,0.0,0.100307,0.336522,0.618132,0.895522


In [36]:
#df_pandas['Popularity'] = df_pandas['Popularity'].apply(lambda x:x/100)
#df_pandas.head()

In [37]:
train_ann['Popularity'] = train_ann['Popularity'].astype(float)
#train_ann['Popularity'] = train_ann['Popularity'].apply(lambda x:x/100)
test_ann['Popularity'] = test_ann['Popularity'].astype(float)
#test_ann['Popularity'] = test_ann['Popularity'].apply(lambda x:x/100)

scaler.fit(train_ann)
train_ann = pd.DataFrame(scaler.transform(train_ann))

scaler.fit(test_ann)
test_ann = pd.DataFrame(scaler.transform(test_ann))

In [137]:
#df = df_pandas.astype(float)

In [138]:
#train_ann = train_ann.astype(float)
#test_ann = test_ann.astype(float)

Model will consist of 5 layers: 11 neurons input, 8,  6 and 4 neurons hidden layers and an output layer (1 neuron).

In [38]:
model_ann = keras.Sequential([
    keras.layers.Dense(8, input_shape=(11,), activation='relu'),
    keras.layers.Dense(6, activation = 'sigmoid'),
    keras.layers.Dense(4, activation = 'relu'),
    keras.layers.Dense(1, activation = 'sigmoid')
])

In [39]:
opt = tf.keras.optimizers.experimental.Adam(
    learning_rate=0.01
)

model_ann.compile(optimizer=opt,
              loss='mse',
              metrics=['mse', 'mae']
            )

In [40]:
X = train_ann.drop(columns = [1]).copy()
y = train_ann[1].copy()

In [41]:
model_ann.fit(X, y, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1fc898b02b0>

In [42]:
X_test = test_ann.drop(columns = [1]).copy()
y_test = test_ann[1].copy()

In [43]:
model_ann.evaluate(X_test,  y_test, verbose=2)

76/76 - 0s - loss: 0.0905 - mse: 0.0905 - mae: 0.2657 - 342ms/epoch - 5ms/step


[0.09045311063528061, 0.09045311063528061, 0.2656819522380829]

In [44]:
predictions = model_ann.predict(X_test)



In [45]:
results = pd.DataFrame({'Predictions':predictions[0:30].reshape(30,), 'Test':y_test[0:30]})

In [46]:
results.head(10)

Unnamed: 0,Predictions,Test
0,0.166357,0.309278
1,0.213126,0.0
2,0.472497,0.103093
3,0.336735,0.268041
4,0.329499,0.56701
5,0.221005,0.185567
6,0.34988,0.56701
7,0.429347,0.773196
8,0.269292,0.360825
9,0.472497,0.0


In [47]:
print(f"O valor maximo entre as previsoes e {max(results['Predictions'])} e o minimo e {min(results['Predictions'])}")
print(f"O valor maximo na amostra e {max(results['Test'])} e o minimo e {min(results['Test'])}")

O valor maximo entre as previsoes e 0.4724966287612915 e o minimo e 0.13152974843978882
O valor maximo na amostra e 0.7731958762886598 e o minimo e 0.0


In [62]:
spark.stop()