In [3]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf


from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score
# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [61]:
# loading the data
df_genre_clean = pd.read_csv("Resources/cleaned_data.csv")
df_genre_clean.head()

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,edm,latin,pop,r&b,rap,rock
0,1.0,0.748,0.916,6.0,-2.634,1.0,0.0583,0.102,0.0,0.0653,0.518,122.036,194754.0,0,0,1,0,0,0
1,1.0,0.726,0.815,11.0,-4.969,1.0,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600.0,0,0,1,0,0,0
2,1.0,0.675,0.931,1.0,-3.432,0.0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616.0,0,0,1,0,0,0
3,1.0,0.718,0.93,7.0,-3.778,1.0,0.102,0.0287,9e-06,0.204,0.277,121.956,169093.0,0,0,1,0,0,0
4,1.0,0.65,0.833,1.0,-4.672,1.0,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052.0,0,0,1,0,0,0


In [62]:
# drop less important columns
columns=['edm','latin','pop','r&b','rap','rock','key','mode']
df_genre_clean = df_genre_clean.drop(columns=columns)
df_genre_clean

Unnamed: 0,track_popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,1.0,0.748,0.916,-2.634,0.0583,0.102000,0.000000,0.0653,0.5180,122.036,194754.0
1,1.0,0.726,0.815,-4.969,0.0373,0.072400,0.004210,0.3570,0.6930,99.972,162600.0
2,1.0,0.675,0.931,-3.432,0.0742,0.079400,0.000023,0.1100,0.6130,124.008,176616.0
3,1.0,0.718,0.930,-3.778,0.1020,0.028700,0.000009,0.2040,0.2770,121.956,169093.0
4,1.0,0.650,0.833,-4.672,0.0359,0.080300,0.000000,0.0833,0.7250,123.976,189052.0
...,...,...,...,...,...,...,...,...,...,...,...
32828,0.0,0.428,0.922,-1.814,0.0936,0.076600,0.000000,0.0668,0.2100,128.170,204375.0
32829,0.0,0.522,0.786,-4.462,0.0420,0.001710,0.004270,0.3750,0.4000,128.041,353120.0
32830,0.0,0.529,0.821,-4.899,0.0481,0.108000,0.000001,0.1500,0.4360,127.989,210112.0
32831,0.0,0.626,0.888,-3.361,0.1090,0.007920,0.127000,0.3430,0.3080,128.008,367432.0


In [63]:
# Sampling our data to ensure we have balance set of data between train and test data

df_genre_clean = df_genre_clean.sample(frac = 0.8,random_state = 420)
df_genre_clean

Unnamed: 0,track_popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
27566,1.0,0.367,0.783,-4.504,0.1540,0.00515,0.000000,0.0545,0.289,88.013,192653.0
17762,0.0,0.333,0.456,-8.960,0.0293,0.72500,0.000000,0.0921,0.331,169.954,209529.0
543,1.0,0.584,0.643,-6.415,0.0395,0.28800,0.000000,0.1860,0.530,90.036,181844.0
4003,1.0,0.802,0.645,-6.181,0.0715,0.27200,0.000000,0.1190,0.376,100.001,157202.0
8860,0.0,0.671,0.803,-7.497,0.2920,0.05200,0.000248,0.1500,0.762,137.997,170013.0
...,...,...,...,...,...,...,...,...,...,...,...
31487,0.0,0.661,0.935,-4.711,0.0796,0.00408,0.000825,0.0780,0.389,127.987,335613.0
28246,0.0,0.736,0.506,-12.549,0.4020,0.75400,0.000001,0.4710,0.424,124.015,230338.0
27383,0.0,0.798,0.832,-5.340,0.0916,0.16700,0.016900,0.1050,0.717,117.967,225890.0
3367,0.0,0.824,0.862,-5.213,0.0403,0.00270,0.000201,0.0917,0.773,128.001,390791.0


In [64]:
# Define features set


X=df_genre_clean.copy()
X=X.drop(columns=["track_popularity"])
X

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
27566,0.367,0.783,-4.504,0.1540,0.00515,0.000000,0.0545,0.289,88.013,192653.0
17762,0.333,0.456,-8.960,0.0293,0.72500,0.000000,0.0921,0.331,169.954,209529.0
543,0.584,0.643,-6.415,0.0395,0.28800,0.000000,0.1860,0.530,90.036,181844.0
4003,0.802,0.645,-6.181,0.0715,0.27200,0.000000,0.1190,0.376,100.001,157202.0
8860,0.671,0.803,-7.497,0.2920,0.05200,0.000248,0.1500,0.762,137.997,170013.0
...,...,...,...,...,...,...,...,...,...,...
31487,0.661,0.935,-4.711,0.0796,0.00408,0.000825,0.0780,0.389,127.987,335613.0
28246,0.736,0.506,-12.549,0.4020,0.75400,0.000001,0.4710,0.424,124.015,230338.0
27383,0.798,0.832,-5.340,0.0916,0.16700,0.016900,0.1050,0.717,117.967,225890.0
3367,0.824,0.862,-5.213,0.0403,0.00270,0.000201,0.0917,0.773,128.001,390791.0


In [65]:
# Define target vector

y=df_genre_clean['track_popularity'].ravel()
y

array([1., 0., 1., ..., 0., 0., 0.])

In [66]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [67]:
X_train.values[0]

array([ 6.45000e-01,  8.14000e-01, -5.93400e+00,  6.67000e-02,
        1.64000e-01,  1.64000e-06,  2.13000e-01,  5.19000e-01,
        1.45036e+02,  1.90345e+05])

In [68]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [69]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [11]:
# save the scaler
from joblib import dump
dump(scaler, 'std_scaler.bin', compress=True)

['std_scaler.bin']

In [70]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [71]:
X_train_scaled

array([[-0.0673396 ,  0.6361474 ,  0.26263051, ...,  0.03133489,
         0.88641992, -0.59294298],
       [-1.03177175,  1.15460428,  0.48808484, ...,  0.85424361,
         1.29311624, -0.46320433],
       [ 0.01532602, -0.13050691,  0.3945876 , ...,  1.59143267,
        -1.21983775,  0.41918539],
       ...,
       [-2.67130642,  0.48722894, -0.34476997, ..., -1.04444683,
         0.33239824,  1.50648708],
       [-0.81133012,  1.43589471,  1.23473652, ...,  1.35570361,
         0.06519554, -0.05292982],
       [ 0.14621324,  0.49274444,  0.70624506, ...,  0.755666  ,
         0.07302728,  0.45996754]])

In [73]:
# Using LogisticRegression Model

LR_Model = LogisticRegression()
LR_Model.fit(X_train_scaled, y_train)
LR_Predict = LR_Model.predict(X_test_scaled)
LR_Accuracy = accuracy_score(y_test, LR_Predict)
print("Accuracy: " + str(LR_Accuracy))

LR_AUC = roc_auc_score(y_test, LR_Predict)
print("AUC: " + str(LR_AUC))

Accuracy: 0.6445865692096848
AUC: 0.5344580043072505


# Data Model Optimization

In [74]:
# Using DecisionTree Model

DT_Model = DecisionTreeClassifier(max_depth=4)
DT_Model.fit(X_train_scaled, y_train)
DT_Predict = DT_Model.predict(X_test_scaled)
DT_Accuracy = accuracy_score(y_test, DT_Predict)
print("Accuracy: " + str(DT_Accuracy))

DT_AUC = roc_auc_score(y_test, DT_Predict)
print("AUC: " + str(DT_AUC))

Accuracy: 0.639409167047358
AUC: 0.5471703756879636


In [75]:
# Using RandomForestClassifier Model

RFC_Model = RandomForestClassifier()
RFC_Model.fit(X_train_scaled, y_train)
RFC_Predict = RFC_Model.predict(X_test_scaled)
RFC_Accuracy = accuracy_score(y_test, RFC_Predict)
print("Accuracy: " + str(RFC_Accuracy))

RFC_AUC = roc_auc_score(y_test, RFC_Predict)
print("AUC: " + str(RFC_AUC))

Accuracy: 0.7385411908024974
AUC: 0.6797080641301747


In [76]:
# comparing the predicted value to the actual value
predicted_value_df=pd.DataFrame({"Prediction": RFC_Predict, "Actual": y_test})
predicted_value_df

Unnamed: 0,Prediction,Actual
0,0.0,0.0
1,1.0,1.0
2,0.0,0.0
3,0.0,1.0
4,0.0,1.0
...,...,...
6562,0.0,0.0
6563,0.0,0.0
6564,0.0,0.0
6565,0.0,0.0


In [77]:
# merge the dataframe
X_test['predicted_value']=RFC_Predict
X_test

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,predicted_value
27094,0.627,0.495,-11.236,0.1270,0.400000,0.000035,0.0714,0.305,120.088,212101.0,0.0
21961,0.883,0.525,-8.054,0.0633,0.189000,0.000000,0.0971,0.312,99.978,174100.0,1.0
24681,0.857,0.867,-4.952,0.0635,0.027100,0.000000,0.1280,0.918,104.382,253133.0,0.0
226,0.845,0.766,-5.727,0.0658,0.212000,0.000000,0.1110,0.782,114.054,195924.0,0.0
14735,0.476,0.990,-2.240,0.1330,0.000361,0.000003,0.4090,0.406,130.970,177853.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
30016,0.699,0.609,-8.201,0.0479,0.012600,0.000003,0.0332,0.545,96.028,190000.0,0.0
25179,0.699,0.764,-6.657,0.0914,0.191000,0.082000,0.2560,0.652,93.997,229381.0,0.0
17145,0.673,0.762,-6.312,0.0430,0.304000,0.000000,0.0560,0.524,90.976,322395.0,0.0
5584,0.717,0.472,-11.145,0.0467,0.787000,0.441000,0.2900,0.786,79.982,156000.0,0.0


In [19]:
# saving the predicted data
predicted_value_df.to_csv("Resources/predicted_data.csv", index=False)

In [20]:
X_test.to_csv("Resources/merged_data.csv", index=False)

In [78]:
# Random Forests in sklearn will automatically calculate feature importance
importances = RFC_Model.feature_importances_
# We can sort the features by their importance
sorted(zip(RFC_Model.feature_importances_, X.columns), reverse=True)

[(0.11054389433245199, 'loudness'),
 (0.10503629979200019, 'energy'),
 (0.10387357117417632, 'duration_ms'),
 (0.10156999649273471, 'tempo'),
 (0.10007038181554355, 'danceability'),
 (0.09907889172911613, 'acousticness'),
 (0.09790956572099928, 'speechiness'),
 (0.0967852468155232, 'valence'),
 (0.09434071644821596, 'liveness'),
 (0.09079143567923881, 'instrumentalness')]

In [42]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  YOUR CODE GOES HERE
number_input_features = len(X_train.values[0])
hidden_nodes_layer1 =  50
hidden_nodes_layer2 = 30
hidden_nodes_layer3 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 50)                550       
                                                                 
 dense_5 (Dense)             (None, 30)                1530      
                                                                 
 dense_6 (Dense)             (None, 5)                 155       
                                                                 
 dense_7 (Dense)             (None, 1)                 6         
                                                                 
Total params: 2241 (8.75 KB)
Trainable params: 2241 (8.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [43]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [44]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [45]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

206/206 - 1s - loss: 0.6092 - accuracy: 0.6693 - 724ms/epoch - 4ms/step
Loss: 0.6092129945755005, Accuracy: 0.6692553758621216


In [27]:
import pickle


# save model
pickle.dump(RFC_Model, open('rf_model.pickle', "wb"))

In [28]:
# load model
loaded_rf_model = pickle.load(open('rf_model.pickle', "rb"))

# # you can use loaded model to compute predictions
rf_prediction = loaded_rf_model.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_prediction)
print("Accuracy: " + str(rf_accuracy))

rf_auc = roc_auc_score(y_test, rf_prediction)
print("AUC: " + str(rf_auc))

Accuracy: 0.7251408557941221
AUC: 0.6646027757836803


In [29]:
test=X_train_scaled[:1]
test

array([[-0.0673396 ,  0.6361474 ,  0.73324171,  0.26263051, -1.14085666,
        -0.3987141 , -0.0507052 , -0.37702559,  0.1516965 ,  0.03133489,
         0.88641992, -0.59294298,  2.10539978, -0.42635027, -0.45103583,
        -0.44335312, -0.46329047, -0.42351975]])

In [30]:
result=rf_prediction = loaded_rf_model.predict(test)
result

array([0.])