In [1]:
import pickle

# math and dataframes
import pandas as pd
import numpy as np

# neural network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

# Pipeline and Evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from imblearn.pipeline import make_pipeline

# Undersampling 
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set_theme()

In [2]:
df_10M = pd.read_pickle('df_10M_clustered.pickle')
X_all = pd.read_pickle('X_clustered.pickle')

In [3]:
# need to cast to float to use in tensor
X_all['mode'] = X_all['mode'].astype('float32')

In [4]:
y_column = 'in_B100'
X_columns = [
    'mode', 'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]
genre_columns = [
    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
]
cluster_columns = ['cluster', 'cluster2']
other_columns = ['key', 'time_signature', 'genre', 'release_date']

In [5]:
# create a dict with all 'name': (X, y) key match pairs
clusters = {}

# entire predictive dataset
clusters['All'] = (X_all[X_columns+genre_columns], X_all[y_column])
# clusters['All'] = (X_all[X_columns], X_all[y_column])

# add genres
for genre in genre_columns:
    title = genre[3:]
    clusters[title] = (X_all[X_all[genre]][X_columns], X_all[X_all[genre]][y_column])
    
# add clusters
for n in sorted(X_all['cluster'].unique()):
    title = genre[3:]
    clusters['cluster1_' + str(n)] = (X_all[X_all['cluster'] == n][X_columns], X_all[X_all['cluster'] == n][y_column])
    
for n in sorted(X_all['cluster2'].unique()):
    title = genre[3:]
    clusters['cluster2_' + str(n)] = (X_all[X_all['cluster2'] == n][X_columns], X_all[X_all['cluster2'] == n][y_column])
    
# setup tuning algorithm with a small dataset
small = X_all.sample(10_000, random_state=42)
X_small = small[X_columns]
y_small = small[y_column]
clusters['small'] = (X_small, y_small)

In [6]:
# scenarios to check

metrics = [
    'balanced_accuracy', 'average_precision', 'neg_brier_score', 'f1', 'f1_micro', 
    'f1_macro', 'f1_weighted', 'neg_log_loss', 'precision', 'recall', 'roc_auc', 'jaccard'
]

cluster1_keys = [
    'cluster1_0', 'cluster1_1', 'cluster1_2', 'cluster1_3'
]

cluster2_keys = [
    'cluster2_0', 'cluster2_1', 'cluster2_2', 'cluster2_3', 'cluster2_4', 
    'cluster2_5', 'cluster2_6', 'cluster2_7', 'cluster2_8', 'cluster2_9',
]

genre_keys = [
    'Adult_Standard', 'Rock', 'R&B', 'Country', 'Pop', 'Rap', 'Alternative', 'EDM', 'Metal'
]

In [29]:
# set up the neural network

# start with width of number of features 
# (could encode to higher dimensions)
tf_width = len(X_columns)
tf_dropout = 0

# setup
tf_model = Sequential(name='sequential')

# hidden layers
how_many_hidden_layers = 5
for i in range(how_many_hidden_layers):
    tf_model.add(Dense(tf_width, activation="relu", name='dense_'+str(i)))
    tf_model.add(Dropout(tf_dropout, name='dropout_'+str(i)))

# output
tf_model.add(Dense(1, activation='sigmoid', name='output'))

# setup a checkpoint to save model
checkpoint = ModelCheckpoint('tf_model', save_best_only=True)

# compile model
tf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

In [57]:
%%time
# train the model

# pick the dataset
am_testing = False
if am_testing:
    dataset = 'small'
else:
    dataset = 'cluster2_7' # good amount of hits, about 1M songs 
X_, y_ = clusters[dataset]

# split the dataset into train test, then separate validation set for fitting the neural network
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42, stratify=y_)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)

# undersample data
undersample = True
if undersample:
    undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
    X_train, y_train = undersampler.fit_resample(X_train, y_train)
    
# convert to Numpy arrays
X_train = np.asarray(X_train).astype('float32')
X_val = np.asarray(X_val).astype('float32')
X_test = np.asarray(X_test).astype('float32')
y_train = np.asarray(y_train).astype('float32')
y_val = np.asarray(y_val).astype('float32')
y_test = np.asarray(y_test).astype('float32')

# fit the data
tf_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, callbacks=checkpoint)

Epoch 1/10


INFO:tensorflow:Assets written to: tf_model\assets


Epoch 2/10
Epoch 3/10


INFO:tensorflow:Assets written to: tf_model\assets


Epoch 4/10


INFO:tensorflow:Assets written to: tf_model\assets


Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


INFO:tensorflow:Assets written to: tf_model\assets


Wall time: 1min 35s


<keras.callbacks.History at 0x1b203533100>

In [38]:
# predict and check results
y_pred = tf_model.predict(X_test)
print('\nClassification Report\n------------------------------\n', classification_report(y_test, y_pred.flatten() > 0.5))


Classification Report
------------------------------
               precision    recall  f1-score   support

         0.0       1.00      0.45      0.62      1994
         1.0       0.00      0.83      0.01         6

    accuracy                           0.45      2000
   macro avg       0.50      0.64      0.32      2000
weighted avg       1.00      0.45      0.62      2000



In [56]:
# this neural network is not very certain about anything
pd.DataFrame(y_pred).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,2000.0,0.555,0.141,0.308,0.429,0.538,0.684,0.834


In [55]:
# 95% of predictions are in this range of certainty
pd.DataFrame(y_pred).describe([0.025, 0.975]).loc[['2.5%', '97.5%']]

Unnamed: 0,0
2.5%,0.365
97.5%,0.792
