In [1]:
import pickle

# math and dataframes
import pandas as pd
import numpy as np
import scipy as sp

# neural network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Pipeline and Evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from imblearn.pipeline import make_pipeline

# Undersampling 
# Note: undersampling was used in at least 1 paper predicting popularity (Gao 2021)
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import time
import seaborn as sns
sns.set_theme()

In [2]:
df_10M = pd.read_pickle('df_10M_clustered.pickle')
X_all = pd.read_pickle('X_clustered.pickle')

In [3]:
y_column = 'in_B100'
X_columns = [
    'mode', 'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]
genre_columns = [
    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
]
cluster_columns = ['cluster', 'cluster2']
other_columns = ['key', 'time_signature', 'genre', 'release_date']

In [4]:
# create a dict with all 'name': (X, y) key match pairs
clusters = {}

# entire predictive dataset
clusters['All'] = (X_all[X_columns+genre_columns], X_all[y_column])
# clusters['All'] = (X_all[X_columns], X_all[y_column])

# add genres
for genre in genre_columns:
    title = genre[3:]
    clusters[title] = (X_all[X_all[genre]][X_columns], X_all[X_all[genre]][y_column])
    
# add clusters
for n in sorted(X_all['cluster'].unique()):
    title = genre[3:]
    clusters['cluster1_' + str(n)] = (X_all[X_all['cluster'] == n][X_columns], X_all[X_all['cluster'] == n][y_column])
    
for n in sorted(X_all['cluster2'].unique()):
    title = genre[3:]
    clusters['cluster2_' + str(n)] = (X_all[X_all['cluster2'] == n][X_columns], X_all[X_all['cluster2'] == n][y_column])
    
# setup tuning algorithm with a small dataset
small = X_all.sample(10_000, random_state=42)
X_small = small[X_columns]
y_small = small[y_column]
clusters['small'] = (X_small, y_small)

In [14]:
# scenarios to check

metrics = [
    'balanced_accuracy', 'average_precision', 'neg_brier_score', 'f1', 'f1_micro', 
    'f1_macro', 'f1_weighted', 'neg_log_loss', 'precision', 'recall', 'roc_auc', 'jaccard'
]

cluster1_keys = [
    'cluster1_0', 'cluster1_1', 'cluster1_2', 'cluster1_3'
]

cluster2_keys = [
    'cluster2_0', 'cluster2_1', 'cluster2_2', 'cluster2_3', 'cluster2_4', 
    'cluster2_5', 'cluster2_6', 'cluster2_7', 'cluster2_8', 'cluster2_9',
]

genre_keys = [
    'Adult_Standard', 'Rock', 'R&B', 'Country', 'Pop', 'Rap', 'Alternative', 'EDM', 'Metal'
]

In [None]:
# set up the neural network

# start with width of number of features 
# (could encode to higher dimensions)
tf_width = len(X_columns)
tf_dropout = 0

# setup
tf_model = Sequential(name='Sequential')

# hidden layers
how_many_hidden_layers = 5

for i in range(how_many_hidden_layers):
tf_model.add(Dense(tf_width, activation="relu", name=f'Dense_{i}'))
tf_model.add(Dropout(tf_dropout, name=f'Dropout_{i}'))

# output
tf_model.add(Dense(1, activation='sigmoid', name='Output'))

# compile model
tf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

In [None]:
%%time
# train the model

# pick the dataset
dataset = 'small'
X_, y_ = clusters[dataset]

# split the dataset into train test
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.3, random_state=42, stratify=y_)

# undersample data
undersample = True
if undersample:
    undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
    X_train, y_train = undersampler.fit_resample(X_train, y_train)

# fit the data
tf_model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=40, callbacks=checkpoint)

In [15]:
# pick the dataset
dataset = 'cluster1_0'
X_, y_ = clusters[dataset]

# split the dataset into train test
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.3, random_state=42, stratify=y_)

# undersample data
undersample = True
if undersample:
    undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
    X_train, y_train = undersampler.fit_resample(X_train, y_train)

In [16]:
y_train.sum(), y_train.shape

(129, (258,))

In [17]:
y_test.shape

(370086,)

In [25]:
clusters['All'][1].sum()

21229

In [29]:
# adult standard is just better because of randomness...
for cluster in clusters:
    print(cluster, clusters[cluster][1].sum(), clusters[cluster][1].count(), clusters[cluster][1].sum() / clusters[cluster][1].count())

All 21229 8827719 0.0024048114807460456
Adult_Standard 3678 208860 0.01760988221775352
Rock 6311 657177 0.009603196703475625
R&B 2965 136578 0.02170920646077699
Country 2448 270216 0.009059419131361577
Pop 3904 540126 0.00722794310957073
Rap 2084 415998 0.005009639469420526
Alternative 177 84950 0.002083578575632725
EDM 103 249597 0.0004126652163287219
Metal 142 245399 0.0005786494647492451
cluster1_0 184 1233618 0.00014915476265748393
cluster1_1 14659 3978190 0.0036848415988175527
cluster1_2 5844 2304781 0.002535598826960132
cluster1_3 542 1311130 0.00041338387497807233
cluster2_0 2156 995561 0.0021656131568030487
cluster2_1 3443 1086979 0.003167494496213818
cluster2_2 56 819425 6.834060469231473e-05
cluster2_3 101 755361 0.00013371090114528021
cluster2_4 128 414193 0.00030903467707083415
cluster2_5 3772 934132 0.004037973220058835
cluster2_6 883 317876 0.0027778127320087077
cluster2_7 7848 1730621 0.0045347883794314295
cluster2_8 2401 1217802 0.0019715848717607623
cluster2_9 441 5557