# Import

In [2]:
import pickle

# math and dataframes
import pandas as pd
import numpy as np

# statistics
from sklearn.metrics import r2_score
from scipy.stats import friedmanchisquare, wilcoxon

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.decomposition import PCA

# Pipeline and Evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set_theme()

In [3]:
df_10M = pd.read_pickle('df_10M_clustered.pickle')
X_all = pd.read_pickle('X_clustered.pickle')

# Setup inputs for statistical scenarios

In [4]:
# columns for datasets

y_column = 'in_B100'
X_columns = [
    'mode', 'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]
genre_columns = [
    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
]
cluster_columns = ['cluster', 'cluster2']
other_columns = ['key', 'time_signature', 'genre', 'release_date']

In [5]:
# clusters

# create a dict with all 'name': (X, y) key match pairs
clusters = {}

# entire predictive dataset
clusters['All'] = (X_all[X_columns], X_all[y_column])

# add genres
for genre in genre_columns:
    title = genre[3:]
    clusters[title] = (X_all[X_all[genre]][X_columns], X_all[X_all[genre]][y_column])
    
# add clusters
for n in sorted(X_all['cluster'].unique()):
    title = genre[3:]
    clusters['cluster1_' + str(n)] = (X_all[X_all['cluster'] == n][X_columns], X_all[X_all['cluster'] == n][y_column])
    
for n in sorted(X_all['cluster2'].unique()):
    title = genre[3:]
    clusters['cluster2_' + str(n)] = (X_all[X_all['cluster2'] == n][X_columns], X_all[X_all['cluster2'] == n][y_column])

# a small dataset for testing
small = X_all.sample(10_000, random_state=42)
X_small = small[X_columns]
y_small = small[y_column]
clusters['small'] = (X_small, y_small)

cluster_keys = [
    'All', 
    'Adult_Standard', 'Rock', 'R&B', 'Country', 'Pop', 'Rap', 'Alternative', 'EDM', 'Metal', 
    'cluster1_0', 'cluster1_1', 'cluster1_2', 'cluster1_3', 
    'cluster2_0', 'cluster2_1', 'cluster2_2', 'cluster2_3', 'cluster2_4', 
    'cluster2_5', 'cluster2_6', 'cluster2_7', 'cluster2_8', 'cluster2_9',
    'small'
]

In [6]:
# machine learning algorithms
# note: tensorflow is harder to tune and use in a pipeline, so has been kept separate

ML_algorithms = [
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    RandomForestClassifier,
    AdaBoostClassifier
]

In [7]:
# hyperparameters

param_by_model = {}

params_lr = {}
orders_of_magnitude = []
for lst in [[int(x)/10000 for x in range(1, 11)],
            [int(x)/1000 for x in range(1, 11)],
            [int(x)/100 for x in range(1, 11)],
            [int(x)/10 for x in range(1, 11)],
            [1 * x for x in range(1, 11)],
            [10 * x for x in range(1, 11)],
            [100 * x for x in range(1, 11)],
            [1000 * x for x in range(1, 11)]]:
    orders_of_magnitude += lst
params_lr['logisticregression__penalty'] = ['l1', 'l2']
params_lr['logisticregression__C'] = orders_of_magnitude
params_lr['logisticregression__solver'] = ['liblinear']
param_by_model[0] = params_lr

params_dt = {}
params_dt['decisiontreeclassifier__max_depth'] = [3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 100, None]
params_dt['decisiontreeclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
params_dt['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
param_by_model[1] = params_dt

params_knn = {}
params_knn['kneighborsclassifier__n_neighbors'] = [x for x in range(2,20)]+[x for x in range(20,101,5)]
params_knn['kneighborsclassifier__weights'] = ['uniform', 'distance']
params_knn['kneighborsclassifier__metric'] = ['minkowski', 'euclidean', 'manhattan']
param_by_model[2] = params_knn

params_rf = {}
params_rf['randomforestclassifier__n_estimators'] = [5, 10, 20, 50, 100, 200, 500, 1000, 2000]
params_rf['randomforestclassifier__max_features'] = ['sqrt', 'log2']
params_rf['randomforestclassifier__max_depth'] = [3, 5, 7, 10, 15, 20, 30, 50, 100, None]
params_rf['randomforestclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
params_rf['randomforestclassifier__bootstrap'] = [True, False]
param_by_model[3] = params_rf

params_ab = {}
params_ab['adaboostclassifier__n_estimators'] = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
params_ab['adaboostclassifier__learning_rate'] = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0]
params_ab['adaboostclassifier__algorithm'] = ['SAMME', 'SAMME.R']
param_by_model[4] = params_ab

# scoring metrics

metrics = [
    'balanced_accuracy', 'average_precision', 'neg_brier_score', 'f1', 'f1_micro', 
    'f1_macro', 'f1_weighted', 'neg_log_loss', 'precision', 'recall', 'roc_auc', 'jaccard'
]

# how many hyperparameter scenarios in the grid search

def how_many_scenarios(n_ML):
    n_scenarios = 1
    for key in param_by_model[n_ML].keys():
        n_scenarios *=  len(param_by_model[n_ML][key])
    return n_scenarios

for i in range(5):
    print(str(ML_algorithms[i]())[:-2], how_many_scenarios(i))

LogisticRegression 160
DecisionTreeClassifier 160
KNeighborsClassifier 210
RandomForestClassifier 1800
AdaBoostClassifier 162


# Make Predictions Dataframe for Statistics
* split into 5 stratified folds
    * using a consistent random_state to use the same folds between tests
* for each fold:
    * train on undersampled training fold
    * predict on full test fold
    * add out of fold predictions to add to predictions dataframe
    
NOTES: 
* Tuning individual models on limited datasets has been investigated in NOTEBOOK 5B (and 5D).
* Random undersampling and oversampling were investigated in NOTEBOOK 5A
    * More oversampling methods like SMOTE were not considered because the nature of music. For example, interpolating between modes leads to an atonal, non-musical result. Discrete combinations of features are likely to be important in terms of audio features as well. More importantly, with over 20k positive cases in our dataset, we should have enough data for a well trained model.

In [41]:
X_all.head(1)

Unnamed: 0_level_0,song,artist,genre,release_date,in_B100,is_Adult_Standard,is_Rock,is_R&B,is_Country,is_Pop,is_Rap,is_Alternative,is_EDM,is_Metal,key,mode,time_signature,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,cluster,cluster2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
6SFc2WVQmARn6NDS3LrTR8,I Feel So Free - Lost Souls of Saturn Remix,!!!,dance rock,2016-02-23,False,False,True,False,False,False,False,False,False,False,1,1,4,0.007,0.728,0.817,0.747,0.434,0.113,0.573,0.084,0.436,0.284,3,3


In [47]:
# initialise the dataframe

# use the same stratified split for all test cases
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# scenarios to test
stats_scenarios = [
    'y_actual', 'y_lr', 'y_dt', 'y_knn', 'y_rf', 'y_ab', 'y_lr_tuned', 'y_knn_tuned', 'y_cl_1', 'y_cl_2', 'y_genres'
]

df_predictions = pd.DataFrame(columns=stats_scenarios)
df_predictions = pd.concat([X_all, df_predictions], axis=1)
df_predictions['y_actual'] = df_predictions['in_B100']

In [50]:
# df_predictions.head()

In [39]:
# make sure this doesn't double shuffle, or split then do cv manually

data_y = pd.DataFrame()
data_yhat = pd.DataFrame()
X_, y_ = X_all[X_columns], X_all[y_column]

n_scenario = 0

for train_i, test_i in kfold.split(X_, y_):
        
    # train test split for current fold
    train_X, test_X = X_[train_i], X_[test_i]
    train_y, test_y = y_[train_i], y_[test_i]
    
#     # fit model
#     model = KNeighborsClassifier()
#     model.fit(train_X, train_y)
    
#     # make predictions
#     y_pred = model.predict(test_X)
    
    # store
    predictions_column = stats_scenarios[n_scenario]
    
    data_y.extend(test_y)
    data_yhat.extend(y_pred)

XTEMP

Unnamed: 0,0
0,3
1,5
2,16
3,17
4,25
...,...
1995,9961
1996,9972
1997,9977
1998,9978


In [35]:
X_all

Unnamed: 0_level_0,song,artist,genre,release_date,in_B100,is_Adult_Standard,is_Rock,is_R&B,is_Country,is_Pop,is_Rap,is_Alternative,is_EDM,is_Metal,key,mode,time_signature,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,cluster,cluster2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
6SFc2WVQmARn6NDS3LrTR8,I Feel So Free - Lost Souls of Saturn Remix,!!!,dance rock,2016-02-23,False,False,True,False,False,False,False,False,False,False,1,1,4,0.007,0.728,0.817,0.747,0.434,0.113,0.573,0.084,0.436,0.284,3,3
6NMSTM4UQMC5emaYKIueyc,The Most Certain Sure (Liv Spencer Remix),!!!,dance rock,2010-08-16,False,False,True,False,False,False,False,False,False,False,10,0,4,0.027,0.703,0.736,0.763,0.847,0.103,0.619,0.051,0.418,0.336,3,3
1FqHPzJuRdupHbcw09tYUa,Except Death,!!!,dance rock,2013-01-01,False,False,True,False,False,False,False,False,False,False,9,1,4,0.010,0.812,0.402,0.776,0.071,0.074,0.670,0.039,0.400,0.912,1,7
7y8aVfDkqt6qirGNivvs0M,One Girl / One Boy,!!!,dance rock,2013-01-01,False,False,True,False,False,False,False,False,False,False,10,0,4,0.003,0.702,0.340,0.851,0.000,0.322,0.706,0.041,0.395,0.870,1,7
70w8loBbdl4qZOH2brrqKF,"When the Going Gets Tough, the Tough Get Karazzee",!!!,dance rock,2004-06-08,False,False,True,False,False,False,False,False,False,False,11,0,4,0.000,0.786,0.587,0.751,0.446,0.053,0.664,0.078,0.418,0.651,3,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0dojpUA108DBW9rtH90tYp,Buddha’s Festival of Love,,,NaT,False,False,False,False,False,False,False,False,False,False,5,0,4,0.969,0.534,0.243,0.326,0.935,0.252,0.569,0.032,0.348,0.653,0,4
0EDPbjA2MGb8ksFHvPRQkF,Da Music,,,NaT,False,False,False,False,False,False,False,False,False,False,1,1,4,0.005,0.757,0.216,0.714,0.000,0.342,0.700,0.112,0.312,0.743,1,7
0fOFQHkfR26fraeXte9NkW,St. Jak pa la,,,NaT,False,False,False,False,False,False,False,False,False,False,9,0,4,0.512,0.824,0.083,0.610,0.320,0.102,0.561,0.049,0.401,0.923,1,7
0mzP7Bd2PGDA7n1Dt0OQFL,An nou mache,,,NaT,False,False,False,False,False,False,False,False,False,False,6,0,3,0.742,0.631,0.077,0.515,0.000,0.389,0.592,0.089,0.391,0.927,2,1


In [27]:
yTEMP

Unnamed: 0,y


In [29]:
kfold.split(X_small, y_small)

<generator object _BaseKFold.split at 0x0000024E64E1D660>

In [None]:
# make predictions with base algorithms


# save dataframe
df_predictions.to_pickle('df_predictions_DEFAULT.pickle')

In [None]:
# make predictions with tuned logistic regression, tuned knn


# save dataframe
df_predictions.to_pickle('df_predictions_TUNED.pickle')

In [None]:
# make predictions separating into clusters and genres



# save dataframe
df_predictions.to_pickle('df_predictions_CLUSTERS.pickle')

In [None]:
# save final predictions dataframe
df_predictions.to_pickle('df_predictions.pickle')

# Explore Predictions

In [None]:
# Friedman Test (Wilcoxn if required)


In [None]:
# Histograms 
## NEED TO COMBINE WITH id to compare AF
