# Import

In [2]:
import pickle

# math and dataframes
import pandas as pd
import numpy as np

# statistics
from sklearn.metrics import r2_score
from scipy.stats import friedmanchisquare, wilcoxon

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.decomposition import PCA

# Pipeline and Evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set_theme()

In [3]:
df_10M = pd.read_pickle('df_10M_clustered.pickle')
X_all = pd.read_pickle('X_clustered.pickle')
X_all.reset_index()

# Setup inputs for statistical scenarios

In [4]:
# columns for datasets

y_column = 'in_B100'
X_columns = [
    'mode', 'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]
genre_columns = [
    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
]
cluster_columns = ['cluster', 'cluster2']
other_columns = ['key', 'time_signature', 'genre', 'release_date']

In [5]:
# clusters

# create a dict with all 'name': (X, y) key match pairs
clusters = {}

# entire predictive dataset
clusters['All'] = (X_all[X_columns], X_all[y_column])

# add genres
for genre in genre_columns:
    title = genre[3:]
    clusters[title] = (X_all[X_all[genre]][X_columns], X_all[X_all[genre]][y_column])
    
# add clusters
for n in sorted(X_all['cluster'].unique()):
    title = genre[3:]
    clusters['cluster1_' + str(n)] = (X_all[X_all['cluster'] == n][X_columns], X_all[X_all['cluster'] == n][y_column])
    
for n in sorted(X_all['cluster2'].unique()):
    title = genre[3:]
    clusters['cluster2_' + str(n)] = (X_all[X_all['cluster2'] == n][X_columns], X_all[X_all['cluster2'] == n][y_column])

# a small dataset for testing
small = X_all.sample(10_000, random_state=42)
X_small = small[X_columns]
y_small = small[y_column]
clusters['small'] = (X_small, y_small)

cluster_keys = [
    'All', 
    'Adult_Standard', 'Rock', 'R&B', 'Country', 'Pop', 'Rap', 'Alternative', 'EDM', 'Metal', 
    'cluster1_0', 'cluster1_1', 'cluster1_2', 'cluster1_3', 
    'cluster2_0', 'cluster2_1', 'cluster2_2', 'cluster2_3', 'cluster2_4', 
    'cluster2_5', 'cluster2_6', 'cluster2_7', 'cluster2_8', 'cluster2_9',
    'small'
]

In [6]:
# machine learning algorithms
# note: tensorflow is harder to tune and use in a pipeline, so has been kept separate

ML_algorithms = [
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    RandomForestClassifier,
    AdaBoostClassifier
]

In [7]:
# hyperparameters

param_by_model = {}

params_lr = {}
orders_of_magnitude = []
for lst in [[int(x)/10000 for x in range(1, 11)],
            [int(x)/1000 for x in range(1, 11)],
            [int(x)/100 for x in range(1, 11)],
            [int(x)/10 for x in range(1, 11)],
            [1 * x for x in range(1, 11)],
            [10 * x for x in range(1, 11)],
            [100 * x for x in range(1, 11)],
            [1000 * x for x in range(1, 11)]]:
    orders_of_magnitude += lst
params_lr['logisticregression__penalty'] = ['l1', 'l2']
params_lr['logisticregression__C'] = orders_of_magnitude
params_lr['logisticregression__solver'] = ['liblinear']
param_by_model[0] = params_lr

params_dt = {}
params_dt['decisiontreeclassifier__max_depth'] = [3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 100, None]
params_dt['decisiontreeclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
params_dt['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
param_by_model[1] = params_dt

params_knn = {}
params_knn['kneighborsclassifier__n_neighbors'] = [x for x in range(2,20)]+[x for x in range(20,101,5)]
params_knn['kneighborsclassifier__weights'] = ['uniform', 'distance']
params_knn['kneighborsclassifier__metric'] = ['minkowski', 'euclidean', 'manhattan']
param_by_model[2] = params_knn

params_rf = {}
params_rf['randomforestclassifier__n_estimators'] = [5, 10, 20, 50, 100, 200, 500, 1000, 2000]
params_rf['randomforestclassifier__max_features'] = ['sqrt', 'log2']
params_rf['randomforestclassifier__max_depth'] = [3, 5, 7, 10, 15, 20, 30, 50, 100, None]
params_rf['randomforestclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
params_rf['randomforestclassifier__bootstrap'] = [True, False]
param_by_model[3] = params_rf

params_ab = {}
params_ab['adaboostclassifier__n_estimators'] = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
params_ab['adaboostclassifier__learning_rate'] = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0]
params_ab['adaboostclassifier__algorithm'] = ['SAMME', 'SAMME.R']
param_by_model[4] = params_ab

# scoring metrics

metrics = [
    'balanced_accuracy', 'average_precision', 'neg_brier_score', 'f1', 'f1_micro', 
    'f1_macro', 'f1_weighted', 'neg_log_loss', 'precision', 'recall', 'roc_auc', 'jaccard'
]

# how many hyperparameter scenarios in the grid search

def how_many_scenarios(n_ML):
    n_scenarios = 1
    for key in param_by_model[n_ML].keys():
        n_scenarios *=  len(param_by_model[n_ML][key])
    return n_scenarios

for i in range(5):
    print(str(ML_algorithms[i]())[:-2], how_many_scenarios(i))

LogisticRegression 160
DecisionTreeClassifier 160
KNeighborsClassifier 210
RandomForestClassifier 1800
AdaBoostClassifier 162


# Make Predictions Dataframe for Statistics
* split into 5 stratified folds
    * using a consistent random_state to use the same folds between tests
* for each fold:
    * train on undersampled training fold
    * predict on full test fold
    * add out of fold predictions to add to predictions dataframe
    
NOTES: 
* Tuning individual models on limited datasets has been investigated in NOTEBOOK 5B (and 5D).
* Random undersampling and oversampling were investigated in NOTEBOOK 5A
    * More oversampling methods like SMOTE were not considered because the nature of music. For example, interpolating between modes leads to an atonal, non-musical result. Discrete combinations of features are likely to be important in terms of audio features as well. More importantly, with over 20k positive cases in our dataset, we should have enough data for a well trained model.

In [117]:
# initialise the dataframe

# use the same stratified split for all test cases
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# scenarios to test
stats_scenarios = [
    'y_lr', 'y_dt', 'y_knn', 'y_rf', 'y_ab', 'y_lr_tuned', 'y_knn_tuned', 'y_cl_1', 'y_cl_2', 'y_genres'
]

df_predictions = pd.DataFrame(columns=stats_scenarios)
df_predictions['y_actual'] = pd.NA  # for debugging
df_predictions = pd.concat([X_all, df_predictions], axis=1)

In [178]:
def evaluate_default_performance(dataframe, X_columns, y_column, n_scenario):
    
    # entire dataset for predictions
    X_, y_ = dataframe[X_columns], dataframe[y_column]
    
    # actual y and predicted y
    y_actual = pd.DataFrame()
    y_pred = pd.DataFrame()

    # loop through folds
    for train_i, test_i in kfold.split(X_, y_):

        # train test split for current fold
        train_X, test_X = X_.iloc[train_i], X_.iloc[test_i]
        train_y, test_y = y_.iloc[train_i], y_.iloc[test_i]
        
        # create and fit pipeline
        undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
        
        if n_scenario in [1]:
            model = DecisionTreeClassifier()
        elif n_scenario in [2, 6]:
            model = KNeighborsClassifier()
        elif n_scenario in [3]:
            model = RandomForestClassifier()
        elif n_scenario in [4]:
            model = AdaBoostClassifier()
        else:
            model = LogisticRegression()
        
        pipe = make_pipeline(undersampler, model)
        
        pipe.fit(train_X, train_y)
        
        # append results
        y_pred_temp = pipe.predict(test_X)
        y_pred_temp = pd.concat([
            pd.DataFrame(y_pred_temp),
            pd.DataFrame(test_i, columns=[''])
        ], axis=1).set_index('')
        y_pred = pd.concat([y_pred, y_pred_temp], axis=0)
        y_actual = pd.concat([y_actual, test_y], axis=0)  # for debugging

    return y_pred.sort_index()
    
#     # return the full sorted results, appended into the input dataframe
#     dataframe[stats_scenarios[n_scenario]] = y_pred.sort_index()
#     dataframe['y_actual'] = y_actual.sort_index()  # for debugging, this should always be equal to in_B100 or folds are misaligned
    
#     return dataframe

In [179]:
%%time
# which statistical scenario are we evaluating
n_scenario = 0

TEMP = evaluate_default_performance(df_predictions, X_columns, y_column, n_scenario)

Wall time: 20.6 s


In [180]:
TEMP

Unnamed: 0,0
,
0,False
1,False
2,True
3,True
4,False
...,...
8827714,False
8827715,True
8827716,False


In [159]:
# should be zero
sum(TEMP['in_B100'] != TEMP['y_actual'])

KeyError: 'in_B100'

In [None]:
df_predictions[stats_scenarios[n_scenario]]

In [173]:
for train_i, test_i in kfold.split(X_, y_):
    pass

tempdataframe = pd.DataFrame()
tempdataframe = pd.concat([tempdataframe, ], axis=1).set_index('')
tempdataframe

In [None]:
TEMP

In [None]:
# make predictions with base algorithms


# save dataframe
df_predictions.to_pickle('df_predictions_DEFAULT.pickle')

In [None]:
# make predictions with tuned logistic regression, tuned knn


# save dataframe
df_predictions.to_pickle('df_predictions_TUNED.pickle')

In [None]:
# make predictions separating into clusters and genres



# save dataframe
df_predictions.to_pickle('df_predictions_CLUSTERS.pickle')

In [None]:
# save final predictions dataframe
df_predictions.to_pickle('df_predictions.pickle')

# Explore Predictions

In [None]:
# Friedman Test (Wilcoxn if required)


In [None]:
# Histograms 
## NEED TO COMBINE WITH id to compare AF
