# Import

In [1]:
import pickle

# math and dataframes
import pandas as pd
import numpy as np

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Pipeline and Evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set_theme()

In [2]:
df_10M = pd.read_pickle('df_10M_clustered.pickle')
X_all = pd.read_pickle('X_clustered.pickle')
X_all = X_all.reset_index()

# Setup inputs for statistical scenarios

In [3]:
# columns for datasets

y_column = 'in_B100'
X_columns = [
    'mode', 'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]
genre_columns = [
    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
]
cluster_columns = ['cluster', 'cluster2']
other_columns = ['key', 'time_signature', 'genre', 'release_date']

In [4]:
# hyperparameters

param_by_model = {}

params_lr = {}
orders_of_magnitude = []
for lst in [[int(x)/10000 for x in range(1, 11)],
            [int(x)/1000 for x in range(1, 11)],
            [int(x)/100 for x in range(1, 11)],
            [int(x)/10 for x in range(1, 11)],
            [1 * x for x in range(1, 11)],
            [10 * x for x in range(1, 11)],
            [100 * x for x in range(1, 11)],
            [1000 * x for x in range(1, 11)]]:
    orders_of_magnitude += lst
params_lr['logisticregression__penalty'] = ['l1', 'l2']
params_lr['logisticregression__C'] = orders_of_magnitude
params_lr['logisticregression__solver'] = ['liblinear']
param_by_model[0] = params_lr

params_dt = {}
params_dt['decisiontreeclassifier__max_depth'] = [3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 100, None]
params_dt['decisiontreeclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
params_dt['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
param_by_model[1] = params_dt

params_knn = {}
params_knn['kneighborsclassifier__n_neighbors'] = [x for x in range(2,20)]+[x for x in range(20,101,5)]
params_knn['kneighborsclassifier__weights'] = ['uniform', 'distance']
params_knn['kneighborsclassifier__metric'] = ['minkowski', 'euclidean', 'manhattan']
param_by_model[2] = params_knn

params_rf = {}
params_rf['randomforestclassifier__n_estimators'] = [5, 10, 20, 50, 100, 200, 500, 1000, 2000]
params_rf['randomforestclassifier__max_features'] = ['sqrt', 'log2']
params_rf['randomforestclassifier__max_depth'] = [3, 5, 7, 10, 15, 20, 30, 50, 100, None]
params_rf['randomforestclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
params_rf['randomforestclassifier__bootstrap'] = [True, False]
param_by_model[3] = params_rf

params_ab = {}
params_ab['adaboostclassifier__n_estimators'] = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
params_ab['adaboostclassifier__learning_rate'] = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0]
params_ab['adaboostclassifier__algorithm'] = ['SAMME', 'SAMME.R']
param_by_model[4] = params_ab

# scoring metrics

metrics = [
    'balanced_accuracy', 'average_precision', 'neg_brier_score', 'f1', 'f1_micro', 
    'f1_macro', 'f1_weighted', 'neg_log_loss', 'precision', 'recall', 'roc_auc', 'jaccard'
]

# how many hyperparameter scenarios in the grid search

def how_many_scenarios(n_ML):
    n_scenarios = 1
    for key in param_by_model[n_ML].keys():
        n_scenarios *=  len(param_by_model[n_ML][key])
    return n_scenarios

for i in range(5):
    print(how_many_scenarios(i))

160
160
210
1800
162


# Make Predictions Dataframe for Statistics
* split into 5 stratified folds
    * using a consistent random_state to use the same folds between tests
* for each fold:
    * train on undersampled training fold
    * predict on full test fold
    * add out of fold predictions to the predictions dataframe
    
NOTES: 
* Tuning individual models on limited datasets has been investigated in NOTEBOOK 5B (and 5D).
* Random undersampling and oversampling were investigated in NOTEBOOK 5A
    * More involved oversampling methods like SMOTE were not considered because the nature of music. For example, interpolating between modes leads to an atonal, non-musical result. Discrete combinations of features are likely to be important in terms of audio features as well. More importantly, with over 20k positive cases in our dataset, we should have enough data for a well trained model.

In [5]:
# initial setup

# use the same stratified split for all test cases
stratified_5fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# scenarios to test
stats_scenarios = [
    'y_lr', 'y_dt', 'y_knn', 'y_rf', 'y_ab', 'y_lr_tuned', 'y_dt_tuned', 'y_knn_tuned', 'y_rf_tuned', 'y_ab_tuned', 'y_cl_1', 'y_cl_2', 'y_genres'
]

In [6]:
# initialise the dataframe
df_predictions = pd.DataFrame(columns=stats_scenarios)
df_predictions['y_actual'] = pd.NA  # for debugging
df_predictions = pd.concat([X_all, df_predictions], axis=1)

### Predict Using Default Algorithms

In [7]:
### OUTDATED FUNCTION, UPDATED BELOW

def evaluate_default_performance(dataframe, kfold, feature_columns, class_column, n_scenario):
    
    # use a copy of the dataframe to leave the original alone
    dataframe = dataframe.copy()
        
    # entire dataset for predictions
    X_, y_ = dataframe[feature_columns], dataframe[class_column]
    
    # initialise actual y and predicted y as blank dataframes
    y_actual = pd.DataFrame()
    y_pred = pd.DataFrame()

    # loop through folds
    for train_i, test_i in kfold.split(X_, y_):

        # train test split for current fold
        train_X, test_X = X_.iloc[train_i], X_.iloc[test_i]
        train_y, test_y = y_.iloc[train_i], y_.iloc[test_i]
        
        # create and fit pipeline
        undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
        
        if n_scenario in [1, 6]:
            model = DecisionTreeClassifier()
        elif n_scenario in [2, 7]:
            model = KNeighborsClassifier()
        elif n_scenario in [3, 8]:
            model = RandomForestClassifier()
        elif n_scenario in [4, 9]:
            model = AdaBoostClassifier()
        else:
            model = LogisticRegression()
        
        pipe = make_pipeline(undersampler, model)
        
        pipe.fit(train_X, train_y)
        
        # append results
        y_pred_temp = pipe.predict(test_X)
        y_pred_temp = pd.concat([
            pd.DataFrame(y_pred_temp),
            pd.DataFrame(test_i, columns=[''])
        ], axis=1).set_index('')
        y_pred = pd.concat([y_pred, y_pred_temp], axis=0)
        y_actual = pd.concat([y_actual, test_y], axis=0)  # for debugging

    # return the full sorted results, appended into the input dataframe
    dataframe[stats_scenarios[n_scenario]] = y_pred.sort_index()
    dataframe['y_actual'] = y_actual.sort_index()  # for debugging, this should always be equal to in_B100 or folds are misaligned
    
    # DEBUGGING: should be zero
    is_ERRORS = sum(dataframe['in_B100'] != dataframe['y_actual'])
    if is_ERRORS != 0:
        print('THERE WERE ERRORS!!!! (compare the in_B100 and y_actual columns)')
    
    return dataframe

In [8]:
%%time
# Logistic Regression
n_scenario = 0
df_predictions = evaluate_default_performance(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

Wall time: 27.9 s


In [9]:
%%time
# Decision Tree
n_scenario = 1
df_predictions = evaluate_default_performance(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

Wall time: 26.7 s


In [10]:
%%time
# K Nearest Neighbours
n_scenario = 2
df_predictions = evaluate_default_performance(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

Wall time: 17min 55s


In [11]:
%%time
# Random Forest
n_scenario = 3
df_predictions = evaluate_default_performance(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

Wall time: 3min 8s


In [12]:
%%time
# AdaBoost
n_scenario = 4
df_predictions = evaluate_default_performance(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

Wall time: 1min 18s


In [13]:
# save dataframe
df_predictions.to_pickle('df_predictions_DEFAULT.pickle')

### Make Predictions With Tuned Models

In [209]:
### OUTDATED FUNCTION, UPDATED BELOW

def append_predictions(dataframe, kfold, feature_columns, class_column, n_scenario):
    
    # use a copy of the dataframe to leave the original alone
    dataframe = dataframe.copy()
    
    # entire dataset for predictions
    X_, y_ = dataframe[feature_columns], dataframe[class_column]
    
    # initialise actual y and predicted y as blank dataframes
    y_actual = pd.DataFrame()
    y_pred = pd.DataFrame()

    # loop through folds
    for train_i, test_i in kfold.split(X_, y_):

        # train test split for current fold
        train_X, test_X = X_.iloc[train_i], X_.iloc[test_i]
        train_y, test_y = y_.iloc[train_i], y_.iloc[test_i]
        
        # create pipeline
        undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
        
        if n_scenario in [1, 6]:
            n_ML = 1
            model = DecisionTreeClassifier()
        elif n_scenario in [2, 7]:
            n_ML = 2
            model = KNeighborsClassifier()
        elif n_scenario in [3, 8]:
            n_ML = 3
            model = RandomForestClassifier()
        elif n_scenario in [4, 9]:
            n_ML = 4
            model = AdaBoostClassifier()
        else:
            n_ML = 0
            model = LogisticRegression()
        
        pipe = make_pipeline(undersampler, model)
        
        # tune hyperparameters if required
        if n_scenario in [5, 6, 7, 8, 9]:
            # create and fit gridsearch
            grid = GridSearchCV(
                pipe,
                param_grid = param_by_model[n_ML],
                scoring = 'roc_auc'
            )
            grid.fit(train_X, train_y)
            y_pred_temp = grid.predict(test_X)
        else:
            pipe.fit(train_X, train_y)
            y_pred_temp = pipe.predict(test_X)
        
        # append results
        y_pred_temp = pd.concat([
            pd.DataFrame(y_pred_temp),
            pd.DataFrame(test_i, columns=[''])
        ], axis=1).set_index('')
        y_pred = pd.concat([y_pred, y_pred_temp], axis=0)
        y_actual = pd.concat([y_actual, test_y], axis=0)  # for debugging

    # return the full sorted results, appended into the input dataframe
    dataframe[stats_scenarios[n_scenario]] = y_pred.sort_index()
    dataframe['y_actual'] = y_actual.sort_index()  # for debugging, this should always be equal to in_B100 or folds are misaligned
    
    # DEBUGGING: should be zero
    is_ERRORS = sum(dataframe['in_B100'] != dataframe['y_actual'])
    if is_ERRORS != 0:
        print('THERE WERE ERRORS!!!! (compare the in_B100 and y_actual columns)')
    
    return dataframe

In [206]:
# this is wrong, they take a lot longer (esp adaboost)

n_LogisticRegression = 160
n_DecisionTreeClassifier = 160
n_KNeighborsClassifier = 210
n_RandomForestClassifier = 1800
n_AdaBoostClassifier = 162

# KNN and Random Forest Will Take Too Long, only try 
n_LogisticRegression * 23/60, n_DecisionTreeClassifier * 25/60, n_KNeighborsClassifier * 18, n_RandomForestClassifier  * 186/60, n_AdaBoostClassifier * 79/60

(61.333333333333336, 66.66666666666667, 3780, 5580.0, 213.3)

In [232]:
%%time
n_scenario = 5
df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

# save dataframe
df_predictions.to_pickle('df_predictions_TUNED.pickle')

Wall time: 2h 54min 36s


In [233]:
%%time
n_scenario = 6
df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

# save dataframe
df_predictions.to_pickle('df_predictions_TUNED.pickle')

Wall time: 2h 56min 53s


### These are too time consuming to complete
these were investigated using smaller fitting dataset in earlier notebook

In [None]:
# %%time
# # calculated this from early afternoon until the next morning, and it didn't complete
# # drop adaboost from partially tuned models
# n_scenario = 9
# df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

# save dataframe
# df_predictions.to_pickle('df_predictions_FULLYTUNED.pickle')

In [None]:
# %%time
# # this should take too long
# n_scenario = 7
# df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

# save dataframe
# df_predictions.to_pickle('df_predictions_FULLYTUNED.pickle')

In [None]:
# %%time
# # this should take too long
# n_scenario = 8
# df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

# save dataframe
# df_predictions.to_pickle('df_predictions_FULLYTUNED.pickle')

### Make predictions separating into clusters and genres

In [32]:
### THIS IS THE FINAL APPEND PREDICITONS FUNCTION
# should be applicable to all situations, but haven't been tested for all scenarios (some take hours/days to run)
# works on scenario 0, 5, 6, 10+

def append_predictions(dataframe, kfold, feature_columns, class_column, n_scenario):
    """
    loop through folds, append out of fold predictions to database of predictions
    cluster could be 'genre', 'cluster1', 'cluster2', or False (default)
    """
    # use a copy of the dataframe to leave the original alone
    dataframe = dataframe.copy()
    
    # based on the scenario number, do we need to cluster?
    if n_scenario == 10:
        cluster = 'cluster1'
    elif n_scenario == 11:
        cluster = 'cluster2'
    elif n_scenario == 12:
        cluster = 'genre'
    else:
        cluster = False
    
    # entire dataset for predictions
    X_, y_ = dataframe[feature_columns], dataframe[class_column]
    
    # initialise actual y and predicted y as blank dataframes
    y_actual = pd.DataFrame()
    y_pred = pd.DataFrame()

    # loop through folds
    for train_i, test_i in kfold.split(X_, y_):

        # train test split for current fold
        train_X, test_X = X_.iloc[train_i], X_.iloc[test_i]
        train_y, test_y = y_.iloc[train_i], y_.iloc[test_i]
        
        # create pipeline
        undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
        
        # initialise a new classifier (inside fold loop to prevent spillover from refitting)
        if n_scenario in [1, 6]:
            n_ML = 1
            model = DecisionTreeClassifier()
        elif n_scenario in [2, 7]:
            n_ML = 2
            model = KNeighborsClassifier()
        elif n_scenario in [3, 8]:
            n_ML = 3
            model = RandomForestClassifier()
        elif n_scenario in [4, 9]:
            n_ML = 4
            model = AdaBoostClassifier()
        else:
            n_ML = 0
            model = LogisticRegression()
        
        pipe = make_pipeline(undersampler, model)
        
        # THREE OPTIONS: tune hyperparameters, loop through clusters, just fit the pipe
        
        # OPTION 1: tune hyperparameters, tune/fit the grid
        if n_scenario in [5, 6, 7, 8, 9]:
            # create and fit gridsearch
            # NOTE: n_jobs=1 leads to a PicklingError
            """
            from: https://stackoverflow.com/questions/56884020/spacy-with-joblib-library-generates-pickle-picklingerror-could-not-pickle-the
                'Same issue. I solved by changing the backend from loky to threading in Parallel.'
            """
            grid = GridSearchCV(
                pipe,
                param_grid = param_by_model[n_ML],
                scoring = 'roc_auc'
            )
            grid.fit(train_X, train_y)
            y_pred_temp = grid.predict(test_X)
            
        # OPTION 2: loop through clusters, individually fit the pipe
        elif cluster:  
            
            # initialise dataframe to append results
            y_pred_temp = pd.DataFrame()
            
            # cluster 1
            if cluster == 'cluster1':
                for i in range(4):
                    # this seems convoluted, but it speeds the code 10x vs loc bool combo
                    cluster_index = dataframe[['cluster']][dataframe['cluster'] == i]
                    i_train_cluster = cluster_index[cluster_index.index.isin(train_i)].index
                    i_test_cluster = cluster_index[cluster_index.index.isin(test_i)].index
                    
                    # iloc doesn't work on index, it works on position, loc works
                    train_X_c = train_X.loc[i_train_cluster]
                    test_X_c = test_X.loc[i_test_cluster]
                    train_y_c = train_y.loc[i_train_cluster]
                    test_y_c = test_y.loc[i_test_cluster]
                    
                    # make prediction
                    pipe.fit(train_X_c, train_y_c)
                    y_pred_cluster = pipe.predict(test_X_c)
                    
                    # append prediction to y_pred_temp
                    y_pred_temp = pd.concat([
                        y_pred_temp,
                        pd.DataFrame(y_pred_cluster, index=i_test_cluster)
                    ], axis=0)
                    
            # cluster 2
            elif cluster == 'cluster2':
                
                for i in range(10):
                    # this seems convoluted, but it speeds the code up by a factor of 10
                    cluster_index = dataframe[['cluster2']][dataframe['cluster2'] == i]
                    i_train_cluster = cluster_index[cluster_index.index.isin(train_i)].index
                    i_test_cluster = cluster_index[cluster_index.index.isin(test_i)].index
                    
                    # iloc doesn't work on index, it works on position, loc works
                    train_X_c = train_X.loc[i_train_cluster]
                    test_X_c = test_X.loc[i_test_cluster]
                    train_y_c = train_y.loc[i_train_cluster]
                    test_y_c = test_y.loc[i_test_cluster]
                    
                    # make prediction
                    pipe.fit(train_X_c, train_y_c)
                    y_pred_cluster = pipe.predict(test_X_c)
                    
                    # append prediction to y_pred_temp
                    y_pred_temp = pd.concat([
                        y_pred_temp,
                        pd.DataFrame(y_pred_cluster, index=i_test_cluster)
                    ], axis=0)
      
            # genre
            elif cluster == 'genre':
                genre_columns = [
                    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
                    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
                ]
                for genre in genre_columns:
                    
                    # NOTE: could consider adding a 'misc' genre, which is not in these genres
                    
                    # this seems convoluted, but it speeds the code 10x vs loc bool combo
                    cluster_index = dataframe[[genre]][dataframe[genre]]  # confirm that this works
                    i_train_cluster = cluster_index[cluster_index.index.isin(train_i)].index
                    i_test_cluster = cluster_index[cluster_index.index.isin(test_i)].index
                    
                    # iloc doesn't work on index, it works on position, loc works
                    train_X_c = train_X.loc[i_train_cluster]
                    test_X_c = test_X.loc[i_test_cluster]
                    train_y_c = train_y.loc[i_train_cluster]
                    test_y_c = test_y.loc[i_test_cluster]
                    
                    # make prediction
                    pipe.fit(train_X_c, train_y_c)
                    y_pred_cluster = pipe.predict(test_X_c)
                    
                    # append prediction to y_pred_temp
                    y_pred_temp = pd.concat([
                        y_pred_temp,
                        pd.DataFrame(y_pred_cluster, index=i_test_cluster)
                    ], axis=0)
                    
            # no matching cluster exists
            else:
                print('NO SUCH CLUSTER')  # could raise an error instead
                return dataframe  # do nothing, just return the input dataframe
            
            # sort y_pred_temp by index so it aligns properly
            y_pred_temp = np.array(y_pred_temp.sort_index())
                
        # OPTION 3: just fit the pipe
        else:  
            pipe.fit(train_X, train_y)
            y_pred_temp = pipe.predict(test_X)
        
        # debugging statement: after fitting the data for the fold
        print('fold complete')
                
        # fitting complete for fold
        # append results
        y_pred_temp = pd.concat([
            pd.DataFrame(y_pred_temp),
            pd.DataFrame(test_i, columns=[''])
        ], axis=1).set_index('')
        y_pred = pd.concat([y_pred, y_pred_temp], axis=0)
        y_actual = pd.concat([y_actual, test_y], axis=0)  # for debugging
        
    # return the full sorted results, appended into the input dataframe
    dataframe[stats_scenarios[n_scenario]] = y_pred.sort_index()
    dataframe['y_actual'] = y_actual.sort_index()  # for debugging, this should always be equal to in_B100 or folds are misaligned
    
    # DEBUGGING: should be zero
    is_ERRORS = sum(dataframe['in_B100'] != dataframe['y_actual'])
    if is_ERRORS != 0:
        print('THERE WERE ERRORS!!!! (compare the in_B100 and y_actual columns)')
    
    return dataframe

In [37]:
%%time
# clustering results: cluster 1
df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario=10)

# save dataframe
df_predictions.to_pickle('df_predictions_CLUSTERS.pickle')

fold complete
fold complete
fold complete
fold complete
fold complete
Wall time: 1min


In [38]:
%%time
# clustering results: cluster 2
df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario=11)

# save dataframe
df_predictions.to_pickle('df_predictions_CLUSTERS.pickle')

fold complete
fold complete
fold complete
fold complete
fold complete
Wall time: 1min 15s


In [39]:
%%time
# clustering results: genres
df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario=12)

# save dataframe
df_predictions.to_pickle('df_predictions_CLUSTERS.pickle')

fold complete
fold complete
fold complete
fold complete
fold complete
Wall time: 52.8 s


# Save Final Predictions Dataframe

In [40]:
# save final predictions dataframe
df_predictions.drop(['y_knn_tuned', 'y_rf_tuned', 'y_ab_tuned', 'y_actual'], axis=1).to_pickle('df_predictions.pickle')