# Import

In [2]:
import pickle

# math and dataframes
import pandas as pd
import numpy as np

# statistics
from sklearn.metrics import r2_score
from scipy.stats import friedmanchisquare, wilcoxon

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.decomposition import PCA

# Pipeline and Evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set_theme()

In [3]:
df_10M = pd.read_pickle('df_10M_clustered.pickle')
X_all = pd.read_pickle('X_clustered.pickle')
X_all.reset_index()

# Setup inputs for statistical scenarios

In [4]:
# columns for datasets

y_column = 'in_B100'
X_columns = [
    'mode', 'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]
genre_columns = [
    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
]
cluster_columns = ['cluster', 'cluster2']
other_columns = ['key', 'time_signature', 'genre', 'release_date']

In [7]:
# hyperparameters

param_by_model = {}

params_lr = {}
orders_of_magnitude = []
for lst in [[int(x)/10000 for x in range(1, 11)],
            [int(x)/1000 for x in range(1, 11)],
            [int(x)/100 for x in range(1, 11)],
            [int(x)/10 for x in range(1, 11)],
            [1 * x for x in range(1, 11)],
            [10 * x for x in range(1, 11)],
            [100 * x for x in range(1, 11)],
            [1000 * x for x in range(1, 11)]]:
    orders_of_magnitude += lst
params_lr['logisticregression__penalty'] = ['l1', 'l2']
params_lr['logisticregression__C'] = orders_of_magnitude
params_lr['logisticregression__solver'] = ['liblinear']
param_by_model[0] = params_lr

params_dt = {}
params_dt['decisiontreeclassifier__max_depth'] = [3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 100, None]
params_dt['decisiontreeclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
params_dt['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
param_by_model[1] = params_dt

params_knn = {}
params_knn['kneighborsclassifier__n_neighbors'] = [x for x in range(2,20)]+[x for x in range(20,101,5)]
params_knn['kneighborsclassifier__weights'] = ['uniform', 'distance']
params_knn['kneighborsclassifier__metric'] = ['minkowski', 'euclidean', 'manhattan']
param_by_model[2] = params_knn

params_rf = {}
params_rf['randomforestclassifier__n_estimators'] = [5, 10, 20, 50, 100, 200, 500, 1000, 2000]
params_rf['randomforestclassifier__max_features'] = ['sqrt', 'log2']
params_rf['randomforestclassifier__max_depth'] = [3, 5, 7, 10, 15, 20, 30, 50, 100, None]
params_rf['randomforestclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
params_rf['randomforestclassifier__bootstrap'] = [True, False]
param_by_model[3] = params_rf

params_ab = {}
params_ab['adaboostclassifier__n_estimators'] = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
params_ab['adaboostclassifier__learning_rate'] = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0]
params_ab['adaboostclassifier__algorithm'] = ['SAMME', 'SAMME.R']
param_by_model[4] = params_ab

# scoring metrics

metrics = [
    'balanced_accuracy', 'average_precision', 'neg_brier_score', 'f1', 'f1_micro', 
    'f1_macro', 'f1_weighted', 'neg_log_loss', 'precision', 'recall', 'roc_auc', 'jaccard'
]

# how many hyperparameter scenarios in the grid search

def how_many_scenarios(n_ML):
    n_scenarios = 1
    for key in param_by_model[n_ML].keys():
        n_scenarios *=  len(param_by_model[n_ML][key])
    return n_scenarios

for i in range(5):
    print(str(ML_algorithms[i]())[:-2], how_many_scenarios(i))

LogisticRegression 160
DecisionTreeClassifier 160
KNeighborsClassifier 210
RandomForestClassifier 1800
AdaBoostClassifier 162


# Make Predictions Dataframe for Statistics
* split into 5 stratified folds
    * using a consistent random_state to use the same folds between tests
* for each fold:
    * train on undersampled training fold
    * predict on full test fold
    * add out of fold predictions to add to predictions dataframe
    
NOTES: 
* Tuning individual models on limited datasets has been investigated in NOTEBOOK 5B (and 5D).
* Random undersampling and oversampling were investigated in NOTEBOOK 5A
    * More oversampling methods like SMOTE were not considered because the nature of music. For example, interpolating between modes leads to an atonal, non-musical result. Discrete combinations of features are likely to be important in terms of audio features as well. More importantly, with over 20k positive cases in our dataset, we should have enough data for a well trained model.

In [185]:
# initialise the dataframe

# use the same stratified split for all test cases
stratified_5fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# scenarios to test
stats_scenarios = [
    'y_lr', 'y_dt', 'y_knn', 'y_rf', 'y_ab', 'y_lr_tuned', 'y_dt_tuned', 'y_knn_tuned', 'y_rf_tuned', 'y_ab_tuned', 'y_cl_1', 'y_cl_2', 'y_genres'
]

df_predictions = pd.DataFrame(columns=stats_scenarios)
df_predictions['y_actual'] = pd.NA  # for debugging
df_predictions = pd.concat([X_all, df_predictions], axis=1)

### Predict Using Default Algorithms

In [311]:
### UPDATE THIS WITH FUTURE FUNCTION

def evaluate_default_performance(dataframe, kfold, feature_columns, class_column, n_scenario):
    
    # entire dataset for predictions
    X_, y_ = dataframe[feature_columns], dataframe[class_column]
    
    # initialise actual y and predicted y as blank dataframes
    y_actual = pd.DataFrame()
    y_pred = pd.DataFrame()

    # loop through folds
    for train_i, test_i in kfold.split(X_, y_):

        # train test split for current fold
        train_X, test_X = X_.iloc[train_i], X_.iloc[test_i]
        train_y, test_y = y_.iloc[train_i], y_.iloc[test_i]
        
        # create and fit pipeline
        undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
        
        if n_scenario in [1, 6]:
            model = DecisionTreeClassifier()
        elif n_scenario in [2, 7]:
            model = KNeighborsClassifier()
        elif n_scenario in [3, 8]:
            model = RandomForestClassifier()
        elif n_scenario in [4, 9]:
            model = AdaBoostClassifier()
        else:
            model = LogisticRegression()
        
        pipe = make_pipeline(undersampler, model)
        
        pipe.fit(train_X, train_y)
        
        # append results
        y_pred_temp = pipe.predict(test_X)
        y_pred_temp = pd.concat([
            pd.DataFrame(y_pred_temp),
            pd.DataFrame(test_i, columns=[''])
        ], axis=1).set_index('')
        y_pred = pd.concat([y_pred, y_pred_temp], axis=0)
        y_actual = pd.concat([y_actual, test_y], axis=0)  # for debugging

    # return the full sorted results, appended into the input dataframe
    dataframe[stats_scenarios[n_scenario]] = y_pred.sort_index()
    dataframe['y_actual'] = y_actual.sort_index()  # for debugging, this should always be equal to in_B100 or folds are misaligned
    
    # DEBUGGING: should be zero
    is_ERRORS = sum(dataframe['in_B100'] != dataframe['y_actual'])
    if is_ERRORS != 0:
        print('THERE WERE ERRORS!!!! (compare the in_B100 and y_actual columns)')
    
    return dataframe

In [189]:
%%time
# Logistic Regression
n_scenario = 0
df_predictions = evaluate_default_performance(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

Wall time: 22.8 s


In [191]:
%%time
# Decision Tree
n_scenario = 1
df_predictions = evaluate_default_performance(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

Wall time: 25.4 s


In [193]:
%%time
# K Nearest Neighbours
n_scenario = 2
df_predictions = evaluate_default_performance(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

Wall time: 18min


In [194]:
%%time
# Random Forest
n_scenario = 3
df_predictions = evaluate_default_performance(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

Wall time: 3min 6s


In [195]:
%%time
# AdaBoost
n_scenario = 4
df_predictions = evaluate_default_performance(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

Wall time: 1min 19s


In [225]:
# save dataframe
df_predictions.to_pickle('df_predictions_DEFAULT.pickle')

### Make Predictions With Tuned Models

In [209]:
def append_predictions(dataframe, kfold, feature_columns, class_column, n_scenario):
    
    # entire dataset for predictions
    X_, y_ = dataframe[feature_columns], dataframe[class_column]
    
    # initialise actual y and predicted y as blank dataframes
    y_actual = pd.DataFrame()
    y_pred = pd.DataFrame()

    # loop through folds
    for train_i, test_i in kfold.split(X_, y_):

        # train test split for current fold
        train_X, test_X = X_.iloc[train_i], X_.iloc[test_i]
        train_y, test_y = y_.iloc[train_i], y_.iloc[test_i]
        
        # create pipeline
        undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
        
        if n_scenario in [1, 6]:
            n_ML = 1
            model = DecisionTreeClassifier()
        elif n_scenario in [2, 7]:
            n_ML = 2
            model = KNeighborsClassifier()
        elif n_scenario in [3, 8]:
            n_ML = 3
            model = RandomForestClassifier()
        elif n_scenario in [4, 9]:
            n_ML = 4
            model = AdaBoostClassifier()
        else:
            n_ML = 0
            model = LogisticRegression()
        
        pipe = make_pipeline(undersampler, model)
        
        # tune hyperparameters if required
        if n_scenario in [5, 6, 7, 8, 9]:
            # create and fit gridsearch
            grid = GridSearchCV(
                pipe,
                param_grid = param_by_model[n_ML]
            )
            grid.fit(train_X, train_y)
            y_pred_temp = grid.predict(test_X)
        else:
            pipe.fit(train_X, train_y)
            y_pred_temp = pipe.predict(test_X)
        
        # append results
        y_pred_temp = pd.concat([
            pd.DataFrame(y_pred_temp),
            pd.DataFrame(test_i, columns=[''])
        ], axis=1).set_index('')
        y_pred = pd.concat([y_pred, y_pred_temp], axis=0)
        y_actual = pd.concat([y_actual, test_y], axis=0)  # for debugging

    # return the full sorted results, appended into the input dataframe
    dataframe[stats_scenarios[n_scenario]] = y_pred.sort_index()
    dataframe['y_actual'] = y_actual.sort_index()  # for debugging, this should always be equal to in_B100 or folds are misaligned
    
    # DEBUGGING: should be zero
    is_ERRORS = sum(dataframe['in_B100'] != dataframe['y_actual'])
    if is_ERRORS != 0:
        print('THERE WERE ERRORS!!!! (compare the in_B100 and y_actual columns)')
    
    return dataframe

In [206]:
n_LogisticRegression = 160
n_DecisionTreeClassifier = 160
n_KNeighborsClassifier = 210
n_RandomForestClassifier = 1800
n_AdaBoostClassifier = 162

# KNN and Random Forest Will Take Too Long, only try 
n_LogisticRegression * 23/60, n_DecisionTreeClassifier * 25/60, n_KNeighborsClassifier * 18, n_RandomForestClassifier  * 186/60, n_AdaBoostClassifier * 79/60

(61.333333333333336, 66.66666666666667, 3780, 5580.0, 213.3)

In [232]:
%%time
n_scenario = 5
df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

# save dataframe
df_predictions.to_pickle('df_predictions_TUNED.pickle')

Wall time: 2h 54min 36s


In [233]:
%%time
n_scenario = 6
df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

# save dataframe
df_predictions.to_pickle('df_predictions_TUNED.pickle')

Wall time: 2h 56min 53s


### These are too time consuming to complete
these were investigated using smaller fitting dataset in earlier notebook

In [None]:
# %%time
# # calculated this from early afternoon until the next morning, and it didn't complete
# # drop adaboost from partially tuned models
# n_scenario = 9
# df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

# save dataframe
# df_predictions.to_pickle('df_predictions_FULLYTUNED.pickle')

In [None]:
# %%time
# # this should take too long
# n_scenario = 7
# df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

# save dataframe
# df_predictions.to_pickle('df_predictions_FULLYTUNED.pickle')

In [None]:
# %%time
# # this should take too long
# n_scenario = 8
# df_predictions = append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario)

# save dataframe
# df_predictions.to_pickle('df_predictions_FULLYTUNED.pickle')

### Make predictions separating into clusters and genres

In [379]:
def append_predictions(dataframe, kfold, feature_columns, class_column, n_scenario):
    """
    cluster could be 'genre', 'cluster1', 'cluster2', or False (default)
    """
    # based on the scenario number, do we need to cluster?
    if n_scenario == 10:
        cluster = 'cluster1'
    elif n_scenario == 11:
        cluster = 'cluster2'
    elif n_scenario == 12:
        cluster = 'genre'
    else:
        cluster = False
    
    # entire dataset for predictions
    X_, y_ = dataframe[feature_columns], dataframe[class_column]
    
    # initialise actual y and predicted y as blank dataframes
    y_actual = pd.DataFrame()
    y_pred = pd.DataFrame()

    # loop through folds
    for train_i, test_i in kfold.split(X_, y_):

        # train test split for current fold
        train_X, test_X = X_.iloc[train_i], X_.iloc[test_i]
        train_y, test_y = y_.iloc[train_i], y_.iloc[test_i]
        
        # create pipeline
        undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
        
        if n_scenario in [1, 6]:
            n_ML = 1
            model = DecisionTreeClassifier()
        elif n_scenario in [2, 7]:
            n_ML = 2
            model = KNeighborsClassifier()
        elif n_scenario in [3, 8]:
            n_ML = 3
            model = RandomForestClassifier()
        elif n_scenario in [4, 9]:
            n_ML = 4
            model = AdaBoostClassifier()
        else:
            n_ML = 0
            model = LogisticRegression()
        
        pipe = make_pipeline(undersampler, model)
        
        # THREE OPTIONS: tune hyperparameters, loop through clusters, just fit the pipe
        
        # OPTION 1: tune hyperparameters, tune/fit the grid
        if n_scenario in [5, 6, 7, 8, 9]:
            # create and fit gridsearch
            grid = GridSearchCV(
                pipe,
                param_grid = param_by_model[n_ML]
            )
            grid.fit(train_X, train_y)
            y_pred_temp = grid.predict(test_X)
        
        # OPTION 2: loop through clusters, individually fit the pipe
        elif cluster:  
            
            # initialise dataframe to append results
            y_pred_temp = pd.DataFrame()
            
            # genre
            if cluster == 'genre':
                genre_columns = [
                    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
                    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
                ]
                for genre in genre_columns:
                    pass
            # cluster 1
            elif cluster == 'cluster1':
                for i in range(4):
                    pass
            # cluster 2
            elif cluster == 'cluster2':
                
                for i in range(10):
                    # this seems convoluted, but it speeds the code up by a factor of 10
                    cluster_index = dataframe[['cluster2']][dataframe['cluster2'] == i]
                    i_train_cluster = cluster_index.loc[cluster_index.index.isin(train_i)].index
                    i_test_cluster = cluster_index.loc[cluster_index.index.isin(test_i)].index
                    
                    print('cluster indices')
                    
                    # iloc doesn't work on index, it works on position, loc works
                    train_X_c = train_X.loc[i_train_cluster]
                    test_X_c = test_X.loc[i_test_cluster]
                    train_y_c = train_y.loc[i_train_cluster]
                    test_y_c = test_y.loc[i_test_cluster]
                    
                    print('clustered training testing sets')
                    
                    if True:  # JUST FOR TESTING
                        y_pred_cluster = np.array(test_y_c)
                    else:
                        pipe.fit(train_X_c, train_y_c)
                        y_pred_cluster = pipe.predict(test_X_c)
                        
                    y_pred_temp = pd.concat([
                        y_pred_temp,
                        pd.DataFrame(y_pred_cluster, index=i_test_cluster)
                    ], axis=0)
                    
            else:
                print('NO SUCH CLUSTER')  # could raise an error instead
                return dataframe  # do nothing, just return the input dataframe
            
            # sort y_pred_temp by index so it aligns properly
            y_pred_temp = y_pred_temp.sort_index()
            
            print('out of fold predictions: ', y_pred_temp.shape[0])
                
        # OPTION 3: just fit the pipe
        else:  
            pipe.fit(train_X, train_y)
            y_pred_temp = pipe.predict(test_X)
                
        # fitting complete for fold
        # append results
        y_pred_temp = pd.concat([
            pd.DataFrame(y_pred_temp),
            pd.DataFrame(test_i, columns=[''])
        ], axis=1).set_index('')
        y_pred = pd.concat([y_pred, y_pred_temp], axis=0)
        y_actual = pd.concat([y_actual, test_y], axis=0)  # for debugging
    
    # return the full sorted results, appended into the input dataframe
    dataframe[stats_scenarios[n_scenario]] = y_pred.sort_index()
    dataframe['y_actual'] = y_actual.sort_index()  # for debugging, this should always be equal to in_B100 or folds are misaligned
    
    # DEBUGGING: should be zero
    is_ERRORS = sum(dataframe['in_B100'] != dataframe['y_actual'])
    if is_ERRORS != 0:
        print('THERE WERE ERRORS!!!! (compare the in_B100 and y_actual columns)')
    
    return dataframe

In [380]:
%%time
append_predictions(df_predictions, stratified_5fold, X_columns, y_column, n_scenario=11)

cluster indices
clustered training testing sets
out of fold predictions:  198878
cluster indices
clustered training testing sets
out of fold predictions:  416341
cluster indices
clustered training testing sets
out of fold predictions:  580208
cluster indices
clustered training testing sets
out of fold predictions:  731679
cluster indices
clustered training testing sets
out of fold predictions:  814815
cluster indices
clustered training testing sets
out of fold predictions:  1001699
cluster indices
clustered training testing sets
out of fold predictions:  1064762
cluster indices
clustered training testing sets
out of fold predictions:  1411566
cluster indices
clustered training testing sets
out of fold predictions:  1654724
cluster indices
clustered training testing sets
out of fold predictions:  1765544
cluster indices
clustered training testing sets
out of fold predictions:  199898
cluster indices
clustered training testing sets
out of fold predictions:  417071
cluster indices
cluster

Unnamed: 0,id,song,artist,genre,release_date,in_B100,is_Adult_Standard,is_Rock,is_R&B,is_Country,is_Pop,is_Rap,is_Alternative,is_EDM,is_Metal,key,mode,time_signature,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,cluster,cluster2,y_lr,y_dt,y_knn,y_rf,y_ab,y_lr_tuned,y_dt_tuned,y_knn_tuned,y_rf_tuned,y_ab_tuned,y_cl_1,y_cl_2,y_genres,y_actual
0,6SFc2WVQmARn6NDS3LrTR8,I Feel So Free - Lost Souls of Saturn Remix,!!!,dance rock,2016-02-23,False,False,True,False,False,False,False,False,False,False,1,1,4,0.007,0.728,0.817,0.747,0.434,0.113,0.573,0.084,0.436,0.284,3,3,False,False,False,False,False,False,False,,,,,,,False
1,6NMSTM4UQMC5emaYKIueyc,The Most Certain Sure (Liv Spencer Remix),!!!,dance rock,2010-08-16,False,False,True,False,False,False,False,False,False,False,10,0,4,0.027,0.703,0.736,0.763,0.847,0.103,0.619,0.051,0.418,0.336,3,3,True,False,False,False,False,False,False,,,,,,,False
2,1FqHPzJuRdupHbcw09tYUa,Except Death,!!!,dance rock,2013-01-01,False,False,True,False,False,False,False,False,False,False,9,1,4,0.010,0.812,0.402,0.776,0.071,0.074,0.670,0.039,0.400,0.912,1,7,True,True,True,True,True,False,True,,,,,,,False
3,7y8aVfDkqt6qirGNivvs0M,One Girl / One Boy,!!!,dance rock,2013-01-01,False,False,True,False,False,False,False,False,False,False,10,0,4,0.003,0.702,0.340,0.851,0.000,0.322,0.706,0.041,0.395,0.870,1,7,True,True,True,True,True,False,True,,,,,,,False
4,70w8loBbdl4qZOH2brrqKF,"When the Going Gets Tough, the Tough Get Karazzee",!!!,dance rock,2004-06-08,False,False,True,False,False,False,False,False,False,False,11,0,4,0.000,0.786,0.587,0.751,0.446,0.053,0.664,0.078,0.418,0.651,3,9,True,False,True,False,False,False,False,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8827714,0dojpUA108DBW9rtH90tYp,Buddha’s Festival of Love,,,NaT,False,False,False,False,False,False,False,False,False,False,5,0,4,0.969,0.534,0.243,0.326,0.935,0.252,0.569,0.032,0.348,0.653,0,4,False,False,False,False,False,False,False,,,,,,,False
8827715,0EDPbjA2MGb8ksFHvPRQkF,Da Music,,,NaT,False,False,False,False,False,False,False,False,False,False,1,1,4,0.005,0.757,0.216,0.714,0.000,0.342,0.700,0.112,0.312,0.743,1,7,True,False,True,True,True,False,False,,,,,,,False
8827716,0fOFQHkfR26fraeXte9NkW,St. Jak pa la,,,NaT,False,False,False,False,False,False,False,False,False,False,9,0,4,0.512,0.824,0.083,0.610,0.320,0.102,0.561,0.049,0.401,0.923,1,7,False,False,True,False,False,False,True,,,,,,,False
8827717,0mzP7Bd2PGDA7n1Dt0OQFL,An nou mache,,,NaT,False,False,False,False,False,False,False,False,False,False,6,0,3,0.742,0.631,0.077,0.515,0.000,0.389,0.592,0.089,0.391,0.927,2,1,True,False,True,False,False,False,False,,,,,,,False


In [382]:
y_pred.sum()

0    21229
dtype: object

In [383]:
y_actual.sum()

0    21229
dtype: object

In [336]:
# loop through folds
for train_i, test_i in kfold.split(X_, y_):

    # train test split for current fold
    train_X, test_X = X_.iloc[train_i], X_.iloc[test_i]
    train_y, test_y = y_.iloc[train_i], y_.iloc[test_i]

In [337]:
train_i

array([      0,       1,       2, ..., 8827716, 8827717, 8827718])

In [356]:
%%time
i_train_cluster = df_predictions.iloc[train_i][df_predictions.iloc[train_i]['cluster2'] == 0].index
i_test_cluster = df_predictions.iloc[test_i][df_predictions.iloc[test_i]['cluster2'] == 0].index

Wall time: 5.29 s


In [378]:
%%time
cluster_index = df_predictions[genre_columns+cluster_columns]
cluster_index = df_predictions[['cluster2']][df_predictions['cluster2'] == 0]
cluster_index

Wall time: 182 ms


Unnamed: 0,cluster2
21,0
28,0
30,0
53,0
66,0
...,...
8827678,0
8827679,0
8827680,0
8827681,0


In [367]:
%%time
i_train_cluster = cluster_index.iloc[train_i][cluster_index.iloc[train_i]['cluster2'] == 0].index
i_test_cluster = cluster_index.iloc[test_i][cluster_index.iloc[test_i]['cluster2'] == 0].index

AttributeError: 'Index' object has no attribute 'iloc'

In [370]:
%%time
cluster_index = df_predictions[df_predictions.index.isin(test_i) & (df_predictions['cluster2'] == 0)].index
cluster_index

Wall time: 1.12 s


Index([    117,     221,     341,     348,     349,     355,     415,     473,
           475,     477,
       ...
       8826946, 8826970, 8827175, 8827220, 8827337, 8827473, 8827479, 8827516,
       8827526, 8827540],
      dtype='object', length=198669)

In [376]:
%%time
df_predictions.loc[df_predictions.index.isin(test_i) & (df_predictions['cluster2'] == 0)].index

Wall time: 1.12 s


Index([    117,     221,     341,     348,     349,     355,     415,     473,
           475,     477,
       ...
       8826946, 8826970, 8827175, 8827220, 8827337, 8827473, 8827479, 8827516,
       8827526, 8827540],
      dtype='object', length=198669)

In [377]:
%%time
cluster_index.loc[cluster_index.index.isin(test_i) & (cluster_index['cluster2'] == 0)].index

Wall time: 500 ms


Index([    117,     221,     341,     348,     349,     355,     415,     473,
           475,     477,
       ...
       8826946, 8826970, 8827175, 8827220, 8827337, 8827473, 8827479, 8827516,
       8827526, 8827540],
      dtype='object', length=198669)

In [None]:
train_i

In [345]:
%%time
train_X.loc[i_train_cluster]

Wall time: 283 ms


Unnamed: 0,mode,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
21,1,0.032,0.671,0.330,0.788,0.005,0.077,0.690,0.070,0.437,0.552
28,0,0.006,0.686,0.435,0.904,0.008,0.093,0.658,0.046,0.405,0.571
30,0,0.001,0.626,0.561,0.930,0.108,0.126,0.686,0.052,0.405,0.398
53,0,0.001,0.505,0.230,0.992,0.054,0.077,0.703,0.092,0.420,0.609
66,0,0.029,0.474,0.869,0.813,0.094,0.214,0.720,0.120,0.292,0.688
...,...,...,...,...,...,...,...,...,...,...,...
8827678,1,0.002,0.423,0.348,0.844,0.000,0.130,0.734,0.032,0.309,0.202
8827679,0,0.007,0.497,0.364,0.678,0.000,0.265,0.733,0.028,0.346,0.200
8827680,0,0.000,0.461,0.373,0.818,0.000,0.175,0.753,0.028,0.281,0.211
8827681,0,0.114,0.474,0.380,0.734,0.000,0.319,0.725,0.037,0.609,0.560


In [346]:
%%time
test_X.loc[i_test_cluster]

Wall time: 607 ms


Unnamed: 0,mode,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
117,1,0.000,0.314,0.466,0.992,0.130,0.302,0.800,0.215,0.478,0.173
221,0,0.169,0.246,0.378,0.819,0.264,0.084,0.719,0.124,0.701,0.318
341,1,0.001,0.119,0.086,0.988,0.001,0.426,0.782,0.212,0.698,0.328
348,1,0.000,0.251,0.216,0.963,0.001,0.330,0.713,0.070,0.506,0.332
349,1,0.000,0.187,0.162,0.948,0.001,0.084,0.710,0.084,0.582,0.513
...,...,...,...,...,...,...,...,...,...,...,...
8827473,1,0.002,0.504,0.254,0.748,0.000,0.102,0.712,0.054,0.590,0.512
8827479,0,0.045,0.375,0.726,0.809,0.003,0.175,0.769,0.034,0.446,0.359
8827516,0,0.025,0.405,0.373,0.896,0.003,0.371,0.740,0.066,0.419,0.325
8827526,1,0.000,0.436,0.292,0.896,0.000,0.064,0.774,0.046,0.441,0.335


In [303]:
train_X[train_X.index.isin(i_train_cluster)]

Unnamed: 0,mode,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
21,1,0.032,0.671,0.330,0.788,0.005,0.077,0.690,0.070,0.437,0.552
28,0,0.006,0.686,0.435,0.904,0.008,0.093,0.658,0.046,0.405,0.571
30,0,0.001,0.626,0.561,0.930,0.108,0.126,0.686,0.052,0.405,0.398
53,0,0.001,0.505,0.230,0.992,0.054,0.077,0.703,0.092,0.420,0.609
66,0,0.029,0.474,0.869,0.813,0.094,0.214,0.720,0.120,0.292,0.688
...,...,...,...,...,...,...,...,...,...,...,...
8827678,1,0.002,0.423,0.348,0.844,0.000,0.130,0.734,0.032,0.309,0.202
8827679,0,0.007,0.497,0.364,0.678,0.000,0.265,0.733,0.028,0.346,0.200
8827680,0,0.000,0.461,0.373,0.818,0.000,0.175,0.753,0.028,0.281,0.211
8827681,0,0.114,0.474,0.380,0.734,0.000,0.319,0.725,0.037,0.609,0.560


In [289]:
i_train_cluster

Index([     21,      28,      30,      53,      66,      68,      73,      75,
            83,      96,
       ...
       8827661, 8827674, 8827675, 8827676, 8827677, 8827678, 8827679, 8827680,
       8827681, 8827703],
      dtype='object', length=796892)

In [None]:
# save dataframe
df_predictions.to_pickle('df_predictions_CLUSTERS.pickle')

In [None]:
# save final predictions dataframe
df_predictions.to_pickle('df_predictions.pickle')

# Explore Predictions

In [None]:
# Friedman Test (Wilcoxn if required)


In [None]:
# Histograms 
## NEED TO COMBINE WITH id to compare AF
