# Import

In [34]:
import pickle

# math and dataframes
import pandas as pd
import numpy as np
import scipy as sp

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier

# Pipeline and Evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, RandomizedSearchCV 
from sklearn.tree import plot_tree
from imblearn.pipeline import make_pipeline

# Undersampling 
# Note: undersampling was used in at least 1 paper predicting popularity (Gao 2021)
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import time
import seaborn as sns
sns.set_theme()

In [2]:
df_10M = pd.read_pickle('df_10M_clustered.pickle')
X_all = pd.read_pickle('X_clustered.pickle')

# Create Datasets for Prediction

In [3]:
y_column = 'in_B100'
X_columns = [
    'mode', 'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]
genre_columns = [
    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
]
cluster_columns = ['cluster', 'cluster2']
other_columns = ['key', 'time_signature', 'genre', 'release_date']


In [4]:
# create a dict with all 'name': (X, y) key match pairs
clusters = {}

# entire predictive dataset
clusters['All'] = (X_all[X_columns], X_all[y_column])

# add genres
for genre in genre_columns:
    title = genre[3:]
    clusters[title] = (X_all[X_all[genre]][X_columns], X_all[X_all[genre]][y_column])
    
# add clusters
for n in sorted(X_all['cluster'].unique()):
    title = genre[3:]
    clusters['cluster1_' + str(n)] = (X_all[X_all['cluster'] == n][X_columns], X_all[X_all['cluster'] == n][y_column])
    
for n in sorted(X_all['cluster2'].unique()):
    title = genre[3:]
    clusters['cluster2_' + str(n)] = (X_all[X_all['cluster2'] == n][X_columns], X_all[X_all['cluster2'] == n][y_column])
    
# OPTIONAL IF TIME PERMITS: consider adding decades or eras of music

In [5]:
# main non-clustered predictive dataset (all data)
X, y = clusters['All']

# Tune Algorithms

In [6]:
cluster_keys = [
    'All', 
    'Adult_Standard', 'Rock', 'R&B', 'Country', 'Pop', 'Rap', 'Alternative', 'EDM', 'Metal', 
    'cluster1_0', 'cluster1_1', 'cluster1_2', 'cluster1_3', 
    'cluster2_0', 'cluster2_1', 'cluster2_2', 'cluster2_3', 'cluster2_4', 
    'cluster2_5', 'cluster2_6', 'cluster2_7', 'cluster2_8', 'cluster2_9'
]

ML_algorithms = [
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    RandomForestClassifier,
    AdaBoostClassifier
]

In [419]:
with open('default_results.pickle', 'rb') as f:
    default_results = pickle.load(f)

In [422]:
def default_results_by_metric(class_type='True', metric='f1-score'):
    """convert default results into readable form"""
    output_ = []

    for algo in ML_algorithms:
        algo_ = str(algo())[:-2]
        temp_ = [algo_]
        for cluster in cluster_keys:
            if class_type == 'accuracy':
                metric_ = default_results[algo_][cluster][1][class_type]
            else:
                metric_ = default_results[algo_][cluster][1][class_type][metric]
            temp_.append(metric_)
        output_.append(temp_)

    df_default_results = pd.DataFrame(output_, columns=['Model']+list(default_results['LogisticRegression'].keys()))
    df_default_results['min'] = df_default_results.iloc[:, 1:].min(axis=1)
    df_default_results['max'] = df_default_results.iloc[:, 1:].max(axis=1)
    df_default_results['mean'] = df_default_results.iloc[:, 1:].mean(axis=1)

    return df_default_results
    

In [466]:
# best performing classification by cluster = Adult_Standard (1st or 2nd for all ML models)
sortbyfeature = 'LogisticRegression'
pd.DataFrame(default_results_by_metric().iloc[:, :-3].set_index('Model').T).sort_values(sortbyfeature, ascending=False)

Model,LogisticRegression,DecisionTreeClassifier,KNeighborsClassifier,RandomForestClassifier,AdaBoostClassifier
Adult_Standard,0.077,0.06,0.065,0.08,0.077
R&B,0.072,0.063,0.065,0.081,0.076
Country,0.038,0.03,0.033,0.042,0.038
Rock,0.032,0.029,0.03,0.039,0.036
Pop,0.021,0.019,0.019,0.026,0.022
Rap,0.017,0.013,0.014,0.018,0.017
cluster2_7,0.014,0.012,0.013,0.017,0.015
cluster2_1,0.012,0.011,0.011,0.016,0.014
cluster2_5,0.011,0.01,0.01,0.014,0.012
cluster1_1,0.011,0.01,0.011,0.014,0.012


In [19]:
# setup tuning algorithm with a small dataset
small = X_all.sample(10_000)
X_small = small[X_columns]
y_small = small[y_column]

### Logistic Regression

In [472]:
%%time
# choose dataset
am_testing = False  # test tuning with small dataset
if am_testing:
    X_, y_ = X_small, y_small
else:
    X_, y_ = clusters['Adult_Standard']  # has the best classification results from STEP 5

# params
scoring = 'f1'
param_grid = {}
param_grid['logisticregression__penalty'] = ['l1', 'l2']
param_grid['logisticregression__C'] = [
    0.0001, 0.0005, 0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100
]

# pipeline
transform = RandomUnderSampler(sampling_strategy='majority')
model = LogisticRegression(solver='liblinear')
pipe = make_pipeline(transform, model)

# gridsearch
cv = StratifiedKFold(n_splits=10, shuffle=True)  # use cv=10 when training the final model

# randomise?
randomise = True  # randomise had the best results in the small dataset, but both performed similarly
if randomise:
    param_grid['logisticregression__C'] = sp.stats.lognorm(2, scale=0.1) # log distribution centred at 0.1
    lr_grid = RandomizedSearchCV(
        estimator = pipe,
        param_distributions = param_grid,
        cv = cv,
        scoring = scoring, 
        n_jobs = -1,
        n_iter = 1000  # 1000 should take 30min
    )
else:
    lr_grid = GridSearchCV(
        estimator = pipe,
        param_grid = param_grid,
        cv = cv,
        scoring = scoring, 
        n_jobs = -1
    )

# calculate best parameters
lr_grid.fit(X_, y_)

# results
lr_grid.best_params_, lr_grid.best_score_

Wall time: 30min 43s


({'logisticregression__C': 1.1570383472211494,
  'logisticregression__penalty': 'l1'},
 0.07801951769451305)

In [481]:
# huh, that's not better than no tuning... let's try the other algorithm

In [482]:
%%time
# choose dataset
am_testing = False  # test tuning with small dataset
if am_testing:
    X_, y_ = X_small, y_small
else:
    X_, y_ = clusters['Adult_Standard']  # has the best classification results from STEP 5

# params
scoring = 'f1'
param_grid = {}
param_grid['logisticregression__C'] = [
    0.0001, 0.0005, 0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100
]

# pipeline
transform = RandomUnderSampler(sampling_strategy='majority')
model = LogisticRegression()
pipe = make_pipeline(transform, model)

# gridsearch
cv = StratifiedKFold(n_splits=10, shuffle=True)  # use cv=10 when training the final model

# randomise?
randomise = True  # randomise had the best results in the small dataset, but both performed similarly
if randomise:
    param_grid['logisticregression__C'] = sp.stats.lognorm(2, scale=0.1) # log distribution centred at 0.1
    lr_grid = RandomizedSearchCV(
        estimator = pipe,
        param_distributions = param_grid,
        cv = cv,
        scoring = scoring, 
        n_jobs = -1,
        n_iter = 1000  # 1000 should take 30min
    )
else:
    lr_grid = GridSearchCV(
        estimator = pipe,
        param_grid = param_grid,
        cv = cv,
        scoring = scoring, 
        n_jobs = -1
    )

# calculate best parameters
lr_grid.fit(X_, y_)

# results
lr_grid.best_params_, lr_grid.best_score_

Wall time: 31min 2s


({'logisticregression__C': 11.279203301969044}, 0.07790385187982632)

In [483]:
%%time
# gridsearch, not random
# choose dataset
am_testing = False  # test tuning with small dataset
if am_testing:
    X_, y_ = X_small, y_small
else:
    X_, y_ = clusters['Adult_Standard']  # has the best classification results from STEP 5

# params
scoring = 'f1'
param_grid = {}
param_grid['logisticregression__penalty'] = ['l1', 'l2']
param_grid['logisticregression__C'] = [
    0.0001, 0.0005, 0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100
]

# pipeline
transform = RandomUnderSampler(sampling_strategy='majority')
model = LogisticRegression(solver='liblinear')
pipe = make_pipeline(transform, model)

# gridsearch
cv = StratifiedKFold(n_splits=10, shuffle=True)  # use cv=10 when training the final model
lr_grid = GridSearchCV(
    estimator = pipe,
    param_grid = param_grid,
    cv = cv,
    scoring = scoring, 
    n_jobs = -1
)

# calculate best parameters
lr_grid.fit(X_, y_)

# results
lr_grid.best_params_, lr_grid.best_score_

Wall time: 59.3 s


({'logisticregression__C': 5, 'logisticregression__penalty': 'l1'},
 0.07774724465610885)

In [485]:
# hmmm doesn't seem to make a difference
# maybe switch back to whichever is less steps
# maybe add more C values, only doing 32 vs 1000 for rando
.98 * 1000 / 32 # seems to scale linearly

30.625

In [None]:
#### NOTES ####

# time taken for n_iter = 10 >>> 28s

# 0.03 at C=0.009216644946391565
# 0.03 at C=0.005648016650396617
# 0.03 at C=0.01325562486033278
# 0.0747 at C=0.005211080410662897
# 0.03 at C=0.0339936737521536