# Import

In [1]:
import pickle
import spotipy

# math and dataframes
import pandas as pd
import numpy as np
import scipy as sp

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Pipeline and Evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from imblearn.pipeline import make_pipeline

# Undersampling 
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import time
import seaborn as sns
sns.set_theme()

In [2]:
df_10M = pd.read_pickle('df_10M_clustered.pickle')
X_all = pd.read_pickle('X_clustered.pickle')

In [6]:
# popularity from the sql database
"""
    SELECT id, popularity FROM tracks
"""
df_sql_popularity = pd.read_csv('popularity_by_track_sql.csv')

# SKIP THIS STEP 
# (see note below)
##### Get Popularity for Missing Tracks from B100

NOTE:
* this data will not lead to a consistent, apples-to-apples comparision:
    * popularity is based on recency of the plays
    * the SQL database and API 'popularity' have recency which are not in sync
    * it is more accurate to use just the SQL database for this prediction

In [None]:
# initialise popularity dataframe for B100 songs

df_B100_songs = pd.read_pickle('df_B100_songs.pickle')

df_B100_popularity = df_B100_songs.copy()
df_B100_popularity['popularity'] = pd.NA
df_B100_popularity = df_B100_popularity[['id', 'popularity']]

# confirm no duplicates
df_B100_popularity.id.duplicated().sum()

##### get a temporary authorization token from: https://developer.spotify.com/console/get-search-item

In [None]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

In [None]:
def get_popularity(track_id):
    track_info = spotify.track(track_id)
    popularity = track_info['popularity']
    
    return popularity

In [None]:
%%time
# loop to GET popularity

counter = 0
start_over_at = 0
if start_over_at == 0:
    id_errors = set()

for i, row in df_B100_popularity.iterrows():
        
    if counter % 100 == 0:
        print(counter, end=' ')
    if counter % 1000 == 0:
        print()
    
    counter += 1
    
    if i < start_over_at:  # where we timed out last time
        continue
    
    # save temp file
    if counter % 1000 == 0:
        df_B100_popularity.to_pickle('df_B100_popularity_TEMP.pickle')
    
    # does this track have a null popularity value? if not, next row
    if not df_B100_popularity.iloc[[i]].isnull()['popularity'].values[0]:
        continue    
    
    # current id for lookup in API
    track_id = row.id
    
    # lookup song info from API and set the popularity value for that track
    try:
        df_B100_popularity.loc[i, 'popularity'] = get_popularity(track_id)
    except:  # any error should 
        print(' -- get_popularity() didnt work -- ', track_id)
        id_errors.add(track_id)
        df_B100_popularity.loc[i, 'popularity'] = 0  # set it to zero anyway


# save the dataframe
df_B100_popularity.to_pickle('df_B100_popularity_COMPLETE.pickle')

In [None]:
# how many missing values
len(id_errors)

In [None]:
# how many null
df_B100_popularity.popularity.isnull().sum()

In [None]:
# combine df_sql_popularity and df_B100_popularity into df_popularity
df_popularity = pd.concat([df_B100_popularity, df_sql_popularity]).reset_index(drop=True)
df_popularity.shape

In [None]:
# confirm no duplicates
df_popularity.duplicated(subset='id').sum()

In [None]:
# save the dataframe
df_popularity.to_pickle('df_popularity.pickle')

##### Combine Spotify Popularity with X_all

# START OVER HERE
# Choose a Threshold for 'is_Popular'

In [7]:
df_sql_popularity.describe([0.01, 0.99])['mean':'max'].T

Unnamed: 0,mean,std,min,1%,50%,99%,max
popularity,6.118,10.58,0.0,0.0,1.0,48.0,100.0


In [9]:
# the top percentile is about popularity of 50
# this seems like a good, arbitrary threshold

popularity_threshold = 50

df_sql_popularity['is_Popular'] = df_sql_popularity.popularity > popularity_threshold

In [24]:
# merge with X_all
X_all = pd.merge(X_all, df_sql_popularity, left_index=True, right_on='id').set_index('id')

In [26]:
# X_all.artist.isnull().sum()
X_all.is_Popular.count()

8030875

# Create Datasets for Prediction

In [27]:
y_column = 'is_Popular'
X_columns = [
    'mode', 'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]
genre_columns = [
    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
]
cluster_columns = ['cluster', 'cluster2']
other_columns = ['key', 'time_signature', 'genre', 'release_date']


In [28]:
# create a dict with all 'name': (X, y) key match pairs
clusters = {}

# entire predictive dataset
clusters['All'] = (X_all[X_columns+genre_columns], X_all[y_column])
# clusters['All'] = (X_all[X_columns], X_all[y_column])

# add genres
for genre in genre_columns:
    title = genre[3:]
    clusters[title] = (X_all[X_all[genre]][X_columns], X_all[X_all[genre]][y_column])
    
# add clusters
for n in sorted(X_all['cluster'].unique()):
    title = genre[3:]
    clusters['cluster1_' + str(n)] = (X_all[X_all['cluster'] == n][X_columns], X_all[X_all['cluster'] == n][y_column])
    
for n in sorted(X_all['cluster2'].unique()):
    title = genre[3:]
    clusters['cluster2_' + str(n)] = (X_all[X_all['cluster2'] == n][X_columns], X_all[X_all['cluster2'] == n][y_column])
    
# setup tuning algorithm with a small dataset
small = X_all.sample(10_000, random_state=42)
X_small = small[X_columns]
y_small = small[y_column]
clusters['small'] = (X_small, y_small)

# Tune Algorithms

In [29]:
cluster_keys = [
    'All', 
    'Adult_Standard', 'Rock', 'R&B', 'Country', 'Pop', 'Rap', 'Alternative', 'EDM', 'Metal', 
    'cluster1_0', 'cluster1_1', 'cluster1_2', 'cluster1_3', 
    'cluster2_0', 'cluster2_1', 'cluster2_2', 'cluster2_3', 'cluster2_4', 
    'cluster2_5', 'cluster2_6', 'cluster2_7', 'cluster2_8', 'cluster2_9',
    'small'
]

In [30]:
# setup model parameters for grid search

ML_algorithms = [
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    RandomForestClassifier,
    AdaBoostClassifier
]

param_by_model = {}

params_lr = {}
orders_of_magnitude = []
for lst in [[int(x)/10000 for x in range(1, 11)],
            [int(x)/1000 for x in range(1, 11)],
            [int(x)/100 for x in range(1, 11)],
            [int(x)/10 for x in range(1, 11)],
            [1 * x for x in range(1, 11)],
            [10 * x for x in range(1, 11)],
            [100 * x for x in range(1, 11)],
            [1000 * x for x in range(1, 11)]]:
    orders_of_magnitude += lst
params_lr['logisticregression__penalty'] = ['l1', 'l2']
params_lr['logisticregression__C'] = orders_of_magnitude
params_lr['logisticregression__solver'] = ['liblinear']
param_by_model[0] = params_lr

params_dt = {}
params_dt['decisiontreeclassifier__max_depth'] = [3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 100, None]
params_dt['decisiontreeclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
params_dt['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
param_by_model[1] = params_dt

params_knn = {}
params_knn['kneighborsclassifier__n_neighbors'] = [x for x in range(2,20)]+[x for x in range(20,101,5)]
params_knn['kneighborsclassifier__weights'] = ['uniform', 'distance']
params_knn['kneighborsclassifier__metric'] = ['minkowski', 'euclidean', 'manhattan']
param_by_model[2] = params_knn

params_rf = {}
params_rf['randomforestclassifier__n_estimators'] = [5, 10, 20, 50, 100, 200, 500, 1000, 2000]
params_rf['randomforestclassifier__max_features'] = ['sqrt', 'log2']
params_rf['randomforestclassifier__max_depth'] = [3, 5, 7, 10, 15, 20, 30, 50, 100, None]
params_rf['randomforestclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
params_rf['randomforestclassifier__bootstrap'] = [True, False]
param_by_model[3] = params_rf

params_ab = {}
params_ab['adaboostclassifier__n_estimators'] = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
params_ab['adaboostclassifier__learning_rate'] = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0]
params_ab['adaboostclassifier__algorithm'] = ['SAMME', 'SAMME.R']
param_by_model[4] = params_ab

In [31]:
# how many scenarios in the grid search

def how_many_scenarios(n_ML):
    n_scenarios = 1
    for key in param_by_model[n_ML].keys():
        n_scenarios *=  len(param_by_model[n_ML][key])
    return n_scenarios

for i in range(5):
    print(str(ML_algorithms[i]())[:-2], how_many_scenarios(i))

LogisticRegression 160
DecisionTreeClassifier 160
KNeighborsClassifier 210
RandomForestClassifier 1800
AdaBoostClassifier 162


In [32]:
def fit_predict_metric_model(n_ML, dataset='small', n_cv=5, scoring='roc_auc', undersample=True, cv_res_print=False, heatmap=False):
    
    # split the dataset into train test
    X_, y_ = clusters[dataset]
    X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42, stratify=y_)
        
    # params
    param_grid = param_by_model[n_ML]

    # pipeline
    if undersample:
        pipe = make_pipeline(
            RandomUnderSampler(sampling_strategy='majority', random_state=42), 
            ML_algorithms[n_ML]()
        )
    else:
        if n_ML in [0, 1, 3]:
            pipe = make_pipeline(ML_algorithms[n_ML](class_weight='balanced'))
        else:
            pipe = make_pipeline(ML_algorithms[n_ML]())

    # gridsearch
    cv = StratifiedKFold(n_splits=n_cv, shuffle=True)
    grid = GridSearchCV(
        estimator = pipe,
        param_grid = param_grid,
        cv = cv,
        scoring = scoring, 
        n_jobs = -1
    )

    # calculate best parameters
    grid.fit(X_train, y_train)

    # results
    cv_results = (
        grid.best_params_,
        grid.cv_results_['mean_test_score'].mean(), 
        grid.cv_results_['mean_test_score'].min(), 
        grid.cv_results_['mean_test_score'].max()
    )
    
    # print header
    if undersample:
        undersample_description = 'Undersampled'
    else:
        undersample_description = 'Full Dataset'
    print(
        '\nScenario\n------------------------------\n', str(ML_algorithms[0]())[:-2], 
        dataset.title(), 
        scoring, 
        undersample_description
    )
    
    if cv_res_print:
        # print cv results
        print('\nCrossvalidation Results\n------------------------------')
        for i in cv_results:
            print(i)

    # print predictions
    y_pred = grid.predict(X_test)
    print('\nClassification Report\n------------------------------\n', classification_report(y_test, y_pred))
    
    if heatmap:
        print('\nConfusion Matrix\n------------------------------')
        plt.subplots(figsize=(6, 6))
        sns.heatmap(confusion_matrix(y_test, y_pred), vmin=0, cmap='Blues', annot=True, fmt='.0f', cbar=False,
                   xticklabels=['Not Popular', 'Billboard Hit'], yticklabels=['Not Popular', 'Billboard Hit'])
        plt.ylabel('Predicted')
        plt.xlabel('Actual')
        plt.show()

In [33]:
# scenarios to check

metrics = [
    'balanced_accuracy', 'average_precision', 'neg_brier_score', 'f1', 'f1_micro', 
    'f1_macro', 'f1_weighted', 'neg_log_loss', 'precision', 'recall', 'roc_auc', 'jaccard'
]

cluster1_keys = [
    'cluster1_0', 'cluster1_1', 'cluster1_2', 'cluster1_3'
]

cluster2_keys = [
    'cluster2_0', 'cluster2_1', 'cluster2_2', 'cluster2_3', 'cluster2_4', 
    'cluster2_5', 'cluster2_6', 'cluster2_7', 'cluster2_8', 'cluster2_9',
]

genre_keys = [
    'Adult_Standard', 'Rock', 'R&B', 'Country', 'Pop', 'Rap', 'Alternative', 'EDM', 'Metal'
]

# Let's try this again with Spotify's Popular metric

##### Logistic Regression

In [35]:
%%time
# undersampled
fit_predict_metric_model(0, dataset='small', n_cv=5, scoring='roc_auc', cv_res_print=False, undersample=True, heatmap=False)


Scenario
------------------------------
 LogisticRegression Small roc_auc Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       1.00      0.53      0.69      1986
        True       0.01      0.79      0.02        14

    accuracy                           0.54      2000
   macro avg       0.50      0.66      0.36      2000
weighted avg       0.99      0.54      0.69      2000

Wall time: 6.06 s


In [36]:
%%time
# not undersampled
fit_predict_metric_model(0, dataset='small', n_cv=5, scoring='roc_auc', cv_res_print=False, undersample=False, heatmap=False)


Scenario
------------------------------
 LogisticRegression Small roc_auc Full Dataset

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       1.00      0.58      0.74      1986
        True       0.01      0.64      0.02        14

    accuracy                           0.58      2000
   macro avg       0.50      0.61      0.38      2000
weighted avg       0.99      0.58      0.73      2000

Wall time: 17.1 s


In [37]:
%%time
# check metrics
undersample = True

for metric in metrics:
    fit_predict_metric_model(0, dataset='small', n_cv=5, scoring=metric, cv_res_print=False, undersample=undersample, heatmap=False)


Scenario
------------------------------
 LogisticRegression Small balanced_accuracy Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       1.00      0.44      0.61      1986
        True       0.01      0.79      0.02        14

    accuracy                           0.44      2000
   macro avg       0.50      0.61      0.32      2000
weighted avg       0.99      0.44      0.61      2000


Scenario
------------------------------
 LogisticRegression Small average_precision Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       1.00      0.54      0.70      1986
        True       0.01      0.79      0.02        14

    accuracy                           0.54      2000
   macro avg       0.50      0.66      0.36      2000
weighted avg       0.99      0.54      0.70      2000


Scenario
------------------------------

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Scenario
------------------------------
 LogisticRegression Small f1_macro Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       0.99      1.00      1.00      1986
        True       0.00      0.00      0.00        14

    accuracy                           0.99      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       0.99      0.99      0.99      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Scenario
------------------------------
 LogisticRegression Small f1_weighted Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       0.99      1.00      1.00      1986
        True       0.00      0.00      0.00        14

    accuracy                           0.99      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       0.99      0.99      0.99      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Scenario
------------------------------
 LogisticRegression Small neg_log_loss Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       1.00      0.36      0.53      1986
        True       0.01      0.79      0.02        14

    accuracy                           0.37      2000
   macro avg       0.50      0.57      0.27      2000
weighted avg       0.99      0.37      0.53      2000


Scenario
------------------------------
 LogisticRegression Small precision Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       1.00      0.53      0.69      1986
        True       0.01      0.79      0.02        14

    accuracy                           0.54      2000
   macro avg       0.50      0.66      0.36      2000
weighted avg       0.99      0.54      0.69      2000


Scenario
------------------------------
 LogisticReg

In [38]:
%%time
# check clusters 1
undersample = True

for key in cluster1_keys:
    fit_predict_metric_model(0, dataset=key, n_cv=5, scoring='roc_auc', cv_res_print=False, undersample=undersample, heatmap=False)


Scenario
------------------------------
 LogisticRegression Cluster1_0 roc_auc Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       1.00      0.65      0.79    216792
        True       0.01      0.68      0.01       696

    accuracy                           0.65    217488
   macro avg       0.50      0.67      0.40    217488
weighted avg       1.00      0.65      0.78    217488


Scenario
------------------------------
 LogisticRegression Cluster1_1 roc_auc Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       0.99      0.60      0.75    720840
        True       0.02      0.66      0.04      9088

    accuracy                           0.60    729928
   macro avg       0.51      0.63      0.39    729928
weighted avg       0.98      0.60      0.74    729928


Scenario
------------------------------
 Logistic

In [39]:
%%time
# check clusters 2
undersample = True

for key in cluster2_keys:
    fit_predict_metric_model(0, dataset=key, n_cv=5, scoring='roc_auc', cv_res_print=False, undersample=undersample, heatmap=False)


Scenario
------------------------------
 LogisticRegression Cluster2_0 roc_auc Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       0.99      0.58      0.73    178093
        True       0.02      0.68      0.03      1857

    accuracy                           0.58    179950
   macro avg       0.51      0.63      0.38    179950
weighted avg       0.98      0.58      0.73    179950


Scenario
------------------------------
 LogisticRegression Cluster2_1 roc_auc Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       1.00      0.66      0.79    200284
        True       0.01      0.69      0.02      1232

    accuracy                           0.66    201516
   macro avg       0.50      0.68      0.41    201516
weighted avg       0.99      0.66      0.79    201516


Scenario
------------------------------
 Logistic

In [40]:
%%time
# check genres
undersample = True

for key in genre_keys:
    fit_predict_metric_model(0, dataset=key, n_cv=5, scoring='roc_auc', cv_res_print=False, undersample=undersample, heatmap=False)


Scenario
------------------------------
 LogisticRegression Adult_Standard roc_auc Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       1.00      0.66      0.80     41161
        True       0.02      0.68      0.03       365

    accuracy                           0.66     41526
   macro avg       0.51      0.67      0.42     41526
weighted avg       0.99      0.66      0.79     41526


Scenario
------------------------------
 LogisticRegression Rock roc_auc Undersampled

Classification Report
------------------------------
               precision    recall  f1-score   support

       False       0.99      0.55      0.70    128938
        True       0.03      0.71      0.05      2225

    accuracy                           0.55    131163
   macro avg       0.51      0.63      0.38    131163
weighted avg       0.97      0.55      0.69    131163


Scenario
------------------------------
 LogisticRe

# OLDER CODE: no meaningful results

### Logistic Regression

#### huge loop (all night not nearly enough, better now anyway)

In [None]:
%%time
for key in cluster_keys:
    fit_predict_metric_model(0, dataset=key, n_cv=5, scoring='balanced_accuracy')

In [None]:
%%time
for key in cluster_keys:
    fit_predict_metric_model(0, dataset=key, n_cv=5, scoring='average_precision')

#### tons of scenarios

In [None]:
%%time
fit_predict_metric_model(0, dataset='Adult_Standard', n_cv=5, scoring='roc_auc')

In [None]:
%%time
fit_predict_metric_model(0, dataset='Adult_Standard', n_cv=5, scoring='precision')

In [None]:
%%time
fit_predict_metric_model(0, dataset='Adult_Standard', n_cv=5, scoring='balanced_accuracy')

In [None]:
%%time
fit_predict_metric_model(0, dataset='Adult_Standard', n_cv=5, scoring='f1')

In [None]:
%%time
fit_predict_metric_model(0, dataset='cluster1_0', n_cv=5, scoring='roc_auc')

In [None]:
%%time
fit_predict_metric_model(0, dataset='cluster1_1', n_cv=5, scoring='roc_auc')

In [None]:
%%time
fit_predict_metric_model(0, dataset='cluster1_2', n_cv=5, scoring='roc_auc')

In [None]:
%%time
fit_predict_metric_model(0, dataset='cluster1_3', n_cv=5, scoring='roc_auc')

##### Other Models

In [None]:
%%time
fit_predict_metric_model(1, dataset='Adult_Standard', n_cv=5, scoring='roc_auc')

In [None]:
%%time
fit_predict_metric_model(2, dataset='Adult_Standard', n_cv=5, scoring='roc_auc')

In [None]:
%%time
fit_predict_metric_model(3, dataset='Adult_Standard', n_cv=5, scoring='roc_auc')

In [None]:
%%time
fit_predict_metric_model(4, dataset='Adult_Standard', n_cv=5, scoring='roc_auc')

# EVEN OLDER CODE: no meaningful results

### Which ML models did well with default settings?

In [None]:
with open('default_results.pickle', 'rb') as f:
    default_results = pickle.load(f)

In [None]:
ML_algorithms = [
    LogisticRegression,
    DecisionTreeClassifier,
    KNeighborsClassifier,
    RandomForestClassifier,
    AdaBoostClassifier
]

def default_results_by_metric(class_type='True', metric='f1-score'):
    """convert default results into readable form"""
    output_ = []

    for algo in ML_algorithms:
        algo_ = str(algo())[:-2]
        temp_ = [algo_]
        for cluster in cluster_keys:
            if class_type == 'accuracy':
                metric_ = default_results[algo_][cluster][1][class_type]
            else:
                metric_ = default_results[algo_][cluster][1][class_type][metric]
            temp_.append(metric_)
        output_.append(temp_)

    df_default_results = pd.DataFrame(output_, columns=['Model']+list(default_results['LogisticRegression'].keys()))
    df_default_results['min'] = df_default_results.iloc[:, 1:].min(axis=1)
    df_default_results['max'] = df_default_results.iloc[:, 1:].max(axis=1)
    df_default_results['mean'] = df_default_results.iloc[:, 1:].mean(axis=1)

    return df_default_results
    

In [None]:
# best performing classification by cluster = Adult_Standard (1st or 2nd for all ML models)
sortbyfeature = 'LogisticRegression'
pd.DataFrame(default_results_by_metric().iloc[:, :-3].set_index('Model').T).sort_values(sortbyfeature, ascending=False)

In [None]:
# adult standard is just better because of randomness... there are more to start with ...
for cluster in clusters:
    print(cluster, clusters[cluster][1].sum(), clusters[cluster][1].count(), clusters[cluster][1].sum() / clusters[cluster][1].count())

### Logistic Regression

In [None]:
# huge list of orders of magnitude for gridsearch
# formatted a bit funny to avoid rounding errors
orders_of_magnitude = []
for lst in [[int(x)/10000 for x in range(1, 11)],
            [int(x)/1000 for x in range(1, 11)],
            [int(x)/100 for x in range(1, 11)],
            [int(x)/10 for x in range(1, 11)],
            [1 * x for x in range(1, 11)],
            [10 * x for x in range(1, 11)],
            [100 * x for x in range(1, 11)],
            [1000 * x for x in range(1, 11)]]:
    orders_of_magnitude += lst

In [None]:
%%time
# LOGISTIC REGRESSION

# choose dataset
am_testing = False  # test tuning setup with small dataset
if am_testing:
    X_, y_ = X_small, y_small
else:
    X_, y_ = clusters['Adult_Standard']  # has the best classification results from STEP 5

# params
scoring = 'f1'
param_grid = {}
param_grid['logisticregression__penalty'] = ['l1', 'l2']
param_grid['logisticregression__C'] = orders_of_magnitude

# pipeline
pipe = make_pipeline(
    RandomUnderSampler(sampling_strategy='majority'), 
    LogisticRegression(solver='liblinear')
)

# gridsearch
cv = StratifiedKFold(n_splits=10, shuffle=True)
lr_grid = GridSearchCV(
    estimator = pipe,
    param_grid = param_grid,
    cv = cv,
    scoring = scoring, 
    n_jobs = -1
)

# calculate best parameters
lr_grid.fit(X_, y_)

# results
lr_grid.best_params_, lr_grid.best_score_

In [None]:
# doesn't seem like tuning is doing anything
(
    lr_grid.cv_results_['mean_test_score'].mean(), 
    lr_grid.cv_results_['mean_test_score'].min(), 
    lr_grid.cv_results_['mean_test_score'].max()
)

In [None]:
%%time
# DECISION TREE

# choose dataset
am_testing = False  # test tuning setup with small dataset
if am_testing:
    X_, y_ = X_small, y_small
else:
    X_, y_ = clusters['Adult_Standard']  # has the best classification results from STEP 5

# params
scoring = 'f1'
param_grid = {}
param_grid['decisiontreeclassifier__max_depth'] = [3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 100, None]
param_grid['decisiontreeclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
param_grid['decisiontreeclassifier__criterion'] = ['gini', 'entropy']

# pipeline
pipe = make_pipeline(
    RandomUnderSampler(sampling_strategy='majority'), 
    DecisionTreeClassifier()
)

# gridsearch
cv = StratifiedKFold(n_splits=10, shuffle=True)
dt_grid = GridSearchCV(
    estimator = pipe,
    param_grid = param_grid,
    cv = cv,
    scoring = scoring, 
    n_jobs = -1
)

# calculate best parameters
dt_grid.fit(X_, y_)

# results
dt_grid.best_params_, dt_grid.best_score_

In [None]:
# tuning isn't useful here either...
(
    dt_grid.cv_results_['mean_test_score'].mean(), 
    dt_grid.cv_results_['mean_test_score'].min(), 
    dt_grid.cv_results_['mean_test_score'].max()
)

In [None]:
%%time
# KNN

# choose dataset
am_testing = False  # test tuning setup with small dataset
if am_testing:
    X_, y_ = X_small, y_small
else:
    X_, y_ = clusters['Adult_Standard']  # has the best classification results from STEP 5

# params
scoring = 'f1'
param_grid = {}
param_grid['kneighborsclassifier__n_neighbors'] = [x for x in range(2,20)]+[x for x in range(20,101,5)]
param_grid['kneighborsclassifier__weights'] = ['uniform', 'distance']
param_grid['kneighborsclassifier__metric'] = ['minkowski', 'euclidean', 'manhattan']

# pipeline
pipe = make_pipeline(
    RandomUnderSampler(sampling_strategy='majority'), 
    KNeighborsClassifier()
)

# gridsearch
cv = StratifiedKFold(n_splits=10, shuffle=True)
knn_grid = GridSearchCV(
    estimator = pipe,
    param_grid = param_grid,
    cv = cv,
    scoring = scoring, 
    n_jobs = -1
)

# calculate best parameters
knn_grid.fit(X_, y_)

# results
knn_grid.best_params_, knn_grid.best_score_

In [None]:
# check min mean and max
(
    knn_grid.cv_results_['mean_test_score'].mean(), 
    knn_grid.cv_results_['mean_test_score'].min(), 
    knn_grid.cv_results_['mean_test_score'].max()
)

In [None]:
%%time
# random forest

# choose dataset
am_testing = False  # test tuning setup with small dataset
if am_testing:
    X_, y_ = X_small, y_small
else:
    X_, y_ = clusters['Adult_Standard']  # has the best classification results from STEP 5

# params
scoring = 'f1'
param_grid = {}
param_grid['randomforestclassifier__n_estimators'] = [5, 10, 20, 50, 100, 200, 500, 1000, 2000]
param_grid['randomforestclassifier__max_features'] = ['sqrt', 'log2']
param_grid['randomforestclassifier__max_depth'] = [3, 5, 7, 10, 15, 20, 30, 50, 100, None]
param_grid['randomforestclassifier__min_samples_leaf'] = [5, 10, 50, 100, 1000]
param_grid['randomforestclassifier__bootstrap'] = [True, False]

# pipeline
pipe = make_pipeline(
    RandomUnderSampler(sampling_strategy='majority'), 
    RandomForestClassifier()
)

# gridsearch
cv = StratifiedKFold(n_splits=10, shuffle=True)
rf_grid = GridSearchCV(
    estimator = pipe,
    param_grid = param_grid,
    cv = cv,
    scoring = scoring, 
    n_jobs = -1
)

# calculate best parameters
rf_grid.fit(X_, y_)

# results
rf_grid.best_params_, rf_grid.best_score_

In [None]:
# also didn't do great...
(
    rf_grid.cv_results_['mean_test_score'].mean(), 
    rf_grid.cv_results_['mean_test_score'].min(), 
    rf_grid.cv_results_['mean_test_score'].max()
)

In [None]:
%%time
# adaboost

# choose dataset
am_testing = False  # test tuning setup with small dataset
if am_testing:
    X_, y_ = X_small, y_small
else:
    X_, y_ = clusters['Adult_Standard']  # has the best classification results from STEP 5

# params
scoring = 'f1'
param_grid = {}
param_grid['adaboostclassifier__n_estimators'] = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
param_grid['adaboostclassifier__n_learning_rate'] = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0]
param_grid['adaboostclassifier__n_algorithm'] = ['SAMME', 'SAMME.R']
        
# pipeline
pipe = make_pipeline(
    RandomUnderSampler(sampling_strategy='majority'), 
    AdaBoostClassifier()
)

# gridsearch
cv = StratifiedKFold(n_splits=10, shuffle=True)
ab_grid = GridSearchCV(
    estimator = pipe,
    param_grid = param_grid,
    cv = cv,
    scoring = scoring, 
    n_jobs = -1
)

# calculate best parameters
ab_grid.fit(X_, y_)

# results
ab_grid.best_params_, ab_grid.best_score_

In [None]:
# check min mean and max
(
    ab_grid.cv_results_['mean_test_score'].mean(), 
    ab_grid.cv_results_['mean_test_score'].min(), 
    ab_grid.cv_results_['mean_test_score'].max()
)

### Test Ensemble Methods