In [1]:
import pickle

# math and dataframes
import pandas as pd
import numpy as np

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Pipeline and Evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [2]:
df_DEFAULT1 = pd.read_pickle('df_predictions_DEFAULT_BACKUP.pickle')
df_DEFAULT2 = pd.read_pickle('df_predictions_DEFAULT.pickle')
X_all = pd.read_pickle('X_clustered.pickle')

# QA: re-run defaults and make sure all predictions are identical / consistent

In [15]:
columns = ['id', 'song', 'y_lr', 'y_dt', 'y_knn', 'y_rf', 'y_ab']
df_CHECK = pd.concat([df_DEFAULT1[columns], df_DEFAULT1[columns]], axis=1)

In [18]:
columns = [
    'id', 'song', 'y_lr', 'y_dt', 'y_knn', 'y_rf', 'y_ab',
    'id2', 'song2', 'y_lr2', 'y_dt2', 'y_knn2', 'y_rf2', 'y_ab2'
]
df_CHECK.columns = columns

In [20]:
df_CHECK[df_CHECK[col1] != df_CHECK[col2]].shape[0]

(0, 14)

In [23]:
for col in columns[:len(columns)//2]:
    col1, col2 = col, str(col)+'2'
    print(col+':', df_CHECK[df_CHECK[col1] != df_CHECK[col2]].shape[0])

id: 0
song: 3777
y_lr: 0
y_dt: 0
y_knn: 0
y_rf: 0
y_ab: 0


In [26]:
# only finding nulls
df_CHECK[(df_CHECK.song != df_CHECK.song2) & (~df_CHECK.song.isnull())].shape[0]

0

### THEREFORE: all of the runs match without errors, aligned data via matched seeds FTW

# CHECK: prediction times for already trained models

### NOTE: these are bad predictions, predicting on the training set, but we are just doing this to check how long it takes

In [27]:
# columns for datasets

y_column = 'in_B100'
X_columns = [
    'mode', 'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]
genre_columns = [
    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
]
cluster_columns = ['cluster', 'cluster2']
other_columns = ['key', 'time_signature', 'genre', 'release_date']

In [29]:
# common objects
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)
X, y = X_all[X_columns], X_all[y_column]

### Logistic Regression

In [34]:
# Train
lr_pipe = make_pipeline(undersampler, LogisticRegression())
lr_pipe.fit(X, y)

In [36]:
%%time
# PREDICT
lr_predictions = lr_pipe.predict(X)

Wall time: 811 ms


### Decision Tree

In [40]:
# Train
dt_pipe = make_pipeline(undersampler, DecisionTreeClassifier())
dt_pipe.fit(X, y)

In [41]:
%%time
# PREDICT
dt_predictions = dt_pipe.predict(X)

Wall time: 1.62 s


### KNN
* actually longer than training and testing
* maybe partitioning speeds up predictions somehow
  * seems more likely that this is just normal variation
    * assume ~100% of time is taken to predict (because not undersampled)

In [42]:
# Train
knn_pipe = make_pipeline(undersampler, KNeighborsClassifier())
knn_pipe.fit(X, y)

In [43]:
%%time
# PREDICT
knn_predictions = knn_pipe.predict(X)

Wall time: 20min 43s


### Random Forest

In [44]:
# Train
rf_pipe = make_pipeline(undersampler, RandomForestClassifier())
rf_pipe.fit(X, y)

In [45]:
%%time
# PREDICT
rf_predictions = rf_pipe.predict(X)

Wall time: 2min 9s


### AdaBoost

In [46]:
# Train
ab_pipe = make_pipeline(undersampler, AdaBoostClassifier())
ab_pipe.fit(X, y)

In [47]:
%%time
# PREDICT
ab_predictions = ab_pipe.predict(X)

Wall time: 49.5 s
