# Import

In [1]:
# math and dataframes
import pandas as pd
import numpy as np

# machine learning
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier

# Pipeline and Evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import plot_tree

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.3f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import time
import seaborn as sns
sns.set_theme()

In [7]:
df_10M = pd.read_pickle('df_10M_clustered.pickle')
X_all = pd.read_pickle('X_clustered.pickle')

# Create Datasets for Prediction

In [8]:
y_column = 'in_B100'
X_columns = [
    'mode', 'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence'
]
genre_columns = [
    'is_Adult_Standard', 'is_Rock', 'is_R&B', 'is_Country', 'is_Pop',
    'is_Rap', 'is_Alternative', 'is_EDM', 'is_Metal'
]
cluster_columns = ['cluster', 'cluster2']
other_columns = ['key', 'time_signature', 'genre', 'release_date']


In [45]:
# create a dict with all 'name': (y, X) key match pairs
clusters = {}

# entire predictive dataset
clusters['All'] = (X_all[y_column], X_all[X_columns])

# add genres
for genre in genre_columns:
    title = genre[3:]
    clusters[title] = (X_all[X_all[genre]][y_column], X_all[X_all[genre]][X_columns])
    
# add clusters
for n in sorted(X_all['cluster'].unique()):
    title = genre[3:]
    clusters['cluster1_' + str(n)] = (X_all[X_all['cluster'] == n][y_column], X_all[X_all['cluster'] == n][X_columns])
    
for n in sorted(X_all['cluster2'].unique()):
    title = genre[3:]
    clusters['cluster2_' + str(n)] = (X_all[X_all['cluster2'] == n][y_column], X_all[X_all['cluster2'] == n][X_columns])

In [48]:
clusters['All'][1].head()

Unnamed: 0_level_0,mode,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6SFc2WVQmARn6NDS3LrTR8,1,0.007,0.728,0.817,0.747,0.434,0.113,0.573,0.084,0.436,0.284
6NMSTM4UQMC5emaYKIueyc,0,0.027,0.703,0.736,0.763,0.847,0.103,0.619,0.051,0.418,0.336
1FqHPzJuRdupHbcw09tYUa,1,0.01,0.812,0.402,0.776,0.071,0.074,0.67,0.039,0.4,0.912
7y8aVfDkqt6qirGNivvs0M,0,0.003,0.702,0.34,0.851,0.0,0.322,0.706,0.041,0.395,0.87
70w8loBbdl4qZOH2brrqKF,0,0.0,0.786,0.587,0.751,0.446,0.053,0.664,0.078,0.418,0.651


In [46]:
list(clusters.keys())

['All',
 'Adult_Standard',
 'Rock',
 'R&B',
 'Country',
 'Pop',
 'Rap',
 'Alternative',
 'EDM',
 'Metal',
 'cluster1_0',
 'cluster1_1',
 'cluster1_2',
 'cluster1_3',
 'cluster2_0',
 'cluster2_1',
 'cluster2_2',
 'cluster2_3',
 'cluster2_4',
 'cluster2_5',
 'cluster2_6',
 'cluster2_7',
 'cluster2_8',
 'cluster2_9']