In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib widget

# Explicit Classification
In this notebook, i attempt to classify if songs are explicit or not using the spotify dataset

In [None]:
import pandas as pd
from data_prep import DataPrep
from tqdm import tqdm

In [None]:
df=pd.read_csv('spotify.csv')
print(df.shape)
df.head()

### Use DataPrep class to vectorise all data

In [None]:
dp = DataPrep(df)

columns_to_drop=['Artist URI(s)', 'Album URI', 'Album Artist URI(s)',
                 'Album Image URL', 'Disc Number', 'Track Preview URL', 'ISRC',
                 'Added By', 'Added At', 'Copyrights', 'Album Genres']

cat_columns = ['Artist Name(s)','Label','Decade','Key']

dp.prepare_data(drop_columns=columns_to_drop,   #columns to remove
                cat_columns=cat_columns,        #categorical columns i.e mapping to integer
                text_columns=['Track Name','Album Name'], #columns to encode using sent2vec
                n_components_text=2)            #number of features to reduce text columns down to

In [None]:
data = dp.df
print(data.shape)
data.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr = dp.df.select_dtypes(np.number).corr()
fig,ax = plt.subplots(figsize=(12,10))
sns.heatmap(corr, cmap='coolwarm', annot=False, fmt=".1f", linewidths=0.01)
ax.set_title('Correlation Matrix of 9945 Samples with 69 Features')
fig.tight_layout()

### Quick example of classification

First lets define all the different types of features + split them into lists, so eventually different features can be compared.

In [None]:
from cross_validation import *
from data_loading import *

In [None]:
variable = 'Explicit' #define the variable to classify

float_columns = ['Popularity','Track Duration (ms)', 'Explicit', 'Popularity', 'Danceability', 'Energy', 'Key', 'Loudness',
                 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness','Valence', 'Tempo', 'Time Signature']
float_columns = [i for i in float_columns if i!=variable]
genre_columns = [i for i in data.columns if 'Genre' in i and i!=variable]
album_columns = [i for i in data.columns if 'Album Artist' in i!=variable]
cat_columns   = [i for i in cat_columns if i!=variable]
text_columns  = [i for i in data.columns if 'Album Name' in i or 'Track Name' in i]

Define some train test splits

In [None]:
train_test_splits = create_train_test_splits(data, stratified=True,dependent_column=variable,
                                             n_splits=1,test_size=0.20)

Now create train data and test data for this split

In [None]:
(train_x, train_y), (test_x, test_y) = get_train_and_test_set(data,train_test_splits[0],
                                                              dependent_column=variable,
                                                              normalise=True)

print(train_x.shape,train_y.shape)
print(test_x.shape,test_y.shape)

Now train a simple model using this data

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

#the model, fit and predict
model = DecisionTreeClassifier()
model.fit(train_x,train_y)
predict = model.predict(test_x)

print('acc:',accuracy_score(test_y,predict))
print('f1:',f1_score(test_y,predict,average='macro'))
precision_score(test_y,predict,average='macro')

Lets run cross validation to get a better understanding and compare lots of models

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier
from xgboost import XGBClassifier

In [None]:
architectures = [XGBClassifier,DecisionTreeClassifier,RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier]
normalise     = [True, False]

In [None]:
train_test_splits = create_train_test_splits(data, stratified=True,dependent_column=variable,
                                             n_splits=10,test_size=0.20)

In [None]:
reports = []
#loop through architectures
for arch in tqdm(architectures):
    #loop through normalisation options
    for norm in normalise:
        #train same architecture n_splits times and average the results
        report = perform_cross_validation(dataset=data,
                                          architecture=arch,
                                          splits=train_test_splits,
                                          dependent_column=variable,
                                          normalise=norm)
        reports.append(report)

results = pd.DataFrame(reports)

results is a dataframe containing the classification metrics; essentially, the models largely perform the same regardless of normalisation. Explicit is a different field to predict

In [None]:
results_copy = results.copy()
results_copy = results_copy.sort_values(by='f1 mean',ascending=False)
results_copy = results_copy[['arch','normalise','f1 mean', 'f1 std', 'acc mean', 'acc std']]
results_copy['f1'] = [f'{i:.3f} ± {j:.3f}' for i,j in results_copy[['f1 mean','f1 std']].values]
results_copy['acc'] = [f'{i:.3f} ± {j:.3f}' for i,j in results_copy[['acc mean','acc std']].values]
results_copy.drop(columns=['f1 mean', 'f1 std', 'acc mean', 'acc std']).reset_index(drop=True).head()

### Varying the Features
it could be that some features hurt the classification. Lets look at the feature importance for the original decision tree

In [None]:
import matplotlib.pyplot as plt
fig,ax = plt.subplots(figsize=(5,3))
names  = [i for i in data.columns if i not in [variable,'Track URI']]
imp    = model.feature_importances_
asort  = imp.argsort()[::-1]
imp    = imp[asort][:20]
names  = np.array(names)[asort][:20]
ax.scatter(np.arange(1,len(imp)+1),imp,zorder=10,edgecolor='k',s=50)
ax.set_xticks(np.arange(1,len(imp)+1))
ax.set_xticklabels(names,rotation=90)
ax.set_ylabel('Importance')
ax.set_title('Top 20 most important features for "Explicit" Classification')
ax.grid(zorder=-1)
fig.tight_layout()
fig.set_dpi(150)

Select to top-20 features

In [None]:
top20       = np.argsort(imp,)[::-1][:20]
cols        = [names[i] for i in top20] + [variable,'Track URI']
data_top20  = dp.df[cols]

train_test_splits = create_train_test_splits(data, stratified=True,dependent_column=variable,
                                             n_splits=10,test_size=0.50)

In [None]:
top20_reports = []
#loop through architectures
for arch in tqdm(architectures):
    #loop through normalisation options
    for norm in normalise:
        #train same architecture n_splits times and average the results
        report = perform_cross_validation(dataset=data_top20,
                                          architecture=arch,
                                          splits=train_test_splits,
                                          dependent_column=variable,
                                          normalise=norm)
        top20_reports.append(report)

top20_results = pd.DataFrame(top20_reports)

In [None]:
top20_results

### Confusion Matrix

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
cm = ConfusionMatrixDisplay(results['cm mean'].values[0])
fig,ax = plt.subplots(figsize=(3,3))
cm.plot(ax=ax)
fig.tight_layout()