# Tenor saxophone

* Sample duration = 5s
* model = SVM
* target = TENOR

Distinguish between samples with tenor saxophone from those without tenor saxophone. Other brass and reed instruments excluded. Start with a reduced feature set, then do PCA.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier
from scipy.stats import uniform

from pickles_to_pandas import pickles_to_pandas


In [3]:
df = pickles_to_pandas('./data/5s/labeled/features_r02')

In [6]:
NUM_LABEL_COLS = 13
NUM_COMPONENTS = 34
TEST_SIZE = 0.2
RANDOM_SEED = 0

In [7]:
df_filtered = df[df['excl'] == '0']  # exclude records we want to exclude
df_filtered.shape

(5411, 10790)

In [8]:
# select features/columns by regex
regx = 'filename|zeros|mfcc_*|spc_*|spr_*|spf_*|sop|alto|tenr|tora|bari|clrt|othr|trmp|trmb|otrb|ext|iowa'
df_filtered = df_filtered.filter(regex=(regx))
# df_filtered.head()

In [9]:
# exclude records we want to exclude
df_filtered = df_filtered[df_filtered['alto'] == '0'] 
df_filtered = df_filtered[df_filtered['sop'] == '0']
df_filtered = df_filtered[df_filtered['bari'] == '0']
df_filtered = df_filtered[df_filtered['clrt'] == '0']
df_filtered = df_filtered[df_filtered['tora'] == '0']
df_filtered = df_filtered[df_filtered['othr'] == '0']
df_filtered = df_filtered[df_filtered['trmp'] == '0']
df_filtered = df_filtered[df_filtered['trmb'] == '0']
df_filtered = df_filtered[df_filtered['otrb'] == '0']

num_x_cols = df_filtered.shape[1] - NUM_LABEL_COLS - 1  
# ^ last bit to adjust for zero indexing
data = df_filtered.iloc[:, 1:num_x_cols].to_numpy() 
# ^ These are the features
scaler = StandardScaler()
scaler.fit(data)
data = scaler.transform(data)

target = df_filtered[['tenr']].to_numpy().ravel()  # << This is the label

print(data.shape)
print(target.shape)

(3483, 9912)
(3483,)


In [10]:
pca = PCA(n_components=NUM_COMPONENTS)

In [11]:
pca.fit(data)

PCA(n_components=34)

In [12]:
# print(pca.explained_variance_ratio_)

In [13]:
# print(pca.singular_values_)

In [14]:
d = pca.transform(data)

In [15]:
# d

In [16]:
x_train, x_test, y_train, y_test = \
     train_test_split(d, target, test_size=TEST_SIZE, random_state=RANDOM_SEED)

## Test plain vanilla SVM

In [17]:
clf = SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8852223816355811


In [18]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,348,40
1,40,269


In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       388
           1       0.87      0.87      0.87       309

    accuracy                           0.89       697
   macro avg       0.88      0.88      0.88       697
weighted avg       0.89      0.89      0.89       697



## Determine good value for C using random search

In [20]:
model_params = {
    'C': uniform(0.1, 3.0)
}

# RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, 
#                    scoring=None, n_jobs=None, iid='deprecated', 
#                    refit=True, cv=None, verbose=0, 
#                    pre_dispatch='2*n_jobs', random_state=None, 
#                    error_score=nan, return_train_score=False)
svm_model = SVC()
clf = RandomizedSearchCV(svm_model, model_params, n_iter=10, 
                         cv=5)
model = clf.fit(x_train, y_train)
params = model.best_estimator_.get_params()

In [22]:
clf = SVC(C=params['C'])
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))


0.890961262553802


In [23]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,352,36
1,40,269


In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90       388
           1       0.88      0.87      0.88       309

    accuracy                           0.89       697
   macro avg       0.89      0.89      0.89       697
weighted avg       0.89      0.89      0.89       697



## Test ensemble approach with bagging classifier

In [25]:
# BaggingClassifier(base_estimator=None, n_estimators=10, *, 
#                   max_samples=1.0, max_features=1.0, 
#                   bootstrap=True, bootstrap_features=False, 
#                   oob_score=False, warm_start=False, 
#                   n_jobs=None, random_state=None, verbose=0) 
clf = BaggingClassifier(base_estimator=SVC(C=params['C']),
                        n_estimators=400, 
                        max_features=NUM_COMPONENTS - 4,
                        random_state=42)
model = clf.fit(x_train, y_train)

In [26]:
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8938307030129125


In [27]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,352,36
1,38,271


In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90       388
           1       0.88      0.88      0.88       309

    accuracy                           0.89       697
   macro avg       0.89      0.89      0.89       697
weighted avg       0.89      0.89      0.89       697

