# Tenor saxophone

* Sample duration = 5s
* model = SVM
* target = TENOR

Distinguish between samples with tenor saxophone from those without tenor saxophone. Other brass and reed instruments excluded.

In [32]:
from IPython.display import Audio
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, \
                                    RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import BaggingClassifier
from scipy.stats import uniform

In [61]:
df = pd.read_pickle('./data/5s/labeled/features_r03/all_data.pkl')

In [62]:
NUM_LABEL_COLS = 13
NUM_COMPONENTS = 32
TEST_SIZE = 0.2
RANDOM_SEED = 0

In [63]:
# df_filtered = df[df['excl'] == '0']  # exclude records we want to exclude
df_filtered = df
df_filtered.shape

(8436, 10790)

In [64]:
# exclude records we want to exclude
df_filtered = df_filtered[df_filtered['alto'] == '0'] 
df_filtered = df_filtered[df_filtered['sop'] == '0']
df_filtered = df_filtered[df_filtered['bari'] == '0']
df_filtered = df_filtered[df_filtered['clrt'] == '0']
df_filtered = df_filtered[df_filtered['tora'] == '0']
df_filtered = df_filtered[df_filtered['othr'] == '0']
df_filtered = df_filtered[df_filtered['trmp'] == '0']
df_filtered = df_filtered[df_filtered['trmb'] == '0']
df_filtered = df_filtered[df_filtered['otrb'] == '0']

num_x_cols = df_filtered.shape[1] - NUM_LABEL_COLS - 1  
# ^ last bit to adjust for zero indexing
data = df_filtered.iloc[:, 1:num_x_cols].to_numpy() 
# ^ These are the features
scaler = StandardScaler()
scaler.fit(data)
data = scaler.transform(data)

target = df_filtered[['tenr']].to_numpy().ravel()  # << This is the label

print(data.shape)
print(target.shape)

(5827, 10775)
(5827,)


In [65]:
pca = PCA(n_components=NUM_COMPONENTS)

In [66]:
pca.fit(data)

PCA(n_components=32)

In [67]:
# print(pca.explained_variance_ratio_)

In [68]:
# print(pca.singular_values_)

In [69]:
d = pca.transform(data)

In [70]:
# d

In [71]:
x_train, x_test, y_train, y_test = \
     train_test_split(d, target, test_size=TEST_SIZE, random_state=RANDOM_SEED)

## Test plain vanilla SVM

In [72]:
clf = SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8670668953687821


In [73]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,515,79
1,76,496


In [74]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87       594
           1       0.86      0.87      0.86       572

    accuracy                           0.87      1166
   macro avg       0.87      0.87      0.87      1166
weighted avg       0.87      0.87      0.87      1166



## Determine good value for C using random search

In [18]:
model_params = {
    'C': uniform(0.1, 4.0)
}

# RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, 
#                    scoring=None, n_jobs=None, iid='deprecated', 
#                    refit=True, cv=None, verbose=0, 
#                    pre_dispatch='2*n_jobs', random_state=None, 
#                    error_score=nan, return_train_score=False)
svm_model = SVC()
clf = RandomizedSearchCV(svm_model, model_params, n_iter=10, 
                         cv=5)
model = clf.fit(x_train, y_train)
params = model.best_estimator_.get_params()

In [22]:
params

{'C': 3.523320111823472,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [19]:
clf = SVC(C=params['C'])
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))


0.896414342629482


In [20]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,443,53
1,51,457


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89       496
           1       0.90      0.90      0.90       508

    accuracy                           0.90      1004
   macro avg       0.90      0.90      0.90      1004
weighted avg       0.90      0.90      0.90      1004



## Test ensemble approach with bagging classifier

#### Determine good parameters to use with bagging classifier. Start with grid search.

In [25]:
# Parameters
p = {'n_estimators': [100, 200, 300, 400, 500],
     'max_features': [NUM_COMPONENTS - 2, NUM_COMPONENTS - 3, 
                      NUM_COMPONENTS - 4, NUM_COMPONENTS - 5,
                      NUM_COMPONENTS - 6, NUM_COMPONENTS - 7]}

base_estimator = SVC(C=params['C'])
estimator = BaggingClassifier(base_estimator=base_estimator)
# Question: Should we include C here, or ... ? There's no reason it should differ
# for different values of n_estimators, but what about max_features? Might different
# values for C be best for different max_features? Maybe. Dunno. Maybe we should
# search for optimal C *after* we've found the best value for max_features?

# GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, 
#              iid='deprecated', refit=True, cv=None, verbose=0, 
#              pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)
clf = GridSearchCV(estimator, p, n_jobs=-1)
search = clf.fit(x_train, y_train)
search.best_params_

# {'max_features': 26, 'n_estimators': 200}

{'max_features': 26, 'n_estimators': 200}

#### Run bagging classifier with good params

In [26]:
# BaggingClassifier(base_estimator=None, n_estimators=10, *, 
#                   max_samples=1.0, max_features=1.0, 
#                   bootstrap=True, bootstrap_features=False, 
#                   oob_score=False, warm_start=False, 
#                   n_jobs=None, random_state=None, verbose=0) 
clf = BaggingClassifier(base_estimator=SVC(C=params['C']),
                        n_estimators=search.best_params_['n_estimators'], 
                        max_features=search.best_params_['max_features'],
                        random_state=RANDOM_SEED,
                        n_jobs=-1)
model = clf.fit(x_train, y_train)

In [27]:
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.896414342629482


In [28]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,441,55
1,49,459


In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89       496
           1       0.89      0.90      0.90       508

    accuracy                           0.90      1004
   macro avg       0.90      0.90      0.90      1004
weighted avg       0.90      0.90      0.90      1004



### Try weakening our learners. Maybe bagging will work better then.

* Reduce training set size (increase test set size)
* Do not optimize C
* Explore wider range of parameters in grid search

In [32]:
x_train, x_test, y_train, y_test = \
     train_test_split(d, target, test_size=0.4, random_state=RANDOM_SEED)

In [41]:
# Parameters
p = {'n_estimators': [100, 200, 300, 400, 500, 600],
     'max_features': [NUM_COMPONENTS - 2, NUM_COMPONENTS - 3, 
                      NUM_COMPONENTS - 4, NUM_COMPONENTS - 5,
                      NUM_COMPONENTS - 6, NUM_COMPONENTS - 7]}

base_estimator = SVC()  # do not pass in C
estimator = BaggingClassifier(base_estimator=base_estimator)

# GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, 
#              iid='deprecated', refit=True, cv=None, verbose=0, 
#              pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)
clf = GridSearchCV(estimator, p, n_jobs=-1)
search = clf.fit(x_train, y_train)
search.best_params_

# {'max_features': 25, 'n_estimators': 600}

{'max_features': 25, 'n_estimators': 600}

In [34]:
# BaggingClassifier(base_estimator=None, n_estimators=10, *, 
#                   max_samples=1.0, max_features=1.0, 
#                   bootstrap=True, bootstrap_features=False, 
#                   oob_score=False, warm_start=False, 
#                   n_jobs=None, random_state=None, verbose=0) 
clf = BaggingClassifier(base_estimator=SVC(),
                        n_estimators=search.best_params_['n_estimators'], 
                        max_features=search.best_params_['max_features'],
                        random_state=RANDOM_SEED,
                        n_jobs=-1)
model = clf.fit(x_train, y_train)

In [35]:
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8725099601593626


In [36]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,864,117
1,139,888


In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       981
           1       0.88      0.86      0.87      1027

    accuracy                           0.87      2008
   macro avg       0.87      0.87      0.87      2008
weighted avg       0.87      0.87      0.87      2008



In [7]:
pca = PCA(0.90).fit(data)
print(pca.n_components_)
d90 = pca.transform(data)

826


NameError: name 'RANDOM_STATE' is not defined

In [10]:
x_train, x_test, y_train, y_test = \
     train_test_split(d90, target,
                      test_size=TEST_SIZE,
                      random_state=RANDOM_SEED)

In [11]:
clf = SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8645418326693227


In [12]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,425,71
1,65,443


In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86       496
           1       0.86      0.87      0.87       508

    accuracy                           0.86      1004
   macro avg       0.86      0.86      0.86      1004
weighted avg       0.86      0.86      0.86      1004



In [14]:
pca = PCA(0.85).fit(data)
print(pca.n_components_)
d85 = pca.transform(data)

x_train, x_test, y_train, y_test = \
     train_test_split(d85, target,
                      test_size=TEST_SIZE,
                      random_state=RANDOM_SEED)

497


In [15]:
clf = SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8675298804780877


In [16]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,425,71
1,62,446


In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86       496
           1       0.86      0.88      0.87       508

    accuracy                           0.87      1004
   macro avg       0.87      0.87      0.87      1004
weighted avg       0.87      0.87      0.87      1004



In [18]:
pca = PCA(0.75).fit(data)
print(pca.n_components_)
d75 = pca.transform(data)

x_train, x_test, y_train, y_test = \
     train_test_split(d75, target,
                      test_size=TEST_SIZE,
                      random_state=RANDOM_SEED)

199


In [19]:
clf = SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8705179282868526


In [20]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,427,69
1,61,447


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87       496
           1       0.87      0.88      0.87       508

    accuracy                           0.87      1004
   macro avg       0.87      0.87      0.87      1004
weighted avg       0.87      0.87      0.87      1004



In [22]:
pca = PCA(0.50).fit(data)
print(pca.n_components_)
d50 = pca.transform(data)

x_train, x_test, y_train, y_test = \
     train_test_split(d50, target,
                      test_size=TEST_SIZE,
                      random_state=RANDOM_SEED)

24


In [23]:
clf = SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8665338645418327


In [24]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,428,68
1,66,442


In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86       496
           1       0.87      0.87      0.87       508

    accuracy                           0.87      1004
   macro avg       0.87      0.87      0.87      1004
weighted avg       0.87      0.87      0.87      1004

