# Tenor saxophone

* Sample duration = 5s
* model = SVM
* target = TENOR

Distinguish between samples with tenor saxophone from those without tenor saxophone. Other brass and reed instruments excluded.

In [30]:
import pickle
from IPython.display import Audio
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, \
    precision_recall_curve, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, \
                                    RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import BaggingClassifier
from scipy.stats import uniform

In [3]:
df = pd.read_pickle('./data/5s/labeled/features_r02/all_data.pkl')

In [4]:
NUM_LABEL_COLS = 13
NUM_COMPONENTS = 44
TEST_SIZE = 0.2
RANDOM_SEED = 0

In [5]:
df_filtered = df
df_filtered.shape

(9080, 10790)

In [6]:
# exclude records we want to exclude
df_filtered = df_filtered[df_filtered['alto'] == '0'] 
df_filtered = df_filtered[df_filtered['sop'] == '0']
df_filtered = df_filtered[df_filtered['bari'] == '0']
df_filtered = df_filtered[df_filtered['clrt'] == '0']
df_filtered = df_filtered[df_filtered['tora'] == '0']
df_filtered = df_filtered[df_filtered['othr'] == '0']
df_filtered = df_filtered[df_filtered['trmp'] == '0']
df_filtered = df_filtered[df_filtered['trmb'] == '0']
df_filtered = df_filtered[df_filtered['otrb'] == '0']

num_x_cols = df_filtered.shape[1] - NUM_LABEL_COLS 
# ^ last bit to adjust for zero indexing
data = df_filtered.iloc[:, 1:num_x_cols].to_numpy() 
# ^ These are the features
scaler = StandardScaler()
scaler.fit(data)
data = scaler.transform(data)

target = df_filtered[['tenr']].to_numpy().ravel()  # << This is the label

print(data.shape)
print(target.shape)

(6434, 10776)
(6434,)


In [7]:
pca = PCA(n_components=NUM_COMPONENTS)

In [8]:
pca.fit(data)

PCA(n_components=44)

In [9]:
d = pca.transform(data)

In [10]:
x_train, x_test, y_train, y_test = \
     train_test_split(d, target, test_size=TEST_SIZE, random_state=RANDOM_SEED)

## Test plain vanilla SVM

In [11]:
clf = SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8803418803418803


In [12]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,527,70
1,84,606


In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       597
           1       0.90      0.88      0.89       690

    accuracy                           0.88      1287
   macro avg       0.88      0.88      0.88      1287
weighted avg       0.88      0.88      0.88      1287



## Determine good value for C using random search

In [16]:
model_params = {
    'C': uniform(0.1, 10.0)
}

# RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, 
#                    scoring=None, n_jobs=None, iid='deprecated', 
#                    refit=True, cv=None, verbose=0, 
#                    pre_dispatch='2*n_jobs', random_state=None, 
#                    error_score=nan, return_train_score=False)
svm_model = SVC()
clf = RandomizedSearchCV(svm_model, model_params, n_iter=10, cv=5)
model = clf.fit(x_train, y_train)
params = model.best_estimator_.get_params()

In [17]:
params
"""
{'C': 2.5374473415003695,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}
 """

{'C': 2.5374473415003695,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [14]:
clf = SVC(C=2.5374473415003695)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))


0.8873348873348873


In [15]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,528,69
1,76,614


In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88       597
           1       0.90      0.89      0.89       690

    accuracy                           0.89      1287
   macro avg       0.89      0.89      0.89      1287
weighted avg       0.89      0.89      0.89      1287



## Test ensemble approach with bagging classifier

#### Determine good parameters to use with bagging classifier. Start with grid search.

In [21]:
# Parameters
param_grid = {'max_features': [NUM_COMPONENTS - 2, NUM_COMPONENTS - 3, 
                               NUM_COMPONENTS - 4, NUM_COMPONENTS - 5,
                               NUM_COMPONENTS - 6, NUM_COMPONENTS - 7, 
                               NUM_COMPONENTS - 8, NUM_COMPONENTS - 9,
                               NUM_COMPONENTS - 10, NUM_COMPONENTS - 11,
                               NUM_COMPONENTS - 12, NUM_COMPONENTS - 13]}

base_estimator = SVC(C=params['C'])
estimator = BaggingClassifier(base_estimator=base_estimator, n_estimators=100)
# Question: Should we include C here, or ... ? There's no reason it should differ
# for different values of n_estimators, but what about max_features? Might different
# values for C be best for different max_features? Maybe. Dunno. Maybe we should
# search for optimal C *after* we've found the best value for max_features?

# GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, 
#              iid='deprecated', refit=True, cv=None, verbose=0, 
#              pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)
clf = GridSearchCV(estimator, param_grid, n_jobs=-1)
search = clf.fit(x_train, y_train)
search.best_params_

# {'max_features': 34}

{'max_features': 34}

#### Run bagging classifier with good params

In [34]:
# BaggingClassifier(base_estimator=None, n_estimators=10, *, 
#                   max_samples=1.0, max_features=1.0, 
#                   bootstrap=True, bootstrap_features=False, 
#                   oob_score=False, warm_start=False, 
#                   n_jobs=None, random_state=None, verbose=0) 
clf = BaggingClassifier(base_estimator=SVC(C=2.5374473415003695),
                        n_estimators=500, 
                        max_features=34,
                        random_state=RANDOM_SEED,
                        n_jobs=-1)
model = clf.fit(x_train, y_train)

In [35]:
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.891996891996892


In [36]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,524,73
1,66,624


In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88       597
           1       0.90      0.90      0.90       690

    accuracy                           0.89      1287
   macro avg       0.89      0.89      0.89      1287
weighted avg       0.89      0.89      0.89      1287



In [33]:
fn = 'scikit_models/tenor_svm_bagging_tuned_c.pkl'
with open(fn, 'wb') as fh:
    pickle.dump(clf, fh)


### Try weakening our learners. Maybe bagging will work better then.

* Reduce training set size (increase test set size)
* Do not optimize C
* Explore wider range of parameters in grid search

In [32]:
x_train, x_test, y_train, y_test = \
     train_test_split(d, target, test_size=0.4, random_state=RANDOM_SEED)

In [41]:
# Parameters
p = {'n_estimators': [100, 200, 300, 400, 500, 600],
     'max_features': [NUM_COMPONENTS - 2, NUM_COMPONENTS - 3, 
                      NUM_COMPONENTS - 4, NUM_COMPONENTS - 5,
                      NUM_COMPONENTS - 6, NUM_COMPONENTS - 7]}

base_estimator = SVC()  # do not pass in C
estimator = BaggingClassifier(base_estimator=base_estimator)

# GridSearchCV(estimator, param_grid, *, scoring=None, n_jobs=None, 
#              iid='deprecated', refit=True, cv=None, verbose=0, 
#              pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)
clf = GridSearchCV(estimator, p, n_jobs=-1)
search = clf.fit(x_train, y_train)
search.best_params_

# {'max_features': 25, 'n_estimators': 600}

{'max_features': 25, 'n_estimators': 600}

In [34]:
# BaggingClassifier(base_estimator=None, n_estimators=10, *, 
#                   max_samples=1.0, max_features=1.0, 
#                   bootstrap=True, bootstrap_features=False, 
#                   oob_score=False, warm_start=False, 
#                   n_jobs=None, random_state=None, verbose=0) 
clf = BaggingClassifier(base_estimator=SVC(),
                        n_estimators=search.best_params_['n_estimators'], 
                        max_features=search.best_params_['max_features'],
                        random_state=RANDOM_SEED,
                        n_jobs=-1)
model = clf.fit(x_train, y_train)

In [35]:
y_pred = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8725099601593626


In [36]:
c_matrix = confusion_matrix(y_test, y_pred)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,864,117
1,139,888


In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       981
           1       0.88      0.86      0.87      1027

    accuracy                           0.87      2008
   macro avg       0.87      0.87      0.87      2008
weighted avg       0.87      0.87      0.87      2008

