# Tenor Saxophone Ensemble Models

## Setup

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.decomposition import PCA 
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from scipy.stats import uniform
 
from pickles_to_pandas import pickles_to_pandas

In [4]:
df = pickles_to_pandas('./data/5s/labeled/features_r02')

In [16]:
NUM_LABEL_COLS = 13
NUM_COMPONENTS = 30

In [5]:
df_filtered = df[df['excl'] == '0']  # exclude records we want to exclude
df_filtered.shape

(5411, 10790)

# Clayton's Work:

### Remove all other labeled instruments and scale

In [123]:
df_filtered = df_filtered[df_filtered['alto'] == '0']  # exclude records we want to exclude
df_filtered = df_filtered[df_filtered['sop'] == '0']
df_filtered = df_filtered[df_filtered['bari'] == '0']
df_filtered = df_filtered[df_filtered['clrt'] == '0']
df_filtered = df_filtered[df_filtered['tora'] == '0']
df_filtered = df_filtered[df_filtered['othr'] == '0']
df_filtered = df_filtered[df_filtered['trmp'] == '0']
df_filtered = df_filtered[df_filtered['trmb'] == '0']
df_filtered = df_filtered[df_filtered['otrb'] == '0']

num_x_cols = df_filtered.shape[1] - NUM_LABEL_COLS - 1  # last bit to adjust for zero indexing
data = df_filtered.iloc[:, 1:num_x_cols].to_numpy() # << These are the features
scaler = StandardScaler()
scaler.fit(data)
data = scaler.transform(data)

target = df_filtered[['tenr']].to_numpy().ravel()  # << This is the label

print(data.shape)
print(target.shape)

(3483, 10776)
(3483,)


In [80]:
x_train, x_test, y_train, y_test = \
     train_test_split(data, target, test_size=0.30, random_state=0)

### Find best C value

In [8]:
model_params = {
    'C': uniform(0.1, 3.0)
}

# RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, 
#                    scoring=None, n_jobs=None, iid='deprecated', 
#                    refit=True, cv=None, verbose=0, 
#                    pre_dispatch='2*n_jobs', random_state=None, 
#                    error_score=nan, return_train_score=False)
svm_model = SVC()
clf = RandomizedSearchCV(svm_model, model_params, n_iter=10, 
                         cv=5, random_state=1)
model = clf.fit(x_train, y_train)
c_value = model.best_estimator_.get_params()["C"]
print(c_value)

2.2609734803264745


### Test with best found C value

In [9]:
# manual_c_value = 1.716
clf = SVC(C=c_value)
clf.fit(x_train, y_train)
y_pred_svm = clf.predict(x_test)
print(clf.score(x_test, y_test))

0.8794489092996556


In [10]:
c_matrix = confusion_matrix(y_test, y_pred_svm)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,432,55
1,50,334


In [12]:
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89       487
           1       0.86      0.87      0.86       384

    accuracy                           0.88       871
   macro avg       0.88      0.88      0.88       871
weighted avg       0.88      0.88      0.88       871



# Matthew's Work:

### Reduce dimensionality with PCA

In [81]:
pca = PCA(n_components=NUM_COMPONENTS)

In [82]:
pca.fit(data)

PCA(n_components=30)

In [83]:
d = pca.transform(data)

In [84]:
x_train, x_test, y_train, y_test = \
     train_test_split(d, target, test_size=0.30, random_state=0)

### Testing different ensemble methods: Vanilla Random Forest Classifier

In [56]:
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
y_pred_forest = forest.predict(x_test)
print(forest.score(x_test, y_test))

0.8599311136624569


In [57]:
c_matrix = confusion_matrix(y_test, y_pred_forest)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,438,49
1,73,311


In [58]:
print(classification_report(y_test, y_pred_forest))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88       487
           1       0.86      0.81      0.84       384

    accuracy                           0.86       871
   macro avg       0.86      0.85      0.86       871
weighted avg       0.86      0.86      0.86       871



### Random Search for Random Forest Classifier Optimal Hyperparameters

In [59]:
# distributions = dict(n_estimators=[25, 50, 150, 200, 300, 400],
#                     max_features=[10, 15, 20, 25, 27],
#                     max_depth=[100, 200, 300, 400, 500])
# clf = RandomizedSearchCV(forest, distributions, random_state=0)
# search = clf.fit(x_train, y_train)
# search.best_params_

# {'n_estimators': 300, 'max_features': 15, 'max_depth': 300}

{'n_estimators': 300, 'max_features': 15, 'max_depth': 300}

### Testing different ensemble methods: Optimized Random Forest Classifier

In [77]:
forest = RandomForestClassifier(n_estimators=300, max_features=15, max_depth=300)
forest.fit(x_train, y_train)
y_pred_forest = forest.predict(x_test)
print(forest.score(x_test, y_test))

0.8292682926829268


In [78]:
c_matrix = confusion_matrix(y_test, y_pred_forest)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,335,53
1,66,243


In [79]:
print(classification_report(y_test, y_pred_forest))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       388
           1       0.82      0.79      0.80       309

    accuracy                           0.83       697
   macro avg       0.83      0.82      0.83       697
weighted avg       0.83      0.83      0.83       697



### Messing with feature importances

In [70]:
feature_rank = forest.feature_importances_
feature_rank = np.sort(feature_rank, axis=0)
print(feature_rank)

[0.01220197 0.01224287 0.01276152 0.01304759 0.01322074 0.01383302
 0.01454386 0.01479519 0.01503899 0.01543083 0.01605616 0.01652149
 0.01917002 0.02095806 0.02158267 0.02233261 0.02473193 0.02492037
 0.02644653 0.02748147 0.02764621 0.02980129 0.03386331 0.0458015
 0.05236817 0.05241007 0.05716693 0.09343718 0.12033709 0.12985034]


### Testing different ensemble methods: Vanilla Extra Trees Classifier

In [71]:
extra = ExtraTreesClassifier()
extra.fit(x_train, y_train)
y_pred_extra = extra.predict(x_test)
print(extra.score(x_test, y_test))

0.8679678530424799


In [54]:
c_matrix = confusion_matrix(y_test, y_pred_extra)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,412,75
1,61,323


In [55]:
print(classification_report(y_test, y_pred_extra))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86       487
           1       0.81      0.84      0.83       384

    accuracy                           0.84       871
   macro avg       0.84      0.84      0.84       871
weighted avg       0.84      0.84      0.84       871



### Random Search for Extra Trees Classifier Optimal Hyperparameters

In [72]:
# distributions = dict(n_estimators=[25, 50, 150, 200, 300, 400],
#                     max_features=[10, 15, 20, 25, 27],
#                     max_depth=[100, 200, 300, 400, 500])
# clf = RandomizedSearchCV(extra, distributions, random_state=0)
# search = clf.fit(x_train, y_train)
# search.best_params_

# {'n_estimators': 300, 'max_features': 15, 'max_depth': 400}

{'n_estimators': 300, 'max_features': 15, 'max_depth': 400}

### Testing different ensemble methods: Optimized Extra Trees Classifier

In [94]:
extra = ExtraTreesClassifier(n_estimators=300, max_features=15, max_depth=400)
extra.fit(x_train, y_train)
y_pred_extra = extra.predict(x_test)
print(extra.score(x_test, y_test))

0.8861244019138756


In [95]:
c_matrix = confusion_matrix(y_test, y_pred_extra)
idx = cols = [0, 1]
pd.DataFrame(c_matrix, index=idx, columns=cols)

Unnamed: 0,0,1
0,520,56
1,63,406


In [96]:
print(classification_report(y_test, y_pred_extra))

              precision    recall  f1-score   support

           0       0.89      0.90      0.90       576
           1       0.88      0.87      0.87       469

    accuracy                           0.89      1045
   macro avg       0.89      0.88      0.88      1045
weighted avg       0.89      0.89      0.89      1045

