## The purpose of this notebook is to evaluate the discriminative capabilities of the representations learned by the TiFGAN's discriminator by performing keyword detection on the SC09 dataset

* We train a Logistic Regression as well as a Random Forest classifier on top of those features and report our performance on the SC09 test set.

### Import packages

In [1]:
import os
os.chdir(os.path.join("/", "home", "c-matsty", "Bi-TiFGAN---TensorFlow-1.14", "src"))

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score

from feature_evaluation.utils import load_data, load_data_labels

### Define paths

In [2]:
data_dir = os.path.join("/media", "datastore", "c-matsty-data", "datasets", "SpeechCommands")

#### Discriminator features path

In [3]:
discr_features_dirname = "Discriminator_features"

#### Label paths

In [4]:
training_label_dir = os.path.join(data_dir, "SpeechCommands_Preproc_2_training", "labels")
test_label_dir = os.path.join(data_dir, "SpeechCommands_Preproc_2_test", "labels")

### Load data

In [5]:
y_tr = load_data_labels(training_label_dir)
y_ts = load_data_labels(test_label_dir)

100%|██████████| 165/165 [00:00<00:00, 4426.92it/s]
100%|██████████| 20/20 [00:00<00:00, 3860.56it/s]


In [6]:
label_dict = {value: index  for index, value in enumerate(np.unique(y_tr))}

### Prepare data for training

#### Turn labels from strings to integer identifiers

In [7]:
y_tr = np.vectorize(label_dict.get)(y_tr)
y_ts = np.vectorize(label_dict.get)(y_ts)

#### Reshape labels

In [8]:
y_tr = y_tr.flatten()
y_ts = y_ts.flatten()

### Define sample weights

In [9]:
class_counts = [len(y_tr[y_tr == i]) for i in label_dict.values()]
class_weights = [max(class_counts)/class_count for class_count in class_counts]
class_weight_dict = {class_idx: class_weight for class_idx, class_weight in zip(label_dict.values(), class_weights)}

In [10]:
label_dict

{'eight': 0,
 'five': 1,
 'four': 2,
 'nine': 3,
 'one': 4,
 'seven': 5,
 'six': 6,
 'three': 7,
 'two': 8,
 'zero': 9}

In [11]:
sample_weight = [class_weight_dict[label] for label in y_tr]

## Train and test using 5th layer's features

In [12]:
training_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_training_discriminator_features_4th_conv.npy")
test_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_test_discriminator_features_4th_conv.npy")

#### Load data

In [13]:
X_tr = np.load(training_input_path)
X_ts = np.load(test_input_path)

In [14]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

#### Normalization

In [15]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

#### Train logistic regression classifier

In [16]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_5_lg = classification_report(y_ts, y_preds)

In [45]:
print(class_report_5_lg)

              precision    recall  f1-score   support

           0       0.16      0.21      0.18       257
           1       0.08      0.08      0.08       270
           2       0.17      0.13      0.15       253
           3       0.15      0.16      0.15       259
           4       0.16      0.09      0.11       248
           5       0.06      0.08      0.07       239
           6       0.09      0.13      0.10       244
           7       0.16      0.15      0.15       267
           8       0.03      0.02      0.03       264
           9       0.07      0.06      0.06       250

    accuracy                           0.11      2551
   macro avg       0.11      0.11      0.11      2551
weighted avg       0.11      0.11      0.11      2551



##### Train Random Forest classifier

In [None]:
model = RandomForestClassifier(n_estimators=400, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_5_rf = classification_report(y_ts, y_preds)

In [46]:
print(class_report_5_rf)

              precision    recall  f1-score   support

           0       0.15      0.12      0.13       257
           1       0.07      0.10      0.08       270
           2       0.37      0.28      0.32       253
           3       0.15      0.22      0.18       259
           4       0.28      0.31      0.30       248
           5       0.02      0.02      0.02       239
           6       0.02      0.02      0.02       244
           7       0.25      0.31      0.28       267
           8       0.04      0.02      0.03       264
           9       0.09      0.04      0.06       250

    accuracy                           0.15      2551
   macro avg       0.15      0.15      0.14      2551
weighted avg       0.15      0.15      0.14      2551



## Repeat for 4th conv layer

In [None]:
discr_features_dirname = "Discriminator_features"
training_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_training_discriminator_features_3th_conv.npy")
test_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_test_discriminator_features_3th_conv.npy")

#### Load data

In [None]:
X_tr = np.load(training_input_path)

In [None]:
X_ts = np.load(test_input_path)

In [None]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

#### Normalization

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

### Normalize data

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

#### Train logistic regression classifier

In [None]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_4_lg = classification_report(y_ts, y_preds)

In [47]:
print(class_report_4_lg)

              precision    recall  f1-score   support

           0       0.14      0.17      0.15       257
           1       0.07      0.07      0.07       270
           2       0.19      0.15      0.17       253
           3       0.17      0.17      0.17       259
           4       0.17      0.14      0.16       248
           5       0.07      0.08      0.08       239
           6       0.07      0.08      0.07       244
           7       0.17      0.16      0.17       267
           8       0.08      0.06      0.07       264
           9       0.05      0.05      0.05       250

    accuracy                           0.12      2551
   macro avg       0.12      0.12      0.12      2551
weighted avg       0.12      0.12      0.12      2551



##### Train Random Forest classifier

In [None]:
model = RandomForestClassifier(n_estimators=400, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_4_rf = classification_report(y_ts, y_preds)

In [48]:
print(class_report_4_rf)

              precision    recall  f1-score   support

           0       0.09      0.08      0.08       257
           1       0.06      0.07      0.07       270
           2       0.32      0.25      0.28       253
           3       0.18      0.21      0.19       259
           4       0.22      0.19      0.21       248
           5       0.03      0.04      0.03       239
           6       0.09      0.15      0.11       244
           7       0.27      0.25      0.26       267
           8       0.05      0.03      0.04       264
           9       0.09      0.04      0.06       250

    accuracy                           0.13      2551
   macro avg       0.14      0.13      0.13      2551
weighted avg       0.14      0.13      0.13      2551



## Repeat for 3rd conv layer

In [50]:
discr_features_dirname = "Discriminator_features"
training_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_training_discriminator_features_2th_conv.npy")
test_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_test_discriminator_features_2th_conv.npy")

#### Load data

In [51]:
X_tr = np.load(training_input_path)

In [52]:
X_ts = np.load(test_input_path)

In [53]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

### Normalize data

In [54]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

#### Train logistic regression classifier

In [55]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_3_lg = classification_report(y_ts, y_preds)

In [88]:
print(class_report_3_lg)

              precision    recall  f1-score   support

           0       0.14      0.16      0.15       257
           1       0.06      0.06      0.06       270
           2       0.16      0.15      0.16       253
           3       0.14      0.14      0.14       259
           4       0.13      0.11      0.12       248
           5       0.07      0.08      0.07       239
           6       0.07      0.07      0.07       244
           7       0.16      0.15      0.15       267
           8       0.08      0.08      0.08       264
           9       0.07      0.07      0.07       250

    accuracy                           0.11      2551
   macro avg       0.11      0.11      0.11      2551
weighted avg       0.11      0.11      0.11      2551



##### Train Random Forest classifier

In [None]:
model = RandomForestClassifier(n_estimators=400, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_3_rf = classification_report(y_ts, y_preds)

In [89]:
print(class_report_3_rf)

              precision    recall  f1-score   support

           0       0.10      0.12      0.11       257
           1       0.07      0.06      0.06       270
           2       0.34      0.25      0.29       253
           3       0.15      0.12      0.13       259
           4       0.23      0.16      0.19       248
           5       0.01      0.01      0.01       239
           6       0.12      0.34      0.18       244
           7       0.27      0.21      0.24       267
           8       0.14      0.08      0.10       264
           9       0.07      0.04      0.05       250

    accuracy                           0.14      2551
   macro avg       0.15      0.14      0.13      2551
weighted avg       0.15      0.14      0.13      2551



In [None]:
y_preds = model.predict(X_tr)
class_report_3_rf_tr = classification_report(y_tr, y_preds)

In [None]:
print(class_report_3_rf_tr)

## Repeat for 2nd conv layer

In [None]:
discr_features_dirname = "Discriminator_features"
training_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_training_discriminator_features_1th_conv.npy")
test_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_test_discriminator_features_1th_conv.npy")

#### Load data

In [None]:
X_tr = np.load(training_input_path)

In [None]:
X_ts = np.load(test_input_path)

In [None]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

### Normalize data

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

#### Train logistic regression classifier

In [None]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_2_lg = classification_report(y_ts, y_preds)

In [90]:
print(class_report_2_lg)

              precision    recall  f1-score   support

           0       0.16      0.16      0.16       257
           1       0.09      0.09      0.09       270
           2       0.22      0.23      0.22       253
           3       0.12      0.11      0.12       259
           4       0.15      0.17      0.16       248
           5       0.07      0.08      0.08       239
           6       0.10      0.10      0.10       244
           7       0.16      0.15      0.15       267
           8       0.08      0.08      0.08       264
           9       0.08      0.07      0.08       250

    accuracy                           0.12      2551
   macro avg       0.12      0.12      0.12      2551
weighted avg       0.12      0.12      0.12      2551



In [None]:
y_preds = model.predict(X_tr)
class_report_2_lg_tr = classification_report(y_tr, y_preds)

In [None]:
print(class_report_2_lg_tr)
print(balanced_accuracy_score(y_tr, y_preds))

##### Train Random Forest classifier

In [None]:
model = RandomForestClassifier(n_estimators=400, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_2_rf = classification_report(y_ts, y_preds)

In [91]:
print(class_report_2_rf)

              precision    recall  f1-score   support

           0       0.03      0.04      0.04       257
           1       0.06      0.04      0.05       270
           2       0.30      0.19      0.23       253
           3       0.16      0.08      0.11       259
           4       0.12      0.09      0.10       248
           5       0.06      0.02      0.03       239
           6       0.12      0.60      0.20       244
           7       0.21      0.07      0.11       267
           8       0.08      0.03      0.05       264
           9       0.05      0.02      0.02       250

    accuracy                           0.12      2551
   macro avg       0.12      0.12      0.09      2551
weighted avg       0.12      0.12      0.09      2551



In [None]:
y_preds = model.predict(X_tr)
class_report_2_rf_tr = classification_report(y_tr, y_preds)

In [87]:
print(class_report_2_rf_tr)
print(balanced_accuracy_score(y_tr, y_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2087
           1       1.00      1.00      1.00      2082
           2       1.00      1.00      1.00      2107
           3       1.00      1.00      1.00      2097
           4       1.00      1.00      1.00      2117
           5       1.00      1.00      1.00      2131
           6       1.00      1.00      1.00      2117
           7       1.00      1.00      1.00      2081
           8       1.00      1.00      1.00      2103
           9       1.00      1.00      1.00      2122

    accuracy                           1.00     21044
   macro avg       1.00      1.00      1.00     21044
weighted avg       1.00      1.00      1.00     21044

1.0


## Repeat for 1st conv layer but without pooling

In [97]:
discr_features_dirname = "Discriminator_features"
test_input_path = os.path.join(data_dir, discr_features_dirname, "unpooled_1st_ts.npy.npy")

#### Load data

In [None]:
tr_input_file_pattern = "unpooled_1st_tr"
X_tr = []
for file_name in os.listdir(os.path.join(data_dir, discr_features_dirname)):
    if tr_input_file_pattern not in file_name:
        continue
    training_input_path = os.path.join(data_dir, discr_features_dirname, file_name)
    X_tr.append(np.load(training_input_path))
X_tr = np.vstack(X_tr)

In [None]:
X_ts = np.load(test_input_path)

In [None]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

### Normalize data

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

#### Train logistic regression classifier

In [None]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_1_lg = classification_report(y_ts, y_preds)

In [None]:
print(class_report_1_lg)

In [None]:
y_preds = model.predict(X_tr)
class_report_1_lg_tr = classification_report(y_tr, y_preds)

In [None]:
print(class_report_1_lg_tr)

##### Train Random Forest classifier

In [None]:
model = RandomForestClassifier(n_estimators=400, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_1_rf = classification_report(y_ts, y_preds)

In [None]:
print(class_report_1_rf)

In [None]:
y_preds = model.predict(X_tr)
class_report_1_rf_tr = classification_report(y_tr, y_preds)

In [None]:
print(class_report_1_rf_tr)
print(balanced_accuracy_score(y_tr, y_preds))

## Repeat for 1st conv layer

In [None]:
discr_features_dirname = "Discriminator_features"
training_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_training_discriminator_features_0th_conv.npy")
test_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_test_discriminator_features_0th_conv.npy")

#### Load data

In [None]:
X_tr = np.load(training_input_path)

In [None]:
X_ts = np.load(test_input_path)

In [None]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

### Normalize data

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

#### Train logistic regression classifier

In [None]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_1_lg = classification_report(y_ts, y_preds)

In [None]:
print(class_report_1_lg)

In [None]:
y_preds = model.predict(X_tr)
class_report_1_lg_tr = classification_report(y_tr, y_preds)

In [None]:
print(class_report_1_lg_tr)

##### Train Random Forest classifier

In [None]:
model = RandomForestClassifier(n_estimators=400, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
class_report_1_rf = classification_report(y_ts, y_preds)

In [None]:
print(class_report_1_rf)

In [None]:
y_preds = model.predict(X_tr)
class_report_1_rf_tr = classification_report(y_tr, y_preds)

In [None]:
print(class_report_1_rf_tr)
print(balanced_accuracy_score(y_tr, y_preds))