## The purpose of this notebook is to evaluate the discriminative capabilities of the representations learned by the TiFGAN's discriminator by performing keyword detection on the SC09 dataset

* We train a Logistic Regression as well as a Random Forest classifier on top of those features and report our performance on the SC09 test set.

### Import packages

In [None]:
import os

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score

### Define paths

In [None]:
data_dir = os.path.join("/media", "datastore", "c-matsty-data", "datasets", "SpeechCommands")

#### Discriminator features path

In [None]:
discr_features_dirname = "Discriminator_features"
training_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_training_discriminator_features_last_conv.npy")
test_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_test_discriminator_features_last_conv.npy")

#### Label paths

In [None]:
training_label_dir = os.path.join(data_dir, "SpeechCommands_Preproc_2_training", "labels")
test_label_dir = os.path.join(data_dir, "SpeechCommands_Preproc_2_test", "labels")

In [None]:
def load_labels(labels_path):
    Y = []
    for input_file_name in os.listdir(labels_path):
        y = np.load(os.path.join(labels_path, input_file_name))[..., np.newaxis]
        Y.append(y)
    Y = np.vstack(Y)
    return Y

### Load data

In [None]:
X_tr = np.load(training_input_path)

In [None]:
X_ts = np.load(test_input_path)

In [None]:
y_tr = load_labels(training_label_dir)
y_ts = load_labels(test_label_dir)

In [None]:
label_dict = {value: index  for index, value in enumerate(np.unique(y_tr))}

### Prepare data for training

In [None]:
def global_average_pooling(X):
    return np.apply_over_axes(np.mean, X, [1, 2])

#### Turn labels from strings to integer identifiers

In [None]:
y_tr = np.vectorize(label_dict.get)(y_tr)
y_ts = np.vectorize(label_dict.get)(y_ts)

#### Reshape input arrays and labels

In [None]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

In [None]:
y_tr = y_tr.flatten()
y_ts = y_ts.flatten()

### Normalize data

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

### Define sample weights

In [None]:
class_counts = [len(y_tr[y_tr == i]) for i in label_dict.values()]
class_weights = [max(class_counts)/class_count for class_count in class_counts]
class_weight_dict = {class_idx: class_weight for class_idx, class_weight in zip(label_dict.values(), class_weights)}

In [None]:
label_dict

### Train and test logistic regression classifier

In [None]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0, penalty='elasticnet', solver='saga', l1_ratio=0.5)

In [None]:
sample_weight = [class_weight_dict[label] for label in y_tr]

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))

### Train and test RandomForest classifier

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))

## Repeat again but apply global pooling

In [None]:
training_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_training_discriminator_features_last_conv.npy")
test_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_test_discriminator_features_last_conv.npy")

#### Load data

In [None]:
X_tr = np.load(training_input_path)
X_ts = np.load(test_input_path)

#### Global average pooling

In [None]:
X_tr = global_average_pooling(X_tr)
X_ts = global_average_pooling(X_ts)

In [None]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

#### Normalization

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

#### Train logistic regression classifier

In [None]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))

In [None]:
y_preds = model.predict(X_tr)
print(classification_report(y_tr, y_preds))
print(balanced_accuracy_score(y_tr, y_preds))

##### Train Random Forest classifier

In [None]:
model = RandomForestClassifier(n_estimators=400, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))

In [None]:
y_preds = model.predict(X_tr)
print(classification_report(y_tr, y_preds))
print(balanced_accuracy_score(y_tr, y_preds))

## Repeat for 4th conv layer

In [None]:
discr_features_dirname = "Discriminator_features"
training_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_training_discriminator_features_3th_conv.npy")
test_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_test_discriminator_features_3th_conv.npy")

#### Load data

In [None]:
X_tr = np.load(training_input_path)

In [None]:
X_ts = np.load(test_input_path)

#### Global average pooling

In [None]:
X_tr = global_average_pooling(X_tr)
X_ts = global_average_pooling(X_ts)

In [None]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

#### Normalization

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

### Normalize data

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

#### Train logistic regression classifier

In [None]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))

In [None]:
y_preds = model.predict(X_tr)
print(classification_report(y_tr, y_preds))
print(balanced_accuracy_score(y_tr, y_preds))

##### Train Random Forest classifier

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))

In [None]:
y_preds = model.predict(X_tr)
print(classification_report(y_tr, y_preds))
print(balanced_accuracy_score(y_tr, y_preds))

## Repeat for 3rd conv layer

In [None]:
discr_features_dirname = "Discriminator_features"
training_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_training_discriminator_features_2th_conv.npy")
test_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_test_discriminator_features_2th_conv.npy")

#### Load data

In [None]:
X_tr = np.load(training_input_path)

In [None]:
X_ts = np.load(test_input_path)

#### Global average pooling

In [None]:
X_tr = global_average_pooling(X_tr)
X_ts = global_average_pooling(X_ts)

In [None]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

### Normalize data

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

#### Train logistic regression classifier

In [None]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0, penalty='elasticnet', solver='saga', l1_ratio=0.5)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))

In [None]:
y_preds = model.predict(X_tr)
print(classification_report(y_tr, y_preds))
print(balanced_accuracy_score(y_tr, y_preds))

##### Train Random Forest classifier

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))

In [None]:
y_preds = model.predict(X_tr)
print(classification_report(y_tr, y_preds))
print(balanced_accuracy_score(y_tr, y_preds))

## Repeat for 2nd conv layer

In [None]:
discr_features_dirname = "Discriminator_features"
training_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_training_discriminator_features_1th_conv.npy")
test_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_test_discriminator_features_1th_conv.npy")

#### Load data

In [None]:
X_tr = np.load(training_input_path)

In [None]:
X_ts = np.load(test_input_path)

#### Global average pooling

In [None]:
X_tr = global_average_pooling(X_tr)
X_ts = global_average_pooling(X_ts)

In [None]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

### Normalize data

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

#### Train logistic regression classifier

In [None]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))
print(confusion_matrix(y_ts, y_preds))

In [None]:
y_preds = model.predict(X_tr)
print(classification_report(y_tr, y_preds))
print(balanced_accuracy_score(y_tr, y_preds))

##### Train Random Forest classifier

In [None]:
model = RandomForestClassifier(n_estimators=400, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))
print(confusion_matrix(y_ts, y_preds))

In [None]:
y_preds = model.predict(X_tr)
print(classification_report(y_tr, y_preds))
print(balanced_accuracy_score(y_tr, y_preds))

## Repeat for 1st conv layer

In [None]:
discr_features_dirname = "Discriminator_features"
training_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_training_discriminator_features_0th_conv.npy")
test_input_path = os.path.join(data_dir, discr_features_dirname, "SC09_test_discriminator_features_0th_conv.npy")

#### Load data

In [None]:
X_tr = np.load(training_input_path)

In [None]:
X_ts = np.load(test_input_path)

In [None]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

### Normalize data

In [None]:
mean = X_tr.mean()
std = X_tr.std()
X_tr = (X_tr - mean) / std 
X_ts = (X_ts - mean) / std

#### Train logistic regression classifier

In [None]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))
print(confusion_matrix(y_ts, y_preds))

In [None]:
y_preds = model.predict(X_tr)
print(classification_report(y_tr, y_preds))
print(balanced_accuracy_score(y_tr, y_preds))

##### Train Random Forest classifier

In [None]:
model = RandomForestClassifier(n_estimators=400, random_state=0)

In [None]:
model = model.fit(X_tr, y_tr, sample_weight=sample_weight)

In [None]:
y_preds = model.predict(X_ts)
print(classification_report(y_ts, y_preds))
print(balanced_accuracy_score(y_ts, y_preds))
print(confusion_matrix(y_ts, y_preds))

In [None]:
y_preds = model.predict(X_tr)
print(classification_report(y_tr, y_preds))
print(balanced_accuracy_score(y_tr, y_preds))