# The purpose of this notebook is to evaluate different representations by using T-SNE for dimensionality reduction and visualization

We evaluate the following:
1. TiFGAN features
2. Post-hoc encoder features
3. BiTiFGAN features
4. TiFGAN discriminator features
5. MFCC features
6. FBANK features

## Import packages

In [1]:
import os
os.chdir(os.path.join("/", "home", "c-matsty", "Bi-TiFGAN---TensorFlow-1.14", "src"))

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
from sklearn.manifold import TSNE

from feature_evaluation.utils import load_data, load_data_labels

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

## Path to datasets with SpeechCommands representations

In [3]:
data_dir = os.path.join("/media", "datastore", "c-matsty-data", "datasets", "SpeechCommands")

## 1. TiFGAN feature visualization with T-SNE

### Paths to the input data, labels and actors

In [4]:
train_dir = os.path.join(data_dir, "SpeechCommands_Preproc_2_training")
train_input_path = os.path.join(train_dir, "input_data")
train_labels_path = os.path.join(train_dir, "labels")
train_actors_path = os.path.join(train_dir, "actors")

In [5]:
test_dir = os.path.join(data_dir, "SpeechCommands_Preproc_2_test")
test_input_path = os.path.join(test_dir, "input_data")
test_labels_path = os.path.join(test_dir, "labels")
test_actors_path = os.path.join(test_dir, "actors")

### Load data

In [6]:
X_tr = load_data(train_input_path)
X_ts = load_data(test_input_path)

  1%|          | 1/165 [00:00<00:31,  5.13it/s]

Loading data


100%|██████████| 165/165 [00:32<00:00,  5.02it/s]
  5%|▌         | 1/20 [00:00<00:03,  5.20it/s]

Loading data


100%|██████████| 20/20 [00:03<00:00,  5.05it/s]


### Load labels

In [7]:
print("-Read label meta-data for training and test dataset samples.")
y_tr = load_data_labels(train_labels_path)
y_ts = load_data_labels(test_labels_path)
label_dict = {value: index for index, value in enumerate(np.unique(y_tr))}
y_tr = np.vectorize(label_dict.get)(y_tr)
y_ts = np.vectorize(label_dict.get)(y_ts)

100%|██████████| 165/165 [00:00<00:00, 4766.85it/s]
100%|██████████| 20/20 [00:00<00:00, 4155.24it/s]

-Read label meta-data for training and test dataset samples.





### Load actors

In [71]:
print("-Read actor meta-data for training and test dataset samples.")
a_tr = load_data_labels(train_actors_path)
a_ts = load_data_labels(test_actors_path)

100%|██████████| 165/165 [00:00<00:00, 3910.28it/s]
100%|██████████| 20/20 [00:00<00:00, 3977.34it/s]

-Read actor meta-data for training and test dataset samples.





### Prepare data for T-SNE

In [9]:
X_tr = X_tr.reshape((X_tr.shape[0], -1))
X_ts = X_ts.reshape((X_ts.shape[0], -1))

### Train T-SNE model and transform data to 2-D representation

In [None]:
X_tr_embedded = TSNE(n_components=2, perplexity=2100, n_jobs=20).fit_transform(X_tr)

In [None]:
X_ts_embedded = TSNE(n_components=2, perplexity=250, n_jobs=20).fit_transform(X_ts)

### TiFGAN features embedded on a 2-D space by T-SNE - Colored by keyword

In [None]:
def scatterplot_colored_labels(X_embedded, labels, title, legend=True):
    # Set plot size
    plt.rcParams[('figure.figsize')] = (10, 10)
    # Get unique labels
    unique_labels = np.unique(labels)
    # Get distinct colors, as many as distinct labels
    all_colours = [k for k,v in pltc.cnames.items()]
    # If there are enough colours sample without replacement
    replace = True if len(unique_labels) > len(all_colours) else False
    np.random.seed(30000)
    selected_colours = np.random.choice(all_colours, replace=replace, size=len(unique_labels))
    # For each unique label in the dataset
    for i, unique_label in enumerate(unique_labels):
        # Select data with that unique dataset
        X_with_label = X_embedded[labels == unique_label]
        # Create scatter plot with that data
        plt.scatter(X_with_label[:, 0], X_with_label[:, 1], label="Class {}".format(str(unique_label)),
                    c=selected_colours[i], edgecolors='black')
    
    if legend:
        plt.legend()
        
    plt.tight_layout()
    plt.title(title, size=16)
    plt.show()

In [None]:
scatterplot_colored_labels(X_tr_embedded, y_tr, "Training TiFGAN features embedded on a 2-D space - coloured by class")

In [None]:
scatterplot_colored_labels(X_ts_embedded, y_ts, "Test TiFGAN features embedded on a 2-D space - Colored by class")

### TiFGAN features embedded on a 2-D space by T-SNE - Colored by speaker

In [None]:
scatterplot_colored_labels(X_tr_embedded, a_tr, "Test TiFGAN features embedded on a 2-D space - Colored by speaker", legend=False)

In [None]:
scatterplot_colored_labels(X_ts_embedded, a_ts, "Test TiFGAN features embedded on a 2-D space - Colored by speaker", legend=False)