In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.datasets import load_digits
from sklearn.datasets import fetch_openml

from sklearn.decomposition import PCA
from sklearn.utils import resample
from tensorflow.keras.datasets.fashion_mnist import load_data
from umap import UMAP

#Import all the algorithms
from umap import UMAP
from openTSNE import TSNE as OpenTSNE
from tqdm import tqdm

import time
%matplotlib inline

In [None]:
def data_size_scaling(algorithm, algorithm_name, data, sizes=[100, 1000, 2500, 10000, 25000, 70000, 100000], n_runs=5):
    result = []
    for k in tqdm(range(len(sizes))):
        size = sizes[k]
        for run in range(n_runs):
            subsample = resample(data, n_samples=size)
            start_time = time.time()
            if 'UMAP' in algorithm_name:
                algorithm.fit_transform(subsample)
            else:
                algorithm.fit(subsample)
            elapsed_time = time.time() - start_time
            del subsample
            result.append((size, elapsed_time))
    return pd.DataFrame(result, columns=('dataset size', 'runtime (s)'))

In [None]:
digits = fetch_openml('mnist_784')
X_d = digits.data
fashion = fetch_openml('Fashion-MNIST', version=1)
X_f = fashion.data

In [None]:
x_mean_rem_d = X_d - X_d.mean(axis=0)
U, lambd, V = np.linalg.svd(x_mean_rem_d, full_matrices=False)
X784_d = np.dot(U, np.diag(lambd))[:,:784]
x_mean_rem_f = X_f - X_f.mean(axis=0)
U, lambd, V = np.linalg.svd(x_mean_rem_f, full_matrices=False)
X784_f = np.dot(U, np.diag(lambd))[:,:784]

In [None]:
methods = [UMAP(init='random',random_state=42),
             UMAP(random_state=42),
             OpenTSNE(n_jobs=-1, initialization='random', random_state=42,negative_gradient_method='fft'),
             OpenTSNE(n_jobs=-1, negative_gradient_method='fft',random_state=42),
             OpenTSNE(n_jobs=-1, initialization='random', negative_gradient_method='bh',random_state=42),
             OpenTSNE(n_jobs=-1, negative_gradient_method='bh',random_state=42)]

In [None]:
performance_data_d = {}
performance_data_f = {}

for i in range(len(methods_n)):
    algo = methods_n[i]
    if i == 0:
        alg_name = 'UMAP_random_init'
    elif i == 1:
        alg_name = 'UMAP_le_init'
    elif i == 2:
        alg_name = 'OpenTSNE_random_init_fft'
    elif i == 3:
        alg_name = 'OpenTSNE_pca_init_fft'
    elif i == 4:
        alg_name = 'OpenTSNE_random_init_bh'
    else:
        alg_name = 'OpenTSNE_pca_init_bh'
    performance_data_d[alg_name] = data_size_scaling(algo, alg_name, X784_d)
    performance_data_f[alg_name] = data_size_scaling(algo, alg_name, X784_f)

    print(f"[{time.asctime(time.localtime())}] Completed {alg_name}")

In [None]:
for alg_name, perf_data in performance_data_d.items():
    algo = methods[i]
    sns.regplot('dataset size', 'runtime (s)', perf_data, order=2, label=alg_name)
plt.legend()
plt.xlim(0, 110000)

In [None]:
for alg_name, perf_data in performance_data_f.items():
    algo = methods[i]
    sns.regplot('dataset size', 'runtime (s)', perf_data, order=2, label=alg_name)
plt.legend()
plt.xlim(0, 110000)