# Comparison of UMAP and PCA

In the following, neither PCA nor UMAP seem to help KNN - why not?

In [None]:
from numpy.random import normal
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import numpy as np
from math import sqrt
from scipy.special import gamma
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import umap



# Split your data into training and test sets


N=10000
N_broad=2
scale=20

dims=[]
scores=[]
pca_scores = []
umap_scores = []


def avg_distance(k):
    return sqrt(2)*gamma((k+1)/2)/gamma(k/2)

for dim in range(N_broad+1,30):
    clf = KNeighborsClassifier()
    X = np.hstack([normal(size=(N,N_broad)), normal(size=(N,dim-N_broad))/avg_distance(dim-N_broad)/scale])
    y=(X[:,N_broad]>0).astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf.fit(X_train, y_train)

    y_test_pred=clf.predict(X_test)

    score=f1_score(y_test, y_test_pred)
    dims.append(dim)
    scores.append(score)

    pca = PCA(n_components = .99,svd_solver="full")
    pca.fit(X_train)
    #n_comps = len(pca.singular_values_)
    #print(f"Number of components retrieved: {n_comps}")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)

    clf = KNeighborsClassifier()
    clf.fit(X_train_pca,y_train)
    y_test_pca_pred = clf.predict(X_test_pca)
    pca_score=f1_score(y_test, y_test_pca_pred)
    pca_scores.append(pca_score)

    umap_model = umap.UMAP(init='random',
                    n_components=2,
                    n_neighbors=100,
                    min_dist=0.1,
                     spread=2,
                    metric='euclidean')

    
    umap_model.fit(X_train)
    X_train_umap = umap_model.transform(X_train) 
    X_test_umap = umap_model.transform(X_test)

    clf = KNeighborsClassifier()
    clf.fit(X_train_umap,y_train)
    y_test_umap_pred = clf.predict(X_test_umap)
    umap_score=f1_score(y_test, y_test_umap_pred)
    umap_scores.append(umap_score)

plt.plot(dims, scores, label="Non-reduced scores")
plt.plot(dims, pca_scores, label="PCA-reduced scores")
plt.plot(dims, umap_scores, label="UMAP-reduced scores")
plt.ylim([0.5,1])
plt.xlabel("Dimension")
plt.ylabel("Score")
plt.title("kNN on {} samples".format(N))
plt.legend()
plt.show()