In [1]:
import os,glob
import numpy as np
import pandas as pd
from itertools import combinations
from collections import Counter

from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import scipy.stats
import scipy.io
import scipy.sparse
from scipy.stats import ortho_group
from scipy.spatial.distance import cdist

from mlxtend.data import loadlocal_mnist
from tensorflow import keras

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as pathces
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import seaborn as sns

import helper_functions as utils

In [2]:
def get_mnist(l_vec,data='Train'):
    if data=='Train':
        (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
        X, y = x_train.reshape(60000,-1),y_train
#         X, y = loadlocal_mnist(images_path='./datasets/mnist/train-images.idx3-ubyte', labels_path='./datasets/mnist/train-labels.idx1-ubyte')
    elif data=='Test':
        (_, _), (x_test, y_test) = keras.datasets.mnist.load_data()
        X, y = x_test.reshape(10000,-1),y_test
        #loadlocal_mnist(images_path='./datasets/mnist/t10k-images-idx3-ubyte.gz', labels_path='./datasets/mnist/t10k-labels-idx1-ubyte.gz')
    return X,y

def get_vecs(betas,n_clusters=4):
    kmeans = KMeans(n_clusters=n_clusters+1,random_state=0).fit(betas)
    labels = kmeans.labels_
    unique_labels = np.unique(labels)
    remove_label = Counter(labels).most_common(1)[0][0]
    V_group = np.zeros((len(labels),n_clusters))
    count = 0
    for label in unique_labels:
        if label!=remove_label:
            V_group[labels==label,count] = 1
            count =count+1
    return V_group

def get_projected_data_n(phi,n_clusters=3):
    beta_group = [get_vecs(phi_i,n_clusters=n_clusters) for phi_i in phi]
    data = np.hstack([np.dot(X,beta_i) for beta_i in beta_group])
    data_test = np.hstack([np.dot(X_test,beta_i) for beta_i in beta_group])

    return data, data_test

def get_clf_accuracy(data,data_test,y,y_test):
    scaler = StandardScaler().fit(data)
    data = scaler.transform(data)
    data_test = scaler.transform(data_test)
    clf = LogisticRegression(random_state=0,max_iter=1000).fit(data, y)
    y_hat = clf.predict(data)
    y_hat_test = clf.predict(data_test)
    train_acc = accuracy_score(y,y_hat)
    test_acc = accuracy_score(y_test,y_hat_test)
    return train_acc,test_acc


In [3]:
k = 10 # 10=np.ceil(np.log2(784)).astype(int)
d = 20
# number of clusters in kmeans clustering step in downstream analysis
n_clusters = 10 
# number of differential vectors choosen each digit
num_diff = 10 
sigma = 0.0002

l_vec = [i for i in range(10)]
Results = {}

X,y = get_mnist(l_vec)
y_org = np.copy(y)
X = X+sigma*np.random.normal(0,1,X.shape)

X_test,y_test = get_mnist(l_vec,data='Test')
X_test = X_test+sigma*np.random.normal(0,1,X_test.shape)

results = utils.compute_diff_vectors_n([X[y_org==i,:] for i in l_vec],k,d)

beta_ = [results["V_diff"][i][:num_diff,:].T for i in range(len(l_vec))]
data,data_test = get_projected_data_n(beta_,n_clusters=n_clusters)

train_acc,test_acc = get_clf_accuracy(data,data_test,y,y_test)
print("DiSC - Train Acc: {:.4f}, Test Acc: {}".format(train_acc,test_acc))
Results["DiSC (Ours)"] = {"Train Acc":train_acc, "Test Acc":test_acc}

beta_ = [results["U_diff"][i][:num_diff,:].T for i in range(len(l_vec))]
data,data_test = get_projected_data_n(beta_,n_clusters=n_clusters)

train_acc,test_acc = get_clf_accuracy(data,data_test,y,y_test)
print("Diffusion Maps - Train Acc: {}, Test Acc: {}".format(train_acc,test_acc))
Results["DiffusionMaps".format(data.shape[-1])] = {"Train Acc":train_acc, "Test Acc":test_acc}

beta_1 = get_vecs(X.T/255,n_clusters=n_clusters)
data = np.dot(X,beta_1)
data_test = np.dot(X_test,beta_1)
train_acc,test_acc = get_clf_accuracy(data,data_test,y,y_test)
print("Entire Data - Train Acc: {}, Test Acc: {}".format(train_acc,test_acc))
Results["Entire Data"] = {"Train Acc":train_acc, "Test Acc":test_acc}

regr = ElasticNet(random_state=0,l1_ratio=0.01,alpha=0.01)
y_EN = np.where(y==4,-3,3)
regr.fit(X/255, y_EN)
beta_1 = get_vecs(regr.coef_[:,None],n_clusters=n_clusters)
data = np.dot(X,beta_1)
data_test = np.dot(X_test,beta_1)
train_acc,test_acc = get_clf_accuracy(data,data_test,y,y_test)
print("ElasticNet - Train Acc: {}, Test Acc: {}".format(train_acc,test_acc))
Results["ElasticNet"] = {"Train Acc":train_acc, "Test Acc":test_acc}

clf_EN_logistic = SGDClassifier(penalty='elasticnet',loss='log',
                                l1_ratio=0.02,max_iter=500,alpha=0.3,random_state=0)
clf_EN_logistic.fit(X/255,y)
beta_1 = get_vecs(clf_EN_logistic.coef_.T,n_clusters=n_clusters)
data = np.dot(X,beta_1)
data_test = np.dot(X_test,beta_1)
train_acc,test_acc = get_clf_accuracy(data,data_test,y,y_test)
print("EN_logistic - Train Acc: {}, Test Acc: {}".format(train_acc,test_acc))
Results["EN_logistic"] = {"Train Acc":train_acc, "Test Acc":test_acc}

df = pd.DataFrame.from_dict(Results)
display(df)

DiSC - Train Acc: 0.9004, Test Acc: 0.8975
Diffusion Maps - Train Acc: 0.8692833333333333, Test Acc: 0.8742
Entire Data - Train Acc: 0.7510333333333333, Test Acc: 0.7623
ElasticNet - Train Acc: 0.6956166666666667, Test Acc: 0.6996
EN_logistic - Train Acc: 0.8183333333333334, Test Acc: 0.8242


Unnamed: 0,DiSC (Ours),DiffusionMaps,Entire Data,ElasticNet,EN_logistic
Train Acc,0.90045,0.869283,0.751033,0.695617,0.818333
Test Acc,0.8975,0.8742,0.7623,0.6996,0.8242
