In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import time
import cv2 as cv2
import _pickle as pickle

import plotly.express as ex
import plotly.graph_objects as go

import scipy.linalg as la
from scipy.fftpack import dct, idct
import scipy.stats

from gensim import corpora, models
from nltk.corpus import stopwords

In [17]:
def get_pca(M, axis=0):
    cov = M.T @ M
    if axis==1:
        cov = M @ M.T

    cov = np.nan_to_num(cov, 0)
    cov = cov / (M.shape[1]-1)
    s, u = la.eigh(cov)
    u /= np.linalg.norm(u, axis=axis)

    return lambda k: M @ u[:, -k:] if axis==0 else M @ u[-k:, :].T

def get_svd(M, axis=0):
    M = np.nan_to_num(M, 0)
    u, s, v = la.svd(M.T)
    u /= np.linalg.norm(u, axis=axis)

    return lambda k: (u[:, :k].T @ M.T).T  if axis==0 else M @ u[:k, :].T
def get_dct(M, axis=0):
    D = dct(M, axis=axis)
    sum_features = np.sum(D,axis=axis)
    idx = np.argsort(sum_features)
    return lambda k: idct(D[:, idx[:k]],axis=axis) if axis==0 else idct(D[idx[:k], :],axis=axis)

def rp(M, axis=0):
    R = np.random.normal(0, 1, (M.shape[1],M.shape[1]))
    if axis==1:
        R = np.random.normal(0, 1, (M.shape[0],M.shape[0]))
    
    R /= np.linalg.norm(R, axis=axis)
    return lambda k: M @ R[:, :k] if axis==0 else R[:k, :] @ M

def srp(M, axis=0):
    R = np.random.uniform(0,1, (M.shape[1], M.shape[1]))
    if axis==1:
        R = np.random.uniform(0,1, (M.shape[0], M.shape[0]))
    R2 = np.copy(R)
    R2[R < 1/6.0] = 1
    R2[R > 5/6.0] = -1
    R2[np.abs(R2) != 1] = 0
    R2*= np.sqrt(3)
    return lambda k: M @ R2[:, :k] if axis==0 else R2[:k, :] @ M


In [7]:
def rand_measure(x, frac=1, num=100, fn="l2-norm"):
    """
    Randomly choose *num* data points and compute the average distance between them.
    """
    arr = np.random.permutation(x)
    if arr.shape[0] > num:
        arr = arr[:num, :]
        n = int(num / 2)
    else:
        n = int(arr.shape[0] / 2)
    if fn == "l2-norm":
        return np.sqrt(frac) * np.mean(np.linalg.norm(arr[:n, :] - arr[n:,:], axis=0))
    else:
        return np.mean(np.sum(arr*arr,axis=1))

def measure(x, frac=1, fn="l2-norm"):
    """
    Compute the average distance between all data points
    """
    if fn == "l2-norm":
        d = np.diff(x, axis=0)
        return mean_confidence_interval(np.sqrt(frac) * np.linalg.norm(d, axis=1))
    else:
        return mean_confidence_interval([np.dot(x[i],x[i+1]) for i in range(0, len(x), 2)])

def norm(x, axis=1):
    """
    Normalise the data matrix, such that every row is zero mean and unit variance
    """
    if axis!=1:
        return (x-np.mean(x)) / np.std(x)
    else:
        return (x-np.mean(x, axis=axis, keepdims=True)) / np.std(x, axis=axis, keepdims=True)
def l2_norm(x,axis=1):
    """
    Normalise the data matrix, such that every row has unit length
    """
    n = np.linalg.norm(x, axis=axis, keepdims=True)
    n[n[:,0]==0] = 1
    return x / n

def timer(f):
    """
    Measure execution time in second
    """
    if f == 0:
        return time.time()
    else:
        return time.time() - f

        


def mean_confidence_interval(data, confidence=0.95):
    """
    Compute the confidence interval
    """
    a = data
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

def sp_noise(image, prob=0.1):
    """
    Apply salt-and-pepper noise on the image
    """
    samp = np.random.uniform(0,1,(image.shape))
    image[samp < prob] = 0
    image[samp >= 1-prob] = 255
    return image

def normalize(query, lowercase=True, removeSpecialChar=True, removeStop=True):
    """
    Normalise the documents
    """
    query = query.split(" ")  
    
    sw = stopwords.words("english")
    tokens = []
    for t in query:
        if (removeStop and t in sw):
            continue
        elif  (removeSpecialChar and not t.isalpha()):
            continue
        
        if (lowercase):
            t = t.lower()
        tokens.append(t)

    return tokens



In [4]:
def unpickle(file):
    with open(file, 'rb') as fo:
        d = pickle.load(fo, encoding='bytes')
    return d

def load(num=1):
    data_p = []
    for dname, dirs, files in os.walk("../data/cifar-10-python"):
        for fname in files:
            if "data_batch" in fname:
                fpath = os.path.join(dname, fname)
                data_p.append(fpath)
    return [unpickle(p) for p in data_p]

def load_news(num=1):
    data_p = []
    for dname, dirs, files in os.walk("../data/20_newsgroups/"):
        for fname in files:
            fpath = os.path.join(dname, fname)
            data_p.append(fpath)
            if len(data_p) >= num:
                return data_p

def load_jpeg(num=1):
    data_p = []
    for dname, dirs, files in os.walk("../data/test_v2/test/"):
        for fname in files:
            fpath = os.path.join(dname, fname)
            data_p.append(fpath)
            if len(data_p) >= num:
                return data_p
    return data_p


In [48]:
def cifar(N, data, ks, o_diff=0):
    n = data.shape[1]

    if o_diff == 0:
        o_diff, _, _ = measure(data)
    p_diff, d_diff, r_diff, sr_diff = [],[],[],[]
    p_min, d_min, r_min, sr_min = [],[],[],[]
    p_max, d_max, r_max, sr_max = [],[],[],[]
    p_time, d_time, r_time, sr_time = [],[],[],[]

    t = 0

    pca = get_pca(data)
    for k in ks:
        frac = n / k

        t = timer(0)
        p=pca(k)
        t = timer(t)
        m, mmin, mmax = measure(norm(p), frac=frac)
        p_diff.append(m)
        p_min.append(mmin)
        p_max.append(mmax)
        p_time.append(t)

        t = timer(0)
        p = get_dct(data)(k)
        t = timer(t)    
        m, mmin, mmax = measure(norm(p), frac=frac)
        d_diff.append(m)
        d_min.append(mmin)
        d_max.append(mmax)
        d_time.append(t)

        t = timer(0)
        p = rp2(data,k)
        t = timer(t)    
        m, mmin, mmax = measure(norm(p), frac=frac)
        r_diff.append(m)
        r_min.append(mmin)
        r_max.append(mmax)
        r_time.append(t)

        t = timer(0)
        p = srp2(data,k)
        t = timer(t)    
        m, mmin, mmax = measure(norm(p), frac=frac)
        sr_diff.append(m)
        sr_min.append(mmin)
        sr_max.append(mmax)
        sr_time.append(t)

    p_diff = np.asarray(p_diff) - o_diff
    d_diff = np.asarray(d_diff) - o_diff
    r_diff = np.asarray(r_diff) - o_diff
    sr_diff = np.asarray(sr_diff) - o_diff

    p_min  = np.asarray(p_min) - o_diff
    d_min  = np.asarray(d_min) - o_diff
    r_min  = np.asarray(r_min) - o_diff
    sr_min = np.asarray(sr_min) - o_diff

    p_max   = np.asarray(p_max) - o_diff
    d_max   = np.asarray(d_max) - o_diff
    r_max   = np.asarray(r_max) - o_diff
    sr_max = np.asarray(sr_max) - o_diff

    return dict(pca=[p_diff, p_min, p_max, p_time], dct=[d_diff, d_min, d_max, d_time],rp=[r_diff, r_min, r_max, r_time],srp=[sr_diff, sr_min, sr_max, sr_time])

    
def txt(N, data, ks):
    n = data.shape[1]

    o_diff, _, _ = measure(data, fn="inner")
    # o_diff=0
    p_diff, d_diff, r_diff, sr_diff = [],[],[],[]
    p_min, d_min, r_min, sr_min = [],[],[],[]
    p_max, d_max, r_max, sr_max = [],[],[],[]
    p_time, d_time, r_time, sr_time = [],[],[],[]

    t = 0
    # pca = get_pca(data)
    svd = get_svd(data)
    for k in ks:
        frac = n / k

        t = timer(0)
        # p = pca(k)
        p = svd(k)
        t = timer(t)
        m, mmin, mmax = measure(l2_norm(p), fn="inner")
        p_diff.append(m)
        p_min.append(mmin)
        p_max.append(mmax)
        p_time.append(t)

        t = timer(0)
        p = rp2(data,k)
        t = timer(t)    
        m, mmin, mmax = measure(l2_norm(p), fn="inner")
        r_diff.append(m)
        r_min.append(mmin)
        r_max.append(mmax)
        r_time.append(t)

        t = timer(0)
        p = srp2(data,k)
        t = timer(t)    
        m, mmin, mmax = measure(l2_norm(p), fn="inner")
        sr_diff.append(m)
        sr_min.append(mmin)
        sr_max.append(mmax)
        sr_time.append(t)

    p_diff = np.asarray(p_diff) - o_diff
    r_diff = np.asarray(r_diff) - o_diff
    sr_diff = np.asarray(sr_diff) - o_diff

    p_min  = np.asarray(p_min) - o_diff
    r_min  = np.asarray(r_min) - o_diff
    sr_min = np.asarray(sr_min) - o_diff

    p_max   = np.asarray(p_max) - o_diff
    r_max   = np.asarray(r_max) - o_diff
    sr_max = np.asarray(sr_max) - o_diff

    return dict(pca=[p_diff, p_min, p_max, p_time],rp=[r_diff, r_min, r_max, r_time],srp=[sr_diff, sr_min, sr_max, sr_time])

    






# Test on CIFAR data 

In [8]:
# Noiseless, grayscale
N = 1000
d = load()
data = d[0][b"data"][:N].astype(np.float32).reshape((N, 32,32,3))
data = np.asarray([cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) for img in data])
data = data.reshape((1000,-1))
data = norm(data)

np.random.seed(2)
dict1 = cifar(N, data, np.arange(2,325))

np.random.seed(20)
dict2 = cifar(N, data, np.arange(2,325))

np.random.seed(202)
dict3 = cifar(N, data, np.arange(2,325))

np.random.seed(2020)
dict4 = cifar(N, data, np.arange(2,325))

np.random.seed(20200)
dict5 = cifar(N, data, np.arange(2,325))



In [None]:
# Noiseless, colour scale
N = 1000
d = load()
data = d[0][b"data"][:N].astype(np.float32)
data = data.reshape((1000,-1))
data = norm(data)

np.random.seed(2)
dict1 = cifar(N, data, np.arange(2,325))

np.random.seed(20)
dict2 = cifar(N, data, np.arange(2,325))

np.random.seed(202)
dict3 = cifar(N, data, np.arange(2,325))

np.random.seed(2020)
dict4 = cifar(N, data, np.arange(2,325))

np.random.seed(20200)
dict5 = cifar(N, data, np.arange(2,325))



In [18]:
# Noisy
N = 1000
d = load()
data = d[0][b"data"][:N].astype(np.float32).reshape((N, 32,32,3))
noisy_data = np.asarray([sp_noise(cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)) for img in data])
data = np.asarray([cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) for img in data])

data = np.reshape(data, (N, -1))
noisy_data = np.reshape(noisy_data, (N, -1))
data = norm(data)
noisy_data = norm(noisy_data)
o_diff, _, _  = measure(data)

np.random.seed(2020)
dict1 = cifar(N, data, np.arange(2,325),o_diff)

np.random.seed(20)
dict2 = cifar(N, data, np.arange(2,325),o_diff)

np.random.seed(202)
dict3 = cifar(N, data, np.arange(2,325),o_diff)

np.random.seed(2020)
dict4 = cifar(N, data, np.arange(2,325),o_diff)

np.random.seed(20200)
dict5 = cifar(N, data, np.arange(2,325),o_diff)



In [None]:
# Average over the 5 repetitions
dict1["pca"][0] = np.mean(np.asarray([dict1["pca"][0],dict2["pca"][0],dict3["pca"][0],dict4["pca"][0],dict5["pca"][0]]), axis=0)
dict1["dct"][0] = np.mean(np.asarray([dict1["dct"][0],dict2["dct"][0],dict3["dct"][0],dict4["dct"][0],dict5["dct"][0]]), axis=0)
dict1["rp"][0] = np.mean(np.asarray([dict1["rp"][0],dict2["rp"][0],dict3["rp"][0],dict4["rp"][0],dict5["rp"][0]]), axis=0)
dict1["srp"][0] = np.mean(np.asarray([dict1["srp"][0],dict2["srp"][0],dict3["srp"][0],dict4["srp"][0],dict5["srp"][0]]), axis=0)


In [None]:
_, p1 = scipy.stats.mannwhitneyu(dict2["pca"][0],dict1["rp"][0])
_, p2 = scipy.stats.mannwhitneyu(dict2["pca"][0],dict1["srp"][0])
_, p3 = scipy.stats.mannwhitneyu(dict2["dct"][0],dict1["rp"][0])
_, p4 = scipy.stats.mannwhitneyu(dict2["dct"][0],dict1["srp"][0])

print(p1)
print(p2)
print(p3)
print(p4)

In [41]:
# Average the results over a fixed interval
avg = 10

p_diff = np.nan_to_num(dict1["pca"][0],0)
d_diff = np.nan_to_num(dict1["dct"][0],0)
r_diff = np.nan_to_num(dict1["rp"][0],0)
sr_diff = np.nan_to_num(dict1["srp"][0],0)

p_min = np.nan_to_num(dict1["pca"][1],0)
d_min = np.nan_to_num(dict1["dct"][1],0)
r_min = np.nan_to_num(dict1["rp"][1],0)
sr_min = np.nan_to_num(dict1["srp"][1],0)

p_max = np.nan_to_num(dict1["pca"][2],0)
d_max = np.nan_to_num(dict1["dct"][2],0)
r_max = np.nan_to_num(dict1["rp"][2],0)
sr_max = np.nan_to_num(dict1["srp"][2],0)

p_time = dict1["pca"][3]
d_time = dict1["dct"][3]
r_time = dict1["rp"][3]
sr_time = dict1["srp"][3]

p_diff = [np.mean(p_diff[i:i+avg]) for i in range(0, len(p_diff), avg)] 
d_diff = [np.mean(d_diff[i:i+avg]) for i in range(0, len(d_diff), avg)] 
r_diff = [np.mean(r_diff[i:i+avg]) for i in range(0, len(r_diff), avg)] 
sr_diff = [np.mean(sr_diff[i:i+avg]) for i in range(0, len(sr_diff), avg)] 

p_min = [np.mean(p_min[i:i+avg]) for i in range(0, len(p_min), avg)] 
d_min = [np.mean(d_min[i:i+avg]) for i in range(0, len(d_min), avg)] 
r_min = [np.mean(r_min[i:i+avg]) for i in range(0, len(r_min), avg)] 
sr_min = [np.mean(sr_min[i:i+avg]) for i in range(0, len(sr_min), avg)] 

p_max = [np.mean(p_max[i:i+avg]) for i in range(0, len(p_max), avg)] 
d_max = [np.mean(d_max[i:i+avg]) for i in range(0, len(d_max), avg)] 
r_max = [np.mean(r_max[i:i+avg]) for i in range(0, len(r_max), avg)] 
sr_max = [np.mean(sr_max[i:i+avg]) for i in range(0, len(sr_max), avg)] 


p_time = [np.mean(p_time[i:i+avg]) for i in range(0, len(p_time), avg)] 
d_time = [np.mean(d_time[i:i+avg]) for i in range(0, len(d_time), avg)] 
r_time= [np.mean(r_time[i:i+avg]) for i in range(0, len(r_time), avg)] 
sr_time = [np.mean(sr_time[i:i+avg]) for i in range(0, len(sr_time), avg)] 




In [36]:
# Apply median filter and compute error
mf = np.asarray([cv2.medianBlur(d, 3) for d in noisy_data])
m_diff, _, _, = measure(mf)
o_diff, _, _, = measure(data)
m_val = m_diff - o_diff

In [39]:
fig = go.Figure()
ks = np.arange(2,325,avg)
fig.add_trace(go.Scatter(x=ks, y=dict1["pca"][0], name="PCA", marker_symbol="circle", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=dict1["dct"][0], name="DCT", marker_symbol="square", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=dict1["rp"][0], name="RP", marker_symbol="cross", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=dict1["srp"][0], name="SRP", marker_symbol="diamond", mode="markers"))

fig.update_layout(
    title_text="Euclidean error using RP, SRP, PCA and DCT",
    width=800,
)

fig.update_xaxes(title_text='Reduced dimension',dtick=25)
fig.update_yaxes(title_text='Error')

fig.show()

In [238]:
fig = go.Figure()
# ks = np.arange(1,325, 10)

fig.add_trace(go.Scatter(x=ks, y=p_diff, name="PCA", marker_symbol="circle", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=d_diff, name="DCT", marker_symbol="square", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=r_diff, name="RP", marker_symbol="cross", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=sr_diff, name="SRP", marker_symbol="diamond", mode="markers"))

fig.add_trace(go.Scatter(x=list(ks)+list(ks[::-1]), y=list(p_min)+list(p_max[::-1]), fill="toself", fillcolor='rgba(0,255,0,0.2)', line=dict(color='rgba(255,255,255,0)'), hoverinfo="skip", showlegend=False))
fig.add_trace(go.Scatter(x=list(ks)+list(ks[::-1]), y=list(d_min)+list(d_max[::-1]), fill="toself", fillcolor='rgba(255,0,0,0.2)', line=dict(color='rgba(255,255,255,0)'), hoverinfo="skip", showlegend=False))

fig.add_trace(go.Scatter(x=list(ks)+list(ks[::-1]), y=list(r_min)+list(r_max[::-1]), fill="toself", fillcolor='rgba(0,0,255,0.2)', line=dict(color='rgba(255,255,255,0)'), hoverinfo="skip", showlegend=False))

fig.add_trace(go.Scatter(x=list(ks)+list(ks[::-1]), y=list(sr_min)+list(sr_max[::-1]), fill="toself", fillcolor='rgba(100,100,100,0.2)', line=dict(color='rgba(255,255,255,0)'), hoverinfo="skip", showlegend=False))

fig.update_layout(
    title_text="Euclidean error using RP, SRP, PCA and DCT with confidence interval (at 95%)" ,
    width=800,
)

fig.update_xaxes(title_text='Reduced dimension', dtick=25)
fig.update_yaxes(title_text='Error')

fig.show()

In [43]:
fig = go.Figure()
ks = np.arange(2,325, avg)
fig.add_trace(go.Scatter(x=ks, y=p_diff, name="PCA", marker_symbol="circle", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=d_diff, name="DCT", marker_symbol="square", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=r_diff, name="RP", marker_symbol="cross", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=sr_diff, name="SRP", marker_symbol="diamond", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=[m_val]*len(ks), name="Median filter", marker_symbol="diamond", mode="lines"))

fig.update_layout(
    title_text="Euclidean error on noisy data using RP, SRP, PCA, DCT, and Median filter",
    width=800,
)

fig.update_xaxes(title_text='Reduced dimension', dtick=25)
fig.update_yaxes(title_text='Error')

fig.show()

In [39]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=ks, y=p_time,  name="PCA", marker_symbol="circle", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=d_time,  name="DCT", marker_symbol="square", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=r_time,  name="RP", marker_symbol="cross", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=sr_time, name="SRP", marker_symbol="diamond", mode="markers"))

fig.update_layout(
    title_text="Execution time using RP, SRP, PCA and DCT",
    width=800,
)

fig.update_xaxes(title_text='Reduced dimension', dtick=25)
fig.update_yaxes(title_text='sec', type="log")

fig.show()

# Testing on the newsgroup data

In [10]:
N = 1000
d = load_news(N)
data = []
for i in range(N):
    with open(d[i], "r") as f:
        data.append(normalize(f.read()))

In [11]:
dictionary = corpora.Dictionary(data)
corpus = [dictionary.doc2bow(doc) for doc in data]

D = len(dictionary)
bow = np.zeros((N, D), dtype=np.float32)
for i in range(len(data)):
    for j, f in corpus[i]:
        bow[i, j] = f


In [18]:
bow = bow / np.linalg.norm(bow, axis=1, keepdims=True)
dict2 = txt(N, bow, np.arange(1, 700))


In [63]:
_, p1 = scipy.stats.mannwhitneyu(dict2["pca"][0],dict1["rp"][0])
_, p2 = scipy.stats.mannwhitneyu(dict2["pca"][0],dict1["srp"][0])
_, p3 = scipy.stats.mannwhitneyu(dict2["dct"][0],dict1["rp"][0])
_, p4 = scipy.stats.mannwhitneyu(dict2["dct"][0],dict1["srp"][0])

print(p1)
print(p2)
print(p3)
print(p4)

3.034927181213963e-102


In [19]:
avg = 20

p_diff = np.nan_to_num(dict2["pca"][0],0)
r_diff = np.nan_to_num(dict2["rp"][0],0)
sr_diff = np.nan_to_num(dict2["srp"][0],0)

p_min = np.nan_to_num(dict2["pca"][1],0)
r_min = np.nan_to_num(dict2["rp"][1],0)
sr_min = np.nan_to_num(dict2["srp"][1],0)

p_max = np.nan_to_num(dict2["pca"][2],0)
r_max = np.nan_to_num(dict2["rp"][2],0)
sr_max = np.nan_to_num(dict2["srp"][2],0)

p_time = dict2["pca"][3]
r_time = dict2["rp"][3]
sr_time = dict2["srp"][3]

p_diff = [np.mean(p_diff[i:i+avg]) for i in range(0, len(p_diff), avg)] 
r_diff = [np.mean(r_diff[i:i+avg]) for i in range(0, len(r_diff), avg)] 
sr_diff = [np.mean(sr_diff[i:i+avg]) for i in range(0, len(sr_diff), avg)] 

p_min = [np.mean(p_min[i:i+avg]) for i in range(0, len(p_min), avg)] 
r_min = [np.mean(r_min[i:i+avg]) for i in range(0, len(r_min), avg)] 
sr_min = [np.mean(sr_min[i:i+avg]) for i in range(0, len(sr_min), avg)] 

p_max = [np.mean(p_max[i:i+avg]) for i in range(0, len(p_max), avg)] 
r_max = [np.mean(r_max[i:i+avg]) for i in range(0, len(r_max), avg)] 
sr_max = [np.mean(sr_max[i:i+avg]) for i in range(0, len(sr_max), avg)] 


p_time = [np.mean(p_time[i:i+avg]) for i in range(0, len(p_time), avg)] 
r_time= [np.mean(r_time[i:i+avg]) for i in range(0, len(r_time), avg)] 
sr_time = [np.mean(sr_time[i:i+avg]) for i in range(0, len(sr_time), avg)] 




In [21]:
fig = go.Figure()

ks = np.arange(1, 700, avg)
fig.add_trace(go.Scatter(x=ks, y=p_diff, name="SVD", marker_symbol="circle", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=r_diff, name="RP", marker_symbol="cross", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=sr_diff, name="SRP", marker_symbol="diamond", mode="markers"))

# fig.add_trace(go.Scatter(x=list(ks)+list(ks[::-1]), y=list(p_min)+list(p_max[::-1]), fill="toself", fillcolor='rgba(0,255,0,0.2)', line=dict(color='rgba(255,255,255,0)'), hoverinfo="skip", showlegend=False))

# fig.add_trace(go.Scatter(x=list(ks)+list(ks[::-1]), y=list(r_min)+list(r_max[::-1]), fill="toself", fillcolor='rgba(0,0,255,0.2)', line=dict(color='rgba(255,255,255,0)'), hoverinfo="skip", showlegend=False))

# fig.add_trace(go.Scatter(x=list(ks)+list(ks[::-1]), y=list(sr_min)+list(sr_max[::-1]), fill="toself", fillcolor='rgba(100,100,100,0.2)', line=dict(color='rgba(255,255,255,0)'), hoverinfo="skip", showlegend=False))

fig.update_layout(
    title_text="Average error using RP, SRP and PCA(SVD) " ,
    width=800,
)

fig.update_xaxes(title_text='Reduced dimension', dtick=50)
fig.update_yaxes(title_text='Error', dtick=0.05)

fig.show()

# Testing on twitter data

In [22]:
twitter = pd.read_csv("../data/Twitter/training.1600000.processed.noemoticon.csv", encoding='latin-1')
cols = twitter.columns

In [23]:
docs = [normalize(doc) for doc in twitter[cols[-1]]]

In [24]:
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]

D = len(dictionary)
N = len(docs)



In [25]:
bow = np.zeros((N, D), dtype=np.float32)
for i in range(N):
    for j, f in corpus[i]:
        bow[i, j] = f

In [26]:
nor = np.linalg.norm(bow, axis=1, keepdims=True)
nor[nor[:,0]==0] = 1

bow = bow / nor



In [27]:

bow2 = bow[:, :10000]
dict2 = txt(N, bow2, np.arange(1, 700))

In [None]:
_, p1 = scipy.stats.mannwhitneyu(dict2["pca"][0],dict1["rp"][0])
_, p2 = scipy.stats.mannwhitneyu(dict2["pca"][0],dict1["srp"][0])
_, p3 = scipy.stats.mannwhitneyu(dict2["dct"][0],dict1["rp"][0])
_, p4 = scipy.stats.mannwhitneyu(dict2["dct"][0],dict1["srp"][0])

print(p1)
print(p2)
print(p3)
print(p4)

In [28]:
avg = 20

p_diff = np.nan_to_num(dict2["pca"][0],0)
r_diff = np.nan_to_num(dict2["rp"][0],0)
sr_diff = np.nan_to_num(dict2["srp"][0],0)

p_min = np.nan_to_num(dict2["pca"][1],0)
r_min = np.nan_to_num(dict2["rp"][1],0)
sr_min = np.nan_to_num(dict2["srp"][1],0)

p_max = np.nan_to_num(dict2["pca"][2],0)
r_max = np.nan_to_num(dict2["rp"][2],0)
sr_max = np.nan_to_num(dict2["srp"][2],0)

p_time = dict2["pca"][3]
r_time = dict2["rp"][3]
sr_time = dict2["srp"][3]

p_diff = [np.mean(p_diff[i:i+avg]) for i in range(0, len(p_diff), avg)] 
r_diff = [np.mean(r_diff[i:i+avg]) for i in range(0, len(r_diff), avg)] 
sr_diff = [np.mean(sr_diff[i:i+avg]) for i in range(0, len(sr_diff), avg)] 

p_min = [np.mean(p_min[i:i+avg]) for i in range(0, len(p_min), avg)] 
r_min = [np.mean(r_min[i:i+avg]) for i in range(0, len(r_min), avg)] 
sr_min = [np.mean(sr_min[i:i+avg]) for i in range(0, len(sr_min), avg)] 

p_max = [np.mean(p_max[i:i+avg]) for i in range(0, len(p_max), avg)] 
r_max = [np.mean(r_max[i:i+avg]) for i in range(0, len(r_max), avg)] 
sr_max = [np.mean(sr_max[i:i+avg]) for i in range(0, len(sr_max), avg)] 


p_time = [np.mean(p_time[i:i+avg]) for i in range(0, len(p_time), avg)] 
r_time= [np.mean(r_time[i:i+avg]) for i in range(0, len(r_time), avg)] 
sr_time = [np.mean(sr_time[i:i+avg]) for i in range(0, len(sr_time), avg)] 




In [30]:
fig = go.Figure()

ks = np.arange(1,700, avg)

fig.add_trace(go.Scatter(x=ks, y=p_diff, name="SVD", marker_symbol="circle", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=r_diff, name="RP", marker_symbol="cross", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=sr_diff, name="SRP", marker_symbol="diamond", mode="markers"))

fig.add_trace(go.Scatter(x=list(ks)+list(ks[::-1]), y=list(p_min)+list(p_max[::-1]), fill="toself", fillcolor='rgba(0,255,0,0.2)', line=dict(color='rgba(255,255,255,0)'), hoverinfo="skip", showlegend=False))

fig.add_trace(go.Scatter(x=list(ks)+list(ks[::-1]), y=list(r_min)+list(r_max[::-1]), fill="toself", fillcolor='rgba(0,0,255,0.2)', line=dict(color='rgba(255,255,255,0)'), hoverinfo="skip", showlegend=False))

fig.add_trace(go.Scatter(x=list(ks)+list(ks[::-1]), y=list(sr_min)+list(sr_max[::-1]), fill="toself", fillcolor='rgba(100,100,100,0.2)', line=dict(color='rgba(255,255,255,0)'), hoverinfo="skip", showlegend=False))

fig.update_layout(
    title_text="Average error using RP, SRP and PCA(SVD)" ,
    width=800,
)

fig.update_xaxes(title_text='Reduced dimension', dtick=50)
fig.update_yaxes(title_text='Error')

fig.show()

# Testing on handwriting data

In [16]:
imgs = load_jpeg(N)

In [17]:
data = [cv2.resize(cv2.imread(imgs[i], cv2.IMREAD_GRAYSCALE), dsize=(50, 125), interpolation=cv2.INTER_CUBIC) for i in range(len(imgs))]
data = np.asarray([d.reshape(-1) for d in data])

In [18]:

np.random.seed(2020)
dict1 = cifar(N, data, np.arange(2,325))

np.random.seed(20)
dict2 = cifar(N, data, np.arange(2,325))

np.random.seed(202)
dict3 = cifar(N, data, np.arange(2,325))

np.random.seed(2020)
dict4 = cifar(N, data, np.arange(2,325))

np.random.seed(20200)
dict5 = cifar(N, data, np.arange(2,325))



In [None]:
# Average over the 5 repetitions
dict1["pca"][0] = np.mean(np.asarray([dict1["pca"][0],dict2["pca"][0],dict3["pca"][0],dict4["pca"][0],dict5["pca"][0]]), axis=0)
dict1["dct"][0] = np.mean(np.asarray([dict1["dct"][0],dict2["dct"][0],dict3["dct"][0],dict4["dct"][0],dict5["dct"][0]]), axis=0)
dict1["rp"][0] = np.mean(np.asarray([dict1["rp"][0],dict2["rp"][0],dict3["rp"][0],dict4["rp"][0],dict5["rp"][0]]), axis=0)
dict1["srp"][0] = np.mean(np.asarray([dict1["srp"][0],dict2["srp"][0],dict3["srp"][0],dict4["srp"][0],dict5["srp"][0]]), axis=0)


In [21]:
_, p1 = scipy.stats.mannwhitneyu(dict2["pca"][0],dict1["rp"][0])
_, p2 = scipy.stats.mannwhitneyu(dict2["pca"][0],dict1["srp"][0])
_, p3 = scipy.stats.mannwhitneyu(dict2["dct"][0],dict1["rp"][0])
_, p4 = scipy.stats.mannwhitneyu(dict2["dct"][0],dict1["srp"][0])

print(p1)
print(p2)
print(p3)
print(p4)

1.6386802704221208e-107


In [20]:
avg = 10

p_diff = np.nan_to_num(dict1["pca"][0],0)
d_diff = np.nan_to_num(dict1["dct"][0],0)
r_diff = np.nan_to_num(dict1["rp"][0],0)
sr_diff = np.nan_to_num(dict1["srp"][0],0)

p_min = np.nan_to_num(dict1["pca"][1],0)
d_min = np.nan_to_num(dict1["dct"][1],0)
r_min = np.nan_to_num(dict1["rp"][1],0)
sr_min = np.nan_to_num(dict1["srp"][1],0)

p_max = np.nan_to_num(dict1["pca"][2],0)
d_max = np.nan_to_num(dict1["dct"][2],0)
r_max = np.nan_to_num(dict1["rp"][2],0)
sr_max = np.nan_to_num(dict1["srp"][2],0)

p_time = dict1["pca"][3]
d_time = dict1["dct"][3]
r_time = dict1["rp"][3]
sr_time = dict1["srp"][3]

p_diff = [np.mean(p_diff[i:i+avg]) for i in range(0, len(p_diff), avg)] 
d_diff = [np.mean(d_diff[i:i+avg]) for i in range(0, len(d_diff), avg)] 
r_diff = [np.mean(r_diff[i:i+avg]) for i in range(0, len(r_diff), avg)] 
sr_diff = [np.mean(sr_diff[i:i+avg]) for i in range(0, len(sr_diff), avg)] 

p_min = [np.mean(p_min[i:i+avg]) for i in range(0, len(p_min), avg)] 
d_min = [np.mean(d_min[i:i+avg]) for i in range(0, len(d_min), avg)] 
r_min = [np.mean(r_min[i:i+avg]) for i in range(0, len(r_min), avg)] 
sr_min = [np.mean(sr_min[i:i+avg]) for i in range(0, len(sr_min), avg)] 

p_max = [np.mean(p_max[i:i+avg]) for i in range(0, len(p_max), avg)] 
d_max = [np.mean(d_max[i:i+avg]) for i in range(0, len(d_max), avg)] 
r_max = [np.mean(r_max[i:i+avg]) for i in range(0, len(r_max), avg)] 
sr_max = [np.mean(sr_max[i:i+avg]) for i in range(0, len(sr_max), avg)] 


p_time = [np.mean(p_time[i:i+avg]) for i in range(0, len(p_time), avg)] 
d_time = [np.mean(d_time[i:i+avg]) for i in range(0, len(d_time), avg)] 
r_time= [np.mean(r_time[i:i+avg]) for i in range(0, len(r_time), avg)] 
sr_time = [np.mean(sr_time[i:i+avg]) for i in range(0, len(sr_time), avg)] 




In [21]:
fig = go.Figure()
# fig.add_trace(go.Scatter(x=ks, y=p_diff, name="PCA", marker_symbol="circle", mode="markers", error_y=dict(type="data", symmetric=False, array=p_max, arrayminus=p_min)))
ks = np.arange(2,325, avg)
fig.add_trace(go.Scatter(x=ks, y=p_diff, name="PCA", marker_symbol="circle", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=d_diff, name="DCT", marker_symbol="square", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=r_diff, name="RP", marker_symbol="cross", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=sr_diff, name="SRP", marker_symbol="diamond", mode="markers"))
# fig.add_trace(go.Scatter(x=ks, y=[m_val]*len(ks), name="Median filter", marker_symbol="diamond", mode="lines"))

fig.update_layout(
    title_text="Euclidean error using RP, SRP, PCA and DCT",
    width=800,
)

fig.update_xaxes(title_text='Reduced dimension', dtick=25)
fig.update_yaxes(title_text='Error')

fig.show()

In [23]:
fig = go.Figure()
# ks = np.arange(2,325, 50)
fig.add_trace(go.Scatter(x=ks, y=p_time,  name="PCA", marker_symbol="circle", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=d_time,  name="DCT", marker_symbol="square", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=r_time,  name="RP", marker_symbol="cross", mode="markers"))
fig.add_trace(go.Scatter(x=ks, y=sr_time, name="SRP", marker_symbol="diamond", mode="markers"))

fig.update_layout(
    title_text="Execution time using RP, SRP, PCA and DCT",
    width=800,
)

fig.update_xaxes(title_text='Reduced dimension', dtick=25)
fig.update_yaxes(title_text='sec')

fig.show()