In [46]:
import numpy as np
from PIL import Image
import math
import sklearn.feature_selection
import sklearn.decomposition

In [28]:
data = np.loadtxt('data/03-train-x.csv', delimiter=',')

In [13]:
def make_image_matrix(rows, cols, images, fileName):
    img_height = int(math.sqrt(len(images[0])))
    img_width = img_height
    mat_width = cols*img_width
    mat_height = rows*img_height
    matrix = Image.new('L', (mat_width, mat_height))
    for row in range(rows):
        for col in range(cols):
            idx = row*cols + col
            data = images[idx].astype(np.int32)
            data = data.reshape((img_height, img_width), order='F')
            img = Image.fromarray(data, mode='I')
            matrix.paste(img, box=(col*img_width, row*img_height))
    matrix.save(fileName)

In [19]:
make_image_matrix(10, 10, data, "letters.png")

In [73]:
#method = None

#method = "variance"
#selector = sklearn.feature_selection.VarianceThreshold(threshold=(0.3))

#method = "krank-svd"
#selector = sklearn.decomposition.TruncatedSVD(n_components=15)

method = "NMF"
selector = sklearn.decomposition.NMF(n_components=10)

In [74]:
reduced = selector.fit_transform(data) if method is not None  else data
print(data.shape)
print(reduced.shape)

(88800, 784)
(88800, 10)


In [75]:
# query
q = data[0]
if method is not None:
    q = selector.transform([q]) 

results = []
for idx, d in enumerate(reduced):
    # euclidean distance between query and document
    dist = np.linalg.norm(q - d)
    results.append((idx, dist))

sorted_result = sorted(results, key=lambda x: x[1])

sorted_result[:10]

[(0, 0.009103312874038254),
 (65412, 1.07404393003565),
 (81951, 1.1630267433922208),
 (35030, 1.2369516691116147),
 (84084, 1.274611136223378),
 (2046, 1.322295837351508),
 (30740, 1.365389278980342),
 (60963, 1.3671336025238574),
 (70822, 1.37294149245441),
 (2877, 1.3797039422389763)]

In [76]:
imgs = [data[x[0]] for x in sorted_result[:100]]
make_image_matrix(10, 10, imgs, "imgs-{}.png".format(method))