In [1]:
import dask
import h5py

import numpy as np
import dask.array as da

import memory_profiler
from dask.diagnostics import ProgressBar
%load_ext memory_profiler

In [2]:
def vectorized_RBF_kernel(X, sigma):
    # % This is equivalent to computing the kernel on every pair of examples
    X2 = np.sum(np.multiply(X, X), 1) # sum colums of the matrix
    K0 = X2 + X2.T - 2 * X * X.T
    K = np.power(np.exp(-1.0 / sigma ** 2), K0)
    return K

def vectorized_RBF_kernel2(X, sigma):
    # % This is equivalent to computing the kernel on every pair of examples
    X2 = np.sum(np.multiply(X, X), 1) # sum colums of the matrix
    K0 = X2 + X2.T - 2 * X.dot(X.T) 
    print(K0)
    K = np.power(np.exp(-1.0 / sigma ** 2), K0)
    return K

In [3]:
def load_features_from_file(path):
    with open(path, 'r') as features_file:
        return [[float(feature) for feature in feature_set.split()] for feature_set in features_file.readlines()]

In [4]:
def read_hdf5_from_file(file, path):
    f = h5py.File(file)     # HDF5 file
    return f[path]          # Pointer on on-disk array

In [5]:
features_matrix = np.matrix(load_features_from_file(
    '../../dataset/condensed_features/all_features-music4all.txt'
    ))

In [6]:
features_matrix_slice = features_matrix[:400]
features_matrix_slice.shape

(400, 120)

In [7]:
vectorized_RBF_kernel2(features_matrix_slice, 0.1)

[[-7.27595761e-12  4.36420881e+04  1.34811375e+04 ...  6.06565623e+03
   4.17865990e+03  5.86061907e+04]
 [ 4.36420881e+04  2.91038305e-11  3.02426916e+04 ...  4.42278724e+04
   3.13091022e+04  9.56625761e+03]
 [ 1.34811375e+04  3.02426916e+04  0.00000000e+00 ...  1.77228353e+04
   7.90285750e+03  4.19229694e+04]
 ...
 [ 6.06565623e+03  4.42278724e+04  1.77228353e+04 ... -3.63797881e-11
   1.09344929e+04  6.16629751e+04]
 [ 4.17865990e+03  3.13091022e+04  7.90285750e+03 ...  1.09344929e+04
  -7.27595761e-12  4.08263440e+04]
 [ 5.86061907e+04  9.56625761e+03  4.19229694e+04 ...  6.16629751e+04
   4.08263440e+04  8.73114914e-11]]


matrix([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.99999999]])

In [8]:
vectorized_RBF_kernel(features_matrix_slice, 0.1)

matrix([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.99999999]])

In [9]:
X = features_matrix
X.shape

(109269, 120)

In [10]:
!rm *.hdf5

In [11]:
sigma = 0.1
with ProgressBar():
    print("Making X2")
    # X2 = np.sum(np.multiply(X, X), 1) # sum colums of the matrix
    X2 = np.sum(np.multiply(X,X), 1)
    X2_da = da.from_array(X2, chunks=(10000,10000))
    X_da = da.from_array(X,   chunks=(10000,10000))

    print("Making K0")
    # K0 = X2 + X2.T - 2 * X * X.T
    da.to_hdf5("K0_part1.hdf5", "/data", X2_da + X2_da.T)
    da.to_hdf5("K0_part2.hdf5", "/data", -2 * X_da.dot(X_da.T))

    # abre esses dois arquivos
    K0_part1_file = read_hdf5_from_file("K0_part1.hdf5", "/data")
    K0_part2_file = read_hdf5_from_file("K0_part2.hdf5", "/data")
    
    # le eles como chunks
    K0_dask1 = da.from_array(K0_part1_file, chunks=(10000, 10000))
    K0_dask2 = da.from_array(K0_part2_file, chunks=(10000, 10000))
    
    # Exporta a soma deles
    da.to_hdf5("K_0.hdf5", "/data", K0_dask1 + K0_dask2)
    
    K0_file = read_hdf5_from_file("K_0.hdf5", "/data")
    K0_dask = da.from_array(K0_file, chunks=(10000, 10000))
    
    print("Making K")
    # K = np.power(np.exp(-1.0 / sigma ** 2), K0)
    da.to_hdf5("K.hdf5", "/data", da.power(da.exp(-1.0/sigma ** 2), K0_dask))    

Making X2
Making K0
[                                        ] | 0% Completed |  0.0s

  result = blockwise(


[########################################] | 100% Completed | 13min 30.9s


  intermediate = blockwise(


[########################################] | 100% Completed | 16min 52.5s
[########################################] | 100% Completed | 46min 14.1s
Making K
[########################################] | 100% Completed | 25min 25.2s


In [2]:
f = h5py.File('K.hdf5') # HDF5 file
d = f['/data']          # Pointer on on-disk array
d.shape                 # d can be very large

(109269, 109269)

In [3]:
x = da.from_array(d, chunks=(10000, 10000))
x

Unnamed: 0,Array,Chunk
Bytes,95.52 GB,800.00 MB
Shape,"(109269, 109269)","(10000, 10000)"
Count,122 Tasks,121 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 95.52 GB 800.00 MB Shape (109269, 109269) (10000, 10000) Count 122 Tasks 121 Chunks Type float64 numpy.ndarray",109269  109269,

Unnamed: 0,Array,Chunk
Bytes,95.52 GB,800.00 MB
Shape,"(109269, 109269)","(10000, 10000)"
Count,122 Tasks,121 Chunks
Type,float64,numpy.ndarray


In [9]:
print(x[:10].shape)

(10, 109269)


In [16]:
y = da.count_nonzero(x==0)
print(y.compute())

11939595562
