# Understanding how to control memory allocation
##### (Of functions that are called lots of times and generate arrays everytime they are called)



This notebook compares two versions of a function:  **```compute_grad```** and **```compute_grad_with_dot!```**.

The idea was to have a type that has "placeholders" for the quantities that are computed inside ```compute_grad``` (sampling quantities, as well as V_hat, H_hat) to avoid allocating memory at every update of the parameters of the model (at every call to  ```compute_grad```).



In [1]:
# Import Distributions to generate random numbers W matrix of the RBM
using Distributions
using MNIST
using BenchmarkTools
using Combinatorics


Use "const Partition = Vector{Int}" instead.

Use "const YoungDiagram = Array{Int,2}" instead.

Use "const SkewDiagram = Tuple{Partition,Partition}" instead.


In [2]:
X_train, y_train = MNIST.traindata()
X_test, y_test = MNIST.testdata()

T = Float32
X_train = Array{T}((X_train - minimum(X_train))/(maximum(X_train) - minimum(X_train)))
y_train = Array{T}(y_train)
X_test = Array{T}(X_test - minimum(X_test))/(maximum(X_test) - minimum(X_test)) 
y_test = Array{T}(y_test);

Stacktrace:
 [1] [1mdepwarn[22m[22m[1m([22m[22m::String, ::Symbol[1m)[22m[22m at [1m./deprecated.jl:64[22m[22m
 [2] [1mArray[22m[22m[1m([22m[22m::Type{Float64}, ::Int64, ::Int64[1m)[22m[22m at [1m./deprecated.jl:51[22m[22m
 [3] [1mtraindata[22m[22m[1m([22m[22m[1m)[22m[22m at [1m/Users/macpro/.julia/v0.6/MNIST/src/MNIST.jl:88[22m[22m
 [4] [1minclude_string[22m[22m[1m([22m[22m::String, ::String[1m)[22m[22m at [1m./loading.jl:485[22m[22m
 [5] [1mexecute_request[22m[22m[1m([22m[22m::ZMQ.Socket, ::IJulia.Msg[1m)[22m[22m at [1m/Users/macpro/.julia/v0.6/IJulia/src/execute_request.jl:157[22m[22m
 [6] [1meventloop[22m[22m[1m([22m[22m::ZMQ.Socket[1m)[22m[22m at [1m/Users/macpro/.julia/v0.6/IJulia/src/eventloop.jl:8[22m[22m
 [7] [1m(::IJulia.##9#12)[22m[22m[1m([22m[22m[1m)[22m[22m at [1m./task.jl:335[22m[22m
while loading In[2], in expression starting on line 1
Stacktrace:
 [1] [1mdepwarn[22m[22m[1m([22m[22m

## Define basic types

In [3]:
function sigmoid(x::Float32)
    return 1/(1 + exp(-x))
end

type RBM{T <: Real}
    n_vis::Int
    n_hid::Int
    W::Matrix{T}  
    vis_bias::Vector{T}     
    hid_bias::Vector{T}   
    trained::Bool
    n_epochs_trained::Int
end

function initialize_RBM(n_vis, n_hid, sigma, T)
    
    return RBM{T}(n_vis,                                   # num visible units 
                  n_hid,                                   # num hidden unnits
                  rand(Normal(0,sigma), n_hid, n_vis),     # weight matrix
                  zeros(n_vis),                            # visible vector  
                  zeros(n_hid),                            # Hidden vector
                  false,0)                                 # trained
end

function Base.show{T}(io::IO, rbm::RBM{T})
    n_vis = size(rbm.vis_bias, 1)
    n_hid = size(rbm.hid_bias, 1)
    trained = rbm.trained
    print(io, "RBM{$T}(n_vis=$n_vis, n_hid=$n_hid, trained=$trained)")
end


type CDK{T}
    K::Int
    batch_size::Int
    
    # Placeholders needed for the gradients of the parameters of the RBM
    grad_W::Matrix{T}         
    grad_vis_bias::Vector{T}     
    grad_hid_bias::Vector{T}   
    
    # Placeholders needed for performing CDK in a minibatch
    H::Matrix{T}
    V_hat::Matrix{T}
    H_hat::Matrix{T}
    rec_error::Float64 # This is probably irrelevant, allo
    
    # Placeholders needed for performing sampling in a minibatch
    V_sampling::Matrix{T}
    H_sampling::Matrix{T}   
    H_aux::Matrix{T}  
    V_aux::Matrix{T}  


end

function initialize_CDK(rbm::RBM, K, batch_size)
    """
    This function initializes a CDK type that will be used as placeholder for the
    memory needed for the gibbs sampling process needed at every minibatch update.
    """
    T = eltype(rbm.vis_bias)
    grad_W = zeros(T, size(rbm.W))
    grad_vis_bias = zeros(T, size(rbm.vis_bias))
    grad_hid_bias = zeros(T, size(rbm.hid_bias))
    V_hat = zeros(T, rbm.n_vis, batch_size)
    H_hat = zeros(T, rbm.n_hid, batch_size)
    H = zeros(T, rbm.n_hid, batch_size)
    V_sampling = zeros(T, rbm.n_vis, batch_size)
    H_sampling = zeros(T, rbm.n_hid, batch_size)
    H_aux = zeros(T, rbm.n_hid, batch_size)
    V_aux = zeros(T, rbm.n_vis, batch_size)

    cdk = CDK(K, batch_size, 
              grad_W, grad_vis_bias,grad_hid_bias,
              H, V_hat, H_hat, 0.,
              V_sampling, H_sampling, H_aux,V_aux)
    return cdk
end

function update_params!(rbm::RBM, opt::CDK, lr)
    rbm.W .+= lr .* opt.grad_W 
    rbm.vis_bias .+= lr .* opt.grad_vis_bias
    rbm.hid_bias .+= lr .* opt.grad_hid_bias
end

update_params! (generic function with 1 method)

# Test fit without .= 

In [4]:

function fit!(rbm::RBM, 
              X::Matrix, 
              batch_size::Integer,
              n_epochs::Integer,
              lr::Real,
              shuffle_data::Bool,
              opt)
        
    T = eltype(X)
    lr = T(lr)
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    sample_perm = Vector(1:n_samples)
    n_minibatches = T(length(indicies))
    rec_errors = Vector{T}([])
        
    ###### Initialize Optimizer, CDK, PCDK, ....#######
    #cdk = initialize_CDK(rbm, K, batch_size)  
    
    for epoch in 1:n_epochs
        rec_error = Float32(0.)
        
        # should  it be more efficient to Shuffle indicies not the whole data?
        # then access is not contiguous though
        if shuffle_data==true
            shuffle!(sample_perm)
            X .= X[:,sample_perm]
        end
        
        for minibatch_ind in indicies          
            partial_fit!(rbm, X[:, minibatch_ind], lr, opt)
            rec_error += opt.rec_error
        end
        
        push!(rec_errors, rec_error/n_minibatches)
        rbm.n_epochs_trained +=1
        print(rec_errors[end], "\n")
    end
    rbm.trained = true
    return rec_errors
end

function partial_fit!(rbm::RBM, X::Matrix,  lr::Real, opt::CDK)
    compute_grad!(rbm, X, opt)
    update_params!(rbm, opt, lr)    
end

function compute_grad!(rbm::RBM, X::Matrix,  opt::CDK)

    T = eltype(rbm.vis_bias)
    batch_size = size(X)[2]
    
    # Perform gibbs sampling to compute the negative phase
    for k in 1:opt.K
        if k==1       
            opt.H .= sigmoid.(rbm.W * X .+ rbm.hid_bias)
            opt.V_hat .= sigmoid.(rbm.W'* opt.H .+ rbm.vis_bias) .> rand(T,rbm.n_vis, batch_size)
            opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) 
        else
            opt.V_hat .= sigmoid.(rbm.W'* opt.H_hat .+ rbm.vis_bias) .> rand(T,rbm.n_vis, batch_size)
            opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) 
        end               
    end   
   
    opt.grad_W =  (opt.H * X' .-  opt.H_hat * opt.V_hat')./ batch_size; 
    opt.grad_vis_bias = vec(sum((X .- opt.V_hat), 2))./ batch_size;
    opt.grad_hid_bias = vec(sum((opt.H .- opt.H_hat), 2))./ batch_size;
    
    opt.rec_error = sqrt(sum((X.-opt.V_hat).^2))
end

compute_grad! (generic function with 1 method)

In [5]:
#@time A_mul_B!(cdk.H_hat,rbm.W, X_train[:,1:500])

In [6]:
rbm = initialize_RBM(784, 20, 0.01, Float32);
cdk = initialize_CDK(rbm, 2, 500);

In [7]:
# function partial_fit!(rbm::RBM, X::Matrix, K::Integer, lr::Real, optimizer::CDK)
@benchmark partial_fit!(rbm, X_train[:,1:500], 0.1, cdk)

BenchmarkTools.Trial: 
  memory estimate:  10.81 MiB
  allocs estimate:  61
  --------------
  minimum time:     13.835 ms (0.00% GC)
  median time:      14.890 ms (0.00% GC)
  mean time:        15.344 ms (2.18% GC)
  maximum time:     24.032 ms (3.29% GC)
  --------------
  samples:          324
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [8]:
n_epochs = 1
batch_size = 500
K = 1
lr = 0.05
@benchmark fit!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

210.7846
209.95836
208.93597
208.024
207.10504
206.2031
205.46944
204.74767


BenchmarkTools.Trial: 
  memory estimate:  1.27 GiB
  allocs estimate:  7929
  --------------
  minimum time:     1.758 s (2.15% GC)
  median time:      1.790 s (2.11% GC)
  mean time:        1.781 s (2.13% GC)
  maximum time:     1.797 s (2.10% GC)
  --------------
  samples:          3
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

### Using .= to update the params

In [9]:

function fit_with_dot!(rbm::RBM, 
              X::Matrix, 
              batch_size::Integer,
              n_epochs::Integer,
              lr::Real,
              shuffle_data::Bool,
              opt)
        
    T = eltype(X)
    lr = T(lr)
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    sample_perm = Vector(1:n_samples)
    n_minibatches = T(length(indicies))
    rec_errors = Vector{T}([])
            
    for epoch in 1:n_epochs
        rec_error = Float32(0.)
        
        # should  it be more efficient to Shuffle indicies not the whole data?
        # then access is not contiguous though
        if shuffle_data==true
            shuffle!(sample_perm)
            X .= X[:,sample_perm]
        end
        
        for minibatch_ind in indicies          
            partial_fit_with_dot!(rbm, X[:, minibatch_ind], lr, opt)
            rec_error += opt.rec_error
        end
        
        push!(rec_errors, rec_error/n_minibatches)
        rbm.n_epochs_trained +=1
        print(rec_errors[end], "\n")
    end
    rbm.trained = true
    return rec_errors
end

function partial_fit_with_dot!(rbm::RBM, X::Matrix,  lr::Real, opt::CDK)
    compute_grad_with_dot!(rbm, X, opt)
    update_params!(rbm, opt, lr)    
end

function compute_grad_with_dot!(rbm::RBM, X::Matrix,  opt::CDK)

    T = eltype(rbm.vis_bias)
    batch_size = size(X)[2]
    
    # Perform gibbs sampling to compute the negative phase
    for k in 1:opt.K
        opt.V_sampling .= rand(T, rbm.n_vis, batch_size)
        
        if k==1       
            opt.H .= sigmoid.(rbm.W * X .+ rbm.hid_bias)
            opt.V_hat .= sigmoid.(rbm.W'* opt.H .+ rbm.vis_bias) .> opt.V_sampling
            opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) 
        else
            opt.V_hat .= sigmoid.(rbm.W'* opt.H_hat .+ rbm.vis_bias) .> opt.V_sampling
            opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) 
        end               
    end   
   
    opt.grad_W .=  (opt.H * X' .-  opt.H_hat * opt.V_hat')./ batch_size; 
    opt.grad_vis_bias .= vec(sum((X .- opt.V_hat), 2))./ batch_size;
    opt.grad_hid_bias .= vec(sum((opt.H .- opt.H_hat), 2))./ batch_size;
    opt.rec_error = sqrt(sum((X .- opt.V_hat).^2))
end

compute_grad_with_dot! (generic function with 1 method)

In [10]:
rbm = initialize_RBM(784, 20, 0.01, Float32);
cdk = initialize_CDK(rbm, 2, 500);

In [11]:
@benchmark partial_fit_with_dot!(rbm, X_train[:,1:500], 0.1, cdk)

BenchmarkTools.Trial: 
  memory estimate:  10.75 MiB
  allocs estimate:  57
  --------------
  minimum time:     14.147 ms (0.00% GC)
  median time:      15.545 ms (0.00% GC)
  mean time:        15.688 ms (2.21% GC)
  maximum time:     21.027 ms (3.58% GC)
  --------------
  samples:          317
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [12]:
n_epochs = 1
batch_size = 500
K = 1
lr = 0.05
@benchmark fit_with_dot!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

210.84888
209.96886
208.89296
208.09567
207.08444
206.23053
205.44463
204.65509


BenchmarkTools.Trial: 
  memory estimate:  1.26 GiB
  allocs estimate:  7458
  --------------
  minimum time:     1.868 s (2.10% GC)
  median time:      1.895 s (2.05% GC)
  mean time:        1.897 s (2.05% GC)
  maximum time:     1.928 s (2.02% GC)
  --------------
  samples:          3
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

# Making propup inplace

### Inplace matrix multiplication

**A_mul_B!(Y, A, B) → Y**

Calculates the matrix-matrix or matrix-vector product 𝐴 · 𝐵 and stores the result in Y, overwriting the existing value of Y. Note that Y must not be aliased with either A or B.

In [13]:

function fit_with_dot2!(rbm::RBM, 
              X::Matrix, 
              batch_size::Integer,
              n_epochs::Integer,
              lr::Real,
              shuffle_data::Bool,
              opt)
        
    T = eltype(X)
    lr = T(lr)
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    sample_perm = Vector(1:n_samples)
    n_minibatches = T(length(indicies))
    rec_errors = Vector{T}([])
            
    for epoch in 1:n_epochs
        rec_error = Float32(0.)
        
        # should  it be more efficient to Shuffle indicies not the whole data?
        # then access is not contiguous though
        if shuffle_data==true
            shuffle!(sample_perm)
            X .= X[:,sample_perm]
        end
        
        for minibatch_ind in indicies          
            partial_fit_with_dot2!(rbm, X[:, minibatch_ind], lr, opt)
            rec_error += opt.rec_error
        end
        
        push!(rec_errors, rec_error/n_minibatches)
        rbm.n_epochs_trained +=1
        print(rec_errors[end], "\n")
    end
    rbm.trained = true
    return rec_errors
end

function partial_fit_with_dot2!(rbm::RBM, X::Matrix,  lr::Real, opt::CDK)
    compute_grad_with_dot2!(rbm, X, opt)
    update_params!(rbm, opt, lr)    
end

function compute_grad_with_dot2!(rbm::RBM, X::Matrix,  opt::CDK)

    T = eltype(rbm.vis_bias)
    batch_size = size(X)[2]
    
    # Perform gibbs sampling to compute the negative phase
    for k in 1:opt.K
        #opt.V_sampling .= rand(T, rbm.n_vis, batch_size)
        rand!(opt.V_sampling)
        
        if k==1       
            opt.H .= sigmoid.(A_mul_B!(opt.H_aux, rbm.W, X) .+ rbm.hid_bias)
            opt.V_hat .= sigmoid.(rbm.W'* opt.H .+ rbm.vis_bias) .> opt.V_sampling
            opt.H_hat .= sigmoid.(A_mul_B!(opt.H_aux, rbm.W, opt.V_hat)  .+ rbm.hid_bias) 
        else
            opt.V_hat .= sigmoid.(rbm.W'* opt.H_hat .+ rbm.vis_bias) .> opt.V_sampling
            opt.H_hat .= sigmoid.(A_mul_B!(opt.H_aux, rbm.W, opt.V_hat)  .+ rbm.hid_bias) 
        end        
    end   
   
    opt.grad_W .=  (opt.H * X' .-  opt.H_hat * opt.V_hat')./ batch_size; 
    opt.grad_vis_bias .= vec(sum((X .- opt.V_hat), 2))./ batch_size;
    opt.grad_hid_bias .= vec(sum((opt.H .- opt.H_hat), 2))./ batch_size;
    opt.rec_error = sqrt(sum((X .- opt.V_hat).^2))
end

compute_grad_with_dot2! (generic function with 1 method)

In [14]:
rbm = initialize_RBM(784, 20, 0.01, Float32);
cdk = initialize_CDK(rbm, 2, 500);

In [15]:
@benchmark partial_fit_with_dot2!(rbm, X_train[:,1:500], 0.1, cdk)

BenchmarkTools.Trial: 
  memory estimate:  7.64 MiB
  allocs estimate:  47
  --------------
  minimum time:     13.719 ms (0.00% GC)
  median time:      14.347 ms (0.00% GC)
  mean time:        14.557 ms (1.61% GC)
  maximum time:     23.497 ms (3.61% GC)
  --------------
  samples:          342
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [16]:
n_epochs = 1
batch_size = 500
K = 1
lr = 0.05
@benchmark fit_with_dot2!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

209.84149
208.83263
207.87555
206.95923
206.08691
205.26257
204.49213
203.83333


BenchmarkTools.Trial: 
  memory estimate:  917.24 MiB
  allocs estimate:  6259
  --------------
  minimum time:     1.736 s (1.49% GC)
  median time:      1.751 s (1.48% GC)
  mean time:        1.755 s (1.48% GC)
  maximum time:     1.779 s (1.48% GC)
  --------------
  samples:          3
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [17]:
n_epochs = 1
batch_size = 500
K = 1
lr = 0.05
@benchmark fit_with_dot2!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

203.11678
202.40695
201.84895
201.26213
200.67061
200.11891
199.67332
199.16504


BenchmarkTools.Trial: 
  memory estimate:  917.24 MiB
  allocs estimate:  6258
  --------------
  minimum time:     1.716 s (1.50% GC)
  median time:      1.725 s (1.49% GC)
  mean time:        1.724 s (1.49% GC)
  maximum time:     1.732 s (1.48% GC)
  --------------
  samples:          3
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [None]:
expand(:(A_mul_Bt(cdk.H, X_train[:,1:500])))

In [None]:
expand(:(cdk.H*X_train[:,1:500]'))

## placeholders for all computations

In [86]:

function fit_with_dot3!(rbm::RBM, 
              X::Matrix, 
              batch_size::Integer,
              n_epochs::Integer,
              lr::Real,
              shuffle_data::Bool,
              opt)
        
    T = eltype(X)
    lr = T(lr)
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    sample_perm = Vector(1:n_samples)
    n_minibatches = T(length(indicies))
    rec_errors = Vector{T}([])
            
    for epoch in 1:n_epochs
        rec_error = Float32(0.)
        
        # should  it be more efficient to Shuffle indicies not the whole data?
        # then access is not contiguous though
        if shuffle_data==true
            shuffle!(sample_perm)
            X .= X[:,sample_perm]
        end
        
        for minibatch_ind in indicies          
            partial_fit_with_dot2!(rbm, X[:, minibatch_ind], lr, opt)
            rec_error += opt.rec_error
        end
        
        push!(rec_errors, rec_error/n_minibatches)
        rbm.n_epochs_trained +=1
        print(rec_errors[end], "\n")
    end
    rbm.trained = true
    return rec_errors
end

function partial_fit_with_dot3!(rbm::RBM, X::Matrix,  lr::Real, opt::CDK)
    compute_grad_with_dot3!(rbm, X, opt)
    update_params!(rbm, opt, lr)    
end

function compute_grad_with_dot3!(rbm::RBM, X::Matrix,  opt::CDK)

    T = eltype(rbm.vis_bias)
    batch_size = size(X)[2]
    
    # Perform gibbs sampling to compute the negative phase
    for k in 1:opt.K
        #opt.V_sampling .= rand(T, rbm.n_vis, batch_size)
        rand!(opt.V_sampling)
        
        if k==1       
            opt.H .= sigmoid.(A_mul_B!(opt.H_aux, rbm.W, X) .+ rbm.hid_bias)
            opt.V_hat .= sigmoid.(At_mul_B!(opt.V_aux, rbm.W, opt.H).+ rbm.vis_bias) .> opt.V_sampling
            opt.H_hat .= sigmoid.(A_mul_B!(opt.H_aux, rbm.W, opt.V_hat)  .+ rbm.hid_bias) 
        else
            opt.V_hat .= sigmoid.(At_mul_B!(opt.V_hat, rbm.W, opt.H_hat) .+ rbm.vis_bias) .> opt.V_sampling
            opt.H_hat .= sigmoid.(A_mul_B!(opt.H_hat, rbm.W, opt.V_hat)  .+ rbm.hid_bias) 
        end        
    end   
   
    # opt.grad_W .=  (opt.H * X' .-  opt.H_hat * opt.V_hat')./ batch_size; 
    opt.grad_W .=  (A_mul_Bt!(opt.grad_W, opt.H , X) .-  A_mul_Bt!(opt.grad_W, opt.H_hat , opt.V_hat))./ batch_size; 
    opt.grad_vis_bias .= vec(sum((X .- opt.V_hat), 2))./ batch_size;
    opt.grad_hid_bias .= vec(sum((opt.H .- opt.H_hat), 2))./ batch_size;
    opt.rec_error = sqrt(sum((X .- opt.V_hat).^2))
end

compute_grad_with_dot3! (generic function with 1 method)

In [87]:
rbm = initialize_RBM(784, 20, 0.01, Float32);
cdk = initialize_CDK(rbm, 2, 500);

In [88]:
size(A_mul_Bt( cdk.H ,  X_train[:,1:500]))

(20, 784)

In [89]:
@benchmark partial_fit_with_dot3!(rbm, X_train[:,1:500], 0.1, cdk)

BenchmarkTools.Trial: 
  memory estimate:  4.53 MiB
  allocs estimate:  39
  --------------
  minimum time:     13.070 ms (0.00% GC)
  median time:      13.357 ms (0.00% GC)
  mean time:        14.156 ms (1.40% GC)
  maximum time:     23.972 ms (0.00% GC)
  --------------
  samples:          351
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [90]:
n_epochs = 1
batch_size = 500
K = 1
lr = 0.05
@benchmark fit_with_dot3!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

LoadError: [91mDimensionMismatch("A has dimensions (20,500) but B has dimensions (784,500)")[39m