# Understanding how to control memory allocation
##### (Of functions that are called lots of times and generate arrays everytime they are called)



This notebook compares two versions of a function:  **```compute_grad```** and **```compute_grad_with_dot!```**.

The idea was to have a type that has "placeholders" for the quantities that are computed inside ```compute_grad``` (sampling quantities, as well as V_hat, H_hat) to avoid allocating memory at every update of the parameters of the model (at every call to  ```compute_grad```).



In [1]:
# Import Distributions to generate random numbers W matrix of the RBM
using Distributions
using MNIST
using BenchmarkTools
using Combinatorics

In [2]:
X_train, y_train = MNIST.traindata()
X_test, y_test = MNIST.testdata()

T = Float32
X_train = Array{T}((X_train - minimum(X_train))/(maximum(X_train) - minimum(X_train)))
y_train = Array{T}(y_train)
X_test = Array{T}(X_test - minimum(X_test))/(maximum(X_test) - minimum(X_test)) 
y_test = Array{T}(y_test);

## Define basic types

In [3]:
function sigmoid(x::Float32)
    return 1/(1 + exp(-x))
end

type RBM{T <: Real}
    n_vis::Int
    n_hid::Int
    W::Matrix{T}  
    vis_bias::Vector{T}     
    hid_bias::Vector{T}   
    trained::Bool
    n_epochs_trained::Int
end

function initialize_RBM(n_vis, n_hid, sigma, T)
    
    return RBM{T}(n_vis,                                   # num visible units 
                  n_hid,                                   # num hidden unnits
                  rand(Normal(0,sigma), n_hid, n_vis),     # weight matrix
                  zeros(n_vis),                            # visible vector  
                  zeros(n_hid),                            # Hidden vector
                  false,0)                                 # trained
end

function Base.show{T}(io::IO, rbm::RBM{T})
    n_vis = size(rbm.vis_bias, 1)
    n_hid = size(rbm.hid_bias, 1)
    trained = rbm.trained
    print(io, "RBM{$T}(n_vis=$n_vis, n_hid=$n_hid, trained=$trained)")
end


type CDK{T}
    K::Int
    batch_size::Int
    
    # Placeholders needed for the gradients of the parameters of the RBM
    grad_W::Matrix{T}         
    grad_vis_bias::Vector{T}     
    grad_hid_bias::Vector{T}   
    
    # Placeholders needed for performing CDK in a minibatch
    H::Matrix{T}
    V_hat::Matrix{T}
    H_hat::Matrix{T}
    rec_error::Float64 # This is probably irrelevant, allo
    
    # Placeholders needed for performing sampling in a minibatch
    V_sampling::Matrix{T}
    H_sampling::Matrix{T}   
    H_aux::Matrix{T}  

end

function initialize_CDK(rbm::RBM, K, batch_size)
    """
    This function initializes a CDK type that will be used as placeholder for the
    memory needed for the gibbs sampling process needed at every minibatch update.
    """
    T = eltype(rbm.vis_bias)
    grad_W = zeros(T, size(rbm.W))
    grad_vis_bias = zeros(T, size(rbm.vis_bias))
    grad_hid_bias = zeros(T, size(rbm.hid_bias))
    V_hat = zeros(T, rbm.n_vis, batch_size)
    H_hat = zeros(T, rbm.n_hid, batch_size)
    H = zeros(T, rbm.n_hid, batch_size)
    V_sampling = zeros(T, rbm.n_vis, batch_size)
    H_sampling = zeros(T, rbm.n_hid, batch_size)
    H_aux = zeros(T, rbm.n_hid, batch_size)

    cdk = CDK(K, batch_size, 
              grad_W, grad_vis_bias,grad_hid_bias,
              H, V_hat, H_hat, 0.,
              V_sampling, H_sampling, H_aux)
    return cdk
end

function update_params!(rbm::RBM, opt::CDK, lr)
    rbm.W .+= lr .* opt.grad_W 
    rbm.vis_bias .+= lr .* opt.grad_vis_bias
    rbm.hid_bias .+= lr .* opt.grad_hid_bias
end

update_params! (generic function with 1 method)

# Test fit without .= 

In [None]:

function fit!(rbm::RBM, 
              X::Matrix, 
              batch_size::Integer,
              n_epochs::Integer,
              lr::Real,
              shuffle_data::Bool,
              opt)
        
    T = eltype(X)
    lr = T(lr)
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    sample_perm = Vector(1:n_samples)
    n_minibatches = T(length(indicies))
    rec_errors = Vector{T}([])
        
    ###### Initialize Optimizer, CDK, PCDK, ....#######
    #cdk = initialize_CDK(rbm, K, batch_size)  
    
    for epoch in 1:n_epochs
        rec_error = Float32(0.)
        
        # should  it be more efficient to Shuffle indicies not the whole data?
        # then access is not contiguous though
        if shuffle_data==true
            shuffle!(sample_perm)
            X .= X[:,sample_perm]
        end
        
        for minibatch_ind in indicies          
            partial_fit!(rbm, X[:, minibatch_ind], lr, opt)
            rec_error += opt.rec_error
        end
        
        push!(rec_errors, rec_error/n_minibatches)
        rbm.n_epochs_trained +=1
        print(rec_errors[end], "\n")
    end
    rbm.trained = true
    return rec_errors
end

function partial_fit!(rbm::RBM, X::Matrix,  lr::Real, opt::CDK)
    compute_grad!(rbm, X, opt)
    update_params!(rbm, opt, lr)    
end

function compute_grad!(rbm::RBM, X::Matrix,  opt::CDK)

    T = eltype(rbm.vis_bias)
    batch_size = size(X)[2]
    
    # Perform gibbs sampling to compute the negative phase
    for k in 1:opt.K
        if k==1       
            opt.H .= sigmoid.(rbm.W * X .+ rbm.hid_bias)
            opt.V_hat .= sigmoid.(rbm.W'* opt.H .+ rbm.vis_bias) .> rand(T,rbm.n_vis, batch_size)
            opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) 
        else
            opt.V_hat .= sigmoid.(rbm.W'* opt.H_hat .+ rbm.vis_bias) .> rand(T,rbm.n_vis, batch_size)
            opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) 
        end               
    end   
   
    opt.grad_W =  (opt.H * X' .-  opt.H_hat * opt.V_hat')./ batch_size; 
    opt.grad_vis_bias = vec(sum((X .- opt.V_hat), 2))./ batch_size;
    opt.grad_hid_bias = vec(sum((opt.H .- opt.H_hat), 2))./ batch_size;
    
    opt.rec_error = sqrt(sum((X.-opt.V_hat).^2))
end

In [None]:
#@time A_mul_B!(cdk.H_hat,rbm.W, X_train[:,1:500])

In [None]:
rbm = initialize_RBM(784, 20, 0.01, Float32);
cdk = initialize_CDK(rbm, 2, 500);

In [None]:
# function partial_fit!(rbm::RBM, X::Matrix, K::Integer, lr::Real, optimizer::CDK)
@benchmark partial_fit!(rbm, X_train[:,1:500], 0.1, cdk)

In [None]:
n_epochs = 1
batch_size = 500
K = 1
lr = 0.05
@benchmark fit!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

### Using .= to update the params

In [None]:

function fit_with_dot!(rbm::RBM, 
              X::Matrix, 
              batch_size::Integer,
              n_epochs::Integer,
              lr::Real,
              shuffle_data::Bool,
              opt)
        
    T = eltype(X)
    lr = T(lr)
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    sample_perm = Vector(1:n_samples)
    n_minibatches = T(length(indicies))
    rec_errors = Vector{T}([])
            
    for epoch in 1:n_epochs
        rec_error = Float32(0.)
        
        # should  it be more efficient to Shuffle indicies not the whole data?
        # then access is not contiguous though
        if shuffle_data==true
            shuffle!(sample_perm)
            X .= X[:,sample_perm]
        end
        
        for minibatch_ind in indicies          
            partial_fit_with_dot!(rbm, X[:, minibatch_ind], lr, opt)
            rec_error += opt.rec_error
        end
        
        push!(rec_errors, rec_error/n_minibatches)
        rbm.n_epochs_trained +=1
        print(rec_errors[end], "\n")
    end
    rbm.trained = true
    return rec_errors
end

function partial_fit_with_dot!(rbm::RBM, X::Matrix,  lr::Real, opt::CDK)
    compute_grad_with_dot!(rbm, X, opt)
    update_params!(rbm, opt, lr)    
end

function compute_grad_with_dot!(rbm::RBM, X::Matrix,  opt::CDK)

    T = eltype(rbm.vis_bias)
    batch_size = size(X)[2]
    
    # Perform gibbs sampling to compute the negative phase
    for k in 1:opt.K
        opt.V_sampling .= rand(T, rbm.n_vis, batch_size)
        
        if k==1       
            opt.H .= sigmoid.(rbm.W * X .+ rbm.hid_bias)
            opt.V_hat .= sigmoid.(rbm.W'* opt.H .+ rbm.vis_bias) .> opt.V_sampling
            opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) 
        else
            opt.V_hat .= sigmoid.(rbm.W'* opt.H_hat .+ rbm.vis_bias) .> opt.V_sampling
            opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) 
        end               
    end   
   
    opt.grad_W .=  (opt.H * X' .-  opt.H_hat * opt.V_hat')./ batch_size; 
    opt.grad_vis_bias .= vec(sum((X .- opt.V_hat), 2))./ batch_size;
    opt.grad_hid_bias .= vec(sum((opt.H .- opt.H_hat), 2))./ batch_size;
    opt.rec_error = sqrt(sum((X .- opt.V_hat).^2))
end

In [None]:
rbm = initialize_RBM(784, 20, 0.01, Float32);
cdk = initialize_CDK(rbm, 2, 500);

In [None]:
@benchmark partial_fit_with_dot!(rbm, X_train[:,1:500], 0.1, cdk)

In [None]:
n_epochs = 1
batch_size = 500
K = 1
lr = 0.05
@benchmark fit_with_dot!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

# Making propup inplace

### Inplace matrix multiplication

**A_mul_B!(Y, A, B) → Y**

Calculates the matrix-matrix or matrix-vector product 𝐴 · 𝐵 and stores the result in Y, overwriting the existing value of Y. Note that Y must not be aliased with either A or B.

In [None]:

function fit_with_dot2!(rbm::RBM, 
              X::Matrix, 
              batch_size::Integer,
              n_epochs::Integer,
              lr::Real,
              shuffle_data::Bool,
              opt)
        
    T = eltype(X)
    lr = T(lr)
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    sample_perm = Vector(1:n_samples)
    n_minibatches = T(length(indicies))
    rec_errors = Vector{T}([])
            
    for epoch in 1:n_epochs
        rec_error = Float32(0.)
        
        # should  it be more efficient to Shuffle indicies not the whole data?
        # then access is not contiguous though
        if shuffle_data==true
            shuffle!(sample_perm)
            X .= X[:,sample_perm]
        end
        
        for minibatch_ind in indicies          
            partial_fit_with_dot2!(rbm, X[:, minibatch_ind], lr, opt)
            rec_error += opt.rec_error
        end
        
        push!(rec_errors, rec_error/n_minibatches)
        rbm.n_epochs_trained +=1
        print(rec_errors[end], "\n")
    end
    rbm.trained = true
    return rec_errors
end

function partial_fit_with_dot2!(rbm::RBM, X::Matrix,  lr::Real, opt::CDK)
    compute_grad_with_dot2!(rbm, X, opt)
    update_params!(rbm, opt, lr)    
end

function compute_grad_with_dot2!(rbm::RBM, X::Matrix,  opt::CDK)

    T = eltype(rbm.vis_bias)
    batch_size = size(X)[2]
    
    # Perform gibbs sampling to compute the negative phase
    for k in 1:opt.K
        rand!(opt.V_sampling)
        
        if k==1       
            opt.H .= sigmoid.(A_mul_B!(opt.H_aux, rbm.W, X) .+ rbm.hid_bias)
            opt.V_hat .= sigmoid.(rbm.W'* opt.H .+ rbm.vis_bias) .> opt.V_sampling
            opt.H_hat .= sigmoid.(A_mul_B!(opt.H_aux, rbm.W, opt.V_hat)  .+ rbm.hid_bias) 
        else
            opt.V_hat .= sigmoid.(rbm.W'* opt.H_hat .+ rbm.vis_bias) .> opt.V_sampling
            opt.H_hat .= sigmoid.(A_mul_B!(opt.H_aux, rbm.W, opt.V_hat)  .+ rbm.hid_bias) 
        end        
    end   
   
    opt.grad_W .=  (opt.H * X' .-  opt.H_hat * opt.V_hat')./ batch_size; 
    opt.grad_vis_bias .= squeeze(sum((X .- opt.V_hat), 2),2)./ batch_size;
    opt.grad_hid_bias .= squeeze(sum((opt.H .- opt.H_hat), 2),2)./ batch_size;
    opt.rec_error = sqrt(sum((X .- opt.V_hat).^2))
end

In [None]:
@benchmark squeeze(sum((cdk.H .- cdk.H), 2),2)

In [None]:
@benchmark vec(sum((cdk.H .- cdk.H), 2))

In [None]:
rbm = initialize_RBM(784, 20, 0.01, Float32);
cdk = initialize_CDK(rbm, 2, 500);

In [None]:
@benchmark partial_fit_with_dot2!(rbm, X_train[:,1:500], 0.1, cdk)

In [None]:
n_epochs = 1
batch_size = 500
K = 1
lr = 0.05
@benchmark fit_with_dot2!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)