In [1]:
using ITensors
using CUDA

In [2]:
CUDA.devices()

CUDA.DeviceIterator() for 2 devices:
0. Tesla V100-SXM2-32GB
1. Tesla V100-SXM2-32GB

In [3]:
#monitoring multiples gpu functions:

function memory_info_all_gpus(print_info = true)
    
    percentages = []

    scale = 1/(1024^3) #converty bytes to GB
    for (i, dev) in enumerate(CUDA.NVML.devices())

        name = CUDA.NVML.name(dev) 
        mem_info = CUDA.NVML.memory_info(dev)
        total = round(mem_info.total*scale, sigdigits=4)
        used = round(mem_info.used*scale, sigdigits=4)
        free = round(mem_info.free*scale, sigdigits=4)
        percentage= round(used*100/total, sigdigits=4)
        
        print_info ? println("$name #$i memory usage: $percentage % ($used GB/ $total GB)" ) : nothing
        
        append!(percentages, percentage)
    end
    
    return percentages
end

function clean_all_gpus(Deep_cleaning = false)
    for i=reverse(0:length(CUDA.devices()) - 1)
        global current_gpu = i
        CUDA.device!(current_gpu)
        Deep_cleaning ? GC.gc(true) : nothing #This could be very slow.
        CUDA.reclaim()
    end
end

clean_all_gpus (generic function with 2 methods)

In [4]:
@time memory_info_all_gpus()
@time CUDA.memory_status() #It just print the one that is in use.

Tesla V100-SXM2-32GB #1 memory usage: 0.8259 % (0.2643 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 0.8259 % (0.2643 GB/ 32.0 GB)
  0.154710 seconds (46.23 k allocations: 3.479 MiB, 15.55% gc time, 45.80% compilation time)
Effective GPU memory usage: 0.95% (309.812 MiB/31.739 GiB)
Memory pool usage: 0 bytes (0 bytes reserved)
  0.577573 seconds (538.73 k allocations: 36.108 MiB, 1.97% gc time, 78.53% compilation time)


In [53]:
function Create_H_MPO(t,U, N, sites = []) 

    if length(sites) == 0
        sites = siteinds("Electron",N)    
    end
    
    os = OpSum() 
    for j=1:N-1 
        os += -t,"Cdagup",j,"Cup",j+1 
        os += -t,"Cdagup",j+1,"Cup",j
        os += -t,"Cdagdn",j,"Cdn",j+1 
        os += -t,"Cdagdn",j+1,"Cdn",j
    end 

    for j=1:N
        os += U,"Nup * Ndn",j
    end
    # Convert these terms to an MPO 
    H = MPO(os,sites)
    return H, sites
end

#Custom observer to measure the use of GPU:

mutable struct DemoObserver <: AbstractObserver
    energy_tol::Float64
    last_energy::Float64

    DemoObserver(energy_tol=0.0) = new(energy_tol,1000.0)
end

function ITensors.checkdone!(o::DemoObserver;kwargs...)
    
    CUDA.reclaim()
    memory_info_all_gpus() #Print GPU percentage of use. 
        
    sw = kwargs[:sweep]
    energy = kwargs[:energy]
    if abs(energy-o.last_energy)/abs(energy) < o.energy_tol
    println("Stopping DMRG after sweep $sw")
    return true
    end
    # Otherwise, update last_energy and keep going
    o.last_energy = energy
    return false
end

In [54]:
clean_all_gpus(true)
memory_info_all_gpus()

Tesla V100-SXM2-32GB #1 memory usage: 10.39 % (3.325 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 2.071 % (0.6628 GB/ 32.0 GB)


2-element Vector{Any}:
 10.39
  2.071

In [55]:
N = 60
t = 1
U = 1

nsweeps = 100
maxdim = [1000] #maxdim - integer or array of integers specifying the maximum size allowed for the bond dimension or rank of the MPS being optimized
cutoff = [1E-10] #maxdim - integer or array of integers specifying the maximum size allowed for the bond dimension or rank of the MPS being optimized
# DMRG_observer = DMRGObserver(;energy_tol=10e-8, minsweeps=10, energy_type=Float64)
DMRG_observer = DemoObserver(10e-8)

DemoObserver(1.0e-7, 1000.0)

In [56]:
H, sites = Create_H_MPO(t,U, N) 
Initial_Guess = randomMPS(sites);
# @time energy_ground_state, psi_ground_state = dmrg(H,Initial_Guess; nsweeps, maxdim, cutoff, observer = DMRG_observer, outputlevel = 1) 

In [57]:
H = NDTensors.cu(H)
Initial_Guess = NDTensors.cu(Initial_Guess)

# H = NDTensors.cu(H; storagemode=CUDA.UnifiedMemory)
# Initial_Guess = NDTensors.cu(Initial_Guess; storagemode=CUDA.UnifiedMemory)

MPS
[1] ((dim=4|id=730|"Electron,Site,n=1"), (dim=1|id=565|"Link,l=1"))
[2] ((dim=1|id=565|"Link,l=1"), (dim=4|id=40|"Electron,Site,n=2"), (dim=1|id=330|"Link,l=2"))
[3] ((dim=1|id=330|"Link,l=2"), (dim=4|id=252|"Electron,Site,n=3"), (dim=1|id=12|"Link,l=3"))
[4] ((dim=1|id=12|"Link,l=3"), (dim=4|id=842|"Electron,Site,n=4"), (dim=1|id=811|"Link,l=4"))
[5] ((dim=1|id=811|"Link,l=4"), (dim=4|id=706|"Electron,Site,n=5"), (dim=1|id=472|"Link,l=5"))
[6] ((dim=1|id=472|"Link,l=5"), (dim=4|id=614|"Electron,Site,n=6"), (dim=1|id=182|"Link,l=6"))
[7] ((dim=1|id=182|"Link,l=6"), (dim=4|id=446|"Electron,Site,n=7"), (dim=1|id=640|"Link,l=7"))
[8] ((dim=1|id=640|"Link,l=7"), (dim=4|id=620|"Electron,Site,n=8"), (dim=1|id=829|"Link,l=8"))
[9] ((dim=1|id=829|"Link,l=8"), (dim=4|id=506|"Electron,Site,n=9"), (dim=1|id=834|"Link,l=9"))
[10] ((dim=1|id=834|"Link,l=9"), (dim=4|id=163|"Electron,Site,n=10"), (dim=1|id=581|"Link,l=10"))
[11] ((dim=1|id=581|"Link,l=10"), (dim=4|id=453|"Electron,Site,n=11"), (d

In [58]:
@time energy_ground_state, psi_ground_state = dmrg(H,Initial_Guess; nsweeps, maxdim, cutoff, observer = DMRG_observer, outputlevel = 1) 

After sweep 1 energy=-61.598267913887405  maxlinkdim=16 maxerr=3.34E-16 time=0.458
Tesla V100-SXM2-32GB #1 memory usage: 10.39 % (3.325 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 2.071 % (0.6628 GB/ 32.0 GB)
After sweep 2 energy=-63.487609988293784  maxlinkdim=172 maxerr=9.98E-11 time=1.115
Tesla V100-SXM2-32GB #1 memory usage: 16.05 % (5.137 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 2.071 % (0.6628 GB/ 32.0 GB)
After sweep 3 energy=-63.60981980908561  maxlinkdim=243 maxerr=1.00E-10 time=2.075
Tesla V100-SXM2-32GB #1 memory usage: 23.77 % (7.606 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 2.071 % (0.6628 GB/ 32.0 GB)
After sweep 4 energy=-63.65560593732036  maxlinkdim=286 maxerr=9.99E-11 time=2.729
Tesla V100-SXM2-32GB #1 memory usage: 19.96 % (6.387 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 2.071 % (0.6628 GB/ 32.0 GB)
After sweep 5 energy=-63.67791747944098  maxlinkdim=332 maxerr=1.00E-10 time=3.688
Tesla V100-SXM2-32GB #1 memory usage: 50.53 % (16.17

(-63.71610197701952, MPS
[1] ((dim=4|id=214|"Link,l=1"), (dim=4|id=730|"Electron,Site,n=1"))
[2] ((dim=16|id=633|"Link,l=2"), (dim=4|id=40|"Electron,Site,n=2"), (dim=4|id=214|"Link,l=1"))
[3] ((dim=4|id=252|"Electron,Site,n=3"), (dim=63|id=632|"Link,l=3"), (dim=16|id=633|"Link,l=2"))
[4] ((dim=4|id=842|"Electron,Site,n=4"), (dim=164|id=493|"Link,l=4"), (dim=63|id=632|"Link,l=3"))
[5] ((dim=4|id=706|"Electron,Site,n=5"), (dim=302|id=928|"Link,l=5"), (dim=164|id=493|"Link,l=4"))
[6] ((dim=4|id=614|"Electron,Site,n=6"), (dim=399|id=788|"Link,l=6"), (dim=302|id=928|"Link,l=5"))
[7] ((dim=4|id=446|"Electron,Site,n=7"), (dim=515|id=438|"Link,l=7"), (dim=399|id=788|"Link,l=6"))
[8] ((dim=4|id=620|"Electron,Site,n=8"), (dim=585|id=763|"Link,l=8"), (dim=515|id=438|"Link,l=7"))
[9] ((dim=4|id=506|"Electron,Site,n=9"), (dim=657|id=497|"Link,l=9"), (dim=585|id=763|"Link,l=8"))
[10] ((dim=4|id=163|"Electron,Site,n=10"), (dim=733|id=493|"Link,l=10"), (dim=657|id=497|"Link,l=9"))
[11] ((dim=4|id=453|

Here is clear that ITensors - DMRG does not works automatically with multiple GPUS. 

In [59]:
clean_all_gpus(true)
memory_info_all_gpus()

Tesla V100-SXM2-32GB #1 memory usage: 16.4 % (5.249 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 2.071 % (0.6628 GB/ 32.0 GB)


2-element Vector{Any}:
 16.4
  2.071

In [60]:
#Final version ? 

global current_gpu = 0 #the code always starts running with GPU 1.
CUDA.device!(current_gpu)

function ITensors.checkdone!(o::DemoObserver;kwargs...)
    
    CUDA.reclaim() #It is the faster way to clean.
    percentage = memory_info_all_gpus() #Print GPU percentage of use. 
    
    if percentage[current_gpu + 1] >= 90
        
        global current_gpu = current_gpu + 1
        
        # current_gpu == length(CUDA.devices()) ? clean_all_gpus() : @time CUDA.device!(current_gpu) 
        
        if current_gpu == length(CUDA.devices()) 
            global current_gpu = 0 
        end
        
        CUDA.device!(current_gpu)
    end    
        
    sw = kwargs[:sweep]
    energy = kwargs[:energy]
    if abs(energy-o.last_energy)/abs(energy) < o.energy_tol
    println("Stopping DMRG after sweep $sw")
    return true
    end
    # Otherwise, update last_energy and keep going
    o.last_energy = energy
    return false
end

In [61]:
@time energy_ground_state, psi_ground_state = dmrg(H,Initial_Guess; nsweeps, maxdim, cutoff, observer = DMRG_observer, outputlevel = 1) 

After sweep 1 energy=-61.598267913887405  maxlinkdim=16 maxerr=3.34E-16 time=0.446
Tesla V100-SXM2-32GB #1 memory usage: 16.4 % (5.249 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 2.071 % (0.6628 GB/ 32.0 GB)
After sweep 2 energy=-63.487609988293784  maxlinkdim=172 maxerr=9.98E-11 time=1.044
Tesla V100-SXM2-32GB #1 memory usage: 23.92 % (7.655 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 2.071 % (0.6628 GB/ 32.0 GB)
After sweep 3 energy=-63.60981980908561  maxlinkdim=243 maxerr=1.00E-10 time=2.018
Tesla V100-SXM2-32GB #1 memory usage: 46.78 % (14.97 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 2.071 % (0.6628 GB/ 32.0 GB)
After sweep 4 energy=-63.65560593732036  maxlinkdim=286 maxerr=9.99E-11 time=2.685
Tesla V100-SXM2-32GB #1 memory usage: 26.36 % (8.436 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 2.071 % (0.6628 GB/ 32.0 GB)
After sweep 5 energy=-63.67791747944098  maxlinkdim=332 maxerr=1.00E-10 time=3.671
Tesla V100-SXM2-32GB #1 memory usage: 51.66 % (16.53 

(-63.71610197701952, MPS
[1] ((dim=4|id=686|"Link,l=1"), (dim=4|id=730|"Electron,Site,n=1"))
[2] ((dim=16|id=568|"Link,l=2"), (dim=4|id=40|"Electron,Site,n=2"), (dim=4|id=686|"Link,l=1"))
[3] ((dim=4|id=252|"Electron,Site,n=3"), (dim=63|id=808|"Link,l=3"), (dim=16|id=568|"Link,l=2"))
[4] ((dim=4|id=842|"Electron,Site,n=4"), (dim=164|id=668|"Link,l=4"), (dim=63|id=808|"Link,l=3"))
[5] ((dim=4|id=706|"Electron,Site,n=5"), (dim=302|id=869|"Link,l=5"), (dim=164|id=668|"Link,l=4"))
[6] ((dim=4|id=614|"Electron,Site,n=6"), (dim=399|id=895|"Link,l=6"), (dim=302|id=869|"Link,l=5"))
[7] ((dim=4|id=446|"Electron,Site,n=7"), (dim=515|id=921|"Link,l=7"), (dim=399|id=895|"Link,l=6"))
[8] ((dim=4|id=620|"Electron,Site,n=8"), (dim=585|id=967|"Link,l=8"), (dim=515|id=921|"Link,l=7"))
[9] ((dim=4|id=506|"Electron,Site,n=9"), (dim=657|id=235|"Link,l=9"), (dim=585|id=967|"Link,l=8"))
[10] ((dim=4|id=163|"Electron,Site,n=10"), (dim=733|id=612|"Link,l=10"), (dim=657|id=235|"Link,l=9"))
[11] ((dim=4|id=453|