In [1]:
using ITensors
using CUDA

In [2]:
CUDA.devices()

CUDA.DeviceIterator() for 2 devices:
0. Tesla V100-SXM2-32GB
1. Tesla V100-SXM2-32GB

In [3]:
#monitoring multiples gpu functions:

function memory_info_all_gpus(print_info = true)
    
    percentages = []

    scale = 1/(1024^3) #converty bytes to GB
    for (i, dev) in enumerate(CUDA.NVML.devices())

        name = CUDA.NVML.name(dev) 
        mem_info = CUDA.NVML.memory_info(dev)
        total = round(mem_info.total*scale, sigdigits=4)
        used = round(mem_info.used*scale, sigdigits=4)
        free = round(mem_info.free*scale, sigdigits=4)
        percentage= round(used*100/total, sigdigits=4)
        
        print_info ? println("$name #$i memory usage: $percentage % ($used GB/ $total GB)" ) : nothing
        
        append!(percentages, percentage)
    end
    
    return percentages
end

function clean_all_gpus(Deep_cleaning = false)
    for i=reverse(0:length(CUDA.devices()) - 1)
        global current_gpu = i
        CUDA.device!(current_gpu)
        Deep_cleaning ? GC.gc(true) : nothing #This could be very slow.
        CUDA.reclaim()
    end
end

clean_all_gpus (generic function with 2 methods)

In [4]:
@time memory_info_all_gpus()
@time CUDA.memory_status() #It just print the one that is in use.

Tesla V100-SXM2-32GB #1 memory usage: 0.8259 % (0.2643 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 0.8259 % (0.2643 GB/ 32.0 GB)
  0.121026 seconds (46.23 k allocations: 3.477 MiB, 46.53% compilation time)
Effective GPU memory usage: 0.95% (309.812 MiB/31.739 GiB)
Memory pool usage: 0 bytes (0 bytes reserved)
  0.774600 seconds (538.69 k allocations: 36.105 MiB, 28.96% gc time, 86.25% compilation time)


In [21]:
function Create_H_MPO(t,U, N, sites = []) 

    if length(sites) == 0
        sites = siteinds("Electron",N)    
    end
    
    os = OpSum() 
    for j=1:N-1 
        os += -t,"Cdagup",j,"Cup",j+1 
        os += -t,"Cdagup",j+1,"Cup",j
        os += -t,"Cdagdn",j,"Cdn",j+1 
        os += -t,"Cdagdn",j+1,"Cdn",j
    end 

    for j=1:N
        os += U,"Nup * Ndn",j
    end
    # Convert these terms to an MPO 
    H = MPO(os,sites)
    return H, sites
end

#Custom observer to measure the use of GPU:

mutable struct DemoObserver <: AbstractObserver
    energy_tol::Float64
    last_energy::Float64

    DemoObserver(energy_tol=0.0) = new(energy_tol,1000.0)
end

function ITensors.checkdone!(o::DemoObserver;kwargs...)
    
    CUDA.reclaim()
    memory_info_all_gpus() #Print GPU percentage of use. 
        
    sw = kwargs[:sweep]
    energy = kwargs[:energy]
    if abs(energy-o.last_energy)/abs(energy) < o.energy_tol
    println("Stopping DMRG after sweep $sw")
    return true
    end
    # Otherwise, update last_energy and keep going
    o.last_energy = energy
    return false
end

In [22]:
clean_all_gpus(true)
memory_info_all_gpus()

Tesla V100-SXM2-32GB #1 memory usage: 2.407 % (0.7702 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 1.76 % (0.5632 GB/ 32.0 GB)


2-element Vector{Any}:
 2.407
 1.76

In [23]:
N = 140
t = 1
U = 1

nsweeps = 100
maxdim = [1500] #maxdim - integer or array of integers specifying the maximum size allowed for the bond dimension or rank of the MPS being optimized
cutoff = [1E-10] #maxdim - integer or array of integers specifying the maximum size allowed for the bond dimension or rank of the MPS being optimized
# DMRG_observer = DMRGObserver(;energy_tol=10e-8, minsweeps=10, energy_type=Float64)
DMRG_observer = DemoObserver(10e-8)

DemoObserver(1.0e-7, 1000.0)

In [24]:
H, sites = Create_H_MPO(t,U, N) 
Initial_Guess = randomMPS(sites);
# @time energy_ground_state, psi_ground_state = dmrg(H,Initial_Guess; nsweeps, maxdim, cutoff, observer = DMRG_observer, outputlevel = 1) 

In [25]:
H = NDTensors.cu(H)
Initial_Guess = NDTensors.cu(Initial_Guess);

# H = NDTensors.cu(H; storagemode=CUDA.UnifiedMemory)
# Initial_Guess = NDTensors.cu(Initial_Guess; storagemode=CUDA.UnifiedMemory);

In [26]:
@time energy_ground_state, psi_ground_state = dmrg(H,Initial_Guess; nsweeps, maxdim, cutoff, observer = DMRG_observer, outputlevel = 1) 

After sweep 1 energy=-145.020342982526  maxlinkdim=16 maxerr=3.70E-16 time=1.084
Tesla V100-SXM2-32GB #1 memory usage: 2.7 % (0.864 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 1.76 % (0.5632 GB/ 32.0 GB)
After sweep 2 energy=-148.80206937737617  maxlinkdim=168 maxerr=1.00E-10 time=2.532
Tesla V100-SXM2-32GB #1 memory usage: 20.09 % (6.428 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 1.76 % (0.5632 GB/ 32.0 GB)
After sweep 3 energy=-149.05029474281758  maxlinkdim=219 maxerr=1.00E-10 time=4.607
Tesla V100-SXM2-32GB #1 memory usage: 31.02 % (9.928 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 1.76 % (0.5632 GB/ 32.0 GB)
After sweep 4 energy=-149.1879160956947  maxlinkdim=251 maxerr=1.00E-10 time=4.933
Tesla V100-SXM2-32GB #1 memory usage: 31.81 % (10.18 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 1.76 % (0.5632 GB/ 32.0 GB)
After sweep 5 energy=-149.28202884310565  maxlinkdim=365 maxerr=9.99E-11 time=5.586
Tesla V100-SXM2-32GB #1 memory usage: 25.07 % (8.022 GB/ 3

LoadError: Out of GPU memory trying to allocate 1.565 GiB
Effective GPU memory usage: 82.25% (26.105 GiB/31.739 GiB)
Memory pool usage: 20.735 GiB (25.719 GiB reserved)


Here is clear that ITensors - DMRG does not works automatically with multiple GPUS. 

In [18]:
clean_all_gpus(true)
memory_info_all_gpus()

Tesla V100-SXM2-32GB #1 memory usage: 2.309 % (0.739 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 1.76 % (0.5632 GB/ 32.0 GB)


2-element Vector{Any}:
 2.309
 1.76

In [19]:
#Final version ? 

global current_gpu = 0 #the code always starts running with GPU 1.
CUDA.device!(current_gpu)

function ITensors.checkdone!(o::DemoObserver;kwargs...)
    
    CUDA.reclaim() #It is the faster way to clean.
    percentage = memory_info_all_gpus() #Print GPU percentage of use. 
    
    if percentage[current_gpu + 1] >= 90
        
        global current_gpu = current_gpu + 1
        
        current_gpu == length(CUDA.devices()) ? CUDA.reclaim() : @time CUDA.device!(current_gpu) 
        
        # if current_gpu == length(CUDA.devices()) 
        #     global current_gpu = 0 
        # end
        
        CUDA.device!(current_gpu)
    end    
        
    sw = kwargs[:sweep]
    energy = kwargs[:energy]
    if abs(energy-o.last_energy)/abs(energy) < o.energy_tol
    println("Stopping DMRG after sweep $sw")
    return true
    end
    # Otherwise, update last_energy and keep going
    o.last_energy = energy
    return false
end

In [20]:
@time energy_ground_state, psi_ground_state = dmrg(H,Initial_Guess; nsweeps, maxdim, cutoff, observer = DMRG_observer, outputlevel = 1) 

After sweep 1 energy=-5.377057396686317  maxlinkdim=16 maxerr=0.00E+00 time=0.034
Tesla V100-SXM2-32GB #1 memory usage: 2.309 % (0.739 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 1.76 % (0.5632 GB/ 32.0 GB)
After sweep 2 energy=-5.635730087143738  maxlinkdim=62 maxerr=7.56E-11 time=0.041
Tesla V100-SXM2-32GB #1 memory usage: 2.309 % (0.739 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 1.76 % (0.5632 GB/ 32.0 GB)
After sweep 3 energy=-5.640267766121047  maxlinkdim=60 maxerr=6.30E-11 time=0.040
Tesla V100-SXM2-32GB #1 memory usage: 2.407 % (0.7702 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 1.76 % (0.5632 GB/ 32.0 GB)
After sweep 4 energy=-5.641159604795807  maxlinkdim=59 maxerr=5.02E-11 time=0.040
Tesla V100-SXM2-32GB #1 memory usage: 2.505 % (0.8015 GB/ 32.0 GB)
Tesla V100-SXM2-32GB #2 memory usage: 1.76 % (0.5632 GB/ 32.0 GB)
After sweep 5 energy=-5.641799391896577  maxlinkdim=59 maxerr=4.53E-11 time=0.040
Tesla V100-SXM2-32GB #1 memory usage: 2.602 % (0.8327 GB/ 32

(-5.644196619655373, MPS
[1] ((dim=4|id=431|"Link,l=1"), (dim=4|id=929|"Electron,Site,n=1"))
[2] ((dim=16|id=847|"Link,l=2"), (dim=4|id=800|"Electron,Site,n=2"), (dim=4|id=431|"Link,l=1"))
[3] ((dim=4|id=748|"Electron,Site,n=3"), (dim=34|id=795|"Link,l=3"), (dim=16|id=847|"Link,l=2"))
[4] ((dim=4|id=545|"Electron,Site,n=4"), (dim=16|id=851|"Link,l=4"), (dim=34|id=795|"Link,l=3"))
[5] ((dim=4|id=626|"Electron,Site,n=5"), (dim=4|id=437|"Link,l=5"), (dim=16|id=851|"Link,l=4"))
[6] ((dim=4|id=86|"Electron,Site,n=6"), (dim=4|id=437|"Link,l=5"))
)