-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpu_bind
executable file
·37 lines (33 loc) · 1.26 KB
/
gpu_bind
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/bin/bash
if [ -z "$tmp_logfile" ]; then
echo "ERROR: tmp_logfile is undefined, it should be set by mpibind!"
echo "($0 is not intended to be called directly)"
exit 1
fi
if [ $LMOD_FAMILY_MPI == "cray-mpich" ]; then
export MPICH_GPU_SUPPORT_ENABLED=1
grank=$PMI_RANK
lrank=$PMI_LOCAL_RANK
lsize=$PMI_LOCAL_SIZE
elif [ $LMOD_FAMILY_MPI == "openmpi" ]; then
grank=$OMPI_COMM_WORLD_RANK
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
lsize=$OMPI_COMM_WORLD_LOCAL_SIZE
else
>&2 echo "Error: mpibind requires cray-mpich for binding GPU applications"
exit 2
fi
if [[ -z "$MPICH_OFI_NIC_POLICY" ]]; then
export MPICH_OFI_NIC_POLICY=GPU
fi
nsock=`lscpu | grep Socket | awk '{print $2}'`
cpsock=`lscpu | grep "per sock" | awk '{print $4}'`
rank_per_sock=$((lsize/nsock + (lsize%nsock)))
cores=()
for i in `seq 0 $((cpsock/rank_per_sock)) $((cpsock - 1))`; do
cores+=("$i")
done
export CUDA_VISIBLE_DEVICES=`nvidia-smi topo -c ${cores[lrank]} | egrep "^[0-9]+"`
export MPIBIND_GPU_UUID=`nvidia-smi -L | grep "GPU $CUDA_VISIBLE_DEVICES:" | egrep -o "GPU-[a-f,0-9]+-[a-f,0-9]+-[a-f,0-9]+-[a-f,0-9]+-[a-f,0-9]+"`
echo "rank: $grank, cores: ${cores[lrank]}, CUDA DEVICE: $CUDA_VISIBLE_DEVICES, UUID: $MPIBIND_GPU_UUID" > $tmp_logfile.rankinfo.$grank
numactl -C "${cores[lrank]}" $*