Skip to content

Commit

Permalink
Add PGO+LTO Makefile
Browse files Browse the repository at this point in the history
Adds a convenient way to enable PGO+LTO on Julia and LLVM together:

1. `cd contrib/pgo-lto`
2. `make -j$(nproc) stage1`
3. `make clean-profiles`
4. `./stage1.build/julia -O3 -e 'using Pkg; Pkg.add("LoopVectorization"); Pkg.test("LoopVectorization")'`
5. `make -j$(nproc) stage2`

This results quite often in spectacular speedups for time to first X as
it reduces the time spent in LLVM optimization passes by 25 or even 30%.

Example 1:

```julia
using LoopVectorization
function f!(a, b)
    @turbo for i in eachindex(a)
        a[i] *= b[i]
    end
    return a
end
f!(rand(1), rand(1))
```

```console
$ time ./julia -O3 lv.jl
```

Without PGO+LTO: 14.801s
With PGO+LTO: 11.978s (-19%)

Example 2:

```console
$ time ./julia -e 'using Pkg; Pkg.test("Unitful");'
```

Without PGO+LTO: 1m47.688s
With PGO+LTO: 1m35.704s (-11%)

Example 3 (taken from issue #45395, which is almost only LLVM):

```console
$ JULIA_LLVM_ARGS=-time-passes ./julia script-45395.jl
```

Without PGO+LTO:

```
===-------------------------------------------------------------------------===
                      ... Pass execution timing report ...
===-------------------------------------------------------------------------===
  Total Execution Time: 101.0130 seconds (98.6253 wall clock)

   ---User Time---   --System Time--   --User+System--   ---Wall Time---  --- Name ---
  53.6961 ( 54.7%)   0.1050 (  3.8%)  53.8012 ( 53.3%)  53.8045 ( 54.6%)  Unroll loops
  25.5423 ( 26.0%)   0.0072 (  0.3%)  25.5495 ( 25.3%)  25.5444 ( 25.9%)  Global Value Numbering
   7.1995 (  7.3%)   0.0526 (  1.9%)   7.2521 (  7.2%)   7.2517 (  7.4%)  Induction Variable Simplification
   5.0541 (  5.1%)   0.0098 (  0.3%)   5.0639 (  5.0%)   5.0561 (  5.1%)
   Combine redundant instructions #2
```

Wit PGO+LTO:

```
===-------------------------------------------------------------------------===
                      ... Pass execution timing report ...
===-------------------------------------------------------------------------===
  Total Execution Time: 72.6507 seconds (70.1337 wall clock)

   ---User Time---   --System Time--   --User+System--   ---Wall Time---  --- Name ---
  36.0894 ( 51.7%)   0.0825 (  2.9%)  36.1719 ( 49.8%)  36.1738 ( 51.6%)  Unroll loops
  16.5713 ( 23.7%)   0.0129 (  0.5%)  16.5843 ( 22.8%)  16.5794 ( 23.6%)  Global Value Numbering
   5.9047 (  8.5%)   0.0395 (  1.4%)   5.9442 (  8.2%)   5.9438 (  8.5%)  Induction Variable Simplification
   4.7566 (  6.8%)   0.0078 (  0.3%)   4.7645 (  6.6%)   4.7575 (  6.8%)  Combine redundant instructions #2
```

Or -28% time spent in LLVM.

---

Finally there's a significant reduction in binary sizes. For libLLVM.so:

```
79M	usr/lib/libLLVM-13jl.so (before)
67M	usr/lib/libLLVM-13jl.so (after)
```

And it can be reduced by another 2MB with `--icf=safe` when using LLD as
a linker anways.

Turn into makefile

Newline

Use two out of source builds

Ignore profiles + build dirs

Add --icf=safe

stage0 setup prebuilt clang with [cd]tors->init/fini patch
  • Loading branch information
haampie committed Jun 28, 2022
1 parent 4015e0d commit 229819b
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 0 deletions.
4 changes: 4 additions & 0 deletions contrib/pgo-lto/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
profiles
stage0*
stage1*
stage2*
78 changes: 78 additions & 0 deletions contrib/pgo-lto/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
.PHONY: top clean clean-profiles

STAGE0_BUILD:=$(CURDIR)/stage0.build
STAGE1_BUILD:=$(CURDIR)/stage1.build
STAGE2_BUILD:=$(CURDIR)/stage2.build

STAGE0_TOOLS:=$(STAGE0_BUILD)/usr/tools/

PROFILE_DIR:=$(CURDIR)/profiles
PROFILE_FILE:=$(PROFILE_DIR)/merged.prof
PROFRAW_FILES:=$(wildcard $(PROFILE_DIR)/*.profraw)
JULIA_ROOT:=$(CURDIR)/../..
CXXFILT:=c++filt
LLVM_PROFDATA:=$(STAGE0_TOOLS)llvm-profdata

# When building a single libLLVM.so we need to increase -vp-counters-per-site
# significantly
COUNTERS_PER_SITE:=6

AFTER_STAGE1_MESSAGE:=Run \`make clean-profiles\` to start with a clean slate. $\
Then run Julia to collect realistic profile data, for example: \`julia -O3 -e $\
'using Pkg; Pkg.add("LoopVectorization"); Pkg.test("LoopVectorization")'\`. This $\
should produce about 15MB of data in $(PROFILE_DIR). Note that running extensive $\
scripts may result in counter overflows, which can be detected by running $\
\`make top\`. Afterwards run \`make stage2\`.
TOOLCHAIN_FLAGS = $\
"CC=$(STAGE0_TOOLS)clang" $\
"CXX=$(STAGE0_TOOLS)clang++" $\
"LD=$(STAGE0_TOOLS)ld.lld" $\
"AR=$(STAGE0_TOOLS)llvm-ar" $\
"RANLIB=$(STAGE0_TOOLS)llvm-ranlib" $\
"CFLAGS+=$(PGO_CFLAGS)" $\
"CXXFLAGS+=$(PGO_CXXFLAGS)" $\
"LDFLAGS+=$(PGO_LDFLAGS)"
$(STAGE0_BUILD) $(STAGE1_BUILD) $(STAGE2_BUILD):
$(MAKE) -C $(JULIA_ROOT) O=$@ configure
stage0: export USE_BINARYBUILDER_LLVM=1
stage0: | $(STAGE0_BUILD)
# Turn [cd]tors into init/fini_array sections in libclang_rt, since lld
# doesn't do that, and otherwise the profile constructor is not executed
$(MAKE) -C $(STAGE0_BUILD)/deps install-clang install-llvm install-lld install-llvm-tools && \
find $< -name 'libclang_rt.profile-*.a' -exec $(STAGE0_BUILD)objcopy --rename-section .ctors=.init_array --rename-section .dtors=.fini_array {} + && \
touch $@
$(STAGE1_BUILD): stage0
stage1: PGO_CFLAGS:=-fprofile-generate=$(PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE)
stage1: PGO_CXXFLAGS:=-fprofile-generate=$(PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE)
stage1: PGO_LDFLAGS:=-fuse-ld=lld -flto=thin -fprofile-generate=$(PROFILE_DIR)
stage1: export USE_BINARYBUILDER_LLVM=0
stage1: | $(STAGE1_BUILD)
$(MAKE) -C $(STAGE1_BUILD) $(TOOLCHAIN_FLAGS) && touch $@
@echo "$(AFTER_STAGE1_MESSAGE)"
stage2: PGO_CFLAGS:=-fprofile-use=$(PROFILE_FILE)
stage2: PGO_CXXFLAGS:=-fprofile-use=$(PROFILE_FILE)
stage2: PGO_LDFLAGS:=-fuse-ld=lld -flto=thin -fprofile-use=$(PROFILE_FILE) -Wl,--icf=safe
stage2: export USE_BINARYBUILDER_LLVM=0
stage2: $(PROFILE_FILE) | $(STAGE2_BUILD)
$(MAKE) -C $(STAGE2_BUILD) $(TOOLCHAIN_FLAGS) && touch $@
install: stage2
$(MAKE) -C $(STAGE2_BUILD) USE_BINARYBUILDER_LLVM=0 install
$(PROFILE_FILE): stage1 $(PROFRAW_FILES)
$(LLVM_PROFDATA) merge -output=$@ $(PROFRAW_FILES)
# show top 50 functions
top: $(PROFILE_FILE)
$(LLVM_PROFDATA) show --topn=50 $< | $(CXXFILT)
clean-profiles:
rm -rf $(PROFILE_DIR)
clean:
rm -f stage0 stage1 stage2 $(PROFILE_FILE)

0 comments on commit 229819b

Please sign in to comment.