From 6f56ffdcab19434a2614b23c9dbd2a69478d97ec Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Fri, 10 Jun 2022 21:58:39 +0200 Subject: [PATCH] Add PGO+LTO Makefile Adds a convenient way to enable PGO+LTO on Julia and LLVM together: 1. `cd contrib/pgo-lto` 2. `make -j$(nproc) stage1` 3. `make clean-profiles` 4. `./stage1.build/julia -O3 -e 'using Pkg; Pkg.add("LoopVectorization"); Pkg.test("LoopVectorization")'` 5. `make -j$(nproc) stage2` This results quite often in spectacular speedups for time to first X as it reduces the time spent in LLVM optimization passes by 25 or even 30%. Example 1: ```julia using LoopVectorization function f!(a, b) @turbo for i in eachindex(a) a[i] *= b[i] end return a end f!(rand(1), rand(1)) ``` ```console $ time ./julia -O3 lv.jl ``` Without PGO+LTO: 14.801s With PGO+LTO: 11.978s (-19%) Example 2: ```console $ time ./julia -e 'using Pkg; Pkg.test("Unitful");' ``` Without PGO+LTO: 1m47.688s With PGO+LTO: 1m35.704s (-11%) Example 3 (taken from issue #45395, which is almost only LLVM): ```console $ JULIA_LLVM_ARGS=-time-passes ./julia script-45395.jl ``` Without PGO+LTO: ``` ===-------------------------------------------------------------------------=== ... Pass execution timing report ... ===-------------------------------------------------------------------------=== Total Execution Time: 101.0130 seconds (98.6253 wall clock) ---User Time--- --System Time-- --User+System-- ---Wall Time--- --- Name --- 53.6961 ( 54.7%) 0.1050 ( 3.8%) 53.8012 ( 53.3%) 53.8045 ( 54.6%) Unroll loops 25.5423 ( 26.0%) 0.0072 ( 0.3%) 25.5495 ( 25.3%) 25.5444 ( 25.9%) Global Value Numbering 7.1995 ( 7.3%) 0.0526 ( 1.9%) 7.2521 ( 7.2%) 7.2517 ( 7.4%) Induction Variable Simplification 5.0541 ( 5.1%) 0.0098 ( 0.3%) 5.0639 ( 5.0%) 5.0561 ( 5.1%) Combine redundant instructions #2 ``` Wit PGO+LTO: ``` ===-------------------------------------------------------------------------=== ... Pass execution timing report ... ===-------------------------------------------------------------------------=== Total Execution Time: 72.6507 seconds (70.1337 wall clock) ---User Time--- --System Time-- --User+System-- ---Wall Time--- --- Name --- 36.0894 ( 51.7%) 0.0825 ( 2.9%) 36.1719 ( 49.8%) 36.1738 ( 51.6%) Unroll loops 16.5713 ( 23.7%) 0.0129 ( 0.5%) 16.5843 ( 22.8%) 16.5794 ( 23.6%) Global Value Numbering 5.9047 ( 8.5%) 0.0395 ( 1.4%) 5.9442 ( 8.2%) 5.9438 ( 8.5%) Induction Variable Simplification 4.7566 ( 6.8%) 0.0078 ( 0.3%) 4.7645 ( 6.6%) 4.7575 ( 6.8%) Combine redundant instructions #2 ``` Or -28% time spent in LLVM. --- Finally there's a significant reduction in binary sizes. For libLLVM.so: ``` 79M usr/lib/libLLVM-13jl.so (before) 67M usr/lib/libLLVM-13jl.so (after) ``` And it can be reduced by another 2MB with `--icf=safe` when using LLD as a linker anways. Turn into makefile Newline Use two out of source builds Ignore profiles + build dirs Add --icf=safe stage0 setup prebuilt clang with [cd]tors->init/fini patch --- contrib/pgo-lto/.gitignore | 4 ++ contrib/pgo-lto/Makefile | 78 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 contrib/pgo-lto/.gitignore create mode 100644 contrib/pgo-lto/Makefile diff --git a/contrib/pgo-lto/.gitignore b/contrib/pgo-lto/.gitignore new file mode 100644 index 0000000000000..978d8f2ca86dd --- /dev/null +++ b/contrib/pgo-lto/.gitignore @@ -0,0 +1,4 @@ +profiles +stage0* +stage1* +stage2* diff --git a/contrib/pgo-lto/Makefile b/contrib/pgo-lto/Makefile new file mode 100644 index 0000000000000..04f0dd236b1b6 --- /dev/null +++ b/contrib/pgo-lto/Makefile @@ -0,0 +1,78 @@ +.PHONY: top clean clean-profiles + +STAGE0_BUILD:=$(CURDIR)/stage0.build +STAGE1_BUILD:=$(CURDIR)/stage1.build +STAGE2_BUILD:=$(CURDIR)/stage2.build + +STAGE0_TOOLS:=$(STAGE0_BUILD)/usr/tools/ + +PROFILE_DIR:=$(CURDIR)/profiles +PROFILE_FILE:=$(PROFILE_DIR)/merged.prof +PROFRAW_FILES:=$(wildcard $(PROFILE_DIR)/*.profraw) +JULIA_ROOT:=$(CURDIR)/../.. +CXXFILT:=c++filt +LLVM_PROFDATA:=$(STAGE0_TOOLS)llvm-profdata + +# When building a single libLLVM.so we need to increase -vp-counters-per-site +# significantly +COUNTERS_PER_SITE:=6 + +AFTER_STAGE1_MESSAGE:=Run \`make clean-profiles\` to start with a clean slate. $\ + Then run Julia to collect realistic profile data, for example: \`julia -O3 -e $\ + 'using Pkg; Pkg.add("LoopVectorization"); Pkg.test("LoopVectorization")'\`. This $\ + should produce about 15MB of data in $(PROFILE_DIR). Note that running extensive $\ + scripts may result in counter overflows, which can be detected by running $\ + \`make top\`. Afterwards run \`make stage2\`. + +TOOLCHAIN_FLAGS = $\ + "CC=$(STAGE0_TOOLS)clang" $\ + "CXX=$(STAGE0_TOOLS)clang++" $\ + "LD=$(STAGE0_TOOLS)ld.lld" $\ + "AR=$(STAGE0_TOOLS)llvm-ar" $\ + "RANLIB=$(STAGE0_TOOLS)llvm-ranlib" $\ + "CFLAGS+=$(PGO_CFLAGS)" $\ + "CXXFLAGS+=$(PGO_CXXFLAGS)" $\ + "LDFLAGS+=$(PGO_LDFLAGS)" + +$(STAGE0_BUILD) $(STAGE1_BUILD) $(STAGE2_BUILD): + $(MAKE) -C $(JULIA_ROOT) O=$@ configure + +stage0: export USE_BINARYBUILDER_LLVM=1 +stage0: | $(STAGE0_BUILD) + # Turn [cd]tors into init/fini_array sections in libclang_rt, since lld + # doesn't do that, and otherwise the profile constructor is not executed + $(MAKE) -C $(STAGE0_BUILD)/deps install-clang install-llvm install-lld install-llvm-tools && \ + find $< -name 'libclang_rt.profile-*.a' -exec $(STAGE0_TOOLS)objcopy --rename-section .ctors=.init_array --rename-section .dtors=.fini_array {} + && \ + touch $@ + +$(STAGE1_BUILD): stage0 +stage1: PGO_CFLAGS:=-fprofile-generate=$(PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE) +stage1: PGO_CXXFLAGS:=-fprofile-generate=$(PROFILE_DIR) -Xclang -mllvm -Xclang -vp-counters-per-site=$(COUNTERS_PER_SITE) +stage1: PGO_LDFLAGS:=-fuse-ld=lld -flto=thin -fprofile-generate=$(PROFILE_DIR) +stage1: export USE_BINARYBUILDER_LLVM=0 +stage1: | $(STAGE1_BUILD) + $(MAKE) -C $(STAGE1_BUILD) $(TOOLCHAIN_FLAGS) && touch $@ + @echo "$(AFTER_STAGE1_MESSAGE)" + +stage2: PGO_CFLAGS:=-fprofile-use=$(PROFILE_FILE) +stage2: PGO_CXXFLAGS:=-fprofile-use=$(PROFILE_FILE) +stage2: PGO_LDFLAGS:=-fuse-ld=lld -flto=thin -fprofile-use=$(PROFILE_FILE) -Wl,--icf=safe +stage2: export USE_BINARYBUILDER_LLVM=0 +stage2: $(PROFILE_FILE) | $(STAGE2_BUILD) + $(MAKE) -C $(STAGE2_BUILD) $(TOOLCHAIN_FLAGS) && touch $@ + +install: stage2 + $(MAKE) -C $(STAGE2_BUILD) USE_BINARYBUILDER_LLVM=0 install + +$(PROFILE_FILE): stage1 $(PROFRAW_FILES) + $(LLVM_PROFDATA) merge -output=$@ $(PROFRAW_FILES) + +# show top 50 functions +top: $(PROFILE_FILE) + $(LLVM_PROFDATA) show --topn=50 $< | $(CXXFILT) + +clean-profiles: + rm -rf $(PROFILE_DIR) + +clean: + rm -f stage0 stage1 stage2 $(PROFILE_FILE)