diff --git a/.gitignore b/.gitignore index c5c04ee..95a8876 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,4 @@ compiler/version.sml # /runtime /runtime/Makefile /runtime/Makefile.in +/runtime/ghc/*.a diff --git a/Makefile.in b/Makefile.in index dcebc5f..f6c2c11 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1,3 +1,4 @@ -include ihc-makefile.inc +include hrc-makefile.inc -IHC_PREFIX=@prefix@ +HRC_PREFIX=@prefix@ +PLATFORM_CPPFLAGS=@PLATFORM_CPPFLAGS@ diff --git a/compiler/as-to-mil/ghc-prim.sml b/compiler/as-to-mil/ghc-prim.sml index 0f45739..4477428 100644 --- a/compiler/as-to-mil/ghc-prim.sml +++ b/compiler/as-to-mil/ghc-prim.sml @@ -83,7 +83,7 @@ struct type argsAndTyps = (Mil.simple * Mil.typ) list type block = HsToMilUtils.MS.t - val pkgName = "ihc/plsr-prims-ghc" + val pkgName = "hrc/plsr-prims-ghc" (* mil types *) val boolTyp = M.TBoolean val int64ArbTyp = IntArb.T (IntArb.S64, IntArb.Signed) diff --git a/compiler/back-end/back-end.sml b/compiler/back-end/back-end.sml index 0700f6f..d37e749 100644 --- a/compiler/back-end/back-end.sml +++ b/compiler/back-end/back-end.sml @@ -28,11 +28,10 @@ struct val name = "BackEnd" val indent = 0 end) - + (* val runtimeDirectory = fn config => Path.snoc (Config.home config, "runtime") - (* val iflcLibDirectory = fn config => Config.iflcLibDirectory config @@ -591,6 +590,15 @@ struct in [file] end + fun runtimeLibraries (config) = + let + val hrcGhcRtLib = + case Config.targetWordSize config + of Config.Ws32 => "libhrc_ghc_runtime32.a" + | Config.Ws64 => "libhrc_ghc_runtime64.a" + in [hrcGhcRtLib] + end + fun unmanagedLibraries (config) = let val threads = @@ -606,11 +614,12 @@ struct val (prtBegin, prtEnd) = ([ifDebug (config, "pillar2c_crt_begind.obj", "pillar2c_crt_begin.obj")], [ifDebug (config, "pillar2c_crt_endd.obj", "pillar2c_crt_end.obj")]) - val gcLibs = gcLibraries (config) - val futureLibs = futureLibraries (config) - val unmanagedLibs = unmanagedLibraries (config) + val gcLibs = gcLibraries config + val futureLibs = futureLibraries config + val runtimeLibs = runtimeLibraries config + val unmanagedLibs = unmanagedLibraries config val pre = prtBegin - val post = List.concat [futureLibs, prtEnd, gcLibs, unmanagedLibs] + val post = List.concat [futureLibs, prtEnd, gcLibs, runtimeLibs, unmanagedLibs] in (pre, post) end diff --git a/compiler/back-end/mil-to-pil.sml b/compiler/back-end/mil-to-pil.sml index 0af0822..f7b8f5f 100644 --- a/compiler/back-end/mil-to-pil.sml +++ b/compiler/back-end/mil-to-pil.sml @@ -3357,25 +3357,11 @@ struct Pil.D.blank, omDefs, Pil.D.blank, - Pil.D.includeLocalFile "ihc/pil", - Pil.D.includeLocalFile "ihc/plsr", + Pil.D.includeLocalFile "hrc/pil", + Pil.D.includeLocalFile "hrc/plsr", Pil.D.blank, incs, -(* - There are a number of reasons we don't want to print externs in the output file: - - 1. GHC's extern declaration doesn't exactly match C function's type, for example, - CInt is internally represented as Int# in GHC Core, which translates to int64 - on x86_64, and if we have already included a C header, it will become a type - mis-match, and GCC will error (ICC will warn). - - 2. We have already included all header files, which will be inlined by pilicl, - and extern definitions following the header files are just redundant. - - Unless we can fix 1, or the assumption in 2 is no longer valid, we can safety - exclude extern declarations in the output. -*) - (* exts, *) + exts, Pil.D.blank, Pil.D.comment "Types", typs, diff --git a/configure.ac b/configure.ac index eb7882c..37016d8 100644 --- a/configure.ac +++ b/configure.ac @@ -2,11 +2,27 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.60]) -AC_INIT([ihc], [0.1], [BUG-REPORT-ADDRESS]) +AC_INIT([hrc], [0.1], [BUG-REPORT-ADDRESS]) AC_CHECK_HEADERS([assert.h float.h immintrin.h limits.h locale.h malloc.h math.h nmmintrin.h signal.h stdarg.h stdio.h stdlib.h string.h time.h sys/time.h sys/timeb.h termios.h]) PKG_CHECK_MODULES([IFLC_LIB], [iflc-lib]) +PLATFORM_CPPFLAGS= +AC_CANONICAL_HOST +case "$host" in + *-mingw*|*-*-cygwin*) + ;; + *-*-linux*) + PLATFORM_CPPFLAGS="-DPLSR_LINUX" + ;; + *) + AC_MSG_ERROR([Unsupported host: $host]) + ;; +esac + +AC_SUBST([PLATFORM_CPPFLAGS]) + +AM_SUBST_NOTMAKE AC_CONFIG_FILES([Makefile]) AC_OUTPUT diff --git a/hrc-makefile.inc b/hrc-makefile.inc new file mode 100644 index 0000000..e094a2e --- /dev/null +++ b/hrc-makefile.inc @@ -0,0 +1,510 @@ +# The Haskell Research Compiler +# COPYRIGHT_NOTICE_1 + +all: all2 + +# TODO: Tidy this up +HRC_EXE = $(IHC_EXE) + +INSTALL_BINDIR = $(HRC_PREFIX)/bin +INSTALL_INCLUDEDIR = $(HRC_PREFIX)/include/hrc +INSTALL_LIBDIR = $(HRC_PREFIX)/lib + +BINDIR = bin +RUNDIR = runtime +RUNDIR_INCLUDEDIR = $(RUNDIR)/include + +install: $(HRC_GHC_RUNTIME_LIBS) $(HRC_EXE) + install -d $(INSTALL_BINDIR) $(INSTALL_INCLUDEDIR) $(INSTALL_INCLUDEDIR)/ghc $(INSTALL_LIBDIR) + install $(HRC_GHC_RUNTIMEDIR)/*.a $(INSTALL_LIBDIR) + install $(HRC_EXE) $(INSTALL_BINDIR) + install $(RUNDIR_INCLUDEDIR)/hrc/*.h $(INSTALL_INCLUDEDIR) + install $(RUNDIR_INCLUDEDIR)/hrc/ghc/*.h $(INSTALL_INCLUDEDIR)/ghc + +ifeq (Windows_NT, $(OS)) + EXE := .exe +else + EXE := +endif + +### HRC-GHC runtime +CC = gcc +AR = ar + +ARFLAGS = rvs +CPPFLAGS = -I"$(RUNDIR_INCLUDEDIR)" $(PLATFORM_CPPFLAGS) `pkg-config --cflags iflc-lib` -Wall +CFLAGS = $(CPPFLAGS) + +HRC_GHC_RUNTIMEDIR = $(RUNDIR)/ghc + +HRC_GHC_RUNTIME_LIBS = \ + $(HRC_GHC_RUNTIMEDIR)/libhrc_ghc_runtime32.a \ + $(HRC_GHC_RUNTIMEDIR)/libhrc_ghc_runtime64.a + +HRC_GHC_RUNTIME_OBJECTS32 = \ + $(HRC_GHC_RUNTIMEDIR)/float32.o \ + $(HRC_GHC_RUNTIMEDIR)/Globals32.o \ + $(HRC_GHC_RUNTIMEDIR)/plsr-util32.o \ + $(HRC_GHC_RUNTIMEDIR)/thread32.o \ + $(HRC_GHC_RUNTIMEDIR)/TTY32.o + +HRC_GHC_RUNTIME_OBJECTS64 = \ + $(HRC_GHC_RUNTIMEDIR)/float64.o \ + $(HRC_GHC_RUNTIMEDIR)/Globals64.o \ + $(HRC_GHC_RUNTIMEDIR)/plsr-util64.o \ + $(HRC_GHC_RUNTIMEDIR)/thread64.o \ + $(HRC_GHC_RUNTIMEDIR)/TTY64.o + +$(HRC_GHC_RUNTIMEDIR)/%32.o: $(HRC_GHC_RUNTIMEDIR)/%.c + $(CC) $(CFLAGS) -DP_WORD_SIZE=4 $< -c -o $@ + +$(HRC_GHC_RUNTIMEDIR)/%64.o: $(HRC_GHC_RUNTIMEDIR)/%.c + $(CC) $(CFLAGS) -DP_WORD_SIZE=8 $< -c -o $@ + +$(HRC_GHC_RUNTIMEDIR)/libhrc_ghc_runtime32.a: $(HRC_GHC_RUNTIME_OBJECTS32) + $(AR) $(ARFLAGS) $@ $< + +$(HRC_GHC_RUNTIMEDIR)/libhrc_ghc_runtime64.a: $(HRC_GHC_RUNTIME_OBJECTS64) + $(AR) $(ARFLAGS) $@ $^ + +hrc-ghc-runtime: $(HRC_GHC_RUNTIME_LIBS) + +### Old P Front End + +OFE = frontend-old +OFE_EXE = $(BINDIR)/$(OFE)$(EXE) +OFE_SRCS = frontend-old/p.cpp + +ofe: $(OFE_EXE) + +$(OFE_EXE): build/vs2010/$(OFE)/$(OFE)/$(OFE).vcxproj $(OFE_SRCS) + cd build/vs2010/$(OFE)/$(OFE); devenv $(OFE).vcxproj /rebuild Debug; cp ../Debug/frontend-old.exe ../../../../bin +#$(OFE_EXE): build/vc60/$(OFE)/$(OFE).dsp $(OFE_SRCS) +# cd build/vc60/$(OFE); msdev $(OFE).dsp /MAKE "p - Win32 Debug" /REBUILD + +### New P Front End + +NFE = frontend-new +NFE_EXE = $(BINDIR)/$(NFE)$(EXE) +NFE_SRCS = $(NFE)/p.hs + +nfe: $(NFE_EXE) + +$(NFE_EXE): $(NFE_SRCS) + ghc -fno-gen-manifest $(NFE_SRCS) -o $(NFE_EXE) + +RR_EXE = $(BINDIR)/p-runtime$(EXE) +RR_SRCS = $(NFE)/p-runtime.hs + +$(RR_EXE): $(RR_SRCS) + ghc $(RR_SRCS) -o $(RR_EXE) + +### Main part of the compiler + +IFLC_VERSION = v1.0 + +# keep these in alpabetic order +COMPILER_SRCS = \ + compiler/back-end/back-end.mlb \ + compiler/back-end/back-end.sml \ + compiler/back-end/mil-to-pil.sml \ + compiler/back-end/outputter.sml \ + compiler/back-end/pil.sml \ + compiler/back-end/runtime.sml \ + compiler/common/chat.sml \ + compiler/common/common.mlb \ + compiler/common/compare.sml \ + compiler/common/config.sml \ + compiler/common/dataflow.sml \ + compiler/common/dominance.sml \ + compiler/common/effect.sml \ + compiler/common/fail.sml \ + compiler/common/globals.sml \ + compiler/common/graph.sml \ + compiler/common/identifier.sml \ + compiler/common/int-arb.sml \ + compiler/common/intr.sml \ + compiler/common/lub.sml \ + compiler/common/pass.sml \ + compiler/common/path.sml \ + compiler/common/rat.sml \ + compiler/common/rename.sml \ + compiler/common/topo-sort.sml \ + compiler/common/tuple.sml \ + compiler/common/try.sml \ + compiler/common/type-rep.sml \ + compiler/common/utils.sml \ + compiler/common/z-coding.sml \ + compiler/driver.sml \ + compiler/mil/analyse.sml \ + compiler/mil/bound-vars.sml \ + compiler/mil/call-graph.sml \ + compiler/mil/cfg.sml \ + compiler/mil/check.sml \ + compiler/mil/code-copy.sml \ + compiler/mil/compile.mlb \ + compiler/mil/compile.sml \ + compiler/mil/dataflow-analysis.sml \ + compiler/mil/dependence-analysis.sml \ + compiler/mil/extended-layout.sml \ + compiler/mil/fmil.sml \ + compiler/mil/free-vars.sml \ + compiler/mil/imil/block.sml \ + compiler/mil/imil/both-mil.sml \ + compiler/mil/imil/common.sml \ + compiler/mil/imil/def.sml \ + compiler/mil/imil/enumerate.sml \ + compiler/mil/imil/func.sml \ + compiler/mil/imil/global.sml \ + compiler/mil/imil/instr.sml \ + compiler/mil/imil/item.sml \ + compiler/mil/imil/layout.sml \ + compiler/mil/imil/t.sml \ + compiler/mil/imil/types.sml \ + compiler/mil/imil/use.sml \ + compiler/mil/imil/var.sml \ + compiler/mil/imil/workset.sml \ + compiler/mil/imil/imil.mlb \ + compiler/mil/imil/imil.sml \ + compiler/mil/layout.sml \ + compiler/mil/loop.sml \ + compiler/mil/lower/lower.mlb \ + compiler/mil/lower/mil-to-core-mil.sml \ + compiler/mil/lower/vector.sml \ + compiler/mil/mil.mlb \ + compiler/mil/mil.sml \ + compiler/mil/name-small-values.sml \ + compiler/mil/number-instructions.sml \ + compiler/mil/optimise/annotated-cg-printer.sml \ + compiler/mil/optimise/branch-remove.sml \ + compiler/mil/optimise/cfg-simplify.sml \ + compiler/mil/optimise/contify.sml \ + compiler/mil/optimise/cse.sml \ + compiler/mil/optimise/double-diamond.sml \ + compiler/mil/optimise/fun-known.sml \ + compiler/mil/optimise/fx-analysis.sml \ + compiler/mil/optimise/inline-aggressive.sml \ + compiler/mil/optimise/inline-leaves.sml \ + compiler/mil/optimise/inline-profile.sml \ + compiler/mil/optimise/inline-rewrite.sml \ + compiler/mil/optimise/inline-small.sml \ + compiler/mil/optimise/iv-cse.sml \ + compiler/mil/optimise/licm.sml \ + compiler/mil/optimise/loop-invert.sml \ + compiler/mil/optimise/optimise.mlb \ + compiler/mil/optimise/rep/analyze.sml \ + compiler/mil/optimise/rep/base.sml \ + compiler/mil/optimise/rep/dead-code.sml \ + compiler/mil/optimise/rep/driver.sml \ + compiler/mil/optimise/rep/flatten.sml \ + compiler/mil/optimise/rep/flowgraph.sml \ + compiler/mil/optimise/rep/node.sml \ + compiler/mil/optimise/rep/object.sml \ + compiler/mil/optimise/rep/optimize.sml \ + compiler/mil/optimise/rep/prep.sml \ + compiler/mil/optimise/rep/reconstruct.sml \ + compiler/mil/optimise/rep/seq.sml \ + compiler/mil/optimise/rep/show.sml \ + compiler/mil/optimise/rep/summary.sml \ + compiler/mil/optimise/rep/rep.mlb \ + compiler/mil/optimise/rep/rep.sml \ + compiler/mil/optimise/simple-escape.sml \ + compiler/mil/optimise/simplify.sml \ + compiler/mil/optimise/thunks.sml \ + compiler/mil/optimise/vectorize.sml \ + compiler/mil/p-object-model.sml \ + compiler/mil/parse.sml \ + compiler/mil/prims.sml \ + compiler/mil/prims-utils.sml \ + compiler/mil/profile.sml \ + compiler/mil/rename.sml \ + compiler/mil/rewrite.sml \ + compiler/mil/shape-analysis.sml \ + compiler/mil/stats.sml \ + compiler/mil/stream.sml \ + compiler/mil/stream2.sml \ + compiler/mil/transform.sml \ + compiler/mil/type.sml \ + compiler/mil/utils.sml \ + compiler/mil/utils2.sml + +COMPILER_STD_OPTS = @MLton -- \ + -verbose 1 -runtime 'use-mmap true' \ + -codegen native \ + -native-split 900000 +# The following don't work on 64-bit +# -ieee-fp true -max-heap 800m + +ifeq (y, $(fast)) + COMPILER_OPTS :=$(COMPILER_STD_OPTS) +else + COMPILER_OPTS :=$(COMPILER_STD_OPTS) -const 'Exn.keepHistory true' +endif + +# keep these in alpabetic order +HASKELL_SRCS = \ + compiler/anorm-strict/anorm-strict-analyze.sml \ + compiler/anorm-strict/anorm-strict-clone.sml \ + compiler/anorm-strict/anorm-strict-closure-convert.sml \ + compiler/anorm-strict/anorm-strict-free-vars.sml \ + compiler/anorm-strict/anorm-strict-layout.sml \ + compiler/anorm-strict/anorm-strict.mlb \ + compiler/anorm-strict/anorm-strict-optimize.sml \ + compiler/anorm-strict/anorm-strict-rewrite.sml \ + compiler/anorm-strict/stats.sml \ + compiler/anorm-strict/anorm-strict.sml \ + compiler/anorm-strict/anorm-strict-utils.sml \ + compiler/as-to-mil/as-to-mil.mlb \ + compiler/as-to-mil/ghc-prim.sml \ + compiler/as-to-mil/to-mil.sml \ + compiler/as-to-mil/utils.sml \ + compiler/anorm-lazy/anorm-lazy.sml \ + compiler/anorm-lazy/anorm-lazy-analyze.sml \ + compiler/anorm-lazy/abs-core.sml \ + compiler/anorm-lazy/abs-eval.sml \ + compiler/anorm-lazy/stats.sml \ + compiler/anorm-lazy/strictness.sml \ + compiler/anorm-lazy/to-abs-core.sml \ + compiler/ch-to-as/ch-to-as.mlb \ + compiler/ch-to-as/to-lazy.sml \ + compiler/ch-to-as/to-strict.sml \ + compiler/core-hs/core-hs.mlb \ + compiler/core-hs/core-hs.sml \ + compiler/core-hs/ghc-prim-op.sml \ + compiler/core-hs/ghc-prim-type.sml \ + compiler/core-hs/layout.sml \ + compiler/core-hs/normalize.sml \ + compiler/core-hs/parse.sml \ + compiler/core-hs/link-option.sml \ + compiler/core-hs/core-hs.lex.sml \ + compiler/core-hs/core-hs.grm.sig \ + compiler/core-hs/core-hs.grm.sml \ + compiler/haskell.sml + +# keep these in alpabetic order +P_SRCS = \ + compiler/core-op/core-op.mlb \ + compiler/core-op/core-p.sml \ + compiler/core-op/fix-up.sml \ + compiler/core-op/layout.sml \ + compiler/core-op/parse.sml \ + compiler/core-op/prims.sml \ + compiler/core-op/stats.sml \ + compiler/core-op/utils.sml \ + compiler/core-p/common.sml \ + compiler/core-p/common-utils.sml \ + compiler/core-p/core-p.mlb \ + compiler/core-p/linear.sml \ + compiler/core-p/linear-analyse.sml \ + compiler/core-p/linear-dce.sml \ + compiler/core-p/linear-free-variables.sml \ + compiler/core-p/linear-layout.sml \ + compiler/core-p/linear-optimise.sml \ + compiler/core-p/linear-simplify.sml \ + compiler/core-p/linear-sink1.sml \ + compiler/core-p/linear-sink2.sml \ + compiler/core-p/linear-stats.sml \ + compiler/core-p/linear-type.sml \ + compiler/core-p/linear-utils.sml \ + compiler/core-p/natives.sml \ + compiler/core-p/parse.sml \ + compiler/core-p/prims.sml \ + compiler/core-p/raw-layout.sml \ + compiler/core-p/raw-stats.sml \ + compiler/core-p/raw-to-linear.sml \ + compiler/core-p/raw-utils.sml \ + compiler/core-p/raw.sml \ + compiler/core-p/type.sml \ + compiler/cop-to-old-hil/acp.sml \ + compiler/cop-to-old-hil/analysis.sml \ + compiler/cop-to-old-hil/compute-work.sml \ + compiler/cop-to-old-hil/cop-to-hil.mlb \ + compiler/cop-to-old-hil/cp-to-acp.sml \ + compiler/cop-to-old-hil/cp-to-hil.sml \ + compiler/cop-to-old-hil/direct.sml \ + compiler/cop-to-old-hil/effect-analysis.sml \ + compiler/cop-to-old-hil/evaled.sml \ + compiler/cop-to-old-hil/hil-builder.sml \ + compiler/cop-to-old-hil/hybrid.sml \ + compiler/cop-to-old-hil/thunks.sml \ + compiler/cop-to-old-hil/transform.sml \ + compiler/cop-to-old-hil/two-version-analysis.sml \ + compiler/cop-to-old-hil/two-version-compute-work.sml \ + compiler/cop-to-old-hil/two-version-transform.sml \ + compiler/cop-to-old-hil/two-version-utils.sml \ + compiler/cop-to-old-hil/two-version-version-analysis.sml \ + compiler/cop-to-old-hil/two-version.sml \ + compiler/cop-to-old-hil/type-analyse.sml \ + compiler/cop-to-old-hil/utils.sml \ + compiler/cp-to-hil/analyse.sml \ + compiler/cp-to-hil/check.sml \ + compiler/cp-to-hil/cp-to-hil.mlb \ + compiler/cp-to-hil/cp-to-hil.sml \ + compiler/cp-to-hil/interface.sml \ + compiler/cp-to-hil/transform.sml \ + compiler/cp-to-hil/simple-analyse.sml \ + compiler/hil/analyse.sml \ + compiler/hil/check.sml \ + compiler/hil/dce.sml \ + compiler/hil/free-vars.sml \ + compiler/hil/hil.mlb \ + compiler/hil/hil.sml \ + compiler/hil/layout.sml \ + compiler/hil/optimise.sml \ + compiler/hil/simplifier.sml \ + compiler/hil/utils.sml \ + compiler/hil-to-mil/analyze.sml \ + compiler/hil-to-mil/hil-to-mil.mlb \ + compiler/hil-to-mil/hil-to-mil.sml \ + compiler/hil-to-mil/interface.sml \ + compiler/hil-to-mil/rewrite.sml \ + compiler/hil-to-mil/simple-analyze.sml \ + compiler/old-hil/check.sml \ + compiler/old-hil/duplicate.sml \ + compiler/old-hil/free-vars.sml \ + compiler/old-hil/hil.mlb \ + compiler/old-hil/hil.sml \ + compiler/old-hil/inline.sml \ + compiler/old-hil/layout.sml \ + compiler/old-hil/optimise.sml \ + compiler/old-hil/stats.sml \ + compiler/old-hil/utils.sml \ + compiler/old-hil-to-mil/analyze.sml \ + compiler/old-hil-to-mil/hil-in-mil.sml \ + compiler/old-hil-to-mil/hil-to-mil.mlb \ + compiler/old-hil-to-mil/hil-to-mil.sml \ + compiler/old-hil-to-mil/interface.sml \ + compiler/old-hil-to-mil/rewrite.sml \ + compiler/old-hil-to-mil/simple-analyze.sml \ + compiler/p.sml + +# make-version +make-version: $(BINDIR)/make-version.sh + $(BINDIR)/make-version.sh "$(IFLC_VERSION)" compiler/version.sml $(HRC_PREFIX) + +# iflc + +IFLC = iflc +IFLC_EXE = $(BINDIR)/$(IFLC)$(EXE) + +IFLC_SRCS = $(COMPILER_SRCS) $(HASKELL_SRCS) $(P_SRCS) compiler/iflc.mlb compiler/iflc.sml + +iflc: $(IFLC_EXE) + +$(IFLC_EXE): make-version $(IFLC_SRCS) + cd compiler; \ + rm -f iflc.*.c iflc.*.s; \ + mlton $(COMPILER_OPTS) -stop g iflc.mlb && \ + mlton $(COMPILER_OPTS) -output ../$(IFLC_EXE) iflc.*.c iflc.*.s && \ + rm -f iflc.*.c iflc.*.s; + +### ihc + +IHC = ihc +IHC_EXE = $(BINDIR)/$(IHC)$(EXE) + +IHC_SRCS = $(COMPILER_SRCS) $(HASKELL_SRCS) compiler/ihc.mlb compiler/ihc.sml + +ihc: $(IHC_EXE) + +$(IHC_EXE): make-version $(IHC_SRCS) compiler/ihc.mlb + cd compiler; \ + rm -f ihc.*.c ihc.*.s; \ + mlton $(COMPILER_OPTS) -stop g $(IHC).mlb && \ + mlton $(COMPILER_OPTS) -output ../$(IHC_EXE) ihc.*.c ihc.*.s && \ + rm -f ihc.*.c ihc.*.s + +### New P Runtime Support + +NP_RUNTIME = $(RUNDIR)/hil.mil + +npr: $(NP_RUNTIME) + +$(RUNDIR)/hil.mil: compiler/hil-to-mil/hil.mil + cp $< $@ + +### P LSR + +# Keep these in alphabetic order +PLSR_SRCS = \ + $(RUNDIR_INCLUDEDIR)/hrc/pil.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-ap-integer.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-ap-rational.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-gc.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-gmp-integer.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-gmp-integer-gallocate.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-iflc-integer.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-integer.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-lightweight-thunk.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-main.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-numeric.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-objects.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-params.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims-prims.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims-runtime.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims-vector-avx.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims-vector-sse.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims-vector.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-ptk-thunk.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-rational.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-tagged-int32.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-thunk.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-util.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-value.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr-vector.h \ + $(RUNDIR_INCLUDEDIR)/hrc/plsr.h + +### Main stuff + +iflc-all: nfe iflc npr + +all2: hrc-ghc-runtime ihc + +partclean: + rm -f runtime/ghc/*.a + rm -f `find bin build compiler runtime benchmarks \ + -name \\*~ -o -name \\*.obj -o -name \\*.o -o -name gc.log -o -name .pilink -o -name .pilicl` + rm -f $(NFE)/*.{hi,o,exe} $(NFE)/*~ + rm -rf build/vc60/$(OFE)/Debug + rm -f build/vc60/$(OFE)/$(OFE).{ncb,opt,plg} + rm -rf build/vc60/pdebug/Debug + rm -rf build/vc60/pdebug/Release + rm -rf build/vc60/pdebug/pdebug.{ncb,opt,plg} + rm -f `find tests -name \\*~ -o -name \\*.exe -o \ + -name \\*.obj -o -name \\*.o -o -name \\*.cmp -o -name \\*.exp -o \ + -name \\*.lib -o -name \\*.pdb -o -name \\*.suo -o -name \\*.ilk -o -name gc.log` + rm -f `find tests -name \\*.c -a -not -name matrix-multiply-c.c` + -make -C benchmarks/barnes-hut clean + -make -C benchmarks/cloth-nobc clean + -make -C benchmarks/matrix-multiply clean + -make -C benchmarks/quicksort clean + -make -C benchmarks/raytracer clean + -make -C benchmarks/smvp clean + -make -C benchmarks/smvp-direct clean + -make -C benchmarks/sudoku clean + -make -C benchmarks/nofib clean + rm -f *~ + rm -f mlmon.out gc.log root_map.txt *.ppm compiler/version.sml + +clean: partclean + rm -f $(OFE_EXE) $(NFE_EXE) $(IFLC_EXE) $(IHC_EXE) $(NP_RUNTIME) regressions.log smvp.res + +### Testing + +test: iflc-all + $(BINDIR)/regress.sh -short -nomake + +regress: iflc-all + $(BINDIR)/regress.sh -nomake + +torture: iflc-all + $(BINDIR)/regress.sh -torture -nomake + +### ML-Yacc and ML-Lex +compiler/core-hs/core-hs.lex.sml: compiler/core-hs/core-hs.lex compiler/core-hs/core-hs.grm.sml + mllex $< +compiler/core-hs/core-hs.grm.sml: compiler/core-hs/core-hs.grm + mlyacc $< +compiler/core-hs/core-hs.grm.sig: compiler/core-hs/core-hs.grm + mlyacc $< diff --git a/runtime/ghc/Globals.c b/runtime/ghc/Globals.c index fb90782..f51c379 100755 --- a/runtime/ghc/Globals.c +++ b/runtime/ghc/Globals.c @@ -1,10 +1,9 @@ /* The Haskell Research Compiler */ /* COPYRIGHT_NOTICE_1 */ -typedef enum { - ISK_SystemEventThreadEventManager, ISK_SystemEventThreadIOManager, ISK_GHCConcWindowsPendingDelays, - ISK_GHCConcWindowsIOManagerThread, ISK_GHCConcWindowsProdding, ISK_GHCConcSignalSignalHandler, ISK_Num -} IhrStoreKey; +#include +#include "hrc/plsr-util.h" +#include "hrc/ghc/Globals.h" static void* ihrStore[ISK_Num] = {NULL, }; static struct prtMutex* ihrStoreLock; @@ -74,7 +73,7 @@ void blockUserSignals() {} void unblockUserSignals() {} -void stopTimer() {} +void stopTimer() {} void startTimer() {} diff --git a/runtime/ghc/TTY.c b/runtime/ghc/TTY.c index 7ea41cd..cacb69e 100644 --- a/runtime/ghc/TTY.c +++ b/runtime/ghc/TTY.c @@ -22,6 +22,8 @@ #include #endif +#include "hrc/ghc/TTY.h" + // Here we save the terminal settings on the standard file // descriptors, if we need to change them (eg. to support NoBuffering // input). @@ -52,7 +54,7 @@ resetTerminalSettings (void) // if we changed them. See System.Posix.Internals.tcSetAttr for // more details, including the reason we termporarily disable // SIGTTOU here. - { + { int fd; sigset_t sigset, old_sigset; sigemptyset(&sigset); diff --git a/runtime/ghc/float.c b/runtime/ghc/float.c index b0667f2..c655be5 100644 --- a/runtime/ghc/float.c +++ b/runtime/ghc/float.c @@ -9,55 +9,28 @@ #include #include - -#define IEEE_FLOATING_POINT 1 - -/* - * Encoding and decoding Doubles. Code based on the HBC code - * (lib/fltcode.c). - */ - -#if IEEE_FLOATING_POINT -#define MY_DMINEXP ((DBL_MIN_EXP) - (DBL_MANT_DIG) - 1) -/* DMINEXP is defined in values.h on Linux (for example) */ -#define DHIGHBIT 0x00100000 -#define DMSBIT 0x80000000 - -#define MY_FMINEXP ((FLT_MIN_EXP) - (FLT_MANT_DIG) - 1) -#define FHIGHBIT 0x00800000 -#define FMSBIT 0x80000000 -#endif - -#if defined(WORDS_BIGENDIAN) || defined(FLOAT_WORDS_BIGENDIAN) -#define L 1 -#define H 0 -#else -#define L 0 -#define H 1 -#endif - -#define __abs(a) (( (a) >= 0 ) ? (a) : (-(a))) +#include "hrc/ghc/float.h" StgDouble __2Int_encodeDouble (I_ j_high, I_ j_low, I_ e) { StgDouble r; - + /* assuming 32 bit ints */ ASSERT(sizeof(int ) == 4 ); r = (StgDouble)((unsigned int)j_high); r *= 4294967296.0; /* exp2f(32); */ r += (StgDouble)((unsigned int)j_low); - + /* Now raise to the exponent */ if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */ r = ldexp(r, e); - + /* sign is encoded in the size */ if (j_high < 0) r = -r; - + return r; } @@ -66,13 +39,13 @@ StgDouble __word_encodeDouble (W_ j, I_ e) { StgDouble r; - + r = (StgDouble)j; - + /* Now raise to the exponent */ if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */ r = ldexp(r, e); - + return r; } @@ -81,17 +54,17 @@ StgDouble __int_encodeDouble (I_ j, I_ e) { StgDouble r; - + r = (StgDouble)__abs(j); - + /* Now raise to the exponent */ if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */ r = ldexp(r, e); - + /* sign is encoded in the size */ if (j < 0) r = -r; - + return r; } @@ -100,17 +73,17 @@ StgFloat __int_encodeFloat (I_ j, I_ e) { StgFloat r; - + r = (StgFloat)__abs(j); - + /* Now raise to the exponent */ if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */ r = ldexp(r, e); - + /* sign is encoded in the size */ if (j < 0) r = -r; - + return r; } @@ -119,13 +92,13 @@ StgFloat __word_encodeFloat (W_ j, I_ e) { StgFloat r; - + r = (StgFloat)j; - + /* Now raise to the exponent */ if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */ r = ldexp(r, e); - + return r; } @@ -217,4 +190,3 @@ __decodeFloat_Int (I_ *man, I_ *exp, StgFloat flt) *man = - *man; } } - diff --git a/runtime/ghc/thread.c b/runtime/ghc/thread.c index c405cfb..eee8e2b 100644 --- a/runtime/ghc/thread.c +++ b/runtime/ghc/thread.c @@ -1,6 +1,8 @@ /* The Haskell Research Compiler */ /* COPYRIGHT_NOTICE_1 */ +#include "hrc/ghc/thread.h" + /* place holders for now */ I_ rts_getThreadId (W_ tid) { diff --git a/runtime/include/hrc/ghc/Globals.h b/runtime/include/hrc/ghc/Globals.h new file mode 100644 index 0000000..5c7c112 --- /dev/null +++ b/runtime/include/hrc/ghc/Globals.h @@ -0,0 +1,33 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _GHC_GLOBALS_H_ +#define _GHC_GLOBALS_H_ + +#include "hrc/ghc/float.h" + +typedef enum { + ISK_SystemEventThreadEventManager, ISK_SystemEventThreadIOManager, ISK_GHCConcWindowsPendingDelays, + ISK_GHCConcWindowsIOManagerThread, ISK_GHCConcWindowsProdding, ISK_GHCConcSignalSignalHandler, ISK_Num +} IhrStoreKey; + +void ihrSetNCapabilities(uint32 n); +void ihrGlobalInit(); +void* getOrSetKey(IhrStoreKey k, void* p); +void* getOrSetSystemEventThreadEventManagerStore(void* p); +void* getOrSetSystemEventThreadIOManagerThreadStore(void* p); +void* getOrSetGHCConcWindowsPendingDelaysStore(void* p); +void* getOrSetGHCConcWindowsIOManagerThreadStore(void* p); +void* getOrSetGHCConcWindowsProddingStore(void* p); +void* getOrSetGHCConcSignalSignalHandlerStore(void* p); +void sysErrorBelch(char* s); +void blockUserSignals(); +void unblockUserSignals(); +void stopTimer(); +void startTimer(); +void stackOverflow(); +// int lockFile(int fd, uint64 dev, uint64 ino, int for_writing); +// int unlockFile(int fd); +uint64 getMonotonicNSec(); + +#endif diff --git a/runtime/include/hrc/ghc/TTY.h b/runtime/include/hrc/ghc/TTY.h new file mode 100644 index 0000000..c9f6221 --- /dev/null +++ b/runtime/include/hrc/ghc/TTY.h @@ -0,0 +1,20 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _GHC_TTY_H_ +#define _GHC_TTY_H_ + +#include "hrc/ghc/float.h" + +#ifdef HAVE_TERMIOS_H +#include +#endif +#ifdef HAVE_SIGNAL_H +#include +#endif + +void* __hscore_get_saved_termios(I_ fd0); +void __hscore_set_saved_termios(I_ fd0, void* ts); +void resetTerminalSettings (void); + +#endif diff --git a/runtime/include/hrc/ghc/float.h b/runtime/include/hrc/ghc/float.h new file mode 100644 index 0000000..8bd2d43 --- /dev/null +++ b/runtime/include/hrc/ghc/float.h @@ -0,0 +1,64 @@ +/* ----------------------------------------------------------------------------- + * + * (c) Lennart Augustsson + * (c) The GHC Team, 1998-2000 + * + * Miscellaneous support for floating-point primitives + * + * ---------------------------------------------------------------------------*/ + +#ifndef _GHC_FLOAT_H_ +#define _GHC_FLOAT_H_ + +#include +#include +#include +#include "hrc/pil.h" + +#define IEEE_FLOATING_POINT 1 + +/* + * Encoding and decoding Doubles. Code based on the HBC code + * (lib/fltcode.c). + */ + +#if IEEE_FLOATING_POINT +#define MY_DMINEXP ((DBL_MIN_EXP) - (DBL_MANT_DIG) - 1) +/* DMINEXP is defined in values.h on Linux (for example) */ +#define DHIGHBIT 0x00100000 +#define DMSBIT 0x80000000 + +#define MY_FMINEXP ((FLT_MIN_EXP) - (FLT_MANT_DIG) - 1) +#define FHIGHBIT 0x00800000 +#define FMSBIT 0x80000000 +#endif + +#if defined(WORDS_BIGENDIAN) || defined(FLOAT_WORDS_BIGENDIAN) +#define L 1 +#define H 0 +#else +#define L 0 +#define H 1 +#endif + +#define __abs(a) (( (a) >= 0 ) ? (a) : (-(a))) + +typedef void* StgStablePtr; +#define ASSERT assert +//typedef uint32 nat; +typedef float StgFloat; +typedef double StgDouble; +typedef sintp I_; +typedef uintp W_; +#define SIZEOF_FLOAT 4 +#define SIZEOF_DOUBLE 8 + +StgDouble __2Int_encodeDouble (I_ j_high, I_ j_low, I_ e); +StgDouble __word_encodeDouble (W_ j, I_ e); +StgDouble __int_encodeDouble (I_ j, I_ e); +StgFloat __int_encodeFloat (I_ j, I_ e); +StgFloat __word_encodeFloat (W_ j, I_ e); +void __decodeDouble_2Int (I_ *man_sign, W_ *man_high, W_ *man_low, I_ *exp, StgDouble dbl); +void __decodeFloat_Int (I_ *man, I_ *exp, StgFloat flt); + +#endif diff --git a/runtime/include/hrc/ghc/thread.h b/runtime/include/hrc/ghc/thread.h new file mode 100644 index 0000000..8e7cbd7 --- /dev/null +++ b/runtime/include/hrc/ghc/thread.h @@ -0,0 +1,19 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _GHC_THREAD_H_ +#define _GHC_THREAD_H_ + +#include "hrc/ghc/float.h" + +I_ rts_getThreadId (W_ tid); +I_ cmp_thread (W_ tidA, W_ tidB); +I_ rtsSupportsBoundThreads(); + +#if defined (__MINGW32__) || defined(WIN32) +HANDLE getIOManagerEvent(); +W_ readIOManagerEvent(); +void sendIOManagerEvent(W_ e); +#endif + +#endif diff --git a/runtime/include/hrc/pil.h b/runtime/include/hrc/pil.h new file mode 100644 index 0000000..98b1c2b --- /dev/null +++ b/runtime/include/hrc/pil.h @@ -0,0 +1,251 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* This file defines the Pil abstractions to make C and Pillar look as close + * as possible. + */ + +#ifndef _PIL_H_ +#define _PIL_H_ + +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ +#endif /* P_USE_PILLAR */ + +#include +#include +#include + +#ifdef P_USE_PILLAR +# undef to +# pragma pillar_managed(on) +# include "prt/prt.h" +#endif /* P_USE_PILLAR */ + +/**********************************************************************/ +/* Define definite sized integral types */ + +#ifdef WIN32 /* MS VC++ compiler */ + +typedef __int64 sint64; +typedef __int32 sint32; +typedef __int8 sint8; +typedef __int16 sint16; +typedef unsigned __int64 uint64; +typedef unsigned __int32 uint32; +typedef unsigned __int8 uint8; +typedef unsigned __int16 uint16; +typedef float float32; +typedef double float64; + +#define SINT32_MAX INT_MAX +#define SINT32_MIN INT_MIN +#define UINT32_MAX ULONG_MAX + +#define SINT64_MAX _I64_MAX +#define SINT64_MIN _I64_MIN +#define UINT64_MAX _UI64_MAX + +#define INFINITY32 ((float)(1.0/0.0)) +#define NAN32 ((float)0.0/0.0) +#define INFINITY64 ((double)(1.0/0.0)) +#define NAN64 ((double)0.0/0.0) + +#define SINT64_C(c) c ## L +#define UINT64_C(c) c ## UL + +#else /* !WIN32 */ +#ifdef __INTEL_COMPILER /* Intel C Compiler */ + +typedef long long sint64; +typedef unsigned long long uint64; +typedef int sint32; +typedef unsigned uint32; +typedef short sint16; +typedef unsigned short uint16; +typedef char sint8; +typedef unsigned char uint8; +typedef float float32; +typedef double float64; + +#define SINT32_MAX INT_MAX +#define SINT32_MIN INT_MIN +#define UINT32_MAX UINT_MAX + +#define SINT64_MAX _I64_MAX +#define SINT64_MIN _I64_MIN +#define UINT64_MAX _UI64_MAX + +#define INFINITY32 ((float)(1.0/0.0)) +#define NAN32 ((float)0.0/0.0) +#define INFINITY64 ((double)(1.0/0.0)) +#define NAN64 ((double)0.0/0.0) + +#else /* !WIN32 && !__INTEL_COMPILER */ +#ifdef __GNUC__ /* gcc */ + +#ifndef P_USE_PILLAR +#ifndef __cdecl +#define __cdecl __attribute__((cdecl)) +#endif /* __cdecl */ +#endif /* P_USE_PILLAR */ + +#include + +typedef long long int sint64; +typedef unsigned long long int uint64; +typedef int sint32; +typedef unsigned uint32; +typedef short sint16; +typedef unsigned short uint16; +typedef char sint8; +typedef unsigned char uint8; +typedef float float32; +typedef double float64; + +#define SINT32_MAX INT32_MAX +#define SINT32_MIN INT32_MIN + +#define SINT64_MAX INT64_MAX +#define SINT64_MIN INT64_MIN + +#define INFINITY32 ((float)(1.0/0.0)) +#define NAN32 ((float)0.0/0.0) +#define INFINITY64 ((double)(1.0/0.0)) +#define NAN64 ((double)0.0/0.0) + +#ifdef INTEL64 +#define SINT64_C(c) c ## L +#define UINT64_C(c) c ## UL +#else +#define SINT64_C INT64_C +#endif + +#else + +#error Unknown compiler + +#endif /* !WIN32 && !__INTEL_COMPILER && !__GNUC__ */ +#endif /* !WIN32 && !__INTEL_COMPILER */ +#endif /* !WIN32 */ + +#if P_WORD_SIZE == 4 + +typedef sint32 sintp; +typedef uint32 uintp; +#define SINTP_MAX SINT32_MAX +#define SINTP_MIN SINT32_MIN +#define UINTP_MAX UINT32_MAX + +#elif P_WORD_SIZE == 8 + +typedef sint64 sintp; +typedef uint64 uintp; +#define SINTP_MAX SINT64_MAX +#define SINTP_MIN SINT64_MIN +#define UINTP_MAX UINT64_MAX + +#else /* P_WORD_SIZE notin {4, 8} */ + +#error P_WORD_SIZE not defined or not a supported size + +#endif /* P_WORD_SIZE notin {4, 8} */ + +#ifdef __GNUC__ +#define pil_aligned(a) __attribute__((aligned(a))) +#else +#define pil_aligned(a) __declspec(align(a)) +#endif + +typedef uintp bool; + +static void pilCheck() +{ + if (sizeof(sint32)!=4) { fprintf(stderr, "pil: bad sint32!\n"); exit(-1); } + if (sizeof(uint32)!=4) { fprintf(stderr, "pil: bad uint32!\n"); exit(-1); } + if (sizeof(sint64)!=8) { fprintf(stderr, "pil: bad sint64!\n"); exit(-1); } + if (sizeof(uint64)!=8) { fprintf(stderr, "pil: bad uint64!\n"); exit(-1); } + if (sizeof(sintp)!=P_WORD_SIZE) + { fprintf(stderr, "pil: bad sintp!\n"); exit(-1); } + if (sizeof(uintp)!=P_WORD_SIZE) + { fprintf(stderr, "pil: bad uint!\n"); exit(-1); } + if (sizeof(void*)!=P_WORD_SIZE) + { fprintf(stderr, "pil: bad word size!\n"); exit(-1); } + if (sizeof(float32)!=4) { fprintf(stderr, "pil: bad float32!\n"); exit(-1); } + if (sizeof(float64)!=8) { fprintf(stderr, "pil: bad float64!\n"); exit(-1); } +} + +/**********************************************************************/ +/* Control stuff */ + +#ifdef P_USE_PILLAR + +#ifdef __pillar2c__ +# define PilContinuation0 continuation_type +# define PilContinuation(...) continuation_type< __VA_ARGS__ > +#else +# define PilContinuation0 g4 +# define PilContinuation(...) g4 +#endif +#define pilContinuationLocal(cv) +#define pilContinuationMake(v, cl, cv) (v) = (cv) +#define pilContinuation0(cl, cv) continuation cv(): +#define pilContinuation(cl, cv, ...) continuation cv(__VA_ARGS__): +#define pilCutTo0(c) cut to c +#ifdef __pillar2c__ + #define pilCutToA(c, ...) cut to c with (__VA_ARGS__) + #define pilCutToC(...) also cuts to (__VA_ARGS__) +#else /* !__pillar2c__ */ + #define pilCutToA(c, ...) cut to c(__VA_ARGS__) + #define pilCutToC(...) also cuts to __VA_ARGS__ +#endif /* !__pillar2c__ */ + +#if !PLSR_DISABLE_TAILCALL + #define TAILCALL(e) return tailcall e + #define TAILCALLV(e) tailcall e; return +#else + #define TAILCALL(e) return e + #define TAILCALLV(e) e;return +#endif + +#ifdef USE_PTHREADS +#define YIELDCHECK() (*(unsigned volatile*)(((char *)prtGetTaskHandle()) + (3*P_WORD_SIZE))) +#else // USE_PTHREADS +#ifdef NO_PRSCALL +#define YIELDCHECK() (*((*((short volatile**)(((char *)prtGetTaskHandle()) +(2*P_WORD_SIZE)))) + 0x76)) +#else // NO_PRSCALL +#define YIELDCHECK() (*(short volatile*)(**(char volatile* volatile* volatile*)((char *)prtGetTaskHandle()+(2*P_WORD_SIZE))+0x76)) +#endif // NO_PRSCALL +#endif // USE_PTHREADS + +#define pilYieldDec uintp spin=0 +#define pilYield() \ + do { \ + if (++spin&15==0 && YIELDCHECK()) prtYield(); \ + } while (0) + +#else /* !P_USE_PILLAR */ + +#include + +#define PilContinuation0 jmp_buf* +#define PilContinuation(...) jmp_buf* +#define pilContinuationLocal(cv) jmp_buf cv +#define pilContinuationMake(v, cl, cv) \ + do { (v) = &(cv); if (setjmp((cv))) goto cl; } while (0) +#define pilContinuation0(cl, cv) +#define pilContinuation(cl, cv, ...) +#define pilCutTo0(c) longjmp(*c, 1) +#define pilCutToA(c, ...) longjmp(*c, 1) +#define pilCutToC(...) + +#define TAILCALL(e) return (e) +#define TAILCALLV(e) do { e; return; } while (0) +#define noyield +#define pilYieldDec +#define pilYield() + +#endif /* !P_USE_PILLAR */ + +#endif /* !_PIL_H */ diff --git a/runtime/include/hrc/plsr-ap-integer.h b/runtime/include/hrc/plsr-ap-integer.h new file mode 100644 index 0000000..7bcd5b9 --- /dev/null +++ b/runtime/include/hrc/plsr-ap-integer.h @@ -0,0 +1,20 @@ + /* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifdef PLSR_GMP_USE_DEFAULT +#ifdef __pillar2c__ +#define PLSR_GMP_USE_GALLOCATE +#else /* ! __pillar2c__ */ +#define PLSR_GMP_USE_PINNING +#endif /* ! __pillar2c__ */ +#endif /* !PLSR_GMP_USE_DEFAULT */ + +#ifdef PLSR_NO_GMP_INTEGERS +#include "hrc/plsr-iflc-integer.h" +#else /* !PLSR_NO_GMP_INTEGERS */ +#ifdef PLSR_GMP_USE_GALLOCATE +#include "hrc/plsr-gmp-integer-gallocate.h" +#else /* !PLSR_GMP_USE_GALLOCATE */ +#include "hrc/plsr-gmp-integer.h" +#endif /* !PLSR_GMP_USE_GALLOCATE */ +#endif /* !PLSR_NO_GMP_INTEGERS */ diff --git a/runtime/include/hrc/plsr-ap-rational.h b/runtime/include/hrc/plsr-ap-rational.h new file mode 100755 index 0000000..bee6740 --- /dev/null +++ b/runtime/include/hrc/plsr-ap-rational.h @@ -0,0 +1,500 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Arbitrary precision numbers */ + +#ifndef _PLSR_AP_RATIONAL_H_ +#define _PLSR_AP_RATIONAL_H_ + +/********************************************************************** + * Arbitrary precision rationals + */ + +/* Represents the rational x such that c0+c1*x=0 + * Invariants: gcd(c0,c1)=1 and c1>0 + */ + +typedef struct PlsrAPRatS_ { + PlsrVTable vtable; + PlsrInteger c0; + PlsrInteger c1; +} PlsrAPRatS; + +#ifdef P_USE_PILLAR +typedef PlsrRef PlsrAPRat; +#else /* !P_USE_PILLAR */ +typedef PlsrAPRatS* PlsrAPRat; +#endif /* !P_USE_PILLAR */ + +#define pLsrAPRatRep(l) ((PlsrAPRatS*)(l)) +#define pLsrAPRatC0(l) pLsrAPRatRep(l)->c0 +#define pLsrAPRatC1(l) pLsrAPRatRep(l)->c1 + + +#define pLsrAPRatPadding \ + (sizeof(PlsrAPRatS) - sizeof(PlsrVTable) - sizeof(PlsrInteger) - sizeof(PlsrInteger)) +pLsrVTableStatic(pLsrAPRatVTable_, VNoneTag, "*rational*", pLsrAPRatPadding); +#define pLsrAPRatVTable &pLsrAPRatVTable_ + +/* Note: the parameter names are significant. Using c0 and c1 breaks, + * since apparently the gnu and Intel C compilers confuse the parameter + * with the field name. Uggh. + */ +#define pLsrAPRatStaticUnboxedDef(uvar) \ + static PlsrAPRatS uvar = { .vtable = pLsrAPRatVTable, } + +#define pLsrAPRatStaticInitFromIntegers(var, c00, c10) \ + do { \ + if (pLsrIntegerIsRef) { \ + pLsrWriteBarrierRefBase(var, pLsrAPRatC0(var), c00); \ + pLsrWriteBarrierRefBase(var, pLsrAPRatC1(var), c10); \ + } else { \ + pLsrAPRatC0(var) = c00; \ + pLsrAPRatC1(var) = c10; \ + } \ + } while (0) + +#define pLsrAPRatStaticInit(var, c00, c10) \ + pLsrAPRatStaticInitFromIntegers(var, pLsrIntegerFromCString(c00), pLsrIntegerFromCString(c10)) + +#define pLsrAPRatStaticRef(uvar) \ + ((PlsrAPRat) &uvar) + +/* Constants */ + +static PlsrAPRatS pLsrAPRatZero_ = { .vtable = pLsrAPRatVTable, + .c0 = NULL, + .c1 = NULL }; +#define pLsrAPRatZero ((PlsrAPRat) &pLsrAPRatZero_) + +static PlsrAPRatS pLsrAPRatOne_ = { .vtable = pLsrAPRatVTable, + .c0 = NULL, + .c1 = NULL }; +#define pLsrAPRatOne ((PlsrAPRat) &pLsrAPRatOne_) + +static PlsrAPRatS pLsrAPRatUInt32Max_ = { .vtable = pLsrAPRatVTable, + .c0 = NULL, + .c1 = NULL }; +#define pLsrAPRatUInt32Max ((PlsrAPRat) &pLsrAPRatUInt32Max_) + +/* Constructors */ + +/* Assumes that c0 and c1 are already in canonical form */ +static PlsrAPRat pLsrAPRatMk(PlsrInteger c0, PlsrInteger c1) { + uintp size = sizeof(PlsrAPRatS); + PlsrAPRat a; + noyield { + pLsrAlloc(PlsrAPRat, a, pLsrAPRatVTable, size); + if (pLsrIntegerIsRef) { + pLsrWriteBarrierRefOptBase(a, pLsrAPRatC0(a), c0); + pLsrWriteBarrierRefOptBase(a, pLsrAPRatC1(a), c1); + } else { + pLsrAPRatC0(a) = c0; + pLsrAPRatC1(a) = c1; + } + } + return a; +} + +/* Create a rational number r from two integers c0, c1 such + * that r = -c0/c1, where gcd(c0, c1) may not be 1 */ +static PlsrAPRat pLsrAPRatFromIntegersNorm(PlsrInteger num, PlsrInteger den) +{ + PlsrInteger g = pLsrIntegerGcd(num, den); + PlsrBoolean b; + pLsrIntegerLT(b, den, pLsrIntegerZero); + if (b) { + pLsrIntegerNegate(num, num); + pLsrIntegerNegate(den, den); + } + pLsrIntegerDivT(num, num, g); + pLsrIntegerDivT(den, den, g); + return pLsrAPRatMk(num, den); +} + +static PlsrAPRat pLsrAPRatFromSIntps(sintp a, sintp b) { + PlsrInteger i1; + pLsrIntegerFromSIntp(i1, -a); + PlsrInteger i2; + pLsrIntegerFromSIntp(i2, b); + return pLsrAPRatFromIntegersNorm(i1, i2); +} + + +/* Basic queries and destructors */ + +static PlsrBoolean pLsrAPRatIsIntegral(PlsrAPRat r) +{ + PlsrBoolean b; + pLsrIntegerEQ(b, pLsrAPRatC1(r), pLsrIntegerOne); + return b; +} + +static PlsrInteger pLsrAPRatNumerator(PlsrAPRat r) { + PlsrInteger i = pLsrAPRatC0(r); + pLsrIntegerNegate(i, i); + return i; +} + +static PlsrInteger pLsrAPRatDenominator(PlsrAPRat r) { + return pLsrAPRatC1(r); +} + +/* Conversions */ + +static PlsrAPRat pLsrAPRatFromIntegers(PlsrInteger a, PlsrInteger b) { + PlsrInteger n; + pLsrIntegerNegate(n, a); + return pLsrAPRatFromIntegersNorm(n, b); +} + +static void pLsrIntegersFromAPRat(PlsrInteger* numO, PlsrInteger* denO, PlsrAPRat r) { + pLsrIntegerNegate(*numO, pLsrAPRatC0(r)); + *denO = pLsrAPRatC1(r); + return; +} + +static PlsrAPRat pLsrAPRatFromInteger(PlsrInteger i) +{ + return pLsrAPRatFromIntegers(i, pLsrIntegerOne); +} + +static PlsrInteger pLsrIntegerFromAPRat(PlsrAPRat r) +{ + assert(pLsrAPRatIsIntegral(r)); + PlsrInteger n; + pLsrIntegerNegate(n, pLsrAPRatC0(r)); + return n; +} + +static PlsrAPRat pLsrAPRatFromSInt32(sint32 i) +{ + PlsrInteger ai; + pLsrIntegerFromSInt32(ai, i); + return pLsrAPRatFromInteger(ai); +} + +static sint32 pLsrSInt32FromAPRat(PlsrAPRat r) +{ + sint32 i; + pLsrSInt32FromInteger(i, pLsrIntegerFromAPRat(r)); + return i; +} + +static PlsrAPRat pLsrAPRatFromSInt64(sint64 i) +{ + PlsrInteger ai; + pLsrIntegerFromSInt64(ai, i); + return pLsrAPRatFromInteger(ai); +} + +static sint32 pLsrSInt64FromAPRat(PlsrAPRat r) +{ + sint64 i; + pLsrSInt64FromInteger(i, pLsrIntegerFromAPRat(r)); + return i; +} + +static PlsrAPRat pLsrAPRatFromUInt32(uint32 i) +{ + PlsrInteger ai; + pLsrIntegerFromUInt32(ai, i); + return pLsrAPRatFromInteger(ai); +} + +static uint32 pLsrUInt32FromAPRat(PlsrAPRat r) +{ + uint32 i; + pLsrUInt32FromInteger(i, pLsrIntegerFromAPRat(r)); + return i; +} + +static PlsrAPRat pLsrAPRatFromSIntp(sintp i) +{ + PlsrInteger ai; + pLsrIntegerFromSIntp(ai, i); + return pLsrAPRatFromInteger(ai); +} + +static sintp pLsrSIntpFromAPRat(PlsrAPRat r) +{ + sintp i; + pLsrSIntpFromInteger(i, pLsrIntegerFromAPRat(r)); + return i; +} + +static PlsrAPRat pLsrAPRatFromUIntp(uintp i) +{ + PlsrInteger ai; + pLsrIntegerFromUIntp(ai, i); + return pLsrAPRatFromInteger(ai); +} + +static uintp pLsrUIntpFromAPRat(PlsrAPRat r) +{ + uintp i; + pLsrUIntpFromInteger(i, pLsrIntegerFromAPRat(r)); + return i; +} + +static PlsrAPRat pLsrAPRatFromFloat32(float32 f) { + pLsrRuntimeError("pLsrAPRatFromFloat32 not implemented"); + return 0; +} + +static PlsrAPRat pLsrAPRatFromFloat64(float64 f) { + pLsrRuntimeError("pLsrAPRatFromFloat64 not implemented"); + return 0; +} + +static float32 pLsrFloat32FromAPRat(PlsrAPRat a) { + pLsrRuntimeError("pLsrFloat32FromAPRat not implemented"); + return 0; +} + +static float64 pLsrFloat64FromAPRat(PlsrAPRat a) { + pLsrRuntimeError("pLsrFloat64FromAPRat not implemented"); + return 0; +} + +static PlsrBoolean pLsrAPRatLt(PlsrAPRat a, PlsrAPRat b); +static PlsrBoolean pLsrAPRatGt(PlsrAPRat a, PlsrAPRat b); + +static uintp pLsrAPRatToUInt32Checked(PlsrAPRat r) +{ + if (!pLsrAPRatIsIntegral(r) + || pLsrAPRatLt(r, pLsrAPRatZero) + || pLsrAPRatGt(r, pLsrAPRatUInt32Max)) + return UINTP_MAX; + return pLsrUInt32FromAPRat(r); +}; + +static char* pLsrCStringFromAPRat(PlsrAPRat r) { + PlsrInteger numI = pLsrAPRatNumerator(r); + PlsrInteger denI = pLsrAPRatDenominator(r); + char *num = pLsrCStringFromInteger(numI); + char *w; + PlsrBoolean b; + pLsrIntegerEQ(b, denI, pLsrIntegerOne); + if (b) { + w = (char*)pLsrAllocC(sizeof(char) * (strlen(num) + 1)); + sprintf(w, "%s", num); + } else { + char *den = pLsrCStringFromInteger(denI); + w = (char*)pLsrAllocC(sizeof(char) * (strlen(num) + strlen(den) + 1 + 1)); + sprintf(w, "%s/%s", num, den); + pLsrFreeC(den); + } + pLsrFreeC(num); + + return w; +} + +static PlsrAPRat pLsrAPRatFromCString(char* s) { + char *num; + char *den; + PlsrInteger numI; + PlsrInteger denI; + int i; + for(i=0;(s[i] != '/') && (s[i] != '\0');i++); + num = (char*) pLsrAllocC(sizeof(char) * (i + 1)); + strncpy(num, s, i); + numI = pLsrIntegerFromCString(num); + pLsrFreeC(num); + if (s[i] == '\0') { + denI = pLsrIntegerOne; + } else { + den = s + i + 1; + denI = pLsrIntegerFromCString(den); + } + return pLsrAPRatFromIntegers(numI, denI); + } + +/* Arithmetic */ + +static PlsrAPRat pLsrAPRatNeg(PlsrAPRat r) +{ + PlsrInteger n; + pLsrIntegerNegate(n, pLsrAPRatC0(r)); + return pLsrAPRatMk(n, pLsrAPRatC1(r)); + /* Normalisation should not be necessary */ +}; + +static PlsrAPRat pLsrAPRatPlus(PlsrAPRat r1, + PlsrAPRat r2) +{ + PlsrInteger r1c0 = pLsrAPRatC0(r1); + PlsrInteger r1c1 = pLsrAPRatC1(r1); + PlsrInteger r2c0 = pLsrAPRatC0(r2); + PlsrInteger r2c1 = pLsrAPRatC1(r2); + PlsrInteger p1; + PlsrInteger p2; + PlsrInteger p3; + PlsrInteger c0; + PlsrInteger c1; + pLsrIntegerTimes(p1, r1c0, r2c1); + pLsrIntegerTimes(p2, r1c1, r2c0); + pLsrIntegerPlus(c0, p1, p2); + pLsrIntegerTimes(c1, r1c1, r2c1); + return pLsrAPRatFromIntegersNorm(c0, c1); +} + +static PlsrAPRat pLsrAPRatMinus(PlsrAPRat r1, + PlsrAPRat r2) +{ + PlsrInteger r1c0 = pLsrAPRatC0(r1); + PlsrInteger r1c1 = pLsrAPRatC1(r1); + PlsrInteger r2c0 = pLsrAPRatC0(r2); + PlsrInteger r2c1 = pLsrAPRatC1(r2); + PlsrInteger p1; + PlsrInteger p2; + PlsrInteger p3; + PlsrInteger c0; + PlsrInteger c1; + pLsrIntegerTimes(p1, r1c0, r2c1); + pLsrIntegerTimes(p2, r1c1, r2c0); + pLsrIntegerMinus(c0, p1, p2); + pLsrIntegerTimes(c1, r1c1, r2c1); + return pLsrAPRatFromIntegersNorm(c0, c1); +} + +static PlsrAPRat pLsrAPRatMult(PlsrAPRat r1, + PlsrAPRat r2) +{ + PlsrInteger r1c0 = pLsrAPRatC0(r1); + PlsrInteger r1c1 = pLsrAPRatC1(r1); + PlsrInteger r2c0 = pLsrAPRatC0(r2); + PlsrInteger r2c1 = pLsrAPRatC1(r2); + PlsrInteger c0; + PlsrInteger c1; + pLsrIntegerNegate(r1c0, r1c0); + pLsrIntegerTimes(c0, r1c0, r2c0); + pLsrIntegerTimes(c1, r1c1, r2c1); + return pLsrAPRatFromIntegersNorm(c0, c1); +} + +/* Comparisons */ + +static PlsrBoolean pLsrAPRatEq(PlsrAPRat r1, PlsrAPRat r2) +{ + PlsrInteger r1c0 = pLsrAPRatC0(r1); + PlsrInteger r1c1 = pLsrAPRatC1(r1); + PlsrInteger r2c0 = pLsrAPRatC0(r2); + PlsrInteger r2c1 = pLsrAPRatC1(r2); + PlsrBoolean b1, b2; + pLsrIntegerEQ(b1, r1c0, r2c0); + if (!b1) return 0; + pLsrIntegerEQ(b2, r1c1, r2c1); + return b2; +} + +static PlsrBoolean pLsrAPRatNe(PlsrAPRat r1, PlsrAPRat r2) +{ + return !pLsrAPRatEq(r1, r2); +} + +static PlsrBoolean pLsrAPRatLt(PlsrAPRat r1, PlsrAPRat r2) +{ + PlsrInteger r1c0 = pLsrAPRatC0(r1); + PlsrInteger r1c1 = pLsrAPRatC1(r1); + PlsrInteger r2c0 = pLsrAPRatC0(r2); + PlsrInteger r2c1 = pLsrAPRatC1(r2); + PlsrInteger p1; + PlsrInteger p2; + PlsrBoolean b; + pLsrIntegerTimes(p1, r1c1, r2c0); + pLsrIntegerTimes(p2, r1c0, r2c1); + pLsrIntegerLT(b, p1, p2); + return b; +} + +static PlsrBoolean pLsrAPRatGt(PlsrAPRat r1, PlsrAPRat r2) +{ + PlsrInteger r1c0 = pLsrAPRatC0(r1); + PlsrInteger r1c1 = pLsrAPRatC1(r1); + PlsrInteger r2c0 = pLsrAPRatC0(r2); + PlsrInteger r2c1 = pLsrAPRatC1(r2); + PlsrInteger p1; + PlsrInteger p2; + PlsrBoolean b; + pLsrIntegerTimes(p1, r1c1, r2c0); + pLsrIntegerTimes(p2, r1c0, r2c1); + pLsrIntegerGT(b, p1, p2); + return b; +} + +static PlsrBoolean pLsrAPRatLe(PlsrAPRat r1, PlsrAPRat r2) +{ + PlsrInteger r1c0 = pLsrAPRatC0(r1); + PlsrInteger r1c1 = pLsrAPRatC1(r1); + PlsrInteger r2c0 = pLsrAPRatC0(r2); + PlsrInteger r2c1 = pLsrAPRatC1(r2); + PlsrInteger p1; + PlsrInteger p2; + PlsrBoolean b; + pLsrIntegerTimes(p1, r1c1, r2c0); + pLsrIntegerTimes(p2, r1c0, r2c1); + pLsrIntegerLE(b, p1, p2); + return b; +} + +static PlsrBoolean pLsrAPRatGe(PlsrAPRat r1, PlsrAPRat r2) +{ + PlsrInteger r1c0 = pLsrAPRatC0(r1); + PlsrInteger r1c1 = pLsrAPRatC1(r1); + PlsrInteger r2c0 = pLsrAPRatC0(r2); + PlsrInteger r2c1 = pLsrAPRatC1(r2); + PlsrInteger p1; + PlsrInteger p2; + PlsrBoolean b; + pLsrIntegerTimes(p1, r1c1, r2c0); + pLsrIntegerTimes(p2, r1c0, r2c1); + pLsrIntegerGE(b, p1, p2); + return b; +} + +static sint32 pLsrAPRatCheckRangeSInt32(PlsrAPRat a, sint32 upper, sint32 lower) { + if (pLsrAPRatIsIntegral(a)) { + PlsrInteger im = pLsrAPRatNumerator(a); + PlsrInteger i; + pLsrIntegerNegate(i, im); + return pLsrIntegerCheckRangeSInt32(i, upper, lower); + } + return SINT32_MIN; +} + +#define pLsrAPRatSize (sizeof(PlsrAPRatS)) +#define pLsrAPRatAlignment 4 + +static void pLsrAPRatRegisterVTables() +{ + static PgcIsRef pLsrAPRatRefs[pLsrAPRatSize/P_WORD_SIZE] = { 0, pLsrIntegerIsRef, pLsrIntegerIsRef }; + + assert(pLsrAPRatSize/P_WORD_SIZE == 3); + + pLsrVTableRegister(pLsrAPRatVTable, pLsrAPRatAlignment, pLsrAPRatSize, pLsrAPRatRefs, 0, 0, 0, + PGC_ALWAYS_IMMUTABLE, 0); +} + +#define pLsrAPRatGlobalsCount 3 + +static PlsrObjectB pLsrAPRatGlobals[] = + { + (PlsrObjectB) pLsrAPRatZero, + (PlsrObjectB) pLsrAPRatOne, + (PlsrObjectB) pLsrAPRatUInt32Max, + (PlsrObjectB) NULL /* This must be last */ + }; + +static void pLsrAPRatRegisterGlobals() { + assert(pLsrAPRatGlobals[pLsrAPRatGlobalsCount] == NULL); + pLsrGcRegisterGlobals(pLsrAPRatGlobals, pLsrAPRatGlobalsCount); +} + +static void pLsrAPRatInitialize() { + pLsrAPRatStaticInitFromIntegers(pLsrAPRatZero, pLsrIntegerZero, pLsrIntegerOne); + pLsrAPRatStaticInitFromIntegers(pLsrAPRatOne, pLsrIntegerMinusOne, pLsrIntegerOne); + pLsrAPRatStaticInitFromIntegers(pLsrAPRatUInt32Max, pLsrIntegerMinusUInt32Max, pLsrIntegerOne); +} + +#endif /* !_PLSR_AP_RATIONAL_H_ */ diff --git a/runtime/include/hrc/plsr-finalizer.h b/runtime/include/hrc/plsr-finalizer.h new file mode 100644 index 0000000..0cc9c43 --- /dev/null +++ b/runtime/include/hrc/plsr-finalizer.h @@ -0,0 +1,350 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Finalizers */ + +#ifndef _PLSR_FINALIZER_H_ +#define _PLSR_FINALIZER_H_ + +typedef void (*PlsrFinalizerCode)(PlsrRef); + +#ifdef P_USE_PILLAR + +#define pLsrFinalizerInitialSize 500 +typedef struct PlsrFinalizerS {PlsrFinalizerCode f;volatile PlsrRef object;} PlsrFinalizerU, *PlsrFinalizer; + +typedef struct PlsrFinalizerChunkS { + volatile uintp count; + uintp length; + volatile struct PlsrFinalizerChunkS* next; + volatile PlsrFinalizerU entries[]; +} PlsrFinalizerChunkU, *PlsrFinalizerChunk; + +typedef struct PlsrFinalizerGlobalsS { + uintp busy; + uintp shutdown; + uintp done; + uintp signaled; + struct prtMutex *lock; + struct prtCondition *notify; + volatile PlsrFinalizerChunk inbox; + volatile PlsrFinalizerChunk todo; +} PlsrFinalizerGlobalsU, *PlsrFinalizerGlobals; + +static PlsrFinalizerGlobalsU pLsrFinalizerGlobals; + +/* Finalizer addition */ +#pragma pillar_managed(off) + +/* Notify the finalizer thread of activity +* (shutdown or new work). +*/ +static void pLsrFinalizerNotify() +{ + if (!pLsrFinalizerGlobals.signaled) { + fflush(stdout); + prtMutexLock(pLsrFinalizerGlobals.lock); + fflush(stdout); + pLsrFinalizerGlobals.signaled = 1; + prtConditionSignal(pLsrFinalizerGlobals.notify); + fflush(stdout); + prtMutexUnlock(pLsrFinalizerGlobals.lock); + fflush(stdout); + } +} + +/* Wait for notification of new work */ +static void pLsrFinalizerWaitForNotification() +{ + prtMutexLock(pLsrFinalizerGlobals.lock); + if (!pLsrFinalizerGlobals.signaled) { + prtConditionWait(pLsrFinalizerGlobals.notify, pLsrFinalizerGlobals.lock); + } + pLsrFinalizerGlobals.signaled = 0; + prtMutexUnlock(pLsrFinalizerGlobals.lock); +} + +static PlsrFinalizerChunk pLsrFinalizerTakeInbox() +{ + return (PlsrFinalizerChunk) pLsrSynchAtomicTakeUIntp((volatile uintp*) &pLsrFinalizerGlobals.inbox, 1); +} + +static void pLsrFinalizerPutInbox(PlsrFinalizerChunk inbox) +{ + pLsrSynchAtomicPutUIntp((volatile uintp*) &pLsrFinalizerGlobals.inbox, (uintp) inbox); + pLsrFinalizerNotify(); +} + +static PlsrFinalizerChunk pLsrFinalizerNewChunkUnmanaged(uintp length) +{ + PlsrFinalizerChunk chunk = + (PlsrFinalizerChunk) pLsrAllocCUnmanaged(sizeof(PlsrFinalizerChunkU) + length*sizeof(PlsrFinalizerU)); + chunk->count=0; + chunk->length=length; + chunk->next=NULL; + return chunk; +} + +static void pLsrFinalizerAdd(PlsrFinalizerCode f, volatile PlsrRef object) +{ +#if (defined(PLSR_FINALIZER_TRACE) && (PLSR_FINALIZER_TRACE>0)) + printf("Adding finalizer object %p\n", object); + fflush(stdout); +#endif +#ifdef PLSR_SINGLE_THREADED + printf("Can't use finalizers with the single threaded runtime\n"); + fflush(stdout); + assert(0); + exit(-1); +#else + PlsrFinalizerChunk inbox = pLsrFinalizerTakeInbox(); + if (inbox->count >= inbox->length) { + uintp length = inbox->length; + PlsrFinalizerChunk next = inbox; + inbox = pLsrFinalizerNewChunkUnmanaged(length * 2); + inbox->next = next; + } + uintp count=inbox->count; + inbox->entries[count].f = f; + inbox->entries[count].object = object; + inbox->count = count + 1; + pLsrFinalizerPutInbox(inbox); +#endif +} + +#pragma pillar_managed(on) + +/* Root set enumeration */ +#pragma pillar_managed(off) +/* These run on a pillar task, can yield */ + +static void pLsrFinalizerEnumerateChunk(PrtRseCallback rse, void* env, PlsrFinalizerChunk chunk) +{ + while (chunk != NULL) { + for(int i=0; icount;i++) { +#if (defined(PLSR_FINALIZER_TRACE) && (PLSR_FINALIZER_TRACE>0)) + printf("Enumerating finalizer object %p\n", chunk->entries[i].object); + fflush(stdout); +#endif + + rse(env, (void**) &(chunk->entries[i].object), PrtGcTagDefault, 0); + } + chunk=(PlsrFinalizerChunk) chunk->next; + } +} + +static void pLsrFinalizerReportRoots(PrtRseCallback rse, void* env) +{ +#ifdef PLSR_FINALIZER_TRACE + printf("Finalizer enumeration starting\n"); + fflush(stdout); +#endif + pLsrFinalizerEnumerateChunk(rse, env, pLsrFinalizerGlobals.inbox); + pLsrFinalizerEnumerateChunk(rse, env, pLsrFinalizerGlobals.todo); /* Finalizer thread is paused */ +#ifdef PLSR_FINALIZER_TRACE + printf("Finalizer enumeration finished\n"); + fflush(stdout); +#endif +} +#pragma pillar_managed(on) + +static PlsrFinalizerChunk pLsrFinalizerNewChunk(uintp length) { + PlsrFinalizerChunk chunk = + (PlsrFinalizerChunk) pLsrAllocC(sizeof(PlsrFinalizerChunkU) + length*sizeof(PlsrFinalizerU)); + chunk->count=0; + chunk->length=length; + chunk->next=NULL; + return chunk; +} + +static void pLsrFinalizerDeleteChunk(PlsrFinalizerChunk chunk) { + pLsrFreeC(chunk); +} + + +/* We walk through the todo set, removing an element one + * at a time. All but the last (removed) element, remain + * in the set to be enumerated by the root set enumerator. + * The last (removed) element, is on the stack of this managed thread, + * and hence will be enumerated by pillar + */ +static void pLsrFinalizerFinalizeTodo() +{ +#ifdef PLSR_FINALIZER_TRACE + int processed = 0; + printf("Finalizer finalize loop starting\n"); + fflush(stdout); +#endif + pLsrFinalizerGlobals.busy = 1; + /* Atomically */ + noyield { + pLsrFinalizerGlobals.todo = pLsrFinalizerGlobals.inbox; + int length = + (pLsrFinalizerInitialSize < pLsrFinalizerGlobals.inbox->length/2) + ? pLsrFinalizerGlobals.inbox->length/2 + : pLsrFinalizerInitialSize; + pLsrFinalizerGlobals.inbox = pLsrFinalizerNewChunk(length); + } + + /* We keep a copy here. GC may move the contents + * of the fields, but nothing else modifies todo itself */ + PlsrFinalizerChunk todo = pLsrFinalizerGlobals.todo; + while (todo != NULL) { + while (todo->count > 0) { + PlsrFinalizerCode f; + PlsrRef object; + noyield { + int i = todo->count-1; + f = todo->entries[i].f; + object = todo->entries[i].object; + todo->count = todo->count - 1; + } + +#if (defined(PLSR_FINALIZER_TRACE) && (PLSR_FINALIZER_TRACE>0)) + printf("Finalizer finalizing object %p\n", object); + fflush(stdout); +#endif +#ifdef PLSR_FINALIZER_TRACE + processed++; +#endif + f(object); + } + PlsrFinalizerChunk next = (PlsrFinalizerChunk) todo->next; + pLsrFinalizerGlobals.todo = next; + pLsrFinalizerDeleteChunk(todo); + todo=next; + } + pLsrFinalizerGlobals.busy = 0; +#ifdef PLSR_FINALIZER_TRACE + printf("Finalizer finalize loop finished finalizing %d objects\n", processed); + fflush(stdout); +#endif + +} + +static void pLsrFinalizerRun() +{ + pilYieldDec; + +#ifdef PLSR_FINALIZER_TRACE + printf("Starting finalizer thread\n"); + fflush(stdout); +#endif + + while (!pLsrFinalizerGlobals.shutdown) { + assert(pLsrFinalizerGlobals.todo == NULL); + if (pLsrFinalizerGlobals.inbox->count > 0) { + pLsrFinalizerFinalizeTodo(); + } else { + pLsrFinalizerWaitForNotification(); + } + pilYield(); + } +#ifdef PLSR_FINALIZER_TRACE + printf("Starting finalizer thread shutdown sequence\n"); + fflush(stdout); +#endif + +#ifdef PLSR_FINALIZER_SHUTDOWN_LOOP + pgc_force_gc(); + while (pLsrFinalizerGlobals.inbox->count > 0) { +#ifdef PLSR_FINALIZER_TRACE + printf("Iterating finalizer thread shutdown sequence\n"); + fflush(stdout); +#endif + pLsrFinalizerFinalizeTodo(); + pgc_force_gc(); + pilYield(); + } +#endif + +#ifdef PLSR_FINALIZER_TRACE + printf("Completed finalizer thread shutdown sequence\n"); + fflush(stdout); +#endif + + pLsrFinalizerGlobals.done = 1; +} + +static void pLsrFinalizerStart() +{ + pLsrFinalizerGlobals.shutdown = 0; + pLsrFinalizerGlobals.done = 0; + pLsrFinalizerGlobals.busy = 0; + pLsrFinalizerGlobals.lock = prtMutexCreate(NULL); + pLsrFinalizerGlobals.notify = prtConditionInit(NULL); + pLsrFinalizerGlobals.signaled = 0; + pLsrFinalizerGlobals.inbox = pLsrFinalizerNewChunk(pLsrFinalizerInitialSize); + pLsrFinalizerGlobals.todo = NULL; + +#ifndef PLSR_SINGLE_THREADED + pcall pLsrFinalizerRun(); +#endif +} + +/* Shutdown the system. If cleanup, then complete finalization + * before exiting, otherwise just exit. + */ +static void pLsrFinalizerShutdown(PlsrBoolean cleanup) +{ +#ifndef PLSR_SINGLE_THREADED + if (cleanup) { + pLsrFinalizerGlobals.shutdown = 1; + pLsrFinalizerNotify(); + pLsrSynchYieldUntilEqualUIntp(&pLsrFinalizerGlobals.done, 1); + } +#endif +} + +static void pLsrFinalizerYieldToFinalizer() +{ +#ifndef PLSR_SINGLE_THREADED + if (pLsrFinalizerGlobals.signaled) { + pLsrSynchYieldUntilEqualUIntp(&pLsrFinalizerGlobals.signaled, 0); + } + if (pLsrFinalizerGlobals.busy) { + pLsrSynchYieldUntilEqualUIntp(&pLsrFinalizerGlobals.busy, 0); + } +#else + if (pLsrFinalizerGlobals.inbox->count > 0) { + pLsrFinalizerFinalizeTodo(); + } +#endif +} + +static void pLsrFinalizerRegisterVTables() +{ +} + +static void pLsrFinalizerRegisterGlobals() +{ +} + +static void pLsrFinalizerCheckAssertions() +{ +} +#else /* !P_USE_PILLAR */ + +static void pLsrFinalizerStart() +{ +} + +static void pLsrFinalizerShutdown(PlsrBoolean cleanup) +{ +} + +static void pLsrFinalizerRegisterVTables() +{ +} + +static void pLsrFinalizerRegisterGlobals() +{ +} + +static void pLsrFinalizerCheckAssertions() +{ +} + +#endif /* P_USE_PILLAR */ + +#endif /* !_PLSR_FINALIZER_H_ */ diff --git a/runtime/include/hrc/plsr-gc.h b/runtime/include/hrc/plsr-gc.h new file mode 100755 index 0000000..5a056ba --- /dev/null +++ b/runtime/include/hrc/plsr-gc.h @@ -0,0 +1,730 @@ + /* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Memory Allocation and Garbage Collection */ + +#ifndef _PLSR_GC_H_ +#define _PLSR_GC_H_ + +#ifdef PLSR_LINUX +#define min(x,y) ((x)<(y)?(x):(y)) +#endif + +/**********************************************************************/ +/*** Allocation instrumentation ***/ + +static uint64 pLsrNumObjectsAllocated = 0; +static uint64 pLsrNumBytesAllocated = 0; +static uint64 pLsrNumPaddingBytesAllocated = 0; + +static uint64 pLsrNumObjectsAllocatedSinceLast = 0; +static uint64 pLsrNumBytesAllocatedSinceLast = 0; + +static uint64 pLsrNumObjectsAllocatedUnmanaged = 0; +static uint64 pLsrNumBytesAllocatedUnmanaged = 0; + +static uint64 pLsrNumObjectsFreedUnmanaged = 0; + +static uint64 pLsrNumObjectsAllocatedUnmanagedSinceLast = 0; +static uint64 pLsrNumBytesAllocatedUnmanagedSinceLast = 0; + +static uint64 pLsrNumObjectsFreedUnmanagedSinceLast = 0; + +#ifdef PLSR_INSTRUMENT_ALLOCATION +#define pLsrAllocInstrument1(vtable, size) \ + do { \ + pLsrNumObjectsAllocated++; \ + pLsrNumBytesAllocated += (size); \ + pLsrNumPaddingBytesAllocated += (vtable)->padding; \ + } while (0) +#define pLsrAllocCInstrument1(size) \ + do { \ + pLsrNumObjectsAllocatedUnmanaged++; \ + pLsrNumBytesAllocatedUnmanaged += (size); \ + } while (0); +#define pLsrFreeCInstrument1() \ + do { \ + pLsrNumObjectsFreedUnmanaged++; \ + pLsrNumObjectsFreedUnmanagedSinceLast++; \ + } while (0); +#else /* !PLSR_INSTRUMENT_ALLOCATION */ +#define pLsrAllocInstrument1(vtable, size) +#define pLsrAllocCInstrument1(size) +#define pLsrFreeCInstrument1() +#endif /* !PLSR_INSTRUMENT_ALLOCATION */ + +#ifdef PLSR_INSTRUMENT_VTB_ALC +#define pLsrAllocInstrument2(vtable, size) \ + do { \ + (vtable)->numObjects++; \ + (vtable)->numBytes += (size); \ + } while (0) +#else /* !PLSR_INSTRUMENT_VTB_ALC */ +#define pLsrAllocInstrument2(vtable, size) +#endif /* !PLSR_INSTRUMENT_VTB_ALC */ + +#ifdef PLSR_INSTRUMENT_GCS +#define pLsrAllocInstrument3(size) \ + do { \ + pLsrNumObjectsAllocatedSinceLast++; \ + pLsrNumBytesAllocatedSinceLast += (size); \ + } while (0) +#define pLsrAllocCInstrument2(size) \ + do { \ + pLsrNumObjectsAllocatedUnmanagedSinceLast++; \ + pLsrNumBytesAllocatedUnmanagedSinceLast += (size); \ + } while (0); +#else /* !PLSR_INSTRUMENT_GCS */ +#define pLsrAllocInstrument3(size) +#define pLsrAllocCInstrument2(size) +#endif /* !PLSR_INSTRUMENT_GCS */ + +#ifdef PLSR_FORCE_REGC +uint32 pLsrAllocREGC_signal = 0; +#define pLsrAllocREGC() do {if (pLsrAllocREGC_signal) {pgc_force_gc(); pLsrAllocREGC_signal = 0;}} while (0) +#define pLsrAllocREGCSet() (pLsrAllocREGC_signal = 1) +#else +#define pLsrAllocREGC() +#define pLsrAllocREGCSet() +#endif + + +/**********************************************************************/ +/*** Heap Allocation ***/ + +#define pLsrDefaultAlignment 4 + +#define pLsrAllocStart() pLsrAllocREGC() + +#define pLsrAllocFinish(r, vt, size) \ + do { \ + pLsrAllocInstrument1(vt, size); \ + pLsrAllocInstrument2(vt, size); \ + pLsrAllocInstrument3(size); \ + ((PlsrObjectU*)(r))->vtable = vt; \ + } while(0) + +#define pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocator) \ + do { \ + pLsrAllocStart(); \ + (dest) = (t) pLsrAllocator(size, vtable); \ + pLsrAllocFinish(dest, vtable, size); \ + } while(0) + + +#ifdef P_USE_AGC +/* Wrap the unmanaged call in a managed function, and inline + * the managed function call. This keeps the code size smaller. + * alignment is based on the vtable properties */ +PlsrObjectB pLsrAllocSlowCapacity(uintp size, PlsrVTable vtable) { +#ifdef __pillar2c__ + PlsrObjectB res; + if (!(res = pgc_allocate_or_null(size, vtable))) { + return pgc_allocate(size, vtable); + } + return res; +#else + return pgc_allocate(size, vtable); +#endif +} +PlsrObjectB pLsrAllocSlow(uintp size, PlsrVTable vtable) { + return pgc_allocate(size, vtable); +} + +PlsrObjectB pLsrAllocSlowOrNull(uintp size, PlsrVTable vtable) { + PlsrObjectB res; + if (!(res = pgc_allocate_or_null(size, vtable))) { + pLsrRuntimeError_ ("Allocate or null returned null"); + } + return res; +} + +#define BITS_PER_BYTE 8 +#define NEXT_TO_HIGH_BIT_SET_MASK (1<<((sizeof(uintp) * BITS_PER_BYTE)-2)) + +/* Wrap the unmanaged call in a managed function, and inline + * the managed function call. This keeps the code size smaller. + * alignment is based on the vtable properties */ +PlsrObjectB pLsrAllocSlowPinned(uintp size, PlsrVTable vtable) { + if ((P_USE_AGC == PlsrAKMf)) + return pgc_allocate(size | NEXT_TO_HIGH_BIT_SET_MASK, vtable); + else + return pgc_allocate(size, vtable); +} + +/* Accurate GC, pinned */ +#define pLsrAllocPinned(t, dest, vtable, size) \ + pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocSlowPinned) + +#define pLsrAllocAlignedPinned(t, dest, vtable, size, alignment) \ + pLsrAllocPinned(t, dest, vtable, size) + +#define pLsrAllocPinnedFinalizable pLsrAllocPinned + +#define pLsrAllocFinalizable(t, dest, vtable, size) \ + pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocSlow) + +#ifdef P_USE_FAST_ALLOC + +/* Accurate GC, fast path */ +typedef struct { + char *tls_current_free; + char *tls_current_ceiling; + void *chunk; + void *curr_alloc_block; +} GC_Nursery_Info; + +#ifdef __pillar2c__ + +extern unsigned g_tls_offset_bytes; +#define orp_local_to_gc_local(handle) (void *)((char*)(handle) + g_tls_offset_bytes) +#ifdef TLS0 +#define pLsrGetAllocNursery() (GC_Nursery_Info *)orp_local_to_gc_local(*((void**)prtGetTaskHandle())) +#define pLsrGetAllocNurseryTh(task) (GC_Nursery_Info *)orp_local_to_gc_local(*((void**)task)) +#else // TLS0 +#define pLsrGetAllocNursery() (GC_Nursery_Info *)orp_local_to_gc_local(*((void**)((char*)prtGetTaskHandle() + P_WORD_SIZE))) +#define pLsrGetAllocNurseryTh(task) (GC_Nursery_Info *)orp_local_to_gc_local(*((void**)((char*)task + P_WORD_SIZE))) +#endif + +#else // __pillar2c__ +#define pLsrGetAllocNursery() (GC_Nursery_Info *) prtGetTls() +#endif + +/* alignment is power of two */ +#define pLsrAllocAligned(t, dest, vtable, size, alignment) \ + do { \ + uintp pLsrAllocSize = ((size) + 3) & 0xFFffFFfc; \ + uintp pLsrAllocAlignment = (alignment <=4) ? 0 : (alignment - 1); \ + GC_Nursery_Info *pLsrAllocNursery = pLsrGetAllocNursery(); \ + pLsrAllocStart(); \ + noyield { \ + char * pLsrAllocFrontier = pLsrAllocNursery->tls_current_free; \ + char * pLsrAllocLimit = pLsrAllocNursery->tls_current_ceiling; \ + pLsrAllocFrontier = (char *) ((((uintp) pLsrAllocFrontier) + pLsrAllocAlignment) & (~pLsrAllocAlignment)); \ + if (pLsrAllocFrontier + pLsrAllocSize <= pLsrAllocLimit) { \ + (dest) = (t)pLsrAllocFrontier; \ + pLsrAllocNursery->tls_current_free = pLsrAllocFrontier + pLsrAllocSize; \ + } else { \ + (dest) = (t)pLsrAllocSlowCapacity(pLsrAllocSize, vtable); \ + } \ + } \ + pLsrAllocFinish(dest, vtable, pLsrAllocSize); \ + } while(0) + +#define pLsrAlloc(t, dest, vtable, size) pLsrAllocAligned(t, dest, vtable, size, 1) + +/* Accurate GC, skip local nursery */ +#define pLsrAlloc_(t, dest, vtable, size) \ + pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocSlow) + +/* alignment is power of two */ +#define pLsrAllocNoGC(t, dest, vtable, size) \ + do { \ + uintp pLsrAllocSize = ((size) + 3) & 0xFFffFFfc; \ + GC_Nursery_Info *pLsrAllocNursery = pLsrGetAllocNursery(); \ + pLsrAllocStart(); \ + char * pLsrAllocFrontier = pLsrAllocNursery->tls_current_free; \ + char * pLsrAllocLimit = pLsrAllocNursery->tls_current_ceiling; \ + if (pLsrAllocFrontier + pLsrAllocSize <= pLsrAllocLimit) { \ + (dest) = (t)pLsrAllocFrontier; \ + pLsrAllocNursery->tls_current_free = pLsrAllocFrontier + pLsrAllocSize; \ + } else { /* could fall back to allocate or null here */ \ + pLsrRuntimeErrorUnmanaged("Insufficient space for guaranteed allocation"); \ + (dest) = (t)(0); \ + } \ + pLsrAllocFinish(dest, vtable, pLsrAllocSize); \ + } while(0) + +#else /* !P_USE_FAST_ALLOC */ + +/* Accurate GC, slow path */ +#define pLsrAllocAligned(t, dest, vtable, size, alignment) \ + pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocSlow) + +#define pLsrAlloc(t, dest, vtable, size) pLsrAllocAligned(t, dest, vtable, size, 1) + +#define pLsrAlloc_(t, dest, vtable, size) pLsrAlloc(t, dest, vtable, size) + +#define pLsrAllocNoGC(t, dest, vtable, size) \ + pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocSlowOrNull) + +#endif /* P_USE_FAST_ALLOC */ +#else /* !P_USE_AGC */ + +/* Conservative GC*/ +#ifdef P_USE_CGC +#define pLsrAlloc(t, dest, vtable, size) \ + do { \ + pLsrAllocStart(); \ + (dest) = (t) GC_MALLOC(size); \ + if (!(dest)) pLsrRuntimeError_("Out of memory"); \ + pLsrAllocFinish(dest, vtable, size); \ + } while(0) + +#define pLsrAllocPinned pLsrAlloc +#define pLsrAllocAligned(t, dest, vtable, size, algn) pLsrAlloc(t, dest, vtable, size) +#define pLsrAlloc_ pLsrAlloc +#define pLsrAllocPinnedFinalizable pLsrAlloc +#define pLsrAllocFinalizable pLsrAlloc + +#else /* !P_USE_CGC */ + +/* NO GC */ +#define pLsrAlloc(t, dest, vtable, size) \ + do { \ + pLsrAllocStart(); \ + dest = (t) malloc(size); \ + if (!(dest)) pLsrRuntimeError_("Out of memory"); \ + pLsrAllocFinish(dest, vtable, size); \ + } while(0) + +#define pLsrAllocPinned pLsrAlloc +#define pLsrAllocAligned(t, dest, vtable, size, algn) pLsrAlloc(t, dest, vtable, size) +#define pLsrAlloc_ pLsrAlloc +#define pLsrAllocPinnedFinalizable pLsrAlloc +#define pLsrAllocFinalizable pLsrAlloc + +#endif /*P_USE_CGC */ +#endif /*P_USE_AGC */ + +/**********************************************************************/ +/*** C Allocation ***/ + + +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ +#endif /* P_USE_PILLAR */ + +static void* pLsrAllocCUnmanaged(uintp size) +{ + void* res = malloc(size); + pLsrAllocCInstrument1(size); + pLsrAllocCInstrument2(size); + if (res) return res; + printf("Out of memory (C unmanaged alloc)"); + exit(-1); + return 0; +} + +#ifdef P_USE_PILLAR +# undef to +# pragma pillar_managed(on) +#endif + + +static void* pLsrAllocC(uintp size) +{ + void* res = malloc(size); + pLsrAllocCInstrument1(size); + pLsrAllocCInstrument2(size); + if (res) return res; + pLsrRuntimeError_("Out of memory (C alloc)"); + return 0; +} + +static void* pLsrReAllocC(void * obj, uintp osize, uintp nsize) +{ + void* res = realloc(obj, nsize); + pLsrFreeCInstrument1(); + pLsrAllocCInstrument1(nsize); + pLsrAllocCInstrument2(nsize); + if (res) return res; + pLsrRuntimeError_("Out of memory (C realloc)"); + return 0; +} + +static void pLsrFreeC(void* obj) +{ + pLsrFreeCInstrument1(); + free(obj); +} + +/**********************************************************************/ +/*** Options and Initialisation ***/ + +#ifdef P_USE_AGC +#pragma pillar_managed(off) +static void pLsrRuntimeReportRoots(PrtRseCallback, void*); +static void pLsrPPilerReportRoots(PrtRseCallback, void*); +static void pLsrGcReportRoots(PrtRseCallback rse, void* env) +{ +#ifdef PLSR_INSTRUMENT_GCS + printf("Total allocation: %I64u bytes (%I64u objects) managed, %I64u bytes (%I64u objects) unmanaged, %I64u objects freed\n", + pLsrNumBytesAllocated, pLsrNumObjectsAllocated, + pLsrNumBytesAllocatedUnmanaged, pLsrNumObjectsAllocatedUnmanaged, + pLsrNumObjectsFreedUnmanaged); + printf("Since last: %I64u bytes (%I64u objects) managed, %I64u bytes (%I64u objects) unmanaged, %I64u objects freed\n", + pLsrNumBytesAllocatedSinceLast, pLsrNumObjectsAllocatedSinceLast, + pLsrNumBytesAllocatedUnmanagedSinceLast, pLsrNumObjectsAllocatedUnmanagedSinceLast, + pLsrNumObjectsFreedUnmanagedSinceLast); + pLsrNumBytesAllocatedSinceLast = 0; + pLsrNumObjectsAllocatedSinceLast = 0; + pLsrNumBytesAllocatedUnmanagedSinceLast = 0; + pLsrNumObjectsAllocatedUnmanagedSinceLast = 0; + pLsrNumObjectsFreedUnmanagedSinceLast = 0; +#endif + pLsrRuntimeReportRoots(rse, env); + pLsrPPilerReportRoots(rse, env); + pLsrAllocREGCSet(); +} +#pragma pillar_managed(on) +#endif /* P_USE_AGC */ + +static void pLsrGcOption(const char* name, const char* arg) +{ +#ifdef P_USE_AGC + pgc_next_command_line_argument(name, arg); +#else /* !P_USE_AGC */ + pLsrRuntimeError("GC option not implemented"); +#endif /* !P_USE_AGC */ +} + +static void pLsrGcInit(sintp initHeap, uintp maxHeap) +{ + char buf[100]; +#ifdef P_USE_CGC + if (initHeap<0) initHeap = 50; + initHeap *= 1024 * 1024; + /* GC Bug. The GC tends to run out of memory when initHeap is set to maxHeap */ + initHeap -= initHeap / 10; + maxHeap *= 1024 * 1024; + GC_init(); + GC_expand_hp(initHeap); + GC_set_max_heap_size (maxHeap); +#else /* !P_USE_CGC */ +#ifdef P_USE_AGC + if (P_USE_AGC == PlsrAKCgc || P_USE_AGC == PlsrAKTgc) { + /* If only maxHeap set, prefer maxHeap */ + if (initHeap<=0 && maxHeap>0) {initHeap = maxHeap;} + else if (initHeap>0 && maxHeap==0) {maxHeap = initHeap;} + else if (initHeap >= 0 && maxHeap > 0 && initHeap != maxHeap) + { pLsrRuntimeError("GCInit: initHeap <> maxHeap not supported on this GC");} + } + if (initHeap>0) { + sprintf(buf, "-Xms%dm", initHeap); + pgc_next_command_line_argument("-Xms", buf); + } + if (maxHeap>0) { + sprintf(buf, "-Xmx%dm", maxHeap); + pgc_next_command_line_argument("-Xmx", buf); + } + // lockParm = false for MF, true for V4 + pgc_init(pLsrGcReportRoots, P_AGC_LOCK_PARAM); +#endif /* P_USR_AGC */ +#endif /* !P_USE_CGC */ +} + +/**********************************************************************/ +/*** Vtable registration and Enumeration stuff ***/ + +static PlsrVTable pLsrAllVTables = NULL; + +#ifdef P_USE_AGC + +#define pLsrGcRegisterGlobals pgc_register_global_objects +#define pLsrGcRegisterGlobalRefs pgc_register_global_refs + +/* vtable, object size (bytes), offset of the indirection slot (bytes) */ +static void pLsrIndirectionVTableRegister(PlsrVTable vt, uintp size, uintp offset) +{ +#ifdef PLSR_INSTRUMENT_VTB_ALC + vt->next = pLsrAllVTables; + pLsrAllVTables = vt; +#endif + pgc_new_indirection_object((struct VTable*)vt, size, offset); + }; + +static void pLsrVTableRegisterV(PlsrVTable vt, uintp alignment, + uintp fs, PgcIsRef frefs[], + uintp vs, uintp vlo, + PgcIsRef vrefs[], + enum PGC_MUTABILITY mutability, + uintp pinned, + PgcIsRef wrefs[], + void (* __cdecl finalizer)(Managed_Object_Handle)) +{ + /* alignment is a power of two, so just count shifts, offset by two */ + uintp powerOfTwoBaseFour = 0; + uintp adjusted = alignment >> 2; + while ((adjusted >>=1) > 0) { powerOfTwoBaseFour++; } + /* alignment requirement is 2^(2+powerOfTwoBaseFour).*/ + struct AlignmentInfo ai = {.alignArray = 0, .powerOfTwoBaseFour = powerOfTwoBaseFour}; + +#ifdef PLSR_INSTRUMENT_VTB_ALC + vt->next = pLsrAllVTables; + pLsrAllVTables = vt; +#endif + pgc_new_object_format((struct VTable*)vt, + fs, + frefs, + vs, + vlo, + vrefs, + ai, + mutability, + pinned, + wrefs, + finalizer + ); +} + +/* + * vt is the vtable + * alignment is the required object alignment in bytes (must be power of two) + * fs is the fixed size in bytes + * frefs indicates for each word offset starting at zero (i.e. starting with the vtable) is it a ref + * vs is the element size for the variable portion (in bytes) + * If vs=0, then there is no variable portion, and vlo and vref are ignored. + * vlo is the offset from base of object of the length field (if applicable). + * vref indicates the traceability of the first word of the elements in the variable portion + * mutability indicates the mutability of objects allocated with this vtable + * finalizer is an unmanged code pointer to be run on finalization (NULL if none) + */ +static void pLsrVTableRegisterFinalizable(PlsrVTable vt, uintp alignment, + uintp fs, PgcIsRef frefs[], + uintp vs, uintp vlo, PgcIsRef vref, + enum PGC_MUTABILITY mutability, + uintp pinned, + void (* __cdecl finalizer)(Managed_Object_Handle) + ) +{ + PgcIsRef vrefs[2]; + vrefs[0] = vref; + vrefs[1] = 0; + PgcIsRef wrefs[fs/P_WORD_SIZE]; + for(int i = 0;i < fs/P_WORD_SIZE;i++) { + wrefs[i]=0; + } + pLsrVTableRegisterV(vt, alignment, fs, frefs, vs, vlo, vrefs, mutability, pinned, wrefs, finalizer); +} + +#define pLsrVTableRegister(vt, alignment, fcs, frefs, vs, vlo, vref, mutability, pinned) \ + pLsrVTableRegisterFinalizable(vt, alignment, fcs, frefs, vs, vlo, vref, mutability, pinned, NULL) + +#else /* !P_USE_AGC */ + +enum PGC_MUTABILITY { + PGC_ALWAYS_MUTABLE = 0, + PGC_CREATED_MUTABLE = 1, + PGC_ALWAYS_IMMUTABLE = 2 +}; + +#define pLsrGcRegisterGlobals(gs, num) +#define pLsrGcRegisterGlobalRefs(gs, num) +#define pLsrIndirectionVTableRegister(vt, size, offset) +#define pLsrVTableRegister(vt, alignment, fs, frefs, vs, vlo, vref, m, pinned) +#define pLsrVTableRegisterFinalizable(vt, alignment, fs, frefs, vs, vlo, vref, m, pinned, finalizer) +#define pLsrVTableRegisterV(vt, alignment, fs, frefs, vs, vlo, vrefs, m, pinned, wrefs, finalizer) + +#endif /* !P_USE_AGC */ + +/**********************************************************************/ +/*** Barriers ***/ + +#ifdef P_USE_GC_WRITE_BARRIERS +#ifdef P_USE_AGC +#define pLsrWriteBarrierRef(t, s) \ + (pgc_write_ref_slot((struct Object**)&(t), (struct Object*)(s))) +#define pLsrWriteBarrierRefBase(b, t, s) \ + (pgc_write_ref_slot_with_base((struct Object*)(b), \ + (struct Object**)&(t), \ + (struct Object*)(s))) +#ifdef P_ALL_BARRIERS +#define pLsrWriteBarrierRefOpt(t, s) \ + (pgc_write_ref_slot((struct Object**)&(t), (struct Object*)(s))) +#define pLsrWriteBarrierRefOptBase(b, t, s) \ + (pgc_write_ref_slot_with_base((struct Object*)(b), \ + (struct Object**)&(t), \ + (struct Object*)(s))) +#else /* !P_ALL_BARRIERS */ +#define pLsrWriteBarrierRefOpt(t, s) \ + ((*((struct Object**)&(t)))=((struct Object*)(s))) +#define pLsrWriteBarrierRefOptBase(b, t, s) \ + ((*((struct Object**)&(t)))=((struct Object*)(s))) +#endif /* !P_ALL_BARRIERS */ +#else /* !P_USE_AGC */ +#error "write barriers only work with agc" +#endif /* !P_USE_AGC */ +#else /* !P_USE_GC_WRITE_BARRIERS */ +#define pLsrWriteBarrierRef(t, s) \ + ((*((struct Object**)&(t)))=((struct Object*)(s))) +#define pLsrWriteBarrierRefBase(b, t, s) \ + ((*((struct Object**)&(t)))=((struct Object*)(s))) +#define pLsrWriteBarrierRefOpt(t, s) \ + ((*((struct Object**)&(t)))=((struct Object*)(s))) +#define pLsrWriteBarrierRefOptBase(b, t, s) \ + ((*((struct Object**)&(t)))=((struct Object*)(s))) +#endif /* !P_USE_GC_WRITE_BARRIERS */ + +/**********************************************************************/ +/*** Changing VTables ***/ + +#ifdef P_USE_AGC + +#define pLsrObjectChangeVTableMandatory(obj, vtb) \ + pgc_modify_object_vtable(((struct Object*)(obj)), ((struct VTable*)(vtb))) + +/* XXX Extract this out to pgc.h -leaf */ +#define pLsrObjectCmpAndSetVTableMandatory(obj, vtbOld, vtbNew) \ + pLsrSynchCmpAndSetUIntp(((volatile uintp*)(obj)), (uintp) vtbOld, (uintp) vtbNew) + +#else /* !P_USE_AGC */ + +#define pLsrObjectChangeVTableMandatory(obj, vtb) (obj)->vtable = (vtb) + +#define pLsrObjectCmpAndSetVTableMandatory(obj, vtbOld, vtbNew) \ + pLsrSynchCmpAndSetUIntp(((struct Object*)(obj)), (uintp) vtbOld, (uintp) vtbNew) + +#endif /* !P_USE_AGC */ + +#ifdef P_DO_VTABLE_CHANGE + +#define pLsrObjectChangeVTable(obj, vtb) pLsrObjectChangeVTableMandatory(obj, vtb) + +#else /* !P_DO_VTABLE_CHANGE */ + +#define pLsrObjectChangeVTable(obj, vtb) + +#endif /* !P_DO_VTABLE_CHANGE */ + + +/**********************************************************************/ +/*** Malloc in the GC heap ***/ + +#ifdef P_USE_AGC + +typedef struct PlsrGCHeapMallocObjS_ { + PlsrVTable vtable; + struct PlsrGCHeapMallocObjS_* prev; + struct PlsrGCHeapMallocObjS_* next; + uintp length; + char bytes[]; +} PlsrGCHeapMallocObjS; + +#define pLsrGCHeapMallocObjPadding \ + (sizeof(PlsrGCHeapMallocObjS) - sizeof(PlsrVTable) - 2*sizeof(struct PlsrGCHeapMallocObjS_*) - sizeof(uintp)) +pLsrVTableStatic(pLsrGCHeapMallocObjVTable_, VNoneTag, "*heap malloc*", pLsrGCHeapMallocObjPadding); +#define pLsrGCHeapMallocObjVTable (&pLsrGCHeapMallocObjVTable_) + +#define pLsrGCHeapMallocObjSize (sizeof(PlsrGCHeapMallocObjS)) + +static PlsrGCHeapMallocObjS pLsrGCHeapMallocObjectList_ = + {.vtable = pLsrGCHeapMallocObjVTable, + .prev = NULL, + .next = NULL, + .length = 0}; +static PlsrGCHeapMallocObjS* pLsrGCHeapMallocObjList = &pLsrGCHeapMallocObjectList_; + +static void pLsrGCHeapMallocObjListInsert(PlsrGCHeapMallocObjS* list, PlsrGCHeapMallocObjS* node) +{ + node->prev = list->prev; + node->prev->next = node; + node->next = list; + list->prev = node; +} + +static void pLsrGCHeapMallocObjListRemove(PlsrGCHeapMallocObjS* node) +{ + node->next->prev = node->prev; + node->prev->next = node->next; + node->prev = NULL; + node->next = NULL; +} + +#define pLsrGCHeapPtrToObject(ptr) ((PlsrGCHeapMallocObjS *) (((char*) ptr) - pLsrGCHeapMallocObjSize)) + +static void* pLsrGCHeapMalloc(uintp size) +{ + PlsrGCHeapMallocObjS *node; + assert(pLsrGCHeapMallocObjSize == (uintp) &((PlsrGCHeapMallocObjS *)0)->bytes); + pLsrAllocPinned(PlsrGCHeapMallocObjS*, node, pLsrGCHeapMallocObjVTable, pLsrGCHeapMallocObjSize + size); + PlsrGCHeapMallocObjS *list = (PlsrGCHeapMallocObjS *) pLsrSynchAtomicTakeUIntp((volatile uintp*) &pLsrGCHeapMallocObjList, 1); + pLsrGCHeapMallocObjListInsert(list, node); + pLsrSynchAtomicPutUIntp((volatile uintp*) &pLsrGCHeapMallocObjList, (uintp) list); + node->length=size; + return &node->bytes; +} + +static void pLsrGCHeapFree(void *ptr) +{ + + PlsrGCHeapMallocObjS *node = pLsrGCHeapPtrToObject(ptr); + PlsrGCHeapMallocObjS *list = (PlsrGCHeapMallocObjS *) pLsrSynchAtomicTakeUIntp((volatile uintp*) &pLsrGCHeapMallocObjList, 1); + pLsrGCHeapMallocObjListRemove(node); + pLsrSynchAtomicPutUIntp((volatile uintp*) &pLsrGCHeapMallocObjList, (uintp) list); + return; +} + +static void* pLsrGCHeapReAlloc(void * ptr, uintp osize, uintp nsize) +{ + PlsrGCHeapMallocObjS *old = pLsrGCHeapPtrToObject(ptr); + assert(old->length >= osize); + if (old->length >= nsize) { + return ptr; + } else { + void *newPtr = pLsrGCHeapMalloc(nsize); + uintp csize = min(nsize, osize); + memcpy(newPtr, ptr, csize); + pLsrGCHeapFree(ptr); + return newPtr; + } +} + +static void pLsrGCHeapMallocRegisterVTables() +{ + static PgcIsRef refs[pLsrGCHeapMallocObjSize/P_WORD_SIZE] = { 0, 1, 1, 0}; + uintp pinned = 1; + assert(pLsrGCHeapMallocObjSize/P_WORD_SIZE == 4); + assert((uintp)&(((PlsrGCHeapMallocObjS*)(0))->length) == 3*P_WORD_SIZE); + pLsrVTableRegister(pLsrGCHeapMallocObjVTable, pLsrDefaultAlignment, pLsrGCHeapMallocObjSize, + refs, 1, 3*P_WORD_SIZE, 0, PGC_ALWAYS_MUTABLE, pinned); +} + +#define pLsrGCHeapMallocGlobalsCount 1 + +static PlsrObjectB pLsrGCHeapMallocGlobals[] = + { + (PlsrObjectB) (&pLsrGCHeapMallocObjectList_), + (PlsrObjectB) NULL /* This must be last */ + }; + +static void pLsrGCHeapMallocRegisterGlobals() +{ + assert(pLsrGCHeapMallocGlobals[pLsrGCHeapMallocGlobalsCount] == NULL); + pLsrGcRegisterGlobals (pLsrGCHeapMallocGlobals, pLsrGCHeapMallocGlobalsCount); +}; + + +static void pLsrGCHeapMallocInitialize() +{ + pLsrGCHeapMallocObjList->prev=pLsrGCHeapMallocObjList; + pLsrGCHeapMallocObjList->next=pLsrGCHeapMallocObjList; +} + +#else /* !P_USE_AGC */ + +#define pLsrGCHeapMalloc pLsrAllocC +#define pLsrGCHeapReAlloc pLsrReAllocC +#define pLsrGCHeapFree pLsrFreeC + +static void pLsrGCHeapMallocRegisterVTables() {} +static void pLsrGCHeapMallocRegisterGlobals() {} +static void pLsrGCHeapMallocInitialize() {} + +#endif /* P_USE_AGC */ + +static void pLsrGCRegisterVTables() +{ + pLsrGCHeapMallocRegisterVTables(); +} +static void pLsrGCRegisterGlobals() +{ + pLsrGCHeapMallocRegisterGlobals(); +} +static void pLsrGCInitialize() +{ + pLsrGCHeapMallocInitialize(); +} + +#endif /* !_PLSR_GC_H_ */ diff --git a/runtime/include/hrc/plsr-gmp-integer-gallocate.h b/runtime/include/hrc/plsr-gmp-integer-gallocate.h new file mode 100644 index 0000000..3ea116a --- /dev/null +++ b/runtime/include/hrc/plsr-gmp-integer-gallocate.h @@ -0,0 +1,1188 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_AP_INTEGER_H_ +#define _PLSR_AP_INTEGER_H_ + +#define PLSR_GMP_REPLACE_MALLOC +#define PLSR_GMP_USE_PCDECL + +#ifndef __pillar2c__ +#error "GMP integer guaranteed allocation only supported on ipc" + +#endif /* __pillar2c__ */ + +# pragma pillar_push_cc(__pcdecl) +#include +# pragma pillar_pop_cc + +/* Types */ + +typedef struct PlsrAPIntS_ { + PlsrVTable vtable; + int alloc; + int size; + mp_limb_t d[]; +} PlsrAPIntS; + +#define pLsrAPIntPadding (sizeof(PlsrAPIntS) - sizeof(PlsrVTable) - sizeof(int) - sizeof(int)) +pLsrVTableStatic(pLsrAPIntVTable_, VNoneTag, "*ap integer*", pLsrAPIntPadding); +#define pLsrAPIntVTable (&pLsrAPIntVTable_) + +#ifdef P_USE_PILLAR +typedef PlsrRef PlsrAPInt; +#else /* !P_USE_PILLAR */ +typedef PlsrAPIntS* PlsrAPInt; +#endif /* !P_USE_PILLAR */ + +#define pLsrAPIntAlignment 4 +#define pLsrAPIntSize (sizeof(PlsrAPIntS)) + +#define pLsrMPZ_mp_alloc(i) ((i)[0]._mp_alloc) +#define pLsrMPZ_mp_size(i) ((i)[0]._mp_size) +#define pLsrMPZ_mp_d(i) ((i)[0]._mp_d) + +#define pLsrAPInt_mp_alloc(i) (((PlsrAPIntS*) (i))->alloc) +#define pLsrAPInt_mp_size(i) (((PlsrAPIntS*) (i))->size) +#define pLsrAPInt_mp_d(i) (&(((PlsrAPIntS*) (i))->d[0])) + +#define pLsrAPInt_mp_d_Offset ((uintp)(pLsrAPIntSize)) +#define pLsrAPIntMPZMemPtrToAPInt(mem) (((char*) mem) - (pLsrAPInt_mp_d_Offset)) + +#define pLsrAPIntToMPZ(dest, src) \ + do { \ + pLsrMPZ_mp_alloc(dest) = pLsrAPInt_mp_alloc(src); \ + pLsrMPZ_mp_size(dest) = pLsrAPInt_mp_size(src); \ + pLsrMPZ_mp_d(dest) = pLsrAPInt_mp_d(src); \ + } while (0) + +#define pLsrAPIntFromMPZ(dest, src) \ + do { \ + (dest) = pLsrAPIntMPZMemPtrToAPInt(pLsrMPZ_mp_d(src)); \ + assert(pLsrAPInt_mp_alloc(dest) == pLsrMPZ_mp_alloc(src)); \ + pLsrAPInt_mp_size(dest) = pLsrMPZ_mp_size(src); \ + } while (0) + +#define pLsrAPIntInitMPZFromAPInt(src) \ + {{pLsrAPInt_mp_alloc(src), pLsrAPInt_mp_size(src), pLsrAPInt_mp_d(src)}}; + + +static uintp pLsrGmpAllocationReserve = 300; + +// Ask if the GC can definitely allocate size number of bytes without needing a collection. +EXTERN(PgcBool) __pcdecl pgc_can_allocate_without_collection_th(unsigned size, PrtTaskHandle task); +// Do whatever is necessary to make sure that size bytes can be allocated after this call without a collection. +// This call itself may or may not cause a collection. +EXTERN(void) PRT_CDECL pgc_require_allocate_without_collection_th(unsigned size, PrtTaskHandle task); +/* Prepare to allocate up to count gmp integers */ +#define pLsrGmpPrepareForAllocation(count) \ + do { \ + if (!pgc_can_allocate_without_collection_th(count*pLsrGmpAllocationReserve,prtGetTaskHandle())) { \ + pgc_require_allocate_without_collection_th(count*pLsrGmpAllocationReserve,prtGetTaskHandle()); \ + } \ + } while(0) + +/* For documentation purposes only*/ +#define pLsrAPIntBeginNoGC + +#define pLsrAPIntKeepLive1(v) \ + do { \ + volatile PlsrAPInt pLsrAPIntKeepLive1A; \ + pLsrAPIntKeepLive1A = v; \ + } while(0); + +#define pLsrAPIntKeepLive2(v0, v1) \ + do { \ + volatile PlsrAPInt pLsrAPIntKeepLive2A; \ + pLsrAPIntKeepLive2A = v0; \ + pLsrAPIntKeepLive2A = v1; \ + } while(0); + +#ifdef PLSR_AP_INT_TRACE +static void pLsrAPIntShow1(char * s, PlsrAPInt i1) +{ + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(i1); + char* res = pLsrAllocC(mpz_sizeinbase (t1, 10) + 2); + mpz_get_str(res, 10, t1); + printf("APInt: %s, %s\n", s, res); + pLsrFreeC(res); +} + +static void pLsrAPIntShow3(char * s, PlsrAPInt i1, PlsrAPInt i2, PlsrAPInt i3) +{ + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(i1); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(i2); + mpz_t t3 = pLsrAPIntInitMPZFromAPInt(i3); + char* res1 = pLsrAllocC(mpz_sizeinbase (t1, 10) + 2); + mpz_get_str(res1, 10, t1); + char* res2 = pLsrAllocC(mpz_sizeinbase (t2, 10) + 2); + mpz_get_str(res2, 10, t2); + char* res3 = pLsrAllocC(mpz_sizeinbase (t3, 10) + 2); + mpz_get_str(res3, 10, t3); + printf("APInt: %s, (%s, %s, %s)\n", s, res1, res2, res3); + pLsrFreeC(res1); + pLsrFreeC(res2); + pLsrFreeC(res3); +} +#define pLsrAPIntTrace0(s) +#define pLsrAPIntTrace1(s, i1) pLsrAPIntShow1(s, i1) +#define pLsrAPIntTrace2(s, i1, i2) +#define pLsrAPIntTrace3(s, i1, i2, i3) pLsrAPIntShow3(s, i1, i2, i3) +#define pLsrAPIntTrace4(s, i1, i2, i3, i4) +#define pLsrAPIntTraceFmt1(s, i1) +#else +#define pLsrAPIntTrace0(s) +#define pLsrAPIntTrace1(s, i1) +#define pLsrAPIntTrace2(s, i1, i2) +#define pLsrAPIntTrace3(s, i1, i2, i3) +#define pLsrAPIntTrace4(s, i1, i2, i3, i4) +#define pLsrAPIntTraceFmt1(s, i1) +#endif + +static PlsrAPInt pLsrAPIntNewFromCString(char *s) +{ + pLsrGmpPrepareForAllocation(1); + pLsrAPIntBeginNoGC { + mpz_t t1; + if (mpz_init_set_str(t1, s, 0) != 0) { + pLsrRuntimeError("Failed to initialize ap integer"); + } + PlsrAPInt res; pLsrAPIntFromMPZ(res, t1); + return res; + } +} + +/* operation must be an init_set_* */ +#define pLsrAPIntNewFromXBody(operation, src) \ + do { \ + mpz_t t1; \ + pLsrGmpPrepareForAllocation(1); \ + pLsrAPIntBeginNoGC { \ + operation(t1, src); \ + PlsrAPInt res; pLsrAPIntFromMPZ(res, t1); \ + return res; \ + } \ + } while (0) + +static PlsrAPInt pLsrAPIntNewFromSignedLong(long int si) +{ + pLsrAPIntNewFromXBody(mpz_init_set_si, si); +} + +static PlsrAPInt pLsrAPIntNewFromUnsignedLong(unsigned long int ui) +{ + pLsrAPIntNewFromXBody(mpz_init_set_ui, ui); +} + +static PlsrAPInt pLsrAPIntNewFromFloat32(float32 f) +{ + pLsrAPIntNewFromXBody(mpz_init_set_d, (float64) f); +} + +static PlsrAPInt pLsrAPIntNewFromFloat64(float64 f) +{ + pLsrAPIntNewFromXBody(mpz_init_set_d, f); +} + + +static PlsrAPInt pLsrAPIntNewFromAPInt(PlsrAPInt a) +{ + pLsrGmpPrepareForAllocation(1); + pLsrAPIntBeginNoGC { + mpz_t t1; + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a); + mpz_init_set(t1, t2); + PlsrAPInt res; pLsrAPIntFromMPZ(res, t1); + return res; + } +} + +/* Constants */ + +/* Globals */ + +#define PLSR_AP_INT_BACKPATCH_GLOBALS 1 + +/* This must match the definition of PlsrAPIntS */ +#define PlsrAPIntSGlobal(bytes) \ + struct { \ + PlsrVTable vtable; \ + int alloc; \ + int size; \ + mp_limb_t d[bytes]; \ + } + +#define pLsrAPIntStaticGlobalNew(dv, bytes) \ + static PlsrAPIntSGlobal(bytes) dv = {.vtable = pLsrAPIntVTable, \ + .alloc = (bytes + (sizeof(mp_limb_t) - 1))/sizeof(mp_limb_t), \ + .size = 0, \ + } + +void pLsrAPIntStaticGlobalInitFromAPInt(PlsrAPInt dest, PlsrAPInt src) +{ + int size = pLsrAbs32(pLsrAPInt_mp_size(src)); + assert(pLsrAPInt_mp_alloc(dest) >= size); + pLsrAPInt_mp_size(dest) = pLsrAPInt_mp_size(src); + memcpy(pLsrAPInt_mp_d(dest), pLsrAPInt_mp_d(src), size * sizeof(mp_limb_t)); +} + +/* These are global, and will never be freed, */ +#define pLsrAPIntStaticGlobalInitFromCString(dest, s) (pLsrAPIntStaticGlobalInitFromAPInt(dest, pLsrAPIntNewFromCString(s))) + +#define pLsrAPIntStaticGlobalInitFromSInt32(dest, si) (pLsrAPIntStaticGlobalInitFromAPInt(dest, pLsrAPIntNewFromSignedLong(si))) + +#define pLsrAPIntStaticGlobalInitFromUInt32(dest, ui) (pLsrAPIntStaticGlobalInitFromAPInt(dest, pLsrAPIntNewFromUnsignedLong(ui))) + +pLsrAPIntStaticGlobalNew(pLsrAPIntZero_, 0); +#define pLsrAPIntZero ((PlsrAPInt) &pLsrAPIntZero_) +pLsrAPIntStaticGlobalNew(pLsrAPIntOne_, 1); +#define pLsrAPIntOne ((PlsrAPInt) &pLsrAPIntOne_) +pLsrAPIntStaticGlobalNew(pLsrAPIntMinusOne_, 1); +#define pLsrAPIntMinusOne ((PlsrAPInt) &pLsrAPIntMinusOne_) +pLsrAPIntStaticGlobalNew(pLsrAPIntMinusUInt32Max_, sizeof(uint32)); +#define pLsrAPIntMinusUInt32Max ((PlsrAPInt) &pLsrAPIntMinusUInt32Max_) + +/* Conversions */ + +/* Unsigned Integer Conversions */ + +static PlsrAPInt pLsrAPIntFromUInt32(uint32 i) +{ + PlsrAPInt res = pLsrAPIntNewFromUnsignedLong(i); + pLsrAPIntTrace1("New from uint32", res); + return res; +} + +static PlsrAPInt pLsrAPIntFromUInt64(uint64 i) +{ + PlsrAPInt res; + if (sizeof(uint64) <= sizeof(unsigned long int)) + res = pLsrAPIntNewFromUnsignedLong(i); + else if (i <= UINT32_MAX) { + res = pLsrAPIntFromUInt32(i); + } else { + uint64 ui = i; + uint32 upper = (uint32) (ui >> 32); + uint32 lower = (uint32) ui; + PlsrAPInt z = pLsrAPIntFromUInt32(upper); + pLsrGmpPrepareForAllocation(2); + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(z); + mpz_mul_2exp(t1, t1, 32); + mpz_add_ui(t1, t1, lower); + pLsrAPIntFromMPZ(res, t1); + } + } + pLsrAPIntTrace1("New from uint64", res); + return res; +} + +static PlsrAPInt pLsrAPIntFromUIntp(uintp i) +{ + PlsrAPInt res; + if (sizeof(uintp) <= sizeof(uint32)) + res = pLsrAPIntFromUInt32(i); + else if (sizeof(uintp) <= sizeof(uint64)) + res = pLsrAPIntFromUInt64(i); + else { + pLsrRuntimeError("UIntp too large"); + res = 0; + } + pLsrAPIntTrace1("New from uintp", res); + return res; +} + +static uint32 pLsrUInt32FromAPInt(PlsrAPInt a) +{ + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + uint32 res = (uint32) mpz_get_ui(t1); + pLsrAPIntTrace1("To uint32", a); + return res; +} + +static uint64 pLsrUInt64FromAPInt(PlsrAPInt a) +{ + uint64 res = 0; + pLsrGmpPrepareForAllocation(4); + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + if (sizeof(uint64) <= sizeof(unsigned long int) || mpz_fits_ulong_p(t1)) { + res = (uint64) mpz_get_ui(t1); + } else { + mpz_t tmp1; + mpz_init_set(tmp1, t1); + mpz_fdiv_q_2exp(tmp1, tmp1, 32); + res = (uint64) mpz_get_ui(tmp1); + res = res << 32; + mpz_set(tmp1, t1); + mpz_fdiv_r_2exp(tmp1, tmp1, 32); + res += (uint64) mpz_get_ui(tmp1); + } + } + pLsrAPIntTrace1("To uint64", a); + return res; +} + +static uintp pLsrUIntpFromAPInt(PlsrAPInt a) +{ + uintp res; + if (sizeof(uintp) <= sizeof(uint32)) + res = pLsrUInt32FromAPInt(a); + else if (sizeof(uintp) <= sizeof(uint64)) + res = pLsrUInt64FromAPInt(a); + else { + pLsrRuntimeError("UIntp too large"); + res = 0; + } + pLsrAPIntTrace1("To uintp", a); + return res; +} + +/* Signed Integer Conversions */ + +static PlsrAPInt pLsrAPIntFromSInt32(sint32 i) +{ + PlsrAPInt res = pLsrAPIntNewFromSignedLong(i); + pLsrAPIntTrace1("From sint32", res); + return res; +} + +static PlsrAPInt pLsrAPIntFromSInt64(sint64 i) +{ + PlsrAPInt res; + if (sizeof(sint64) <= sizeof(long int)) + res = pLsrAPIntNewFromSignedLong(i); + else if (i <= ((sint64) SINT32_MAX) && i >= ((sint64) SINT32_MIN)) { + res = pLsrAPIntFromSInt32((sint32) i); + } else { + uint64 ui = pLsrAbs64(i); + uint32 upper = (uint32) (ui >> 32); + uint32 lower = (uint32) ui; + PlsrAPInt z = pLsrAPIntFromUInt32(upper); + pLsrGmpPrepareForAllocation(2); + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(z); + mpz_mul_2exp(t1, t1, 32); + mpz_add_ui(t1, t1, lower); + if (i < 0) { + mpz_neg(t1, t1); + } + pLsrAPIntFromMPZ(res, t1); + } + } + pLsrAPIntTrace1("From sint64", res); + return res; +} + +static PlsrAPInt pLsrAPIntFromSIntp(sintp i) +{ + PlsrAPInt res; + if (sizeof(sintp) <= sizeof(sint32)) + res = pLsrAPIntFromSInt32(i); + else if (sizeof(sintp) <= sizeof(sint64)) + res = pLsrAPIntFromSInt64(i); + else { + pLsrRuntimeError("SIntp too large"); + res = 0; + } + pLsrAPIntTrace1("From sint64", res); + return res; +} + +static sint32 pLsrSInt32FromAPInt(PlsrAPInt a) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + sint32 res = (sint32) mpz_get_si(t1); + pLsrAPIntTraceFmt1("To sint32 %d", res); + return res; + } +} + +static sint64 pLsrSInt64FromAPInt(PlsrAPInt a) +{ + sint64 res = 0; + pLsrAPIntTrace1("To sint64", a); + pLsrGmpPrepareForAllocation(3); + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(t1)) { + res = (sint64) mpz_get_si(t1); + } else { + int negate = 0; + mpz_t tmp1; + mpz_init_set(tmp1, t1); + if (mpz_sgn(tmp1) == -1) { + negate = 1; + mpz_neg(tmp1, tmp1); + } + mpz_fdiv_q_2exp(tmp1, tmp1, 32); + res = (sint64) mpz_get_ui(tmp1); + res = res << 32; + if (negate) { + mpz_neg(tmp1, t1); + } else { + mpz_set(tmp1, t1); + } + mpz_fdiv_r_2exp(tmp1, tmp1, 32); + res += (sint64) mpz_get_ui(tmp1); + if (negate) { + res = -res; + } + } + } + pLsrAPIntTraceFmt1("To sint64 %lld\n", res); + return res; +} + +static sintp pLsrSIntpFromAPInt(PlsrAPInt a) +{ + pLsrAPIntTrace1("To sintp", a); + if (sizeof(sintp) <= sizeof(sint32)) + return pLsrSInt32FromAPInt(a); + else if (sizeof(sintp) <= sizeof(sint64)) + return pLsrSInt64FromAPInt(a); + else { + pLsrRuntimeError("SIntp too large"); + return 0; + } +} + +/* String Conversions */ +static char* pLsrCStringFromAPInt(PlsrAPInt a) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + char* res = pLsrAllocC(mpz_sizeinbase (t1, 10) + 2); + mpz_get_str(res, 10, t1); + pLsrAPIntTrace1("To cstring", a); + return res; + } +} + +static PlsrAPInt pLsrAPIntFromCString(char* s) +{ + PlsrAPInt res = pLsrAPIntNewFromCString(s); + pLsrAPIntTrace1("From cstring", res); + return res; +} + +#define pLsrAPIntBinary(a, b, operator) \ + do { \ + pLsrGmpPrepareForAllocation(1); \ + pLsrAPIntBeginNoGC { \ + mpz_t t1; \ + mpz_init(t1); \ + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a); \ + mpz_t t3 = pLsrAPIntInitMPZFromAPInt(b); \ + operator(t1, t2, t3); \ + PlsrAPInt res; pLsrAPIntFromMPZ(res, t1); \ + pLsrAPIntTrace3(#operator, a, b, res); \ + return res; \ + } \ + } while(0) + +/* Casts */ + +static sint8 pLsrAPIntCastToSInt8(PlsrAPInt a) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + sint8 res = (sint8)(sint32) mpz_get_si(t1); + pLsrAPIntTraceFmt1("Cast to sint8 %d", res); + return res; + } +} + +static sint16 pLsrAPIntCastToSInt16(PlsrAPInt a) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + sint16 res = (sint16)(sint32) mpz_get_si(t1); + pLsrAPIntTraceFmt1("Cast to sint16 %d", res); + return res; + } +} + +static sint64 pLsrAPIntCastToSInt64(PlsrAPInt a) +{ + sint64 res = 0; + pLsrAPIntTrace1("Cast to sint64", a); + pLsrGmpPrepareForAllocation(3); + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(t1)) { + res = (sint64) mpz_get_si(t1); + } else { + int negate = 0; + mpz_t tmp1; + mpz_init_set(tmp1, t1); + if (mpz_sgn(tmp1) == -1) { + negate = 1; + mpz_neg(tmp1, tmp1); + } + mpz_fdiv_q_2exp(tmp1, tmp1, 32); + res = (sint64) mpz_get_ui(tmp1); + res = res << 32; + if (negate) { + mpz_neg(tmp1, t1); + } else { + mpz_set(tmp1, t1); + } + mpz_fdiv_r_2exp(tmp1, tmp1, 32); + res += (sint64) mpz_get_ui(tmp1); + if (negate) { + res = -res; + } + } + } + pLsrAPIntTraceFmt1("To sint64 %lld\n", res); + return res; +} + +static uint8 pLsrAPIntCastToUInt8(PlsrAPInt a) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + uint8 res = (uint8)(sint32) mpz_get_si(t1); + pLsrAPIntTraceFmt1("Cast to uint8 %d", res); + return res; + } +} + +static uint16 pLsrAPIntCastToUInt16(PlsrAPInt a) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + uint16 res = (uint16)(sint32) mpz_get_si(t1); + pLsrAPIntTraceFmt1("Cast to uint16 %d", res); + return res; + } +} + +static uint32 pLsrAPIntCastToUInt32(PlsrAPInt a) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + uint32 res = (uint32) mpz_get_ui(t1); + if (mpz_sgn(t1) == -1) { + res = -res; + } + pLsrAPIntTraceFmt1("Cast to uint32 %d", res); + return res; + } +} + +static sint32 pLsrAPIntCastToSInt32(PlsrAPInt a) +{ + return (sint32)pLsrAPIntCastToUInt32(a); +} + +static uint64 pLsrAPIntCastToUInt64(PlsrAPInt a) +{ + sint64 res = 0; + pLsrAPIntTrace1("Cast to uint64", a); + pLsrGmpPrepareForAllocation(3); + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(t1)) { + res = (sint64) mpz_get_si(t1); + } else { + int negate = 0; + mpz_t tmp1; + mpz_init_set(tmp1, t1); + if (mpz_sgn(tmp1) == -1) { + negate = 1; + mpz_neg(tmp1, tmp1); + } + mpz_fdiv_q_2exp(tmp1, tmp1, 32); + res = (sint64) mpz_get_ui(tmp1); + res = res << 32; + if (negate) { + mpz_neg(tmp1, t1); + } else { + mpz_set(tmp1, t1); + } + mpz_fdiv_r_2exp(tmp1, tmp1, 32); + res += (sint64) mpz_get_ui(tmp1); + if (negate) { + res = -res; + } + } + } + pLsrAPIntTraceFmt1("To uint64 %lld\n", res); + return (uint64)res; +} + +static float32 pLsrAPIntCastToFloat32(PlsrAPInt a) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + float32 res = (float32) mpz_get_d(t1); + pLsrAPIntTraceFmt1("Cast to float32 %f\n", res); + return res; + } +} + +static float64 pLsrAPIntCastToFloat64(PlsrAPInt a) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + float64 res = (float64) mpz_get_d(t1); + pLsrAPIntTraceFmt1("Cast to float64 %lf\n", res); + return res; + } +} + +/* Bitwise */ + +static PlsrAPInt pLsrAPIntBNot(PlsrAPInt a) +{ + pLsrGmpPrepareForAllocation(2); + pLsrAPIntBeginNoGC { + mpz_t t1; + mpz_init(t1); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a); + mpz_com(t1, t2); + PlsrAPInt res; pLsrAPIntFromMPZ(res, t1); + pLsrAPIntTrace2("BNot", a, res); + return res; + } +} + +static PlsrAPInt pLsrAPIntBAnd(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_and); +} + +static PlsrAPInt pLsrAPIntBOr(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_ior); +} + +static PlsrAPInt pLsrAPIntShiftLUIntL(PlsrAPInt a, unsigned long b) +{ + pLsrGmpPrepareForAllocation(1); + pLsrAPIntBeginNoGC { + mpz_t t1; + mpz_init(t1); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a); + mpz_mul_2exp(t1, t2, b); + PlsrAPInt res; pLsrAPIntFromMPZ(res, t1); + return res; + } +} + +static PlsrAPInt pLsrAPIntShiftRUIntL(PlsrAPInt a, unsigned long b) +{ + pLsrGmpPrepareForAllocation(2); + pLsrAPIntBeginNoGC { + mpz_t t1; + mpz_init(t1); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a); + mpz_fdiv_q_2exp(t1, t2, b); + PlsrAPInt res; pLsrAPIntFromMPZ(res, t1); + return res; + } +} + +static PlsrAPInt pLsrAPIntBShiftL(PlsrAPInt a, PlsrAPInt b) +{ + long int count; + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(b); + if (mpz_fits_slong_p(t1)) { + count = mpz_get_si(t1); + } else { + pLsrRuntimeError("GMP AP Int only supports shifts up to 2^32 bits"); + count = 0; + } + } + PlsrAPInt res = 0; + if (count < 0) { + res = pLsrAPIntShiftRUIntL(a, -count); + } else { + res = pLsrAPIntShiftLUIntL(a, count); + } + return res; +} + +static PlsrAPInt pLsrAPIntBShiftR(PlsrAPInt a, PlsrAPInt b) +{ + long int count; + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(b); + if (mpz_fits_slong_p(t1)) { + count = mpz_get_si(t1); + } else { + pLsrRuntimeError("GMP AP Int only supports shifts up to 2^32 bits"); + count = 0; + } + } + PlsrAPInt res = 0; + if (count < 0) { + res = pLsrAPIntShiftLUIntL(a, -count); + } else { + res = pLsrAPIntShiftRUIntL(a, count); + } + return res; +} + +static PlsrAPInt pLsrAPIntBXor(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_xor); +} + + + +/* Arithmetic */ + +static PlsrAPInt pLsrAPIntNegate(PlsrAPInt a) +{ + pLsrGmpPrepareForAllocation(2); + pLsrAPIntBeginNoGC { + mpz_t t1; + mpz_init(t1); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a); + mpz_neg(t1, t2); + PlsrAPInt res; pLsrAPIntFromMPZ(res, t1); + pLsrAPIntTrace2("Negate", a, res); + return res; + } +} + +static PlsrAPInt pLsrAPIntAdd(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_add); +} + +static PlsrAPInt pLsrAPIntSub(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_sub); +} + +static PlsrAPInt pLsrAPIntMul(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_mul); +} + + +static void pLsrAPIntDivModE(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b) +{ + pLsrGmpPrepareForAllocation(4); + pLsrAPIntBeginNoGC { + mpz_t tq, tr; + mpz_init(tq); + mpz_init(tr); + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b); + mpz_tdiv_qr(tq, tr, t1, t2); + if (mpz_sgn(tr) < 0) { + if (mpz_sgn(t2) > 0) { + mpz_sub_ui(tq, tq,1); + mpz_add(tr, tr, t2); + } else { + mpz_add_ui(tq, tq, 1); + mpz_sub(tr, tr, t2); + } + } + pLsrAPIntFromMPZ(*quotO, tq); + pLsrAPIntFromMPZ(*remO, tr); + pLsrAPIntTrace4("DivModE", a, b, q, r); + return; + } +} + +static void pLsrAPIntDivModF(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b) +{ + pLsrGmpPrepareForAllocation(4); + pLsrAPIntBeginNoGC { + mpz_t tq, tr; + mpz_init(tq); + mpz_init(tr); + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b); + mpz_fdiv_qr(tq, tr, t1, t2); + pLsrAPIntFromMPZ(*quotO, tq); + pLsrAPIntFromMPZ(*remO, tr); + pLsrAPIntTrace4("DivModF", a, b, q, r); + return; + } +} + +static void pLsrAPIntDivModT(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b) +{ + pLsrGmpPrepareForAllocation(4); + pLsrAPIntBeginNoGC { + mpz_t tq, tr; + mpz_init(tq); + mpz_init(tr); + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b); + mpz_tdiv_qr(tq, tr, t1, t2); + pLsrAPIntFromMPZ(*quotO, tq); + pLsrAPIntFromMPZ(*remO, tr); + pLsrAPIntTrace4("DivModT", a, b, q, r); + return; + } +} + +static PlsrAPInt pLsrAPIntDivE(PlsrAPInt a, PlsrAPInt b) +{ + pLsrGmpPrepareForAllocation(4); + pLsrAPIntBeginNoGC { + mpz_t tq, tr; + mpz_init(tq); + mpz_init(tr); + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b); + mpz_tdiv_qr(tq, tr, t1, t2); + if (mpz_sgn(tr) < 0) { + if (mpz_sgn(t2) > 0) { + mpz_sub_ui(tq, tq, 1); + } else { + mpz_add_ui(tq, tq, 1); + } + } + pLsrAPIntTrace4("DivE", a, b, q, r); + PlsrAPInt res; pLsrAPIntFromMPZ(res, tq); + return res; + } +} + +static PlsrAPInt pLsrAPIntDivF(PlsrAPInt a, PlsrAPInt b) +{ + pLsrGmpPrepareForAllocation(3); + pLsrAPIntBeginNoGC { + mpz_t tq; + mpz_init(tq); + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b); + mpz_fdiv_q(tq, t1, t2); + PlsrAPInt res; pLsrAPIntFromMPZ(res, tq); + pLsrAPIntTrace3("DivF", a, b, res); + return res; + }; +} + +static PlsrAPInt pLsrAPIntDivT(PlsrAPInt a, PlsrAPInt b) +{ + pLsrGmpPrepareForAllocation(3); + pLsrAPIntBeginNoGC { + mpz_t tq; + mpz_init(tq); + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b); + mpz_tdiv_q(tq, t1, t2); + PlsrAPInt res; pLsrAPIntFromMPZ(res, tq); + pLsrAPIntTrace3("DivT", a, b, res); + return res; + } +} + +static PlsrAPInt pLsrAPIntModE(PlsrAPInt a, PlsrAPInt b) +{ + pLsrGmpPrepareForAllocation(3); + pLsrAPIntBeginNoGC { + mpz_t tr; + mpz_init(tr); + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b); + mpz_tdiv_r(tr, t1, t2); + if (mpz_sgn(tr) < 0) { + if (mpz_sgn(t2) > 0) { + mpz_add(tr, tr, t2); + } else { + mpz_sub(tr, tr, t2); + } + } + PlsrAPInt res; pLsrAPIntFromMPZ(res, tr); + pLsrAPIntTrace3("ModE", a, b, res); + return res; + } +} + +static PlsrAPInt pLsrAPIntModF(PlsrAPInt a, PlsrAPInt b) +{ + pLsrGmpPrepareForAllocation(2); + pLsrAPIntBeginNoGC { + mpz_t tr; + mpz_init(tr); + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b); + mpz_fdiv_r(tr, t1, t2); + PlsrAPInt res; pLsrAPIntFromMPZ(res, tr); + pLsrAPIntTrace3("ModF", a, b, res); + return res; + } +} + +static PlsrAPInt pLsrAPIntModT(PlsrAPInt a, PlsrAPInt b) +{ + pLsrGmpPrepareForAllocation(2); + pLsrAPIntBeginNoGC { + mpz_t tr; + mpz_init(tr); + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b); + mpz_tdiv_r(tr, t1, t2); + PlsrAPInt res; pLsrAPIntFromMPZ(res, tr); + return res; + } +} + +static PlsrAPInt pLsrAPIntGcd(PlsrAPInt a, PlsrAPInt b) +{ + pLsrGmpPrepareForAllocation(2); + pLsrAPIntBeginNoGC { + mpz_t tres; + mpz_init(tres); + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b); + mpz_gcd(tres, t1, t2); + PlsrAPInt res; pLsrAPIntFromMPZ(res, tres); + pLsrAPIntTrace3("Gcd", a, b, res); + return res; + } +} + + +/* Comparisons */ +/* calls below assume that this does not incur a gc */ +static sintp pLsrAPIntCompare(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b); + sintp res = mpz_cmp(t1, t2); + pLsrAPIntTrace2("Compare", a, b); + return res; + } +} + + +static PlsrBoolean pLsrAPIntLess(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) < 0; +} + +static PlsrBoolean pLsrAPIntGreater(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) > 0; +} + +static PlsrBoolean pLsrAPIntEqual(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) == 0; +} + +static PlsrBoolean pLsrAPIntNotEqual(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) != 0; +} + +static PlsrBoolean pLsrAPIntLessOrEqual(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) <= 0; +} + +static PlsrBoolean pLsrAPIntGreaterOrEqual(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) >= 0; +} + + +/* Miscellaneous */ + +static uint32 pLsrAPIntFitsInSInt32(PlsrAPInt a) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + uint32 res = mpz_fits_slong_p(t1); + if (sizeof(sint32) < sizeof(long int)) { + /* If it fits in a long int and the long int is small enough, + * it fits in an sint32 */ + if (res) { + long int ai = mpz_get_si(t1); + res = ai >=SINT32_MIN && ai <= SINT32_MAX; + } + } else if (sizeof(sint32) > sizeof(long int)) { + if (!res) { /* gc ok */ + res = (pLsrAPIntGreaterOrEqual(a, pLsrAPIntFromSInt32(SINT32_MIN)) && + pLsrAPIntLessOrEqual(a, pLsrAPIntFromSInt32(SINT32_MAX))); + } + } + /* If they're equal, we're good. */ + return res; + } +} + +/* Returns SINT32_MIN if (a >= upper) or (a <= lower) + * Otherwise returns a. + * To be useful, (lower > SINT32_MIN) should be true. + */ +static sint32 pLsrAPIntCheckRangeSInt32(PlsrAPInt a, sint32 upper, sint32 lower) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + uint32 fitsInLong = mpz_fits_slong_p(t1); + sint32 res = SINT32_MIN; + if (sizeof(sint32) < sizeof(long int)) { + /* If it fits in a long int and the long int is small enough, + * it fits in an sint32 */ + if (fitsInLong) { + long int ai = mpz_get_si(t1); + if (ai >= ((long int) lower) && ai <= ((long int) upper)) { + res = (sint32) ai; + } + } + } else if (sizeof(sint32) > sizeof(long int)) { + if (fitsInLong) { + sint32 ai = (sint32) mpz_get_si(t1); + if (ai >= lower && ai <= upper) { + res = ai; + } + } else if (pLsrAPIntGreaterOrEqual(a, pLsrAPIntFromSInt32(lower)) && + pLsrAPIntLessOrEqual(a, pLsrAPIntFromSInt32(upper))) { + res = (sint32) mpz_get_si(t1); + } + } else if (fitsInLong) { + sint32 ai = (sint32) mpz_get_si(t1); + if (ai >= lower && ai <= upper) { + res = ai; + } + } + return res; + } +} + +#define pLsrAPIntFromFloat32 pLsrAPIntNewFromFloat32 +#define pLsrAPIntFromFloat64 pLsrAPIntNewFromFloat64 + +static float32 pLsrFloat32FromAPInt(PlsrAPInt a) { + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + float32 res = (float32) mpz_get_d(t1); + pLsrAPIntTrace1("To float32", a); + return res; + } +} + +static float64 pLsrFloat64FromAPInt(PlsrAPInt a) { + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + float64 res = (float64) mpz_get_d(t1); + pLsrAPIntTrace1("To float64", a); + return res; + } +} + +#define hashPair(h1, h2) ((h1)+((h2)<<5)+(h2)+720) + +/* no gc */ +static uintp pLsrAPIntMPZHash(mpz_t z) +{ + uintp h; + switch mpz_sgn(z) { + case -1: h = 987; break; + case 1: h = 0; break; + case 0: h = 0; break; + } + for(size_t l = mpz_size(z); l > 0; l--) { + h = hashPair(h, mpz_getlimbn(z, l-1)); + } + return h; +} + +static uintp pLsrAPIntHash(PlsrAPInt a) +{ + pLsrAPIntBeginNoGC { + mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a); + uintp h = pLsrAPIntMPZHash(t1); + return h; + } +} + +/* This function must match the previous one */ +static uintp pLsrSInt32Hash(sint32 i) +{ + pLsrAPIntBeginNoGC { + uintp h; + mpz_t tmp; + mpz_init_set_si(tmp, i); + h = pLsrAPIntMPZHash(tmp); + return h; + } +} + +/* Initialization and Registration */ + +static void pLsrAPIntRegisterVTables() +{ + static PgcIsRef pLsrAPIntRefs[pLsrAPIntSize/P_WORD_SIZE] = { 0, }; + + pLsrVTableRegister(pLsrAPIntVTable, pLsrAPIntAlignment, pLsrAPIntSize, pLsrAPIntRefs, sizeof(mp_limb_t), P_WORD_SIZE, 0, + PGC_ALWAYS_MUTABLE, 0); + +} + +#define pLsrAPIntGlobalsCount 4 + +static PlsrObjectB pLsrAPIntGlobals[] = + { + (PlsrObjectB) &pLsrAPIntZero_, + (PlsrObjectB) &pLsrAPIntOne_, + (PlsrObjectB) &pLsrAPIntMinusOne_, + (PlsrObjectB) &pLsrAPIntMinusUInt32Max_, + (PlsrObjectB) NULL /* This must be last */ + }; + +static void pLsrAPIntRegisterGlobals() { + assert(pLsrAPIntGlobals[pLsrAPIntGlobalsCount] == NULL); + pLsrGcRegisterGlobals (pLsrAPIntGlobals, pLsrAPIntGlobalsCount); +}; + +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ + +static void* pLsrGMPMalloc(size_t size) +{ + PlsrAPIntS* mem; + pLsrAllocNoGC(PlsrAPIntS*, mem, pLsrAPIntVTable, sizeof(PlsrAPIntS) + size); + mem->alloc = size/sizeof(mp_limb_t); + assert(size % sizeof(mp_limb_t) == 0); + if (size > pLsrGmpAllocationReserve/2) pLsrGmpAllocationReserve = size*2; + return pLsrAPInt_mp_d(mem); +} + +static void pLsrGMPFree(void* object, size_t size) +{ + PlsrAPInt a; + assert((a = pLsrAPIntMPZMemPtrToAPInt(object), pLsrAPInt_mp_alloc(a)*sizeof(mp_limb_t) == size)); +} + +static void* pLsrGMPReAlloc(void* object, size_t osize, size_t nsize) +{ + PlsrAPIntS* mem; + void* s; + pLsrAllocNoGC(PlsrAPIntS*, mem, pLsrAPIntVTable, sizeof(PlsrAPIntS) + nsize); + s = pLsrAPInt_mp_d(mem); + memcpy(s, object, osize); + mem->alloc = nsize/sizeof(mp_limb_t); + assert(nsize % sizeof(mp_limb_t) == 0); + if (nsize > pLsrGmpAllocationReserve/2) pLsrGmpAllocationReserve = nsize*2; + return s; +} + +# undef to +#pragma pillar_managed(on) +#endif + + +static void pLsrAPIntInitialize(uintp memLimit) { + mp_set_memory_functions(pLsrGMPMalloc, pLsrGMPReAlloc, pLsrGMPFree); + pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntZero, 0); + pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntOne, 1); + pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntMinusOne, -1); + PlsrAPInt tmp = pLsrAPIntFromUInt32(UINT32_MAX); + tmp = pLsrAPIntNegate(tmp); + pLsrAPIntStaticGlobalInitFromAPInt(pLsrAPIntMinusUInt32Max, tmp); +}; + + +#endif /*_PLSR_AP_INTEGER_H_ */ diff --git a/runtime/include/hrc/plsr-gmp-integer.h b/runtime/include/hrc/plsr-gmp-integer.h new file mode 100644 index 0000000..ae65c8c --- /dev/null +++ b/runtime/include/hrc/plsr-gmp-integer.h @@ -0,0 +1,1178 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_AP_INTEGER_H_ +#define _PLSR_AP_INTEGER_H_ + +#ifdef PLSR_GMP_USE_MALLOC +#define pLsrAPIntIsPinned 0 +#else /* PLSR_GMP_USE_MALLOC */ + +#ifdef PLSR_GMP_USE_PCDECL + +#define pLsrAPIntIsPinned 0 + +#ifndef __pillar2c__ +#error "pcdecl only supported on pillar2c" +#endif /* __pillar2c__ */ + +#else /* PLSR_GMP_USE_PCDECL*/ + +#ifdef PLSR_GMP_USE_PINNING +#define pLsrAPIntIsPinned 1 + +#else /* !PLSR_GMP_USE_PINNING */ + +#ifdef PLSR_GMP_USE_GCMALLOC + +#define pLsrAPIntIsPinned 1 +#define PLSR_GMP_REPLACE_MALLOC + +#else /* !PLSR_GMP_USE_GCMALLOC */ + +#ifdef PLSR_GMP_USE_FORCE_GC +#define pLsrAPIntIsPinned 0 +#define PLSR_GMP_USE_PCDECL +#define PLSR_GMP_FORCE_GC +#define PLSR_GMP_REPLACE_MALLOC +#else /* !PLSR_GMP_USE_FORCE_GC */ +#error "No GMP implementation selected" +#endif /* !PLSR_GMP_USE_FORCE_GC */ + +#endif /* !PLSR_GMP_USE_GCMALLOC */ + +#endif /* PLSR_GMP_USE_PINNING */ + +#endif /* !PLSR_GMP_USE_PCDECL*/ + +#endif /* !PLSR_GMP_USE_MALLOC*/ + + +#ifdef PLSR_GMP_USE_PCDECL + +# pragma pillar_push_cc(__pcdecl) +#include +# pragma pillar_pop_cc + +#else /* PLSR_GMP_USE_PCDECL */ +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ +#endif /* P_USE_PILLAR */ + +#include + +#ifdef P_USE_PILLAR +# undef to +# pragma pillar_managed(on) +#endif +#endif /* !PLSR_GMP_USE_PCDECL */ +/* Types */ + +#ifdef PLSR_GMP_USE_MALLOC + +typedef struct PlsrAPIntS_ { + PlsrVTable vtable; + mpz_t *z; +} PlsrAPIntS; + +#define pLsrAPIntPadding (sizeof(PlsrAPIntS) - sizeof(PlsrVTable) - sizeof(mpz_t*)) + +#else /* PLSR_GMP_USE_MALLOC */ + +typedef struct PlsrAPIntS_ { + PlsrVTable vtable; + mpz_t z; +} PlsrAPIntS; + +#define pLsrAPIntPadding (sizeof(PlsrAPIntS) - sizeof(PlsrVTable) - sizeof(mpz_t)) + +#endif /* !PLSR_GMP_USE_MALLOC */ + +/* This must be pinned, and we must add finalizers for it */ +pLsrVTableStatic(pLsrAPIntVTable_, VNoneTag, "*ap integer*", pLsrAPIntPadding); +#define pLsrAPIntVTable (&pLsrAPIntVTable_) + + +#ifdef P_USE_PILLAR +typedef PlsrRef PlsrAPInt; +#else /* !P_USE_PILLAR */ +typedef PlsrAPIntS* PlsrAPInt; +#endif /* !P_USE_PILLAR */ + +#define pLsrAPIntAlignment 4 +#define pLsrAPIntSize (sizeof(PlsrAPIntS)) + +#ifdef PLSR_GMP_USE_MALLOC +#define pLsrAPIntGetZPtr(i) (((PlsrAPIntS*) i)->z) +#define pLsrAPIntGetZ(i) (*(((PlsrAPIntS*) i)->z)) +#else +#define pLsrAPIntGetZ(i) (((PlsrAPIntS*) i)->z) +#endif + +#define pLsrAPIntKeepLive1(v) \ + do { \ + volatile PlsrAPInt pLsrAPIntKeepLive1A; \ + pLsrAPIntKeepLive1A = v; \ + } while(0); + +#define pLsrAPIntKeepLive2(v0, v1) \ + do { \ + volatile PlsrAPInt pLsrAPIntKeepLive2A; \ + pLsrAPIntKeepLive2A = v0; \ + pLsrAPIntKeepLive2A = v1; \ + } while(0); + +#ifdef PLSR_AP_INT_TRACE +static void pLsrAPIntShow1(char * s, PlsrAPInt i) +{ + char* res = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i), 10) + 2); + mpz_get_str(res, 10, pLsrAPIntGetZ(i)); + printf("APInt: %s, %s\n", s, res); + pLsrFreeC(res); +} +static void pLsrAPIntShow2(char * s, PlsrAPInt i1, PlsrAPInt i2) +{ + char* res1 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i1), 10) + 2); + mpz_get_str(res1, 10, pLsrAPIntGetZ(i1)); + char* res2 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i2), 10) + 2); + mpz_get_str(res2, 10, pLsrAPIntGetZ(i2)); + printf("APInt: %s, (%s, %s)\n", s, res1, res2); + pLsrFreeC(res1); + pLsrFreeC(res2); +} +static void pLsrAPIntShow3(char * s, PlsrAPInt i1, PlsrAPInt i2, PlsrAPInt i3) +{ + char* res1 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i1), 10) + 2); + mpz_get_str(res1, 10, pLsrAPIntGetZ(i1)); + char* res2 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i2), 10) + 2); + mpz_get_str(res2, 10, pLsrAPIntGetZ(i2)); + char* res3 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i3), 10) + 2); + mpz_get_str(res3, 10, pLsrAPIntGetZ(i3)); + printf("APInt: %s, (%s, %s, %s)\n", s, res1, res2, res3); + pLsrFreeC(res1); + pLsrFreeC(res2); + pLsrFreeC(res3); +} +static void pLsrAPIntShow4(char * s, PlsrAPInt i1, PlsrAPInt i2, PlsrAPInt i3, PlsrAPInt i4) +{ + char* res1 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i1), 10) + 2); + mpz_get_str(res1, 10, pLsrAPIntGetZ(i1)); + char* res2 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i2), 10) + 2); + mpz_get_str(res2, 10, pLsrAPIntGetZ(i2)); + char* res3 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i3), 10) + 2); + mpz_get_str(res3, 10, pLsrAPIntGetZ(i3)); + char* res4 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i4), 10) + 2); + mpz_get_str(res4, 10, pLsrAPIntGetZ(i4)); + printf("APInt: %s, (%s, %s, %s, %s)\n", s, res1, res2, res3, res4); + pLsrFreeC(res1); + pLsrFreeC(res2); + pLsrFreeC(res3); + pLsrFreeC(res4); +} + +#define pLsrAPIntTrace0(s) printf("APInt: %s\n", s) +#define pLsrAPIntTrace1(s, i1) pLsrAPIntShow1(s, i1) +#define pLsrAPIntTrace2(s, i1, i2) pLsrAPIntShow2(s, i1, i2) +#define pLsrAPIntTrace3(s, i1, i2, i3) pLsrAPIntShow3(s, i1, i2, i3) +#define pLsrAPIntTrace4(s, i1, i2, i3, i4) pLsrAPIntShow4(s, i1, i2, i3, i4) +#define pLsrAPIntTraceFmt1(s, i1) printf(s, i1) +#else +#define pLsrAPIntTrace0(s) +#define pLsrAPIntTrace1(s, i1) +#define pLsrAPIntTrace2(s, i1, i2) +#define pLsrAPIntTrace3(s, i1, i2, i3) +#define pLsrAPIntTrace4(s, i1, i2, i3, i4) +#define pLsrAPIntTraceFmt1(s, i1) +#endif + +/* Allocation */ + +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ +static void pLsrAPIntFinalize(PlsrAPInt i) +{ + mpz_clear(pLsrAPIntGetZ(i)); + +#ifdef PLSR_GMP_USE_MALLOC + pLsrFreeC(pLsrAPIntGetZPtr(i)); +#endif + + pLsrAPIntKeepLive1(i); +} + +# undef to +#pragma pillar_managed(on) +#endif + +#ifdef PLSR_GMP_FORCE_GC +/* At some point, this should be made thread safe */ +static int pLsrGmpAllocated=0; +static int pLsrGmpMemLimit=100*1000*1000; +#define pLsrAPIntGCCheck() \ + do { \ + if (pLsrGmpAllocated > pLsrGmpMemLimit) { \ + pgc_force_gc(); \ + } \ + } while (0) +#else /* PLSR_GMP_FORCE_GC */ + +#define pLsrAPIntGCCheck() + +#endif /* PLSR_GMP_FORCE_GC */ + +/* NB: This does not initialize the mpz_t structure */ +static PlsrAPInt pLsrAPIntNewUninit() +{ + PlsrAPInt res; + pLsrAPIntGCCheck(); +#ifdef PLSR_GMP_USE_PINNING + pLsrAllocPinnedFinalizable(PlsrAPInt, res, pLsrAPIntVTable, sizeof(PlsrAPIntS)); +#else + pLsrAllocFinalizable(PlsrAPInt, res, pLsrAPIntVTable, sizeof(PlsrAPIntS)); +#endif +#ifdef PLSR_GMP_USE_MALLOC + pLsrAPIntGetZPtr(res) = (mpz_t*) pLsrAllocC(sizeof(mpz_t)); +#endif + return res; +} + +static PlsrAPInt pLsrAPIntNew() +{ + PlsrAPInt res; + pLsrAPIntGCCheck(); +#ifdef PLSR_GMP_USE_PINNING + pLsrAllocPinnedFinalizable(PlsrAPInt, res, pLsrAPIntVTable, sizeof(PlsrAPIntS)); +#else + pLsrAllocFinalizable(PlsrAPInt, res, pLsrAPIntVTable, sizeof(PlsrAPIntS)); +#endif +#ifdef PLSR_GMP_USE_MALLOC + pLsrAPIntGetZPtr(res) = (mpz_t*) pLsrAllocC(sizeof(mpz_t)); +#endif + mpz_init(pLsrAPIntGetZ(res)); + return res; +} + +static PlsrAPInt pLsrAPIntNewFromCString(char *s) +{ + PlsrAPInt res = pLsrAPIntNewUninit(); + if (mpz_init_set_str(pLsrAPIntGetZ(res), s, 0) != 0) { + pLsrRuntimeError("Failed to initialize ap integer"); + } + pLsrAPIntTrace1("New from string", res); + return res; +} + +static PlsrAPInt pLsrAPIntNewFromSignedLong(long int si) +{ + PlsrAPInt res = pLsrAPIntNewUninit(); + mpz_init_set_si(pLsrAPIntGetZ(res), si); + return res; +} + +static PlsrAPInt pLsrAPIntNewFromUnsignedLong(unsigned long int ui) +{ + PlsrAPInt res = pLsrAPIntNewUninit(); + mpz_init_set_ui(pLsrAPIntGetZ(res), ui); + return res; +} + + +static PlsrAPInt pLsrAPIntNewFromFloat32(float32 f) +{ + PlsrAPInt res = pLsrAPIntNewUninit(); + mpz_init_set_d(pLsrAPIntGetZ(res), (float64) f); + pLsrAPIntTrace1("New from float32", res); + return res; +} + +static PlsrAPInt pLsrAPIntNewFromFloat64(float64 f) +{ + PlsrAPInt res = pLsrAPIntNewUninit(); + mpz_init_set_d(pLsrAPIntGetZ(res), f); + pLsrAPIntTrace1("New from float64", res); + return res; +} + + +static PlsrAPInt pLsrAPIntNewFromAPInt(PlsrAPInt a) +{ + PlsrAPInt res = pLsrAPIntNewUninit(); + mpz_init_set(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a)); + pLsrAPIntKeepLive1(a); + return res; +} + +static void pLsrAPIntDestroy(PlsrAPInt i) +{ + mpz_clear(pLsrAPIntGetZ(i)); +#ifdef PLSR_GMP_USE_MALLOC + pLsrFreeC(pLsrAPIntGetZPtr(i)); +#endif + pLsrAPIntKeepLive1(i); +} + +/* Constants */ + +/* Globals */ + +#define PLSR_AP_INT_BACKPATCH_GLOBALS 1 + +#define pLsrAPIntStaticGlobalNew(dv, bytes) \ + static PlsrAPIntS dv = {.vtable = pLsrAPIntVTable, } + +/* These are global, and will never be freed, */ +static void pLsrAPIntStaticGlobalInitFromCString(PlsrAPInt i, char* s) +{ +#ifdef PLSR_GMP_USE_MALLOC + pLsrAPIntGetZPtr(i) = (mpz_t*) pLsrAllocC(sizeof(mpz_t)); +#endif + /* Use leading characters of string to determine base*/ + if (mpz_init_set_str(pLsrAPIntGetZ(i), s, 0) != 0) { + pLsrRuntimeError("Failed to initialize global ap integer"); + } +} + +static void pLsrAPIntStaticGlobalInitFromSInt32(PlsrAPInt z, sint32 si) +{ +#ifdef PLSR_GMP_USE_MALLOC + pLsrAPIntGetZPtr(z) = (mpz_t*) pLsrAllocC(sizeof(mpz_t)); +#endif + mpz_init_set_si(pLsrAPIntGetZ(z), si); +} + +static void pLsrAPIntStaticGlobalInitFromUInt32(PlsrAPInt z, uint32 ui) +{ +#ifdef PLSR_GMP_USE_MALLOC + pLsrAPIntGetZPtr(z) = (mpz_t*) pLsrAllocC(sizeof(mpz_t)); +#endif + mpz_init_set_ui(pLsrAPIntGetZ(z), ui); +} + +pLsrAPIntStaticGlobalNew(pLsrAPIntZero_, 0); +#define pLsrAPIntZero ((PlsrAPInt) &pLsrAPIntZero_) +pLsrAPIntStaticGlobalNew(pLsrAPIntOne_, 0); +#define pLsrAPIntOne ((PlsrAPInt) &pLsrAPIntOne_) +pLsrAPIntStaticGlobalNew(pLsrAPIntMinusOne_, 0); +#define pLsrAPIntMinusOne ((PlsrAPInt) &pLsrAPIntMinusOne_) +pLsrAPIntStaticGlobalNew(pLsrAPIntMinusUInt32Max_, 0); +#define pLsrAPIntMinusUInt32Max ((PlsrAPInt) &pLsrAPIntMinusUInt32Max_) + +/* Conversions */ + +/* Unsigned Integer Conversions */ + +static PlsrAPInt pLsrAPIntFromUInt32(uint32 i) +{ + PlsrAPInt res = pLsrAPIntNewFromUnsignedLong(i); + pLsrAPIntTrace1("New from uint32", res); + return res; +} + +static PlsrAPInt pLsrAPIntFromUInt64(uint64 i) +{ + PlsrAPInt res; + if (sizeof(uint64) <= sizeof(unsigned long int)) + res = pLsrAPIntNewFromUnsignedLong(i); + else if (i <= UINT32_MAX) { + res = pLsrAPIntFromUInt32(i); + } else { + uint64 ui = i; + uint32 upper = (uint32) (ui >> 32); + uint32 lower = (uint32) ui; + PlsrAPInt z = pLsrAPIntFromUInt32(upper); + mpz_mul_2exp(pLsrAPIntGetZ(z), pLsrAPIntGetZ(z), 32); + mpz_add_ui(pLsrAPIntGetZ(z), pLsrAPIntGetZ(z), lower); + res = z; + } + pLsrAPIntTrace1("New from uint64", res); + return res; +} + +static PlsrAPInt pLsrAPIntFromUIntp(uintp i) +{ + PlsrAPInt res; + if (sizeof(uintp) <= sizeof(uint32)) + res = pLsrAPIntFromUInt32(i); + else if (sizeof(uintp) <= sizeof(uint64)) + res = pLsrAPIntFromUInt64(i); + else { + pLsrRuntimeError("UIntp too large"); + res = 0; + } + pLsrAPIntTrace1("New from uintp", res); + return res; +} + +static uint32 pLsrUInt32FromAPInt(PlsrAPInt a) +{ + uint32 res = (uint32) mpz_get_ui(pLsrAPIntGetZ(a)); + pLsrAPIntKeepLive1(a); + pLsrAPIntTrace1("To uint32", a); + return res; +} + +static uint64 pLsrUInt64FromAPInt(PlsrAPInt a) +{ + uint64 res = 0; + if (sizeof(uint64) <= sizeof(unsigned long int) || mpz_fits_ulong_p(pLsrAPIntGetZ(a))) { + res = (uint64) mpz_get_ui(pLsrAPIntGetZ(a)); + } else { + mpz_t tmp1; + mpz_t tmp2; + mpz_init_set(tmp1, pLsrAPIntGetZ(a)); + mpz_init_set(tmp2, tmp1); + mpz_fdiv_q_2exp(tmp1, tmp1, 32); + mpz_fdiv_r_2exp(tmp2, tmp2, 32); + res = (uint64) mpz_get_ui(tmp1); + res = res << 32; + res += (uint64) mpz_get_ui(tmp2); + mpz_clear(tmp1); + mpz_clear(tmp2); + } + pLsrAPIntKeepLive1(a); + pLsrAPIntTrace1("To uint64", a); + return res; +} + +static uintp pLsrUIntpFromAPInt(PlsrAPInt a) +{ + uintp res; + if (sizeof(uintp) <= sizeof(uint32)) + res = pLsrUInt32FromAPInt(a); + else if (sizeof(uintp) <= sizeof(uint64)) + res = pLsrUInt64FromAPInt(a); + else { + pLsrRuntimeError("UIntp too large"); + res = 0; + } + pLsrAPIntTrace1("To uintp", a); + return res; +} + +/* Signed Integer Conversions */ + +static PlsrAPInt pLsrAPIntFromSInt32(sint32 i) +{ + PlsrAPInt res = pLsrAPIntNewFromSignedLong(i); + pLsrAPIntTrace1("From sint32", res); + return res; +} + +static PlsrAPInt pLsrAPIntFromSInt64(sint64 i) +{ + PlsrAPInt res; + if (sizeof(sint64) <= sizeof(long int)) + res = pLsrAPIntNewFromSignedLong(i); + else if (i <= ((sint64) SINT32_MAX) && i >= ((sint64) SINT32_MIN)) { + res = pLsrAPIntFromSInt32((sint32) i); + } else { + uint64 ui = pLsrAbs64(i); + uint32 upper = (uint32) (ui >> 32); + uint32 lower = (uint32) ui; + PlsrAPInt z = pLsrAPIntFromUInt32(upper); + mpz_mul_2exp(pLsrAPIntGetZ(z), pLsrAPIntGetZ(z), 32); + mpz_add_ui(pLsrAPIntGetZ(z), pLsrAPIntGetZ(z), lower); + if (i < 0) { + mpz_neg(pLsrAPIntGetZ(z), pLsrAPIntGetZ(z)); + } + res = z; + } + pLsrAPIntTrace1("From sint64", res); + return res; +} + +static PlsrAPInt pLsrAPIntFromSIntp(sintp i) +{ + PlsrAPInt res; + if (sizeof(sintp) <= sizeof(sint32)) + res = pLsrAPIntFromSInt32(i); + else if (sizeof(sintp) <= sizeof(sint64)) + res = pLsrAPIntFromSInt64(i); + else { + pLsrRuntimeError("SIntp too large"); + res = 0; + } + pLsrAPIntTrace1("From sint64", res); + return res; +} + +static sint32 pLsrSInt32FromAPInt(PlsrAPInt a) +{ + sint32 res = (sint32) mpz_get_si(pLsrAPIntGetZ(a)); + pLsrAPIntTraceFmt1("To sint32 %d", res); + return res; +} + +static sint64 pLsrSInt64FromAPInt(PlsrAPInt a) +{ + sint64 res = 0; + pLsrAPIntTrace1("To sint64", a); + if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(pLsrAPIntGetZ(a))) { + res = (sint64) mpz_get_si(pLsrAPIntGetZ(a)); + } else { + int negate = 0; + mpz_t tmp1; + mpz_t tmp2; + mpz_init_set(tmp1, pLsrAPIntGetZ(a)); + if (mpz_sgn(tmp1) == -1) { + negate = 1; + mpz_neg(tmp1, tmp1); + } + mpz_init_set(tmp2, tmp1); + mpz_fdiv_q_2exp(tmp1, tmp1, 32); + mpz_fdiv_r_2exp(tmp2, tmp2, 32); + res = (sint64) mpz_get_ui(tmp1); + res = res << 32; + res += (sint64) mpz_get_ui(tmp2); + if (negate) { + res = -res; + } + mpz_clear(tmp1); + mpz_clear(tmp2); + } + pLsrAPIntKeepLive1(a); + pLsrAPIntTraceFmt1("To sint64 %lld\n", res); + return res; +} + +static sintp pLsrSIntpFromAPInt(PlsrAPInt a) +{ + pLsrAPIntTrace1("To sintp", a); + if (sizeof(sintp) <= sizeof(sint32)) + return pLsrSInt32FromAPInt(a); + else if (sizeof(sintp) <= sizeof(sint64)) + return pLsrSInt64FromAPInt(a); + else { + pLsrRuntimeError("SIntp too large"); + return 0; + } +} + +/* String Conversions */ +static char* pLsrCStringFromAPInt(PlsrAPInt a) +{ + char* res = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(a), 10) + 2); + mpz_get_str(res, 10, pLsrAPIntGetZ(a)); + pLsrAPIntKeepLive1(a); + pLsrAPIntTrace1("To cstring", a); + return res; +} + +static PlsrAPInt pLsrAPIntFromCString(char* s) +{ + PlsrAPInt res = pLsrAPIntNewFromCString(s); + pLsrAPIntTrace1("From cstring", res); + return res; +} + +#define pLsrAPIntBinary(a, b, operator) \ + do { \ + PlsrAPInt res = pLsrAPIntNew(); \ + operator(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); \ + pLsrAPIntKeepLive2(a, b); \ + pLsrAPIntTrace3(#operator, a, b, res); \ + return res; \ + } while(0) + +/* Casts */ + +static sint8 pLsrAPIntCastToSInt8(PlsrAPInt a) +{ + sint8 res = (sint8)(sint32) mpz_get_si(pLsrAPIntGetZ(a)); + pLsrAPIntTraceFmt1("Cast to sint8 %d", res); + return res; +} + +static sint16 pLsrAPIntCastToSInt16(PlsrAPInt a) +{ + sint16 res = (sint16)(sint32) mpz_get_si(pLsrAPIntGetZ(a)); + pLsrAPIntTraceFmt1("Cast to sint16 %d", res); + return res; +} + +static sint64 pLsrAPIntCastToSInt64(PlsrAPInt a) +{ + sint64 res = 0; + pLsrAPIntTrace1("Cast to sint64", a); + if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(pLsrAPIntGetZ(a))) { + res = (sint64) mpz_get_si(pLsrAPIntGetZ(a)); + } else { + int negate = 0; + mpz_t tmp1; + mpz_t tmp2; + mpz_init_set(tmp1, pLsrAPIntGetZ(a)); + if (mpz_sgn(tmp1) == -1) { + negate = 1; + mpz_neg(tmp1, tmp1); + } + mpz_init_set(tmp2, tmp1); + mpz_fdiv_q_2exp(tmp1, tmp1, 32); + mpz_fdiv_r_2exp(tmp2, tmp2, 32); + res = (sint64) mpz_get_ui(tmp1); + res = res << 32; + res += (sint64) mpz_get_ui(tmp2); + if (negate) { + res = -res; + } + mpz_clear(tmp1); + mpz_clear(tmp2); + } + pLsrAPIntKeepLive1(a); + pLsrAPIntTraceFmt1("To sint64 %lld\n", res); + return res; +} + +static uint8 pLsrAPIntCastToUInt8(PlsrAPInt a) +{ + uint8 res = (uint8)(sint32) mpz_get_si(pLsrAPIntGetZ(a)); + pLsrAPIntTraceFmt1("Cast to uint8 %d", res); + return res; +} + +static uint16 pLsrAPIntCastToUInt16(PlsrAPInt a) +{ + uint16 res = (uint16)(sint32) mpz_get_si(pLsrAPIntGetZ(a)); + pLsrAPIntTraceFmt1("Cast to uint16 %d", res); + return res; +} + +static uint32 pLsrAPIntCastToUInt32(PlsrAPInt a) +{ + uint32 res = (uint32) mpz_get_ui(pLsrAPIntGetZ(a)); + if (mpz_sgn(pLsrAPIntGetZ(a)) == -1) { + res = -res; + } + pLsrAPIntTraceFmt1("Cast to uint32 %d", res); + return res; +} + +static sint32 pLsrAPIntCastToSInt32(PlsrAPInt a) +{ + return (sint32)pLsrAPIntCastToUInt32(a); +} + +static uint64 pLsrAPIntCastToUInt64(PlsrAPInt a) +{ + sint64 res = 0; + pLsrAPIntTrace1("Cast to uint64", a); + if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(pLsrAPIntGetZ(a))) { + res = (sint64) mpz_get_si(pLsrAPIntGetZ(a)); + } else { + int negate = 0; + mpz_t tmp1; + mpz_t tmp2; + mpz_init_set(tmp1, pLsrAPIntGetZ(a)); + if (mpz_sgn(tmp1) == -1) { + negate = 1; + mpz_neg(tmp1, tmp1); + } + mpz_init_set(tmp2, tmp1); + mpz_fdiv_q_2exp(tmp1, tmp1, 32); + mpz_fdiv_r_2exp(tmp2, tmp2, 32); + res = (sint64) mpz_get_ui(tmp1); + res = res << 32; + res += (sint64) mpz_get_ui(tmp2); + if (negate) { + res = -res; + } + mpz_clear(tmp1); + mpz_clear(tmp2); + } + pLsrAPIntKeepLive1(a); + pLsrAPIntTraceFmt1("To uint64 %lld\n", res); + return (uint64)res; +} + +static float32 pLsrAPIntCastToFloat32(PlsrAPInt a) +{ + float32 res = (float32) mpz_get_d(pLsrAPIntGetZ(a)); + pLsrAPIntKeepLive1(a); + pLsrAPIntTraceFmt1("Cast to float32 %f\n", res); + return res; +} + +static float64 pLsrAPIntCastToFloat64(PlsrAPInt a) +{ + float64 res = (float64) mpz_get_d(pLsrAPIntGetZ(a)); + pLsrAPIntKeepLive1(a); + pLsrAPIntTraceFmt1("Cast to float64 %lf\n", res); + return res; +} + +/* Bitwise */ + +static PlsrAPInt pLsrAPIntBNot(PlsrAPInt a) +{ + PlsrAPInt res = pLsrAPIntNew(); + mpz_com(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a)); + pLsrAPIntKeepLive1(a); + pLsrAPIntTrace2("BNot", a, res); + return res; +} + +static PlsrAPInt pLsrAPIntBAnd(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_and); +} + +static PlsrAPInt pLsrAPIntBOr(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_ior); +} + +static PlsrAPInt pLsrAPIntShiftLUIntL(PlsrAPInt a, unsigned long b) +{ + PlsrAPInt res = pLsrAPIntNew(); + mpz_mul_2exp(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a), b); + pLsrAPIntKeepLive1(a); + return res; +} + +static PlsrAPInt pLsrAPIntShiftRUIntL(PlsrAPInt a, unsigned long b) +{ + PlsrAPInt res = pLsrAPIntNew(); + mpz_fdiv_q_2exp(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a), b); + pLsrAPIntKeepLive1(a); + return res; +} + +static PlsrAPInt pLsrAPIntBShiftL(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt res = 0; + if (mpz_fits_slong_p(pLsrAPIntGetZ(b))) { + long int count = mpz_get_si(pLsrAPIntGetZ(b)); + if (count < 0) { + res = pLsrAPIntShiftRUIntL(a, -count); + } else { + res = pLsrAPIntShiftLUIntL(a, count); + } + } else { + pLsrRuntimeError("GMP AP Int only supports shifts up to 2^32 bits"); + } + return res; +} + +static PlsrAPInt pLsrAPIntBShiftR(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt res = 0; + if (mpz_fits_slong_p(pLsrAPIntGetZ(b))) { + long int count = mpz_get_si(pLsrAPIntGetZ(b)); + if (count < 0) { + res = pLsrAPIntShiftLUIntL(a, -count); + } else { + res = pLsrAPIntShiftRUIntL(a, count); + } + } else { + pLsrRuntimeError("GMP AP Int only supports shifts up to 2^32 bits"); + } + return res; +} + +static PlsrAPInt pLsrAPIntBXor(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_xor); +} + + + +/* Arithmetic */ + +static PlsrAPInt pLsrAPIntNegate(PlsrAPInt a) +{ + PlsrAPInt res = pLsrAPIntNew(); + mpz_neg(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a)); + pLsrAPIntKeepLive1(a); + pLsrAPIntTrace2("Negate", a, res); + return res; +} + +static PlsrAPInt pLsrAPIntAdd(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_add); +} + +static PlsrAPInt pLsrAPIntSub(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_sub); +} + +static PlsrAPInt pLsrAPIntMul(PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntBinary(a, b, mpz_mul); +} + + +static void pLsrAPIntDivModE(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt q = pLsrAPIntNew(); + PlsrAPInt r = pLsrAPIntNew(); + mpz_tdiv_qr(pLsrAPIntGetZ(q), pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); + if (mpz_sgn(pLsrAPIntGetZ(r)) < 0) { + if (mpz_sgn(pLsrAPIntGetZ(b)) > 0) { + mpz_sub_ui(pLsrAPIntGetZ(q), pLsrAPIntGetZ(q),1); + mpz_add(pLsrAPIntGetZ(r), pLsrAPIntGetZ(r), pLsrAPIntGetZ(b)); + } else { + mpz_add_ui(pLsrAPIntGetZ(q), pLsrAPIntGetZ(q), 1); + mpz_sub(pLsrAPIntGetZ(r), pLsrAPIntGetZ(r), pLsrAPIntGetZ(b)); + } + } + *quotO = q; + *remO = r; + pLsrAPIntKeepLive2(a, b); + pLsrAPIntTrace4("DivModE", a, b, q, r); + return; +} + +static void pLsrAPIntDivModF(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt q = pLsrAPIntNew(); + PlsrAPInt r = pLsrAPIntNew(); + mpz_fdiv_qr(pLsrAPIntGetZ(q), pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); + *quotO = q; + *remO = r; + pLsrAPIntKeepLive2(a, b); + pLsrAPIntTrace4("DivModF", a, b, q, r); + return; +} + +static void pLsrAPIntDivModT(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt q = pLsrAPIntNew(); + PlsrAPInt r = pLsrAPIntNew(); + mpz_tdiv_qr(pLsrAPIntGetZ(q), pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); + *quotO = q; + *remO = r; + pLsrAPIntKeepLive2(a, b); + pLsrAPIntTrace4("DivModT", a, b, q, r); + return; +} + +static PlsrAPInt pLsrAPIntDivE(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt q = pLsrAPIntNew(); + mpz_t r; + mpz_init(r); + mpz_tdiv_qr(pLsrAPIntGetZ(q), r, pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); + if (mpz_sgn(r) < 0) { + if (mpz_sgn(pLsrAPIntGetZ(b)) > 0) { + mpz_sub_ui(pLsrAPIntGetZ(q), pLsrAPIntGetZ(q), 1); + } else { + mpz_add_ui(pLsrAPIntGetZ(q), pLsrAPIntGetZ(q), 1); + } + } + pLsrAPIntTrace4("DivE", a, b, q, r); + mpz_clear(r); + pLsrAPIntKeepLive2(a, b); + pLsrAPIntTrace3("DivE", a, b, q); + return q; + +} + +static PlsrAPInt pLsrAPIntDivF(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt q = pLsrAPIntNew(); + mpz_fdiv_q(pLsrAPIntGetZ(q), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); + pLsrAPIntKeepLive2(a, b); + pLsrAPIntTrace3("DivF", a, b, q); + return q; +} + +static PlsrAPInt pLsrAPIntDivT(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt q = pLsrAPIntNew(); + mpz_tdiv_q(pLsrAPIntGetZ(q), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); + pLsrAPIntKeepLive2(a, b); + pLsrAPIntTrace3("DivT", a, b, q); + return q; +} + +static PlsrAPInt pLsrAPIntModE(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt r = pLsrAPIntNew(); + mpz_tdiv_r(pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); + if (mpz_sgn(pLsrAPIntGetZ(r)) < 0) { + if (mpz_sgn(pLsrAPIntGetZ(b)) > 0) { + mpz_add(pLsrAPIntGetZ(r), pLsrAPIntGetZ(r), pLsrAPIntGetZ(b)); + } else { + mpz_sub(pLsrAPIntGetZ(r), pLsrAPIntGetZ(r), pLsrAPIntGetZ(b)); + } + } + pLsrAPIntKeepLive2(a, b); + pLsrAPIntTrace3("ModE", a, b, r); + return r; +} + +static PlsrAPInt pLsrAPIntModF(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt r = pLsrAPIntNew(); + mpz_fdiv_r(pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); + pLsrAPIntKeepLive2(a, b); + pLsrAPIntTrace3("ModF", a, b, r); + return r; +} + +static PlsrAPInt pLsrAPIntModT(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt r = pLsrAPIntNew(); + mpz_tdiv_r(pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); + pLsrAPIntKeepLive2(a, b); + return r; +} + +static PlsrAPInt pLsrAPIntGcd(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt res = pLsrAPIntNew(); + mpz_gcd(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); + pLsrAPIntKeepLive2(a, b); + pLsrAPIntTrace3("Gcd", a, b, res); + return res; +} + + +/* Comparisons */ + +static sintp pLsrAPIntCompare(PlsrAPInt a, PlsrAPInt b) +{ + sintp res = mpz_cmp(pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); + pLsrAPIntKeepLive2(a, b); + pLsrAPIntTrace2("Compare", a, b); + return res; +} + + +static PlsrBoolean pLsrAPIntLess(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) < 0; +} + +static PlsrBoolean pLsrAPIntGreater(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) > 0; +} + +static PlsrBoolean pLsrAPIntEqual(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) == 0; +} + +static PlsrBoolean pLsrAPIntNotEqual(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) != 0; +} + +static PlsrBoolean pLsrAPIntLessOrEqual(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) <= 0; +} + +static PlsrBoolean pLsrAPIntGreaterOrEqual(PlsrAPInt a, PlsrAPInt b) { + return pLsrAPIntCompare(a, b) >= 0; +} + + +/* Miscellaneous */ + +static uint32 pLsrAPIntFitsInSInt32(PlsrAPInt a) +{ + uint32 res = mpz_fits_slong_p(pLsrAPIntGetZ(a)); + + if (sizeof(sint32) < sizeof(long int)) { + /* If it fits in a long int and the long int is small enough, + * it fits in an sint32 */ + if (res) { + long int ai = mpz_get_si(pLsrAPIntGetZ(a)); + res = ai >=SINT32_MIN && ai <= SINT32_MAX; + } + } else if (sizeof(sint32) > sizeof(long int)) { + if (!res) { + res = (pLsrAPIntGreaterOrEqual(a, pLsrAPIntFromSInt32(SINT32_MIN)) && + pLsrAPIntLessOrEqual(a, pLsrAPIntFromSInt32(SINT32_MAX))); + } + } + /* If they're equal, we're good. */ + pLsrAPIntKeepLive1(a); + return res; +} + +/* Returns SINT32_MIN if (a >= upper) or (a <= lower) + * Otherwise returns a. + * To be useful, (lower > SINT32_MIN) should be true. + */ +static sint32 pLsrAPIntCheckRangeSInt32(PlsrAPInt a, sint32 upper, sint32 lower) +{ + uint32 fitsInLong = mpz_fits_slong_p(pLsrAPIntGetZ(a)); + sint32 res = SINT32_MIN; + if (sizeof(sint32) < sizeof(long int)) { + /* If it fits in a long int and the long int is small enough, + * it fits in an sint32 */ + if (fitsInLong) { + long int ai = mpz_get_si(pLsrAPIntGetZ(a)); + if (ai >= ((long int) lower) && ai <= ((long int) upper)) { + res = (sint32) ai; + } + } + } else if (sizeof(sint32) > sizeof(long int)) { + if (fitsInLong) { + sint32 ai = (sint32) mpz_get_si(pLsrAPIntGetZ(a)); + if (ai >= lower && ai <= upper) { + res = ai; + } + } else if (pLsrAPIntGreaterOrEqual(a, pLsrAPIntFromSInt32(lower)) && + pLsrAPIntLessOrEqual(a, pLsrAPIntFromSInt32(upper))) { + res = pLsrSInt32FromAPInt(a); + } + } else if (fitsInLong) { + sint32 ai = (sint32) mpz_get_si(pLsrAPIntGetZ(a)); + if (ai >= lower && ai <= upper) { + res = ai; + } + } + pLsrAPIntKeepLive1(a); + return res; +} + + +#define pLsrAPIntFromFloat32 pLsrAPIntNewFromFloat32 +#define pLsrAPIntFromFloat64 pLsrAPIntNewFromFloat64 + +static float32 pLsrFloat32FromAPInt(PlsrAPInt a) { + float32 res = (float32) mpz_get_d(pLsrAPIntGetZ(a)); + pLsrAPIntKeepLive1(a); + pLsrAPIntTrace1("To float32", a); + return res; +} + +static float64 pLsrFloat64FromAPInt(PlsrAPInt a) { + float64 res = (float64) mpz_get_d(pLsrAPIntGetZ(a)); + pLsrAPIntKeepLive1(a); + pLsrAPIntTrace1("To float64", a); + return res; +} + +#define hashPair(h1, h2) ((h1)+((h2)<<5)+(h2)+720) + +static uintp pLsrAPIntMPZHash(mpz_t z) +{ + uintp h; + switch mpz_sgn(z) { + case -1: h = 987; break; + case 1: h = 0; break; + case 0: h = 0; break; + } + for(size_t l = mpz_size(z); l > 0; l--) { + h = hashPair(h, mpz_getlimbn(z, l-1)); + } + return h; +} +static uintp pLsrAPIntHash(PlsrAPInt i) +{ + uintp h = pLsrAPIntMPZHash(pLsrAPIntGetZ(i)); + pLsrAPIntKeepLive1(i); + return h; +} + +/* This function must match the previous one */ +static uintp pLsrSInt32Hash(sint32 i) +{ + uintp h; + mpz_t tmp; + mpz_init_set_si(tmp, i); + h = pLsrAPIntMPZHash(tmp); + mpz_clear(tmp); + return h; +} + +/* Initialization and Registration */ + +static void pLsrAPIntRegisterVTables() +{ + static PgcIsRef pLsrAPIntRefs[pLsrAPIntSize/P_WORD_SIZE] = { 0, }; + + pLsrVTableRegisterFinalizable(pLsrAPIntVTable, pLsrAPIntAlignment, pLsrAPIntSize, pLsrAPIntRefs, 0, 0, 0, + PGC_ALWAYS_MUTABLE, pLsrAPIntIsPinned, pLsrAPIntFinalize); + +} + +#define pLsrAPIntGlobalsCount 4 + +static PlsrObjectB pLsrAPIntGlobals[] = + { + (PlsrObjectB) &pLsrAPIntZero_, + (PlsrObjectB) &pLsrAPIntOne_, + (PlsrObjectB) &pLsrAPIntMinusOne_, + (PlsrObjectB) &pLsrAPIntMinusUInt32Max_, + (PlsrObjectB) NULL /* This must be last */ + }; + +static void pLsrAPIntRegisterGlobals() { + assert(pLsrAPIntGlobals[pLsrAPIntGlobalsCount] == NULL); + pLsrGcRegisterGlobals (pLsrAPIntGlobals, pLsrAPIntGlobalsCount); +}; + +#ifdef PLSR_GMP_USE_GCMALLOC +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ + +static void* pLsrGMPMalloc(size_t size) +{ + return pLsrGCHeapMalloc((uintp) size); +} +static void pLsrGMPFree(void* object, size_t size) +{ + pLsrGCHeapFree(object); +} +static void* pLsrGMPReAlloc(void* object, size_t osize, size_t nsize) +{ + return pLsrGCHeapReAlloc(object, (uintp) osize, (uintp) nsize); +} + +# undef to +#pragma pillar_managed(on) +#endif +#else +#ifdef PLSR_GMP_FORCE_GC + +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ + +static void* pLsrGMPMalloc(size_t size) +{ + pLsrGmpAllocated += size; + return malloc((uintp) size); +} +static void pLsrGMPFree(void* object, size_t size) +{ + pLsrGmpAllocated -= size; + free(object); +} +static void* pLsrGMPReAlloc(void* object, size_t osize, size_t nsize) +{ + if (osize > nsize) { + pLsrGmpAllocated -= (osize-nsize); + } else { + pLsrGmpAllocated += (nsize-osize); + } + return realloc(object, (uintp) nsize); +} + +# undef to +#pragma pillar_managed(on) +#endif + +#endif /* PLSR_GMP_FORCE_GC */ +#endif /* PLSR_GMP_USE_GCMALLOC*/ + +static void pLsrAPIntInitialize(uintp memLimit) { +#ifdef PLSR_GMP_FORCE_GC + if (memLimit != 0) pLsrGmpMemLimit = memLimit; +#endif +#ifdef PLSR_GMP_REPLACE_MALLOC + mp_set_memory_functions(pLsrGMPMalloc, pLsrGMPReAlloc, pLsrGMPFree); +#endif + pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntZero, 0); + pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntOne, 1); + pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntMinusOne, -1); + pLsrAPIntStaticGlobalInitFromUInt32(pLsrAPIntMinusUInt32Max, UINT32_MAX); + mpz_neg(pLsrAPIntGetZ(pLsrAPIntMinusUInt32Max), pLsrAPIntGetZ(pLsrAPIntMinusUInt32Max)); +}; + +#endif /*_PLSR_AP_INTEGER_H_ */ diff --git a/runtime/include/hrc/plsr-integer.h b/runtime/include/hrc/plsr-integer.h new file mode 100644 index 0000000..13232ee --- /dev/null +++ b/runtime/include/hrc/plsr-integer.h @@ -0,0 +1,651 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_INTEGER_H_ +#define _PLSR_INTEGER_H_ + +/********************************************************************** + * Arbitrary precision integers + */ + +#include "hrc/plsr-ap-integer.h" + +/* GC functions */ + +#define pLsrIntegerIsRef 1 +#define pLsrIntegerRegisterVTables pLsrAPIntRegisterVTables +#define pLsrIntegerRegisterGlobals pLsrAPIntRegisterGlobals + +#ifdef P_USE_TAGGED_INTEGERS + +#include "hrc/plsr-tagged-int32.h" + +/* Types */ + +typedef PlsrAPInt PlsrInteger; +typedef PlsrAPIntS PlsrIntegerU; + +/* Basic constructors */ +#define pLsrIntegerIsTagged(a) (pLsrTaggedInt32TaggedIntIsTagged(a)) + +/* Constants */ + +#define pLsrSmallIntegerMax pLsrTaggedInt32Max +#define pLsrSmallIntegerMin pLsrTaggedInt32Min + +#define pLsrIntegerZero ((PlsrInteger) pLsrTaggedInt32Zero) +#define pLsrIntegerOne ((PlsrInteger) pLsrTaggedInt32One) +#define pLsrIntegerMinusOne ((PlsrInteger) pLsrTaggedInt32MinusOne) +#define pLsrIntegerMinusUInt32Max pLsrAPIntMinusUInt32Max + + +#ifdef PLSR_AP_INT_BACKPATCH_GLOBALS + +#define pLsrIntegerStaticUnboxed pLsrAPIntStaticGlobalNew +#define pLsrIntegerStaticGlobalInit pLsrAPIntStaticGlobalInitFromCString +#define pLsrIntegerStaticRef(uvar) ((PlsrAPInt) & (uvar)) + +#else + +#define pLsrIntegerSignNeg PlsrAPNeg +#define pLsrIntegerSignPos PlsrAPPos +#define pLsrIntegerSignZero PlsrAPZero +#define pLsrIntegerDigitListStaticEmpty pLsrAPIntDigitListStaticEmpty +#define pLsrIntegerDigitListStaticConsUnboxedDef pLsrAPIntDigitListStaticConsUnboxedDef +#define pLsrIntegerDigitListStaticConsRef pLsrAPIntDigitListStaticConsRef +#define pLsrIntegerStaticUnboxedDef pLsrAPIntStaticUnboxedDef +#define pLsrIntegerStaticRef(uvar) ((PlsrAPInt) & uvar) + +#endif + + +/* Conversions */ + +#define pLsrSmallIntegerFromSInt32(a) (pLsrTaggedInt32TaggedIntFromSmallInt32(PlsrInteger, a)) + +#define pLsrIntegerFromSInt8 pLsrIntegerFromSInt32 +#define pLsrSInt8FromInteger pLsrSInt32FromInteger + +#define pLsrIntegerFromSInt16 pLsrIntegerFromSInt32 +#define pLsrSInt16FromInteger pLsrSInt32FromInteger + + +#define pLsrIntegerFromSInt32(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(PlsrInteger, sint32, pLsrAPIntFromSInt32, dest, a) +#define pLsrSInt32FromInteger(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint32, PlsrInteger, PlsrAPInt, pLsrSInt32FromAPInt, dest, a) + +#define pLsrIntegerFromSIntp(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(PlsrInteger, sintp, pLsrAPIntFromSIntp, dest, a) +#define pLsrSIntpFromInteger(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sintp, PlsrInteger, PlsrAPInt, pLsrSIntpFromAPInt, dest, a) + +#define pLsrIntegerFromSInt64(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(PlsrInteger, sint64, pLsrAPIntFromSInt64, dest, a) +#define pLsrSInt64FromInteger(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint64, PlsrInteger, PlsrAPInt, pLsrSInt64FromAPInt, dest, a) + +#define pLsrIntegerFromUInt8 pLsrIntegerFromUInt32 +#define pLsrUInt8FromInteger pLsrUInt32FromInteger + +#define pLsrIntegerFromUInt16 pLsrIntegerFromUInt32 +#define pLsrUInt16FromInteger pLsrUInt32FromInteger + +#define pLsrIntegerFromUInt32(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(PlsrInteger, uint32, pLsrAPIntFromUInt32, dest, a) +#define pLsrUInt32FromInteger(dest, a) \ + pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(uint32, PlsrInteger, PlsrAPInt, pLsrUInt32FromAPInt, dest, a) + +#define pLsrIntegerFromUIntp(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(PlsrInteger, uintp, pLsrAPIntFromUIntp, dest, a) +#define pLsrUIntpFromInteger(dest, a) \ + pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(uintp, PlsrInteger, PlsrAPInt, pLsrUIntpFromAPInt, dest, a) + +#define pLsrIntegerFromUInt64(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(PlsrInteger, uint64, pLsrAPIntFromUInt64, dest, a) +#define pLsrUInt64FromInteger(dest, a) \ + pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(uint64, PlsrInteger, PlsrAPInt, pLsrUInt64FromAPInt, dest, a) + +#define pLsrIntegerFromFloat32(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromFloat32(PlsrInteger, pLsrAPIntFromFloat32, dest, a) +#define pLsrFloat32FromInteger(dest, a) \ + pLsrTaggedInt32NumConvFastFloatXFromTaggedInt(float32, PlsrInteger, PlsrAPInt, pLsrFloat32FromAPInt, dest, a) + +#define pLsrIntegerFromFloat64(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromFloat64(PlsrInteger, pLsrAPIntFromFloat64, dest, a) +#define pLsrFloat64FromInteger(dest, a) \ + pLsrTaggedInt32NumConvFastFloatXFromTaggedInt(float64, PlsrInteger, PlsrAPInt, pLsrFloat64FromAPInt, dest, a) + + +/* pLsrCStringFromInteger */ +pLsrTaggedInt32MkCStringFrom(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrCStringFromAPInt); + +/* pLsrIntegerFromCString */ +pLsrTaggedInt32MkFromCString(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, \ + pLsrSInt32FromAPInt, pLsrAPIntLessOrEqual, pLsrAPIntFromCString); + +/* Casts */ + +/* NG: Note that we are assuming that casting from sint32 to sintX and uintX takes the bottom X bits + * This is not portable. + */ + +#define pLsrIntegerCastToSInt8(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint8, PlsrInteger, PlsrAPInt, pLsrAPIntCastToSInt8, dest, a) + +#define pLsrIntegerCastToSInt16(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint16, PlsrInteger, PlsrAPInt, pLsrAPIntCastToSInt16, dest, a) + +#define pLsrIntegerCastToSInt32(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint32, PlsrInteger, PlsrAPInt, pLsrAPIntCastToSInt32, dest, a) + +#define pLsrIntegerCastToSInt64(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint64, PlsrInteger, PlsrAPInt, pLsrAPIntCastToSInt64, dest, a) + +#define pLsrIntegerCastToUInt8(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(uint8, PlsrInteger, PlsrAPInt, pLsrAPIntCastToUInt8, dest, a) + +#define pLsrIntegerCastToUInt16(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(uint16, PlsrInteger, PlsrAPInt, pLsrAPIntCastToUInt16, dest, a) + +#define pLsrIntegerCastToUInt32(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(uint32, PlsrInteger, PlsrAPInt, pLsrAPIntCastToUInt32, dest, a) + +#define pLsrIntegerCastToUInt64(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(uint64, PlsrInteger, PlsrAPInt, pLsrAPIntCastToUInt64, dest, a) + +#define pLsrIntegerCastToFloat32(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(float32, PlsrInteger, PlsrAPInt, pLsrAPIntCastToFloat32, dest, a) + +#define pLsrIntegerCastToFloat64(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(float64, PlsrInteger, PlsrAPInt, pLsrAPIntCastToFloat64, dest, a) + +/* Arithmetic */ + +#ifdef PLSR_TAGGED_INTEGER_RECOVER + +static PlsrInteger pLsrIntegerAPIntBNot(PlsrAPInt a) +{ + PlsrAPInt c = pLsrAPIntBNot(a); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntBAnd(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntBAnd(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntBOr(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntBOr(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntBShiftL(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntBShiftL(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntBShiftR(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntBShiftR(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntBXor(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntBXor(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntAdd(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntAdd(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntSub(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntSub(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +/* Multiplication can only make things worse */ +#define pLsrIntegerAPIntMul pLsrAPIntMul + +static void pLsrIntegerAPIntDivModE(PlsrInteger* q, PlsrInteger* r, PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntDivModE(q, r, a, b); + sint32 qi = pLsrAPIntCheckRangeSInt32 (*q, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + sint32 ri = pLsrAPIntCheckRangeSInt32 (*r, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (qi > SINT32_MIN) { + *q = pLsrSmallIntegerFromSInt32(qi); + } + if (ri > SINT32_MIN) { + *r = pLsrSmallIntegerFromSInt32(ri); + } +} + +static void pLsrIntegerAPIntDivModF(PlsrInteger* q, PlsrInteger* r, PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntDivModF(q, r, a, b); + sint32 qi = pLsrAPIntCheckRangeSInt32 (*q, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + sint32 ri = pLsrAPIntCheckRangeSInt32 (*r, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (qi > SINT32_MIN) { + *q = pLsrSmallIntegerFromSInt32(qi); + } + if (ri > SINT32_MIN) { + *r = pLsrSmallIntegerFromSInt32(ri); + } +} + +static void pLsrIntegerAPIntDivModT(PlsrInteger* q, PlsrInteger* r, PlsrAPInt a, PlsrAPInt b) +{ + pLsrAPIntDivModT(q, r, a, b); + sint32 qi = pLsrAPIntCheckRangeSInt32 (*q, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + sint32 ri = pLsrAPIntCheckRangeSInt32 (*r, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (qi > SINT32_MIN) { + *q = pLsrSmallIntegerFromSInt32(qi); + } + if (ri > SINT32_MIN) { + *r = pLsrSmallIntegerFromSInt32(ri); + } +} + +static PlsrInteger pLsrIntegerAPIntDivE(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntDivE(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntDivF(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntDivF(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntDivT(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntDivT(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntModE(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntModE(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntModF(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntModF(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +static PlsrInteger pLsrIntegerAPIntModT(PlsrAPInt a, PlsrAPInt b) +{ + PlsrAPInt c = pLsrAPIntModT(a, b); + sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min); + if (ci > SINT32_MIN) { + return pLsrSmallIntegerFromSInt32(ci); + } else { + return c; + } +} + +#else +#define pLsrIntegerAPIntBNot pLsrAPIntBNot +#define pLsrIntegerAPIntBAnd pLsrAPIntBAnd +#define pLsrIntegerAPIntBOr pLsrAPIntBOr +#define pLsrIntegerAPIntBShiftL pLsrAPIntBShiftL +#define pLsrIntegerAPIntBShiftR pLsrAPIntBShiftR +#define pLsrIntegerAPIntBXor pLsrAPIntBXor +#define pLsrIntegerAPIntAdd pLsrAPIntAdd +#define pLsrIntegerAPIntSub pLsrAPIntSub +#define pLsrIntegerAPIntMul pLsrAPIntMul +#define pLsrIntegerAPIntDivModE pLsrAPIntDivModE +#define pLsrIntegerAPIntDivModF pLsrAPIntDivModF +#define pLsrIntegerAPIntDivModT pLsrAPIntDivModT +#define pLsrIntegerAPIntDivE pLsrAPIntDivE +#define pLsrIntegerAPIntDivF pLsrAPIntDivF +#define pLsrIntegerAPIntDivT pLsrAPIntDivT +#define pLsrIntegerAPIntModE pLsrAPIntModE +#define pLsrIntegerAPIntModF pLsrAPIntModF +#define pLsrIntegerAPIntModT pLsrAPIntModT +#endif + + +/**************************** Bitwise *********************************/ + +/* pLsrTaggedInt32IntegerBNotSlow*/ +#define pLsrIntegerBNot(dest, a) \ + pLsrTaggedInt32TaggedIntBNot(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBNotSlow, dest, a) + +/* pLsrTaggedInt32IntegerBAndSlow*/ +pLsrTaggedInt32MkTaggedIntBAndSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntBAnd); +#define pLsrIntegerBAnd(dest, a, b) \ + pLsrTaggedInt32TaggedIntBAnd(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBAndSlow, dest, a, b) + +/* pLsrTaggedInt32IntegerBOrSlow*/ +pLsrTaggedInt32MkTaggedIntBOrSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntBOr); +#define pLsrIntegerBOr(dest, a, b) \ + pLsrTaggedInt32TaggedIntBOr(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBOrSlow, dest, a, b) + +/* pLsrTaggedInt32IntegerBShiftLSlow*/ +pLsrTaggedInt32MkTaggedIntBShiftLSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntBShiftL); +#define pLsrIntegerBShiftL(dest, a, b) \ + pLsrTaggedInt32TaggedIntBShiftL(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBShiftLSlow, dest, a, b) + +/* pLsrTaggedInt32IntegerBShiftRSlow*/ +pLsrTaggedInt32MkTaggedIntBShiftRSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntBShiftR); +#define pLsrIntegerBShiftR(dest, a, b) \ + pLsrTaggedInt32TaggedIntBShiftR(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBShiftRSlow, dest, a, b) + +/* pLsrTaggedInt32IntegerBXorSlow*/ +pLsrTaggedInt32MkTaggedIntBXorSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntBXor); +#define pLsrIntegerBXor(dest, a, b) \ + pLsrTaggedInt32TaggedIntBXor(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBXorSlow, dest, a, b) + + +/**************************** Arithmetic *********************************/ + +#define pLsrIntegerNegate(dest, r) \ + pLsrTaggedInt32TaggedIntNeg(PlsrInteger, PlsrAPInt, pLsrAPIntNegate, pLsrAPIntFromSInt32, dest, r) + +/* pLsrTaggedInt32IntegerAddSlow*/ +pLsrTaggedInt32MkTaggedIntAddSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntAdd); +#define pLsrIntegerPlus(dest, a, b) \ + pLsrTaggedInt32TaggedIntAdd(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerAddSlow, dest, a, b) + +/* pLsrTaggedInt32IntegerSubSlow*/ +pLsrTaggedInt32MkTaggedIntSubSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntSub); +#define pLsrIntegerMinus(dest, a, b) \ + pLsrTaggedInt32TaggedIntSub(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerSubSlow, dest, a, b) + +/* pLsrTaggedInt32IntegerMulSlow*/ +pLsrTaggedInt32MkTaggedIntMulSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntFromSInt64, + pLsrIntegerAPIntMul); +#define pLsrIntegerTimes(dest, a, b) \ + pLsrTaggedInt32TaggedIntMul(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerMulSlow, dest, a, b) + +/* pLsrTaggedInt32IntegerDivModESlow */ +pLsrTaggedInt32MkTaggedIntDivModESlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntDivModE); +#define pLsrIntegerDivModE(q, r, a, b) \ + pLsrTaggedInt32TaggedIntDivModE(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModESlow, q, r, a, b) +#define pLsrIntegerDivE(dest, a, b) \ + pLsrTaggedInt32TaggedIntDivE(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModESlow, dest, a, b) +#define pLsrIntegerModE(dest, a, b) \ + pLsrTaggedInt32TaggedIntModE(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModESlow, dest, a, b) + +/* pLsrTaggedInt32IntegerDivModFSlow */ +pLsrTaggedInt32MkTaggedIntDivModFSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntDivModF); +#define pLsrIntegerDivModF(q, r, a, b) \ + pLsrTaggedInt32TaggedIntDivModF(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModFSlow, q, r, a, b) +#define pLsrIntegerDivF(dest, a, b) \ + pLsrTaggedInt32TaggedIntDivF(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModFSlow, dest, a, b) +#define pLsrIntegerModF(dest, a, b) \ + pLsrTaggedInt32TaggedIntModF(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModFSlow, dest, a, b) + +/* pLsrTaggedInt32IntegerDivModTSlow */ +pLsrTaggedInt32MkTaggedIntDivModTSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntDivModT); +#define pLsrIntegerDivModT(q, r, a, b) \ + pLsrTaggedInt32TaggedIntDivModT(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModTSlow, q, r, a, b) +#define pLsrIntegerDivT(dest, a, b) \ + pLsrTaggedInt32TaggedIntDivT(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModTSlow, dest, a, b) +#define pLsrIntegerModT(dest, a, b) \ + pLsrTaggedInt32TaggedIntModT(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModTSlow, dest, a, b) + +/* pLsrTaggedInt32IntegerGcd */ +pLsrTaggedInt32MkTaggedIntGcd(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, \ + pLsrAPIntLessOrEqual, pLsrSInt32FromAPInt, pLsrAPIntGcd); +#define pLsrIntegerGcd pLsrTaggedInt32IntegerGcd + +/* Comparisons */ +pLsrTaggedInt32MkTaggedIntEqSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntEqual); +pLsrTaggedInt32MkTaggedIntNeSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntNotEqual); +pLsrTaggedInt32MkTaggedIntLtSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntLess); +pLsrTaggedInt32MkTaggedIntGtSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntGreater); +pLsrTaggedInt32MkTaggedIntLeSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntLessOrEqual); +pLsrTaggedInt32MkTaggedIntGeSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntGreaterOrEqual); + +#define pLsrIntegerEQ(dest, a, b) pLsrTaggedInt32TaggedIntEq(pLsrTaggedInt32IntegerEqSlow, dest, a, b) +#define pLsrIntegerNE(dest, a, b) pLsrTaggedInt32TaggedIntNe(pLsrTaggedInt32IntegerNeSlow, dest, a, b) +#define pLsrIntegerLT(dest, a, b) pLsrTaggedInt32TaggedIntLt(pLsrTaggedInt32IntegerLtSlow, dest, a, b) +#define pLsrIntegerGT(dest, a, b) pLsrTaggedInt32TaggedIntGt(pLsrTaggedInt32IntegerGtSlow, dest, a, b) +#define pLsrIntegerLE(dest, a, b) pLsrTaggedInt32TaggedIntLe(pLsrTaggedInt32IntegerLeSlow, dest, a, b) +#define pLsrIntegerGE(dest, a, b) pLsrTaggedInt32TaggedIntGe(pLsrTaggedInt32IntegerGeSlow, dest, a, b) + +/* Miscellaneous */ +#define pLsrIntegerFitsInSInt32(a) ((pLsrIntegerIsTagged(a)) || (pLsrAPIntFitsInSInt32(a))) + +static sint32 pLsrIntegerCheckRangeSInt32(PlsrInteger a, sint32 upper, sint32 lower) +{ + if (pLsrIntegerIsTagged(a)) { + sint32 ai; + pLsrSInt32FromInteger(ai, a); + return ((ai <= upper) && (ai >= lower)) ? ai : SINT32_MIN; + } else { + return pLsrAPIntCheckRangeSInt32(a, upper, lower); + } +} + +#define pLsrIntegerHash(i) pLsrTaggedInt32TaggedIntHash(PlsrInteger, PlsrAPInt, pLsrAPIntHash, i) + +#else /* !P_USE_TAGGED_INTEGERS */ + +#define pLsrIntegerIsTagged(a) (0) + +/* Types */ + +typedef PlsrAPInt PlsrInteger; +typedef PlsrAPIntS PlsrIntegerU; + +/* Constants */ + +#define pLsrSmallIntegerMax pLsrTaggedInt32Max +#define pLsrSmallIntegerMin pLsrTaggedInt32Min + +#define pLsrIntegerZero pLsrAPIntZero +#define pLsrIntegerOne pLsrAPIntOne +#define pLsrIntegerMinusOne pLsrAPIntMinusOne +#define pLsrIntegerMinusUInt32Max pLsrAPIntMinusUInt32Max + +#define pLsrIntegerSignNeg PlsrAPNeg +#define pLsrIntegerSignPos PlsrAPPos +#define pLsrIntegerSignZero PlsrAPZero + +#ifdef PLSR_AP_INT_BACKPATCH_GLOBALS + +#define pLsrIntegerStaticUnboxed pLsrAPIntStaticGlobalNew +#define pLsrIntegerStaticGlobalInit pLsrAPIntStaticGlobalInitFromCString +#define pLsrIntegerStaticRef(uvar) ((PlsrAPInt) & (uvar)) + +#else + +#define pLsrIntegerSignNeg PlsrAPNeg +#define pLsrIntegerSignPos PlsrAPPos +#define pLsrIntegerSignZero PlsrAPZero +#define pLsrIntegerDigitListStaticEmpty pLsrAPIntDigitListStaticEmpty +#define pLsrIntegerDigitListStaticConsUnboxedDef pLsrAPIntDigitListStaticConsUnboxedDef +#define pLsrIntegerDigitListStaticConsRef pLsrAPIntDigitListStaticConsRef +#define pLsrIntegerStaticUnboxedDef pLsrAPIntStaticUnboxedDef +#define pLsrIntegerStaticRef(uvar) ((PlsrAPInt) & uvar) + +#endif + + +/* Conversions */ + +#define pLsrSmallIntegerFromSInt32 pLsrAPIntFromSInt32 + +#define pLsrIntegerFromSInt8 pLsrIntegerFromSInt32 +#define pLsrSInt8FromInteger pLsrSInt32FromInteger + +#define pLsrIntegerFromSInt16 pLsrIntegerFromSInt32 +#define pLsrSInt16FromInteger pLsrSInt32FromInteger + +#define pLsrIntegerFromSInt32(dest, i) ((dest) = (pLsrAPIntFromSInt32(i))) +#define pLsrSInt32FromInteger(dest, i) ((dest) = (pLsrSInt32FromAPInt(i))) + +#define pLsrIntegerFromSInt64(dest, i) ((dest) = (pLsrAPIntFromSInt64(i))) +#define pLsrSInt64FromInteger(dest, i) ((dest) = (pLsrSInt64FromAPInt(i))) + +#define pLsrIntegerFromSIntp(dest, i) ((dest) = (pLsrAPIntFromSIntp(i))) +#define pLsrSIntpFromInteger(dest, i) ((dest) = (pLsrSIntpFromAPInt(i))) + + +#define pLsrIntegerFromUInt8 pLsrIntegerFromUInt32 +#define pLsrUInt8FromInteger pLsrUInt32FromInteger + +#define pLsrIntegerFromUInt16 pLsrIntegerFromUInt32 +#define pLsrUInt16FromInteger pLsrUInt32FromInteger + +#define pLsrIntegerFromUInt32(dest, i) ((dest) = (pLsrAPIntFromUInt32(i))) +#define pLsrUInt32FromInteger(dest, i) ((dest) = (pLsrUInt32FromAPInt(i))) + +#define pLsrIntegerFromUInt64(dest, i) ((dest) = (pLsrAPIntFromUInt64(i))) +#define pLsrUInt64FromInteger(dest, i) ((dest) = (pLsrUInt64FromAPInt(i))) + +#define pLsrIntegerFromUIntp(dest, i) ((dest) = (pLsrAPIntFromUIntp(i))) +#define pLsrUIntpFromInteger(dest, i) ((dest) = (pLsrUIntpFromAPInt(i))) + +#define pLsrCStringFromInteger pLsrCStringFromAPInt +#define pLsrIntegerFromCString pLsrAPIntFromCString + +/* Bitwise */ + +#define pLsrIntegerBNot(dest, i) ((dest) = (pLsrAPIntBNot(i))) +#define pLsrIntegerBAnd(dest, a, b) ((dest) = (pLsrAPIntBAnd(a, b))) +#define pLsrIntegerBOr(dest, a, b) ((dest) = (pLsrAPIntBOr(a, b))) +#define pLsrIntegerBShiftL(dest, a, b) ((dest) = (pLsrAPIntBShiftL(a, b))) +#define pLsrIntegerBShiftR(dest, a, b) ((dest) = (pLsrAPIntBShiftR(a, b))) +#define pLsrIntegerBXor(dest, a, b) ((dest) = (pLsrAPIntBXor(a, b))) + +/* Arithmetic */ + +#define pLsrIntegerNegate(dest, i) ((dest) = (pLsrAPIntNegate(i))) +#define pLsrIntegerPlus(dest, a, b) ((dest) = (pLsrAPIntAdd(a, b))) +#define pLsrIntegerMinus(dest, a, b) ((dest) = (pLsrAPIntSub(a, b))) +#define pLsrIntegerTimes(dest, a, b) ((dest) = (pLsrAPIntMul(a, b))) + +#define pLsrIntegerDivModE(q, r, a, b) \ + do { \ + PlsrAPInt pLsrIntegerDivModE_q = NULL; \ + PlsrAPInt pLsrIntegerDivModE_r = NULL; \ + pLsrAPIntDivModE(&pLsrIntegerDivModE_q, &pLsrIntegerDivModE_r, a, b); \ + q = pLsrIntegerDivModE_q; \ + r = pLsrIntegerDivModE_r; \ + } while (0) +#define pLsrIntegerDivModF(q, r, a, b) \ + do { \ + PlsrAPInt pLsrIntegerDivModF_q = NULL; \ + PlsrAPInt pLsrIntegerDivModF_r = NULL; \ + pLsrAPIntDivModF(&pLsrIntegerDivModF_q, &pLsrIntegerDivModF_r, a, b); \ + q = pLsrIntegerDivModF_q; \ + r = pLsrIntegerDivModF_r; \ + } while (0) +#define pLsrIntegerDivModT(q, r, a, b) \ + do { \ + PlsrAPInt pLsrIntegerDivModT_q = NULL; \ + PlsrAPInt pLsrIntegerDivModT_r = NULL; \ + pLsrAPIntDivModT(&pLsrIntegerDivModT_q, &pLsrIntegerDivModT_r, a, b); \ + q = pLsrIntegerDivModT_q; \ + r = pLsrIntegerDivModT_r; \ + } while (0) + + +#define pLsrIntegerDivE(dest, a, b) ((dest) = (pLsrAPIntDivE(a, b))) +#define pLsrIntegerModE(dest, a, b) ((dest) = (pLsrAPIntModE(a, b))) + +#define pLsrIntegerDivF(dest, a, b) ((dest) = (pLsrAPIntDivF(a, b))) +#define pLsrIntegerModF(dest, a, b) ((dest) = (pLsrAPIntModF(a, b))) + +#define pLsrIntegerDivT(dest, a, b) ((dest) = (pLsrAPIntDivT(a, b))) +#define pLsrIntegerModT(dest, a, b) ((dest) = (pLsrAPIntModT(a, b))) + +#define pLsrIntegerGcd pLsrAPIntGcd + +/* Comparisons */ + +#define pLsrIntegerEQ(dest, a, b) ((dest) = (pLsrAPIntEqual(a, b))) +#define pLsrIntegerNE(dest, a, b) ((dest) = (pLsrAPIntNotEqual(a, b))) +#define pLsrIntegerLT(dest, a, b) ((dest) = (pLsrAPIntLess(a, b))) +#define pLsrIntegerGT(dest, a, b) ((dest) = (pLsrAPIntGreater(a, b))) +#define pLsrIntegerLE(dest, a, b) ((dest) = (pLsrAPIntLessOrEqual(a, b))) +#define pLsrIntegerGE(dest, a, b) ((dest) = (pLsrAPIntGreaterOrEqual(a, b))) + +/* Miscellaneous */ +#define pLsrIntegerFitsInSInt32 pLsrAPIntFitsInSInt32 +#define pLsrIntegerCheckRangeSInt32 pLsrAPIntCheckRangeSInt32 + +#define pLsrIntegerHash(i) (pLsrAPInthash(i)) + +#endif /* P_USE_TAGGED_INTEGERS */ +#endif /*_PLSR_INTEGER_H_ */ diff --git a/runtime/include/hrc/plsr-lightweight-thunk.h b/runtime/include/hrc/plsr-lightweight-thunk.h new file mode 100644 index 0000000..d106ac8 --- /dev/null +++ b/runtime/include/hrc/plsr-lightweight-thunk.h @@ -0,0 +1,568 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Unified Futures Thunk Implementation */ + +#ifndef _PLSR_THUNK_H_ +#define _PLSR_THUNK_H_ + +#ifndef PLSR_THUNK_NO_SUBSUMPTION +#define PLSR_THUNK_SUBSUMPTION +#endif + +#ifdef P_USE_PARALLEL_FUTURES +#error "Parallel futures not supported with lightweight thunks" +#endif + +#ifdef PLSR_THUNK_INTERCEPT_CUTS +#error "Cut interception not supported on lightweight thunks" + #ifndef P_USE_PILLAR + #error "Cut interception only supported on Pillar" + #endif +#endif + +/*** Types ***/ + +#define PlsrTypesMk(name, retType) \ + typedef retType PlsrThunkReturnType##name; \ + typedef PlsrThunkReturnType##name (*PlsrThunkCode##name)(PlsrThunkB##name); \ + /* PlsrObjectU must be a prefix of this structure */ \ + typedef struct PlsrThunkSVS##name { \ + PlsrVTable vtable; \ + retType result; \ + } PlsrThunkSV##name; \ + typedef struct PlsrThunkSDS##name { \ + PlsrVTable vtable; \ + char fvs[]; \ + } PlsrThunkSD##name; \ + typedef struct PlsrThunkSES##name { \ + PlsrVTable vtable; \ + PlsrRef exn; \ + } PlsrThunkSE##name; \ + typedef union PlsrThunkUU##name { \ + PlsrThunkSD##name delay; \ + PlsrThunkSV##name value; \ + PlsrThunkSE##name exn; \ + } PlsrThunkU##name + +/* + * PlsrThunkCodeRef, pLsrThunkSRef, pLsrThunkURef + * PlsrThunkCode32, pLsrThunkS32, pLsrThunkU32 + * PlsrThunkCode64, pLsrThunkS64, pLsrThunkU64 + * PlsrThunkCodeFloat, pLsrThunkSFloat, pLsrThunkUFloat + * PlsrThunkCodeDouble, pLsrThunkSDouble, pLsrThunkUDouble + */ +PlsrTypesMk(Ref, PlsrRef); +PlsrTypesMk(32, uint32); +PlsrTypesMk(64, uint64); +PlsrTypesMk(Float, float32); +PlsrTypesMk(Double, float64); + +#define pLsrThunkRef(t) ((PlsrThunkURef*)(t)) +#define pLsrThunk32(t) ((PlsrThunkU32*)(t)) +#define pLsrThunk64(t) ((PlsrThunkU64*)(t)) +#define pLsrThunkFloat(t) ((PlsrThunkUFloat*)(t)) +#define pLsrThunkDouble(t) ((PlsrThunkUDouble*)(t)) + +#define pLsrThunkResultFieldRef(thunk) (pLsrThunkRef(thunk)->value.result) +#define pLsrThunkResultField32(thunk) (pLsrThunk32(thunk)->value.result) +#define pLsrThunkResultField64(thunk) (pLsrThunk64(thunk)->value.result) +#define pLsrThunkResultFieldFloat(thunk) (pLsrThunkFloat(thunk)->value.result) +#define pLsrThunkResultFieldDouble(thunk) (pLsrThunkDouble(thunk)->value.result) + +#define pLsrThunkExnFieldRef(thunk) (pLsrThunkRef(thunk)->exn.exn) +#define pLsrThunkExnField32(thunk) (pLsrThunk32(thunk)->exn.exn) +#define pLsrThunkExnField64(thunk) (pLsrThunk64(thunk)->exn.exn) +#define pLsrThunkExnFieldFloat(thunk) (pLsrThunkFloat(thunk)->exn.exn) +#define pLsrThunkExnFieldDouble(thunk) (pLsrThunkDouble(thunk)->exn.exn) + +#define pLsrThunkVTableFieldRef(thunk) (pLsrThunkRef(thunk)->value.vtable) +#define pLsrThunkVTableField32(thunk) (pLsrThunk32(thunk)->value.vtable) +#define pLsrThunkVTableField64(thunk) (pLsrThunk64(thunk)->value.vtable) +#define pLsrThunkVTableFieldFloat(thunk) (pLsrThunkFloat(thunk)->value.vtable) +#define pLsrThunkVTableFieldDouble(thunk) (pLsrThunkDouble(thunk)->value.vtable) + +#define pLsrThunkGetCodeRef(thunk) ((PlsrThunkCodeRef)pLsrVTableGetCustom(pLsrThunkRef(thunk)->delay.vtable)) +#define pLsrThunkGetCode32(thunk) ((PlsrThunkCode32)pLsrVTableGetCustom(pLsrThunk32(thunk)->delay.vtable)) +#define pLsrThunkGetCode64(thunk) ((PlsrThunkCode64)pLsrVTableGetCustom(pLsrThunk64(thunk)->delay.vtable)) +#define pLsrThunkGetCodeFloat(thunk) ((PlsrThunkCodeFloat)pLsrVTableGetCustom(pLsrThunkFloat(thunk)->delay.vtable)) +#define pLsrThunkGetCodeDouble(thunk) ((PlsrThunkCodeDouble)pLsrVTableGetCustom(pLsrThunkDouble(thunk)->delay.vtable)) + +#ifdef PLSR_THUNK_SUBSUMPTION +#define pLsrThunkCastToObjectRef(thunk) ((PlsrRef) thunk) +#else /* !PLSR_THUNK_SUBSUMPTION */ +#define pLsrThunkCastToObjectRef(thunk) ((PlsrRef) 0) +#endif /* !PLSR_THUNK_SUBSUMPTION */ +#define pLsrThunkCastToObject32(thunk) (assert(0), (uint32) 0) +#define pLsrThunkCastToObject64(thunk) (assert(0), (uint64) 0) +#define pLsrThunkCastToObjectFloat(thunk) (assert(0), (float32) 0) +#define pLsrThunkCastToObjectDouble(thunk) (assert(0), (float64) 0) + +/* Global Thunks */ + +#define pLsrThunkStaticValueMk(name, v, vt, retType, val) \ + static PlsrThunkU##name v = \ + { .value = {.vtable = (vt), \ + .result = (retType) (val) } \ + } + +#ifdef PLSR_THUNK_SUBSUMPTION +/* For now leave globals boxed, since there are some mil-to-pil complications +* with doing otherwise -leaf */ + #define pLsrThunkSubsumptiveStaticValueMk pLsrThunkStaticValueMk +#else /* ! PLSR_THUNK_SUBSUMPTION */ + #define pLsrThunkSubsumptiveStaticValueMk pLsrThunkStaticValueMk +#endif /* ! PLSR_THUNK_SUBSUMPTION */ + +/* These are marked ALWAYS_IMMUTABLE, so must be initialized atomically wrt the gc */ +#define pLsrThunkStaticValueRef(v, val) \ + pLsrThunkSubsumptiveStaticValueMk(Ref, v, pLsrThunkValVTableRef, PlsrRef, val) +#define pLsrThunkStaticValue32(v, val) pLsrThunkStaticValueMk(32, v, pLsrThunkValVTable32, uint32, val) +#define pLsrThunkStaticValue64(v, val) pLsrThunkStaticValueMk(64, v, pLsrThunkValVTable64, uint64, val) +#define pLsrThunkStaticValueFloat(v, val) pLsrThunkStaticValueMk(Float, v, pLsrThunkValVTableFloat, float32, val) +#define pLsrThunkStaticValueDouble(v, val) pLsrThunkStaticValueMk(Double, v, pLsrThunkValVTableDouble, float64, val) + +/* Creation */ +#define pLsrThunkUninitCodeMk(name) \ + static PlsrThunkReturnType##name pLsrThunkUninitCode##name(PlsrThunkB##name thunk) \ + { \ + pLsrRuntimeError_("Tried to evaluate uninitialized thunk"); \ + return (PlsrThunkReturnType##name) 0; \ + } +/* pLsrThunkUninitCodeRef, pLsrThunkUninitCode32, pLsrThunkUninitCode64 + * pLsrThunkUninitCodeFloat, pLsrThunkUninitCodeDouble + */ +pLsrThunkUninitCodeMk(Ref); +pLsrThunkUninitCodeMk(32); +pLsrThunkUninitCodeMk(64); +pLsrThunkUninitCodeMk(Float); +pLsrThunkUninitCodeMk(Double); + +/* This cannot engender a yield */ +#define pLsrThunkNewMk(name, dv, vt, sz, algn) \ + do { \ + pLsrAllocAligned(PlsrThunkB##name, (dv), (vt), (sz), (algn)); \ + assert((vt)->tag==VThunkTag); \ + } while (0) + +#ifdef PLSR_ZERO_REFS +#define pLsrThunkNewRef(dv, vt, sz, algn) \ + do { \ + pLsrThunkNewMk(Ref, dv, vt, sz, algn); \ + pLsrThunkResultFieldRef(dv) = NULL; \ + pLsrThunkExnFieldRef(dv) = NULL; \ + } while (0) +#define pLsrThunkNew32(dv, vt, sz, algn) \ + do { \ + pLsrThunkNewMk(32, dv, vt, sz, algn); \ + pLsrThunkExnField32(dv) = NULL; \ + } while (0) +#define pLsrThunkNew64(dv, vt, sz, algn) \ + do { \ + pLsrThunkNewMk(64, dv, vt, sz, algn); \ + pLsrThunkExnField64(dv) = NULL; \ + } while (0) +#define pLsrThunkNewFloat(dv, vt, sz, algn) \ + do { \ + pLsrThunkNewMk(Float, dv, vt, sz, algn); \ + pLsrThunkExnFieldFloat(dv) = NULL; \ + } while (0) +#define pLsrThunkNewDouble(dv, vt, sz, algn) \ + do { \ + pLsrThunkNewMk(Double, dv, vt, sz, algn); \ + pLsrThunkExnFieldDouble(dv) = NULL; \ + } while (0) +#else +#define pLsrThunkNewRef(dv, vt, sz, algn) pLsrThunkNewMk(Ref, dv, vt, sz, algn) +#define pLsrThunkNew32(dv, vt, sz, algn) pLsrThunkNewMk(32, dv, vt, sz, algn) +#define pLsrThunkNew64(dv, vt, sz, algn) pLsrThunkNewMk(64, dv, vt, sz, algn) +#define pLsrThunkNewFloat(dv, vt, sz, algn) pLsrThunkNewMk(Float, dv, vt, sz, algn) +#define pLsrThunkNewDouble(dv, vt, sz, algn) pLsrThunkNewMk(Double, dv, vt, sz, algn) +#endif + +/* Initialisation */ + +#define pLsrThunkSetInitMk(name, thunk, vtable) \ + pLsrObjectChangeVTableMandatory(thunk, vtable) + +#define pLsrThunkSetInitRef(thunk, vtable) pLsrThunkSetInitMk(Ref, thunk, vtable) +#define pLsrThunkSetInit32(thunk, vtable) pLsrThunkSetInitMk(32, thunk, vtable) +#define pLsrThunkSetInit64(thunk, vtable) pLsrThunkSetInitMk(64, thunk, vtable) +#define pLsrThunkSetInitFloat(thunk, vtable) pLsrThunkSetInitMk(Float, thunk, vtable) +#define pLsrThunkSetInitDouble(thunk, vtable) pLsrThunkSetInitMk(Double, thunk, vtable) + +#define pLsrThunkSetValueRef(thunk, v) \ + (pLsrWriteBarrierRefBase((PlsrThunkBRef)(thunk), pLsrThunkResultFieldRef(thunk), (v))) +/* This should not engender a yield */ +#define pLsrThunkSetValueNonWbMk(name, thunk, v) \ + (pLsrThunkResultField##name(thunk) = (v)) + +#define pLsrThunkSetValue32(thunk, v) pLsrThunkSetValueNonWbMk(32, thunk, v) +#define pLsrThunkSetValue64(thunk, v) pLsrThunkSetValueNonWbMk(64, thunk, v) +#define pLsrThunkSetValueFloat(thunk, v) pLsrThunkSetValueNonWbMk(Float, thunk, v) +#define pLsrThunkSetValueDouble(thunk, v) pLsrThunkSetValueNonWbMk(Double, thunk, v) + +#define pLsrThunkValueInitRef(thunk, v) \ + do { \ + pLsrThunkSetValueRef(thunk, v); \ + pLsrObjectChangeVTableMandatory(thunk, pLsrThunkValVTableRef); \ + } while (0) + +#define pLsrThunkValueInit32(thunk, v) \ + do { \ + pLsrThunkSetValue32(thunk, v); \ + pLsrObjectChangeVTableMandatory(thunk, pLsrThunkValVTable32); \ + } while (0) + +#define pLsrThunkValueInit64(thunk, v) \ + do { \ + pLsrThunkSetValue64(thunk, v); \ + pLsrObjectChangeVTableMandatory(thunk, pLsrThunkValVTable64); \ + } while (0) + +#define pLsrThunkValueInitFloat(thunk, v) \ + do { \ + pLsrThunkSetValueFloat(thunk, v); \ + pLsrObjectChangeVTableMandatory(thunk, pLsrThunkValVTableFloat); \ + } while (0) + +#define pLsrThunkValueInitDouble(thunk, v) \ + do { \ + pLsrThunkSetValueDouble(thunk, v); \ + pLsrObjectChangeVTableMandatory(thunk, pLsrThunkValVTableDouble); \ + } while (0) + +#define pLsrThunkNewValueMk(name, dv, vt, sz, algn, v) \ + do { \ + noyield { \ + pLsrThunkNew##name(dv, vt, sz, algn); \ + pLsrThunkSetValueNonWbMk(name, dv, v); \ + } \ + } while(0) + +#ifdef PLSR_THUNK_SUBSUMPTION + +#define pLsrThunkSubsumptiveNewValueMk(name, dv, vt, sz, algn, v) ((dv) = ((PlsrThunkBRef) (v))) + +#else /* ! PLSR_THUNK_SUBSUMPTION */ + +#define pLsrThunkSubsumptiveNewValueMk pLsrThunkNewValueMk + +#endif /* ! PLSR_THUNK_SUBSUMPTION */ +#define pLsrThunkNewValueRef(dv, vt, sz, algn, v) pLsrThunkSubsumptiveNewValueMk(Ref, dv, vt, sz, algn, ((PlsrRef) v)) +#define pLsrThunkNewValue32(dv, vt, sz, algn, v) pLsrThunkNewValueMk(32, dv, vt, sz, algn, v) +#define pLsrThunkNewValue64(dv, vt, sz, algn, v) pLsrThunkNewValueMk(64, dv, vt, sz, algn, v) +#define pLsrThunkNewValueFloat(dv, vt, sz, algn, v) pLsrThunkNewValueMk(Float, dv, vt, sz, algn, v) +#define pLsrThunkNewValueDouble(dv, vt, sz, algn, v) pLsrThunkNewValueMk(Double, dv, vt, sz, algn, v) + +static PlsrThunkBRef pLsrThunkNewValRef(PlsrRef v) +{ +#ifdef PLSR_THUNK_SUBSUMPTION + return (PlsrThunkBRef) v; +#else + noyield { + PlsrThunkBRef res; + pLsrAlloc(PlsrThunkBRef, res, pLsrThunkValVTableRef, sizeof(PlsrThunkURef)); + pLsrThunkSetValueRef(res, v); + return res; + } +#endif +} + +/* Projection */ +#ifdef PLSR_THUNK_SUBSUMPTION + #define pLsrThunkIsUnboxedRef(thunk) \ + (pLsrIntegerIsTagged(thunk) || (pLsrThunkVTableFieldRef(thunk)->tag != VThunkTag)) +#else + #define pLsrThunkIsUnboxedRef(thunk) (assert(pLsrThunkVTableFieldRef(thunk)->tag == VThunkTag), 0) +#endif +#define pLsrThunkIsUnboxed32(thunk) (assert(pLsrThunkVTableField32(thunk)->tag == VThunkTag), 0) +#define pLsrThunkIsUnboxed64(thunk) (assert(pLsrThunkVTableField64(thunk)->tag == VThunkTag), 0) +#define pLsrThunkIsUnboxedFloat(thunk) (assert(pLsrThunkVTableFieldFloat(thunk)->tag == VThunkTag), 0) +#define pLsrThunkIsUnboxedDouble(thunk) (assert(pLsrThunkVTableFieldDouble(thunk)->tag == VThunkTag), 0) + +#define pLsrThunkIsEvaledRef(thunk) (pLsrThunkVTableFieldRef(thunk) == pLsrThunkValVTableRef) +#define pLsrThunkIsEvaled32(thunk) (pLsrThunkVTableField32(thunk) == pLsrThunkValVTable32) +#define pLsrThunkIsEvaled64(thunk) (pLsrThunkVTableField64(thunk) == pLsrThunkValVTable64) +#define pLsrThunkIsEvaledFloat(thunk) (pLsrThunkVTableFieldFloat(thunk) == pLsrThunkValVTableFloat) +#define pLsrThunkIsEvaledDouble(thunk) (pLsrThunkVTableFieldDouble(thunk) == pLsrThunkValVTableDouble) + +#define pLsrThunkGetValRef(thunk) pLsrThunkResultFieldRef(thunk) +#define pLsrThunkGetVal32(thunk) pLsrThunkResultField32(thunk) +#define pLsrThunkGetVal64(thunk) pLsrThunkResultField64(thunk) +#define pLsrThunkGetValFloat(thunk) pLsrThunkResultFieldFloat(thunk) +#define pLsrThunkGetValDouble(thunk) pLsrThunkResultFieldDouble(thunk) + + +/* Evaluation */ + +#define pLsrThunkCallMk(name, thunk) pLsrThunkGetCode##name(thunk)(thunk) +#define pLsrThunkTailCallMk(name, thunk) TAILCALL((pLsrThunkGetCode##name(thunk))(thunk)) + +#define pLsrThunkCallDirectMk(name, code, thunk) code(thunk) +#define pLsrThunkTailCallDirectMk(name, code, thunk) TAILCALL(code(thunk)) + +#define pLsrThunkCallDirectNoCutsMk(name, code, thunk) code(thunk) +#define pLsrThunkTailCallDirectNoCutsMk(name, code, thunk) TAILCALL(code(thunk)) + +#define pLsrThunkEvalMk(name, thunk) \ + (pLsrThunkIsEvaled##name(thunk)) ? pLsrThunkGetVal##name(thunk) : pLsrThunkCall##name(thunk) +#define pLsrThunkEvalDirectMk(name, code, thunk) \ + (pLsrThunkIsEvaled##name(thunk)) ? pLsrThunkGetVal##name(thunk) : pLsrThunkCallDirect##name(code, thunk) +#define pLsrThunkTailEvalMk(name, thunk) \ + do { \ + if (pLsrThunkIsEvaled##name(thunk)) { \ + return pLsrThunkGetVal##name(thunk); \ + } else { \ + pLsrThunkTailCall##name(thunk); \ + } \ + } while (0) +#define pLsrThunkTailEvalDirectMk(name, code, thunk) \ + do { \ + if (pLsrThunkIsEvaled##name(thunk)) { \ + return pLsrThunkGetVal##name(thunk); \ + } else { \ + pLsrThunkTailCallDirect##name(code, thunk); \ + } \ + } while (0) + +#ifdef PLSR_THUNK_SUBSUMPTION +#define pLsrThunkSubsumptiveEvalMk(name, thunk) \ + (pLsrThunkIsUnboxed##name(thunk)) ? pLsrThunkCastToObject##name(thunk) : pLsrThunkEvalMk(name, thunk) +#define pLsrThunkSubsumptiveEvalDirectMk(name, code, thunk) \ + (pLsrThunkIsUnboxed##name(thunk)) ? pLsrThunkCastToObject##name(thunk) : pLsrThunkEvalDirectMk(name, code, thunk) +#define pLsrThunkSubsumptiveTailEvalMk(name, thunk) \ + do { \ + if (pLsrThunkIsUnboxed##name(thunk)) { \ + return pLsrThunkCastToObject##name(thunk); \ + } else { \ + pLsrThunkTailEvalMk(name, thunk); \ + } \ + } while (0) +#define pLsrThunkSubsumptiveTailEvalDirectMk(name, code, thunk) \ + do { \ + if (pLsrThunkIsUnboxed##name(thunk)) { \ + return pLsrThunkCastToObject##name(thunk); \ + } else { \ + pLsrThunkTailEvalDirectMk(name, code, thunk); \ + } \ + } while (0) +#else /* ! PLSR_THUNK_SUBSUMPTION */ + +#define pLsrThunkSubsumptiveEvalMk pLsrThunkEvalMk +#define pLsrThunkSubsumptiveEvalDirectMk pLsrThunkEvalDirectMk +#define pLsrThunkSubsumptiveTailEvalMk pLsrThunkTailEvalMk +#define pLsrThunkTailSubsumptiveEvalDirectMk pLsrThunkTailEvalDirectMk + +#endif /* ! PLSR_THUNK_SUBSUMPTION */ + +#define pLsrThunkCallRef(thunk) pLsrThunkCallMk (Ref,thunk) +#define pLsrThunkCall32(thunk) pLsrThunkCallMk (32, thunk) +#define pLsrThunkCall64(thunk) pLsrThunkCallMk (64, thunk) +#define pLsrThunkCallFloat(thunk) pLsrThunkCallMk (Float, thunk) +#define pLsrThunkCallDouble(thunk) pLsrThunkCallMk (Double, thunk) + +#define pLsrThunkCallDirectRef(code, thunk) pLsrThunkCallDirectMk (Ref, code, thunk) +#define pLsrThunkCallDirect32(code, thunk) pLsrThunkCallDirectMk (32, code, thunk) +#define pLsrThunkCallDirect64(code, thunk) pLsrThunkCallDirectMk (64, code, thunk) +#define pLsrThunkCallDirectFloat(code, thunk) pLsrThunkCallDirectMk (Float, code, thunk) +#define pLsrThunkCallDirectDouble(code, thunk) pLsrThunkCallDirectMk (Double, code, thunk) + +#define pLsrThunkTailCallRef(thunk) pLsrThunkTailCallMk (Ref,thunk) +#define pLsrThunkTailCall32(thunk) pLsrThunkTailCallMk (32, thunk) +#define pLsrThunkTailCall64(thunk) pLsrThunkTailCallMk (64, thunk) +#define pLsrThunkTailCallFloat(thunk) pLsrThunkTailCallMk (Float, thunk) +#define pLsrThunkTailCallDouble(thunk) pLsrThunkTailCallMk (Double, thunk) + +#define pLsrThunkTailCallDirectRef(code, thunk) pLsrThunkTailCallDirectMk (Ref, code, thunk) +#define pLsrThunkTailCallDirect32(code, thunk) pLsrThunkTailCallDirectMk (32, code, thunk) +#define pLsrThunkTailCallDirect64(code, thunk) pLsrThunkTailCallDirectMk (64, code, thunk) +#define pLsrThunkTailCallDirectFloat(code, thunk) pLsrThunkTailCallDirectMk (Float, code, thunk) +#define pLsrThunkTailCallDirectDouble(code, thunk) pLsrThunkTailCallDirectMk (Double, code, thunk) + +#define pLsrThunkEvalRef(thunk) pLsrThunkSubsumptiveEvalMk (Ref, thunk) +#define pLsrThunkEval32(thunk) pLsrThunkEvalMk (32, thunk) +#define pLsrThunkEval64(thunk) pLsrThunkEvalMk (64, thunk) +#define pLsrThunkEvalFloat(thunk) pLsrThunkEvalMk (Float, thunk) +#define pLsrThunkEvalDouble(thunk) pLsrThunkEvalMk (Double, thunk) + +#define pLsrThunkEvalDirectRef(code, thunk) pLsrThunkSubsumptiveEvalDirectMk (Ref, code, thunk) +#define pLsrThunkEvalDirect32(code, thunk) pLsrThunkEvalDirectMk (32, code, thunk) +#define pLsrThunkEvalDirect64(code, thunk) pLsrThunkEvalDirectMk (64, code, thunk) +#define pLsrThunkEvalDirectFloat(code, thunk) pLsrThunkEvalDirectMk (Float, code, thunk) +#define pLsrThunkEvalDirectDouble(code, thunk) pLsrThunkEvalDirectMk (Double, code, thunk) + +#define pLsrThunkTailEvalRef(thunk) pLsrThunkSubsumptiveTailEvalMk (Ref, thunk) +#define pLsrThunkTailEval32(thunk) pLsrThunkTailEvalMk (32, thunk) +#define pLsrThunkTailEval64(thunk) pLsrThunkTailEvalMk (64, thunk) +#define pLsrThunkTailEvalFloat(thunk) pLsrThunkTailEvalMk (Float, thunk) +#define pLsrThunkTailEvalDouble(thunk) pLsrThunkTailEvalMk (Double, thunk) + +#define pLsrThunkTailEvalDirectRef(code, thunk) pLsrThunkSubsumptiveTailEvalDirectMk (Ref, code, thunk) +#define pLsrThunkTailEvalDirect32(code, thunk) pLsrThunkTailEvalDirectMk (32, code, thunk) +#define pLsrThunkTailEvalDirect64(code, thunk) pLsrThunkTailEvalDirectMk (64, code, thunk) +#define pLsrThunkTailEvalDirectFloat(code, thunk) pLsrThunkTailEvalDirectMk (Float, code, thunk) +#define pLsrThunkTailEvalDirectDouble(code, thunk) pLsrThunkTailEvalDirectMk (Double, code, thunk) + +#define pLsrThunkReturnRef(thnk, val) \ + do { \ + pLsrThunkSetValueRef(thnk, val); \ + pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTableRef); \ + return val; \ + } while (0) + +#define pLsrThunkReturnNonRefMk(name, thnk, val) \ + do { \ + pLsrThunkSetValue##name(thnk, val); \ + pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTable##name); \ + return val; \ + } while (0) + +#define pLsrThunkReturn32(thunk, val) pLsrThunkReturnNonRefMk(32, thunk, val) +#define pLsrThunkReturn64(thunk, val) pLsrThunkReturnNonRefMk(64, thunk, val) +#define pLsrThunkReturnFloat(thunk, val) pLsrThunkReturnNonRefMk(Float, thunk, val) +#define pLsrThunkReturnDouble(thunk, val) pLsrThunkReturnNonRefMk(Double, thunk, val) + +PlsrThunkReturnTypeRef pLsrThunkDoCut(PlsrThunkBRef t) { + pLsrRuntimeError_("Thunk recuts not supported with lightweight thunks"); + return 0; +#if 0 + PilContinuation0 c = (PilContinuation0) pLsrThunkRef(t)->result; + pilCutTo0(c); + return 0; +#endif +} + +pLsrVTableStaticWithCustom(pLsrThunkCutVTable_, VThunkTag, "*thunk value (cut)*", 0, pLsrThunkDoCut); +#define pLsrThunkCutVTable (&pLsrThunkCutVTable_) + +#define pLsrThunkCutMk(name, thunk, cont) \ + do { \ + assert(sizeof(PlsrThunkU##name) >= sizeof(PlsrThunkURef)); \ + pLsrObjectChangeVTableMandatory(thunk, pLsrThunkCutVTable); \ + /* pLsrThunkRef(thunk)->exn = (PlsrRef) (exn);*/ \ + cut to cont; \ + } while (0) + +#define pLsrThunkCutRef(thunk, cont) pLsrThunkCutMk(Ref, thunk, cont) +#define pLsrThunkCut32(thunk, cont) pLsrThunkCutMk(32, thunk, cont) +#define pLsrThunkCut64(thunk, cont) pLsrThunkCutMk(64, thunk, cont) +#define pLsrThunkCutFloat(thunk, cont) pLsrThunkCutMk(Float, thunk, cont) +#define pLsrThunkCutDouble(thunk, cont) pLsrThunkCutMk(Double, thunk, cont) + +/* Printing */ + +static void pLsrThunkPrintRef(PlsrThunkBRef t) +{ + pLsrValuePrint((PlsrObjectB)pLsrThunkEvalRef(t)); +} + +/* VTables for black holed thunks */ + +pLsrVTableStatic(pLsrThunkEvalVTableRef_, VThunkTag, "*evaled thunk (ref)*", pLsrThunkPaddingRef); +#define pLsrThunkEvalVTableRef (&pLsrThunkEvalVTableRef_) + +pLsrVTableStatic(pLsrThunkEvalVTable32_, VThunkTag, "*evaled thunk (32)*", pLsrThunkPadding32); +#define pLsrThunkEvalVTable32 (&pLsrThunkEvalVTable32_) + +pLsrVTableStatic(pLsrThunkEvalVTable64_, VThunkTag, "*evaled thunk (64)*", pLsrThunkPadding64); +#define pLsrThunkEvalVTable64 (&pLsrThunkEvalVTable64_) + +pLsrVTableStatic(pLsrThunkEvalVTableFloat_, VThunkTag, "*evaled thunk (float)*", pLsrThunkPaddingFloat); +#define pLsrThunkEvalVTableFloat (&pLsrThunkEvalVTableFloat_) + +pLsrVTableStatic(pLsrThunkEvalVTableDouble_, VThunkTag, "*evaled thunk (double)*", pLsrThunkPaddingDouble); +#define pLsrThunkEvalVTableDouble (&pLsrThunkEvalVTableDouble_) + +#ifdef PLSR_THUNK_SYNCHRONIZE +#define pLsrThunkClaimDo(name, thunk) \ + do { \ + PlsrVTable pLsrThunkClaimDoVT = pLsrThunkVTableField##name(thunk); \ + if (!(pLsrVTableGetCustom(pLsrThunkClaimDoVT) && \ + pLsrObjectCmpAndSetVTableMandatory(thunk, pLsrThunkClaimDoVT, pLsrThunkEvalVTable##name))) { \ + iFlcSynchWaitEqualVoidS(thunk, 0, (void*) pLsrThunkValVTable##name); \ + return pLsrThunkGetVal##name(thunk); \ + } \ + } while (0) +#else +#define pLsrThunkClaimDo(name, thunk) \ + do { \ + if (pLsrThunkVTableField##name(thunk) == pLsrThunkEvalVTable##name) { \ + pLsrRuntimeError_("Black Hole"); \ + } \ + pLsrObjectChangeVTableMandatory(thunk, pLsrThunkEvalVTable##name); \ + } while (0) +#endif + +#define pLsrThunkClaimRef(thunk) pLsrThunkClaimDo(Ref, thunk) +#define pLsrThunkClaim32(thunk) pLsrThunkClaimDo(32, thunk) +#define pLsrThunkClaim64(thunk) pLsrThunkClaimDo(64, thunk) +#define pLsrThunkClaimFloat(thunk) pLsrThunkClaimDo(Float, thunk) +#define pLsrThunkClaimDouble(thunk) pLsrThunkClaimDo(Double, thunk) + +#ifdef PLSR_THUNK_SYNCHRONIZE +#define pLsrThunkBlackHoleDo(name, thunk) \ +do { \ + iFlcSynchWaitEqualUIntp(thunk, 0, pLsrThunkValVTable##name); \ + return pLsrThunkGetVal##name(thunk); \ + } while (0) +#else +#define pLsrThunkBlackHoleDo(name, thunk) \ + do { \ + pLsrRuntimeError_("Black Hole"); \ + } while (0) +#endif +#define pLsrThunkBlackHoleRef(thunk) pLsrThunkBlackHoleDo(Ref, thunk) +#define pLsrThunkBlackHole32(thunk) pLsrThunkBlackHoleDo(32, thunk) +#define pLsrThunkBlackHole64(thunk) pLsrThunkBlackHoleDo(64, thunk) +#define pLsrThunkBlackHoleFloat(thunk) pLsrThunkBlackHoleDo(Float, thunk) +#define pLsrThunkBlackHoleDouble(thunk) pLsrThunkBlackHoleDo(Double, thunk) + +#define pLsrThunkZeroFV(zero) + + +/*** Check Object Model ***/ + +/* Generated code defines: + * pLsrThunkFixedSizeRef + * pLsrThunkFixedSize64 + * pLsrThunkFixedSize32 + * pLsrThunkFixedSizeFloat + * pLsrThunkFixedSizeDouble + * pLsrThunkResultOffsetRef + * pLsrThunkResultOffset64 + * pLsrThunkResultOffset32 + * pLsrThunkResultOffsetFloat + * pLsrThunkResultOffsetDouble + */ + +static void pLsrThunkCheck() +{ + /*printf("Thunk check: %d/%d, %d/%d, %d/%d\n", pLsrThunkFixedSizeRef, + sizeof(PlsrThunkURef), pLsrThunkFixedSize32, sizeof(PlsrThunkU32), + pLsrThunkFixedSize64, sizeof(PlsrThunkU64));*/ + if (pLsrThunkFixedSizeRef != sizeof(PlsrThunkURef)) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkFixedSize32 != sizeof(PlsrThunkU32)) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkFixedSize64 != sizeof(PlsrThunkU64)) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkFixedSizeFloat != sizeof(PlsrThunkUFloat)) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkFixedSizeDouble != sizeof(PlsrThunkUDouble)) + pLsrRuntimeError("Bad thunk object model!\n"); + + if (pLsrThunkResultOffsetRef != ((unsigned)(&pLsrThunkResultFieldRef(0)))) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkResultOffset32 != ((unsigned)(&pLsrThunkResultField32(0)))) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkResultOffset64 != ((unsigned)(&pLsrThunkResultField64(0)))) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkResultOffsetFloat != ((unsigned)(&pLsrThunkResultFieldFloat(0)))) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkResultOffsetDouble != ((unsigned)(&pLsrThunkResultFieldDouble(0)))) + pLsrRuntimeError("Bad thunk object model!\n"); + +} + +#endif /* !_PLSR_THUNK_H_ */ diff --git a/runtime/include/hrc/plsr-main.h b/runtime/include/hrc/plsr-main.h new file mode 100755 index 0000000..336236e --- /dev/null +++ b/runtime/include/hrc/plsr-main.h @@ -0,0 +1,164 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_MAIN_H_ +#define _PLSR_MAIN_H_ + +#ifdef P_USE_PILLAR +# define MAIN pillar_main +#else +# define MAIN main +#endif + +/* C functions used for Runtime */ +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ +#endif /* P_USE_PILLAR */ + +#include + +#ifdef P_USE_PILLAR +# undef to +# pragma pillar_managed(on) +#endif /* P_USE_PILLAR */ + + + +static void pLsrRuntimeInitialize() +{ + pLsrGCInitialize(); + pLsrNumericInitialize(pLsrGmpMemLimitParam); +} + +static void pLsrRegisterRuntimeGlobals() +{ + pLsrGCRegisterGlobals(); + pLsrFinalizerRegisterGlobals(); + pLsrWpoRegisterGlobals(); + pLsrNumericRegisterGlobals(); + pLsrValueRegisterGlobals(); +} + +static void pLsrRegisterRuntimeVTables() +{ + pLsrGCRegisterVTables(); + pLsrFinalizerRegisterVTables(); + pLsrWpoRegisterVTables(); + pLsrNumericRegisterVTables(); + pLsrValueRegisterVTables(); +} + +static void pLsrCheckRuntimeAssertions() +{ + pLsrFinalizerCheckAssertions(); + pLsrWpoCheckAssertions(); + pLsrNumericCheckAssertions(); + pLsrObjectCheckModel(); + pLsrThunkCheck(); + pLsrValueCheck(); +} + +#ifdef P_USE_PILLAR +#pragma pillar_managed(off) +static void pLsrRuntimeReportRoots(PrtRseCallback rse, void* env) +{ + pLsrFinalizerReportRoots(rse, env); +} +#pragma pillar_managed(on) +#endif + +static void pLsrFuturesStart () +{ + uintp sizeInBytes = pLsrStackSizeWorker * 1024 * 1024; + uintp digitsB10 = (uintp) log10(sizeInBytes); + char *fmt = "stacksize=%u"; + char *buf = pLsrAllocC(strlen(fmt) + digitsB10 + 1); + if (!sprintf(buf, fmt, sizeInBytes)) { + pLsrRuntimeError("Unable to set stack size"); + } +#ifdef P_USE_PARALLEL_FUTURES + ptkFutureSystemSetOption(buf); + ptkFutureSystemStart(0); +#endif + pLsrFreeC(buf); +} + +static void pLsrIHRInitialize() { + if (pLsrIHRThreadCountParam > 0) ihrSetNCapabilities(pLsrIHRThreadCountParam); +} + +static void __pmain(); + + +static void pLsrRun() +{ + pLsrEventsTransition("Enter", "Main"); + __pmain(); + pLsrFinalizerShutdown (1); + pLsrEventsTransition("Exit", "Main"); + fflush(stdout); + +#ifdef PLSR_INSTRUMENT_ALLOCATION + printf("plsr: Number objects allocated: %I64u\n", pLsrNumObjectsAllocated); + printf("plsr: Number bytes allocated: %I64u\n", pLsrNumBytesAllocated); + printf("plsr: Number padding bytes allocated: %I64u\n", pLsrNumPaddingBytesAllocated); + printf("plsr: Number unmanaged objects allocated: %I64u\n", pLsrNumObjectsAllocatedUnmanaged); + printf("plsr: Number unmanaged bytes allocated: %I64u\n", pLsrNumBytesAllocatedUnmanaged); + printf("plsr: Number unmanaged objects freed: %I64u\n", pLsrNumObjectsFreedUnmanaged); +#endif /* PLSR_INSTRUMENT_ALLOCATION */ +#ifdef PLSR_INSTRUMENT_VTB_ALC + { + PlsrVTable cur = pLsrAllVTables; + printf("plsr: vtable allocation stats:\n"); + while(cur) { + printf(" %s (%p): Number objects allocated: %I64u\n", + cur->name, cur, cur->numObjects); + printf(" %s (%p): Number bytes allocated: %I64u\n", + cur->name, cur, cur->numBytes); + printf(" %s (%p): Number padding bytes allocated: %I64u\n", + cur->name, cur, cur->padding*cur->numObjects); + cur = cur->next; + } + } +#endif /* PLSR_INSTRUMENT_VTB_ALC */ + + pLsrEventsShutdown(); + pLsrExit(0); + + return; +} + +int MAIN(int _argc, const char** _argv) +{ + int argc; + const char** argv; + +#ifdef P_USE_MCRT + mcrtStart(main, _argc, _argv); +#endif + setlocale(LC_CTYPE, ""); + pLsrEventsInit(); + pLsrEventsTransition("Enter", "Startup"); + pLsrDisableErrorBox(); + pilCheck(); + pLsrCheckRuntimeAssertions(); + pLsrParseOptions(_argc, _argv, &argc, &argv); + pLsrGcInit(pLsrInitHeapParam, pLsrMaxHeapParam); + pLsrRegisterRuntimeVTables(); + pLsrRegisterRuntimeGlobals(); + pLsrRuntimeInitialize(); + pLsrIHRInitialize(); + pLsrFuturesStart (); + pLsrFinalizerStart (); + pLsrEventsTransition("Exit", "Startup"); +#ifdef __pillar2c__ + prtSetPcallStackSize(pLsrStackSizeMain * 1024 * 1024); + pcall pLsrRun(); +#else + pLsrRun(); +#endif + return 0; +} + +#endif /* !_PLSR_MAIN_H_ */ diff --git a/runtime/include/hrc/plsr-numeric.h b/runtime/include/hrc/plsr-numeric.h new file mode 100755 index 0000000..e490ec0 --- /dev/null +++ b/runtime/include/hrc/plsr-numeric.h @@ -0,0 +1,65 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Arbitrary precision numbers */ + +#ifndef _PLSR_NUMERIC_H_ +#define _PLSR_NUMERIC_H_ + +/********************************************************************** + * Arbitrary precision integers + */ + +#include "hrc/plsr-integer.h" + +/********************************************************************** + * Arbitrary precision rationals + */ + +#include "hrc/plsr-rational.h" + +/********************************************************************** + * Some miscellaneous floating point stuff + */ + +static char* pLsrCStringFromFloat32(float32 flt) +{ + char* str = pLsrAllocC(30); + sprintf(str, "%f", flt); + return str; +} + +static float32 pLsrFloat32FromCString(char* str) +{ + float32 res = 0.0f; + sscanf(str, "%f", &res); + return res; +} + +/********************************************************************** + * GC registration functions + */ + +static void pLsrNumericRegisterVTables() +{ + pLsrIntegerRegisterVTables(); + pLsrRationalRegisterVTables(); +} + +static void pLsrNumericRegisterGlobals() +{ + pLsrIntegerRegisterGlobals(); + pLsrRationalRegisterGlobals(); +} + +static void pLsrNumericCheckAssertions() +{ + pLsrRationalCheckAssertions(); +} + +static void pLsrNumericInitialize(uintp memLimit) +{ + pLsrAPIntInitialize(memLimit); + pLsrAPRatInitialize(); +} +#endif /* !_PLSR_NUMERIC_H_ */ diff --git a/runtime/include/hrc/plsr-objects.h b/runtime/include/hrc/plsr-objects.h new file mode 100755 index 0000000..8da395b --- /dev/null +++ b/runtime/include/hrc/plsr-objects.h @@ -0,0 +1,189 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Heap Objects and the Object Model */ + +#ifndef _PLSR_OBJECTS_H_ +#define _PLSR_OBJECTS_H_ + +/********************************************************************** + * Refs + */ + +#ifdef P_USE_PILLAR +typedef ref PlsrRef; +#else /* !P_USE_PILLAR */ +typedef void* PlsrRef; +#endif /* !P_USE_PILLAR */ + +/********************************************************************** + * VTables + */ + +enum PlsrValueTagE { + VNoneTag, + VRatTag, + VNameTag, + VFloatTag, + VDoubleTag, + VArrayTag, + VArrayIdxTag, + VSumTag, + VFunctionTag, + VSetTag, + VTypeTag, + VPtrTag, + VThunkTag +}; + +struct PlsrVTableS { + char reserve[P_VTABLE_RESERVE]; + const enum PlsrValueTagE tag; + uintp custom; +#if (defined(PLSR_INSTRUMENT_VTB_ALC) || defined(DEBUG)) + char* name; +#endif +#ifdef PLSR_INSTRUMENT_VTB_ALC + struct PlsrVTableS* next; + uint64 numObjects; + uint64 numBytes; +#endif /* PLSR_INSTRUMENT_VTB_ALC */ +#if (defined(PLSR_INSTRUMENT_ALLOCATION) || defined(PLSR_INSTRUMENT_VTB_ALC)) + uint64 padding; /* How much padding does this object contain? */ +#endif + +}; + +typedef struct PlsrVTableS* PlsrVTable; + +#define pLsrVTableStaticTagField(tg) .tag = (tg), +#define pLsrVTableStaticCustomField(c) .custom = ((uintp) (c)), +#if (defined(PLSR_INSTRUMENT_VTB_ALC) || defined(DEBUG)) +#define pLsrVTableStaticNameField(nm) .name = (nm), +#else +#define pLsrVTableStaticNameField(nm) +#endif +#ifdef PLSR_INSTRUMENT_VTB_ALC +#define pLsrVTableStaticNextField(n) .next = (n), +#define pLsrVTableStaticNumObjectsField(n) .numObjects = (n), +#define pLsrVTableStaticNumBytesField(n) .numBytes = (n), +#else +#define pLsrVTableStaticNextField(n) +#define pLsrVTableStaticNumObjectsField(n) +#define pLsrVTableStaticNumBytesField(n) +#endif +#if (defined(PLSR_INSTRUMENT_ALLOCATION) || defined(PLSR_INSTRUMENT_VTB_ALC)) +#define pLsrVTableStaticPaddingField(n) .padding = (n), +#else +#define pLsrVTableStaticPaddingField(n) +#endif + +#define pLsrVTableStaticWithCustom(vt, tg, nm, p, c) \ + pil_aligned(16) static struct PlsrVTableS vt = \ + { pLsrVTableStaticTagField(tg) \ + pLsrVTableStaticCustomField(c) \ + pLsrVTableStaticNameField(nm) \ + pLsrVTableStaticNextField(0) \ + pLsrVTableStaticNumObjectsField(0) \ + pLsrVTableStaticNumBytesField(0) \ + pLsrVTableStaticPaddingField(p)} + +#define pLsrVTableStatic(vt, tg, nm, p) \ + pLsrVTableStaticWithCustom(vt, tg, nm, p, 0) + +#define pLsrVTableGetTag(vt) ((vt)->tag) + +#define pLsrVTableGetCustom(vt) ((vt)->custom) + +/* This vtable should not be used with accurate GC, as it does not + * determine its GC info unambiguously. + */ +pLsrVTableStatic(pLsrVTableNone_, VNoneTag, "*none*", 0); +#define pLsrVTableNone (&pLsrVTableNone_) + +/* VTables for thunks */ + +/* When using accurate GC, these vtable is only for thunks with no + * free variables, initialized atomically at allocation time. + */ +pLsrVTableStatic(pLsrThunkValVTableRef_, VThunkTag, "*thunk value (ref)*", pLsrThunkPaddingRef); +#define pLsrThunkValVTableRef (&pLsrThunkValVTableRef_) + +pLsrVTableStatic(pLsrThunkValVTable32_, VThunkTag, "*thunk value (32)*", pLsrThunkPadding32); +#define pLsrThunkValVTable32 (&pLsrThunkValVTable32_) + +pLsrVTableStatic(pLsrThunkValVTable64_, VThunkTag, "*thunk value (64)*", pLsrThunkPadding64); +#define pLsrThunkValVTable64 (&pLsrThunkValVTable64_) + +pLsrVTableStatic(pLsrThunkValVTableFloat_, VThunkTag, "*thunk value (float)*", pLsrThunkPaddingFloat); +#define pLsrThunkValVTableFloat (&pLsrThunkValVTableFloat_) + +pLsrVTableStatic(pLsrThunkValVTableDouble_, VThunkTag, "*thunk value (double)*", pLsrThunkPaddingDouble); +#define pLsrThunkValVTableDouble (&pLsrThunkValVTableDouble_) + + + +/********************************************************************** + * Objects + */ + +typedef struct { + PlsrVTable vtable; +} PlsrObjectU; + +#ifdef P_USE_PILLAR +typedef PlsrRef PlsrObjectB; +#else /* !P_USE_PILLAR */ +typedef PlsrObjectU* PlsrObjectB; +#endif /* !P_USE_PILLAR */ + +#define pLsrObjectGetVTable(obj) \ + ((PlsrVTable)((long)(((PlsrObjectU*)(obj))->vtable) & (0xFFffFFfc))) + +/* Generated code defines: + * pLsrObjectFieldsBase + */ + +#define pLsrObjectGetKind(obj) (pLsrVTableGetTag(pLsrObjectGetVTable(obj))) +/* Note that both of these need the pointer type for t */ +#define pLsrObjectField(obj, off, t) (*(t)((char*)(obj) + (off))) +#define pLsrObjectExtra(obj, off, t, es, i) (*(t)((char*)(obj) + (off) + (es * i))) + +#ifdef P_USE_PILLAR +typedef PlsrRef PlsrThunkBRef; +typedef PlsrRef PlsrThunkB32; +typedef PlsrRef PlsrThunkB64; +typedef PlsrRef PlsrThunkBFloat; +typedef PlsrRef PlsrThunkBDouble; +typedef PlsrRef PlsrPAny; +#else /* !P_USE_PILLAR */ +typedef struct PlsrThunkSRef* PlsrThunkBRef; +typedef struct PlsrThunkS32* PlsrThunkB32; +typedef struct PlsrThunkS64* PlsrThunkB64; +typedef struct PlsrThunkSFloat* PlsrThunkBFloat; +typedef struct PlsrThunkSDouble* PlsrThunkBDouble; +typedef PlsrObjectU* PlsrPAny; +#endif /* !P_USE_PILLAR */ + +static void pLsrValuePrint(PlsrObjectB); +static PlsrRef pLsrThunkEvalRef(PlsrThunkBRef); + +#define pLsrObjectEval(dest, obj) \ + do { \ + if (pLsrVTableGetTag(pLsrObjectGetVTable(obj)) == VThunkTag) { \ + (dest) = (PlsrPAny) pLsrThunkEvalRef((PlsrThunkBRef)obj); \ + } else { \ + (dest) = (PlsrPAny)obj; \ + } \ + } while (0) + +static void pLsrObjectCheckModel() +{ + if (pLsrObjectFieldsBase < sizeof(PlsrObjectU)) + pLsrRuntimeError("Bad object model!\n"); +} + +#define pLsrMemzeroArray(dest, fsz, esz, c) \ + memset(((char*)dest) + (fsz),0,(esz)*(c)) + +#endif /* !_PLSR_OBJECTS_H_ */ diff --git a/runtime/include/hrc/plsr-params.h b/runtime/include/hrc/plsr-params.h new file mode 100755 index 0000000..5f9890f --- /dev/null +++ b/runtime/include/hrc/plsr-params.h @@ -0,0 +1,236 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Parse Commandline Parameters */ + +#ifndef _PLSR_PARAMS_H_ +#define _PLSR_PARAMS_H_ + +typedef enum { + PPBool, + PPInt, + PPGc, + PPRT, + PFutures, +#ifdef P_USE_PILLAR + PPPGc, +#endif // P_USE_PILLAR + PPEnd, +} PlsrPParamType; + +typedef struct pLsrPParamEntryS { + PlsrPParamType t; + char* name; + int len; + int* loc; + char* usage; +} PlsrPParamEntry; + +static char pLsrPPStart[] = "@PPiler"; +static uintp pLsrPPStartLen = 7; +static char pLsrPPEnd[] = "--"; +static uintp pLsrPPEndLen = 2; + +static uintp pLsrIHRThreadCountParam = 0; +static sintp pLsrInitHeapParam = -1; /* MB */ +static uintp pLsrMaxHeapParam = 0; /* MB, 0 => unlimited */ +static uintp pLsrStackSizeMain = PLSR_STACK_SIZE_MAIN; +static uintp pLsrStackSizeWorker = PLSR_STACK_SIZE_WORKER; +static uintp pLsrGmpMemLimitParam = 0; + +static PlsrPParamEntry pLsroptions[] = { + {PPInt, "gmpMaxMem", 0, &pLsrGmpMemLimitParam, "Memory limit for gmp integer allocation"}, + {PPInt, "ihrThreads", 0, &pLsrIHRThreadCountParam, "How many haskell thread capabilities?"}, + {PPInt, "initHeap", 0, &pLsrInitHeapParam, "Initial heap size (megabytes)"}, + {PPInt, "maxHeap", 0, &pLsrMaxHeapParam, "Max heap size (megabytes), 0 => unlimited"}, + {PPInt, "stackMain", 0, &pLsrStackSizeMain, "Stack size for main threads (megabytes)"}, + {PPInt, "stackWorker", 0, &pLsrStackSizeWorker, "Stack size for worker threads (megabytes)"}, + {PPGc, "gc", 0, NULL, "Pass option to gc"}, + {PPRT, "prt", 0, NULL, "Pass option to prt"}, + {PFutures, "futures", 0, NULL, "Pass option to the futures package"}, +#ifdef P_USE_PILLAR + {PPPGc, "pgc", 0, NULL, "Pass option to pgc"}, +#endif // P_USR_PILLAR + {PPEnd, NULL, 0, NULL, NULL} /*Signal the end */ +}; + +static void pLsrinitPParamTable() +{ + uintp i = 0; + while(pLsroptions[i].name != NULL) { + pLsroptions[i].len = strlen(pLsroptions[i].name); + i++; + } + return; +} + +static void pLsrdoPParamUsage(const char* argv[]) +{ + uintp i = 0; + fprintf(stderr, "usage: %s @PPiler [opt]* -- \n", argv[0]); + fprintf(stderr, " where opt is one of\n"); + + while(pLsroptions[i].t != PPEnd) { + switch(pLsroptions[i].t) { + case PPInt: + fprintf(stderr, + " %s n (default=%d)\n", + pLsroptions[i].name, + *(pLsroptions[i].loc)); + break; + case PPBool: + fprintf(stderr, + " %s\n", + pLsroptions[i].name); + break; + case PPGc: + fprintf(stderr, + " %s \n", + pLsroptions[i].name); + break; + case PPRT: + fprintf(stderr, + " %s \n", + pLsroptions[i].name); + break; + case PFutures: + fprintf(stderr, + " %s \n", + pLsroptions[i].name); + break; +#ifdef P_USE_PILLAR + case PPPGc: + fprintf(stderr, + " %s \n", + pLsroptions[i].name); + break; +#endif // P_USR_PILLAR + case PPEnd: + assert(0); + break; + } + i++; + } + pLsrRuntimeError("Bad @PPiler option usage"); +} + +static int pLsrdoPParamInt(int* res, int idx, int argc, const char* argv[]) +{ + if (idx >= argc) + pLsrdoPParamUsage(argv); + if (sscanf(argv[idx], "%i", res) <= 0) + pLsrdoPParamUsage(argv); + return idx+1; +} + +static int pLsrdoPParamGc(int idx, int argc, const char* argv[]) +{ + if (idx >= argc) + pLsrdoPParamUsage(argv); + if (strcmp(argv[idx], "verbose") == 0) + pLsrGcOption("-verbosegc", ""); + else + pLsrGcOption("-gc", argv[idx]); + return idx+1; +} + +static int pLsrdoPParam(int idx, int argc, const char* argv[]) +{ + void __cdecl prtSetOption(const char *optionString); + uintp j; + for(j=0; pLsroptions[j].t != PPEnd; j++) { + /* XXX NG: this is not strictly correct */ + if (0 == strncmp(argv[idx], pLsroptions[j].name, pLsroptions[j].len)) { + idx++; + switch (pLsroptions[j].t) { + case PPInt: + idx = pLsrdoPParamInt(pLsroptions[j].loc, idx, argc, argv); + break; + case PPBool: + *(pLsroptions[j].loc) = 1; + break; + case PPGc: + idx = pLsrdoPParamGc(idx, argc, argv); + break; + case PPRT: + if (idx >= argc) + pLsrdoPParamUsage(argv); +#ifdef P_USE_PILLAR + prtSetOption(argv[idx]); +#else /* !P_USE_PILLAR */ + fprintf(stderr, + "prt option: %s ignored (not using pillar)\n", + argv[idx]); +#endif + idx++; + break; + case PFutures: + if (idx >= argc) + pLsrdoPParamUsage(argv); + ptkFutureSystemSetOption(argv[idx]); + idx++; + break; +#ifdef P_USE_PILLAR + case PPPGc: + if (idx >= argc) + pLsrdoPParamUsage(argv); + pgcSetOption(argv[idx]); + idx++; + break; +#endif // P_USR_PILLAR + case PPEnd: + assert(0); + /* Impossible */ + break; + } + return idx; + } + } + /* If we get here, it was an unknown param */ + pLsrdoPParamUsage(argv); + return 0; +} + +static int pLsrdoPParamList(int idx, int argc, const char* argv[]) +{ + while(idx < argc) { + if (0 == strncmp(argv[idx], pLsrPPEnd, pLsrPPEndLen)) { + return idx+1; + } + idx = pLsrdoPParam(idx, argc, argv); + } + /* If we get here, then there was no PPEnd symbol */ + pLsrdoPParamUsage(argv); + return 0; +} + +static int pLsrdoPParamSegment(int idx, int argc, const char* argv[]) +{ + if (idx < argc) { + if (0 == strncmp(argv[idx], pLsrPPStart, pLsrPPStartLen)) { + return pLsrdoPParamList(idx+1, argc, argv); + } + else { + return idx; + } + } + return idx; +} + +static void pLsrParseOptions(int _argc, const char* _argv[], + int* argc, const char*** argv) +{ + /* For now, assume that all the PPiler opts are first. + * Later we can generalize + */ + int idx; + int count; + pLsrinitPParamTable(); + idx = pLsrdoPParamSegment(1, _argc, _argv); + count = idx -1; + *argc = pargc = _argc - count; /* consumed args */ + *argv = pargv = _argv + count; /* Advance to last consumed arg */ + *argv[0] = _argv[0]; /* Keep command name around in place of last consumed */ +} + +#endif /* !_PLSR_PARAMS_H_ */ diff --git a/runtime/include/hrc/plsr-prims-ghc-longlong.h b/runtime/include/hrc/plsr-prims-ghc-longlong.h new file mode 100644 index 0000000..eabdc7c --- /dev/null +++ b/runtime/include/hrc/plsr-prims-ghc-longlong.h @@ -0,0 +1,51 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_PRIMS_GHC_LONGLONG_H_ +#define _PLSR_PRIMS_GHC_LONGLONG_H_ + +/* get type definitions from pil.h */ +#include "hrc/pil.h" + +#define hs_gtWord64(a,b) ((a)> (b)) +#define hs_geWord64(a,b) ((a)>=(b)) +#define hs_eqWord64(a,b) ((a)==(b)) +#define hs_neWord64(a,b) ((a)!=(b)) +#define hs_ltWord64(a,b) ((a)< (b)) +#define hs_leWord64(a,b) ((a)<=(b)) +#define hs_gtInt64(a,b) ((a)> (b)) +#define hs_geInt64(a,b) ((a)>=(b)) +#define hs_eqInt64(a,b) ((a)==(b)) +#define hs_neInt64(a,b) ((a)!=(b)) +#define hs_ltInt64(a,b) ((a)< (b)) +#define hs_leInt64(a,b) ((a)<=(b)) + +#define hs_remWord64(a,b) ((a)% (b)) +#define hs_quotWord64(a,b) ((a)/ (b)) + +#define hs_remInt64(a,b) ((a)% (b)) +#define hs_quotInt64(a,b) ((a)/ (b)) +#define hs_negateInt64(a) (-(a)) +#define hs_plusInt64(a,b) ((a)+ (b)) +#define hs_minusInt64(a,b) ((a)- (b)) +#define hs_timesInt64(a,b) ((a)* (b)) + +#define hs_and64(a,b) ((a)& (b)) +#define hs_or64(a,b) ((a)| (b)) +#define hs_xor64(a,b) ((a)^ (b)) +#define hs_not64(a) ( ~(a)) + +#define hs_uncheckedShiftL64(a,b) ((a)<< (b)) +#define hs_uncheckedShiftRL64(a,b) ((a)>> (b)) +#define hs_uncheckedIShiftL64(a,b) ((a)<< (b)) +#define hs_uncheckedIShiftRA64(a,b) ((a)>> (b)) +#define hs_uncheckedIShiftRL64(a,b) ((sint64)((uint64)(a)>>(b))) + +#define hs_intToInt64(i) ((sint64) (i)) +#define hs_int64ToInt(i) ((sintp) (i)) +#define hs_int64ToWord64(i) ((uint64) (i)) +#define hs_wordToWord64(w) ((uint64) (w)) +#define hs_word64ToWord(w) ((uintp) (w)) +#define hs_word64ToInt64(w) ((sint64) (w)) + +#endif diff --git a/runtime/include/hrc/plsr-prims-ghc.h b/runtime/include/hrc/plsr-prims-ghc.h new file mode 100644 index 0000000..5713740 --- /dev/null +++ b/runtime/include/hrc/plsr-prims-ghc.h @@ -0,0 +1,554 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_PRIMS_GHC_H_ +#define _PLSR_PRIMS_GHC_H_ + +/* XXX NG: This stuff doesn't work under C */ +#ifdef P_USE_PILLAR + +/* get type definitions from pil.h */ +#include "hrc/pil.h" +#include "hrc/plsr-value.h" +#include "hrc/plsr-gc.h" +#include "hrc/plsr-util.h" + +/* C functions used for Runtime */ +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ +#endif /* P_USE_PILLAR */ + +#ifndef PLSR_LINUX +#include +#include +#endif // !PLSR_LINUX + +#include "hrc/ghc/float.h" +#include "hrc/ghc/Globals.h" +#include "hrc/ghc/TTY.h" +#include "hrc/ghc/thread.h" + +void pLsrGetProgArgv (void* argc, void* argv) +{ + if (argc) { *((sintp*)argc) = pargc; } + if (argv) { *((void**)argv) = pargv; } +} + +#ifdef P_USE_PILLAR +# undef to +# pragma pillar_managed(on) +#endif /* P_USE_PILLAR */ + +/*** Address Stuff ***/ + +/* pointer read */ +#define pLsrPrimGHCIndexOffAddrzh(p,i,r,t) ((r)*(((t*)(p))+(i))) +#define pLsrPrimGHCIndexFloatOffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,float,float) +#define pLsrPrimGHCIndexDoubleOffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,double,double) +#define pLsrPrimGHCIndexAddrOffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,void*,void*) +#define pLsrPrimGHCIndexIntOffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,sintp,sintp) +#define pLsrPrimGHCIndexUIntOffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,uintp,uintp) +#define pLsrPrimGHCIndexUInt8OffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,uintp,uint8) +#define pLsrPrimGHCIndexUInt16OffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,uintp,uint16) +#define pLsrPrimGHCIndexUInt32OffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,uintp,uint32) +#define pLsrPrimGHCIndexUInt64OffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,uint64,uint64) +#define pLsrPrimGHCIndexInt8OffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,sintp,sint8) +#define pLsrPrimGHCIndexInt16OffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,sintp,sint16) +#define pLsrPrimGHCIndexInt32OffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,sintp,sint32) +#define pLsrPrimGHCIndexInt64OffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,sint64,sint64) +/* pointer write */ +#define pLsrPrimGHCWriteOffAddrzh(p,i,v,t) { *(((t*)(p))+(i))=(t)(v); } +#define pLsrPrimGHCWriteFloatOffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,float) +#define pLsrPrimGHCWriteDoubleOffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,double) +#define pLsrPrimGHCWriteAddrOffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,void*) +#define pLsrPrimGHCWriteIntOffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,sintp) +#define pLsrPrimGHCWriteUIntOffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,uintp) +#define pLsrPrimGHCWriteUInt8OffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,uint8) +#define pLsrPrimGHCWriteUInt16OffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,uint16) +#define pLsrPrimGHCWriteUInt32OffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,uint32) +#define pLsrPrimGHCWriteUInt64OffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,uint64) +#define pLsrPrimGHCWriteInt8OffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,sint8) +#define pLsrPrimGHCWriteInt16OffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,sint16) +#define pLsrPrimGHCWriteInt32OffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,sint32) +#define pLsrPrimGHCWriteInt64OffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,sint64) +/* pointer arithmetic */ +#define pLsrPrimGHCNullAddrzh() ((void*)0) +#define pLsrPrimGHCPlusAddrzh(p,i) ((void*)((char*)(p)+(i))) +#define pLsrPrimGHCMinusAddrzh(p,q) ((sintp)((char*)(p)-(char*)(q))) +#define pLsrPrimGHCRemAddrzh(p,i) (((sintp)(p)%(i))) +#define pLsrPrimGHCAddr2Intzh(p) ((sintp)(p)) +#define pLsrPrimGHCInt2Addrzh(i) ((void*)(i)) +#define pLsrPrimGHCGtAddrzh(p,q) ((p)>(q)) +#define pLsrPrimGHCGeAddrzh(p,q) ((p)>=(q)) +#define pLsrPrimGHCLtAddrzh(p,q) ((p)<(q)) +#define pLsrPrimGHCLeAddrzh(p,q) ((p)<=(q)) +#define pLsrPrimCastToAddrzh(p) ((void*)(p)) +#define pLsrPrimCastFromAddrzh(p) ((PlsrPAny*)(p)) +/* bit operations */ +#define pLsrPrimGHCPopCntzh(w) ((uintp)(__builtin_popcount((unsigned)(w)))) +#define pLsrPrimGHCPopCnt64zh(w) ((uintp)(__builtin_popcountll((uint64)(w)))) +/* byte array */ +#define pLsrPrimGHCByteArrayContentszh(p) ((void*)(&pLsrPArrayOElt((PlsrObjectB)(p), 0))) + +#ifdef P_USE_PILLAR +static void performMajorGC () { pgc_force_gc(); } +#endif // P_USE_PILLAR + +/*** Thread stuff ***/ + +typedef enum {IhrTsRunning, IhrTsCompleted, IhrTsKilled} IhrThreadStatus; + +typedef struct IhrThreadS { + PlsrVTable vtable; + PrtTaskHandle prtThread; + IhrThreadStatus status; + bool maskAsyncExn, uninteruptable; +} IhrThreadU; + +#define ihrThreadSize sizeof(IhrThreadU) +#define ihrThreadPadding \ + (sizeof(IhrThreadU) - sizeof(PlsrVTable) - sizeof(PrtTaskHandle) - sizeof(IhrThreadStatus) - 2*sizeof(bool)) +pLsrVTableStatic (ihrVTableThread, VNoneTag, "*haskell thread*", ihrThreadPadding); + +#ifdef P_USE_PILLAR +typedef ref IhrThreadId; +#define ihrThreadB(t) ((IhrThreadU*)(t)) +#else +typedef IhrThreadU* IhrThreadId; +#define ihrThreadB(t) (t) +#endif + +typedef struct IhrTlsS { + IhrThreadId thread; + //PilContinuation(PlsrObjectB) exnHandler; + PilContinuation0 exnHandler; + PlsrObjectB exn; +} IhrTlsU; + +typedef IhrTlsU* IhrTlsP; + +unsigned ihrTlsOffset; + +#define ihrPrtTlsToTls(tls) (*((IhrTlsP*)(((char*)(tls))+ihrTlsOffset))) + +static void __cdecl ihrTlsRse(PrtProvidedTlsHandle tls_, struct PrtRseInfo* rse) +{ + IhrTlsP tls = ihrPrtTlsToTls(tls_); + if (tls) { + rse->callback(rse->env, &tls->thread, PrtGcTagDefault, NULL); + rse->callback(rse->env, &tls->exn, PrtGcTagDefault, NULL); + } +} + +static void ihrTlsInit() +{ + ihrTlsOffset = ptkGetNextTlsOffset(sizeof(IhrTlsP)); + prtRegisterTlsEnumerator(ihrTlsRse); +} + +static IhrTlsP ihrTlsGet() +{ + IhrTlsP tls = ihrPrtTlsToTls(prtGetTls()); + return tls; +} + +static void ihrTlsSet(IhrTlsP tls) +{ + ihrPrtTlsToTls(prtGetTls()) = tls; +} + +/*** Exceptions ***/ + +//static inline PilContinuation(PlsrObjectB) ihrExceptionHandlerGet() +static inline PilContinuation0 ihrExceptionHandlerGet() +{ + return ihrTlsGet()->exnHandler; +} + +static inline PlsrObjectB ihrExceptionExnGet() +{ + PlsrObjectB exn = ihrTlsGet()->exn; + assert(exn); + return exn; +} + +//static inline void ihrExceptionHandlerSet(PilContinuation(PlsrObjectB) c) +static inline void ihrExceptionHandlerSet(PilContinuation0 c) +{ + ihrTlsGet()->exnHandler = c; +} + +static inline void ihrExceptionExnSet(PlsrObjectB c) +{ + ihrTlsGet()->exn = c; +} + +/*** Delay/wait operations ***/ + +static void ihrDelay(sintp d) +{ + uintp s; + if (d<0) return; + s = (d+999)/1000; + prtSleep(s); +} + +static void ihrWaitRead(sintp f) +{ + /* NG: The GHC runtime blocks the given thread until the given file descriptor + * is ready to be read from. Ultimately it uses select to test this. + * Since we don't multiplex Haskell level threads onto OS threads, it seems + * fine to just do nothing. The thread will then go ahead and try to read. + * XXX: If we start multiplexing Haskell threads, then we need to implement this + * properly. + */ +} + +static void ihrWaitWrite(sintp f) +{ + /* NG: The GHC runtime blocks the given thread until the given file descriptor + * is ready to be written to. Ultimately it uses select to test this. + * Since we don't multiplex Haskell level threads onto OS threads, it seems + * fine to just do nothing. The thread will then go ahead and try to write. + * XXX: If we start multiplexing Haskell threads, then we need to implement this + * properly. + */ +} + +typedef struct IhrPairS { + PlsrVTable vtable; + sintp fst; + sintp snd; +}* IhrPair; + +#define ihrPairPadding \ + (sizeof(struct IhrPairS) - sizeof(PlsrVTable) - 2*sizeof(sintp)) +pLsrVTableStatic (ihrVTablePair, VNoneTag, "*IHR pair*", ihrPairPadding); +#define ihrPairSize (sizeof(struct IhrPairS)) + +#ifndef PLSR_LINUX + +/* NG: The GHC runtime blocks the given thread until the request read/write/action + * is done. Since we don't multiplex Haskell level threads onto OS threads, it + * seems fine to just do the operation and the return. + * XXX: If we start multiplexing Haskell threads, then we need to implement this + * properly. + */ + +/* XXX NG: These are non-threaded implementations. Threaded ones should barf. */ + +static IhrPair ihrAsyncRead(sintp f, sintp isSock, sintp num, void* buf) +{ + IhrPair res; + int len; + DWORD ec = 0; + + if (isSock) { + len = recv(f, buf, num, 0); + if (len==SOCKET_ERROR) + ec = WSAGetLastError(); + } else { + len = read(f, buf, num); + /* XXX NG: GHC does some special processing in certain circumstances, + * but I'm not sure we can do the same, so I'm ignoring that. + */ + if (len==-1) ec = errno; + } + + pLsrAlloc(IhrPair, res, &ihrVTablePair, ihrPairSize); + res->fst = len; + res->snd = ec; + return res; +} + +static IhrPair ihrAsyncWrite(sintp f, sintp isSock, sintp num, void* buf) +{ + IhrPair res; + int len; + DWORD ec = 0; + + if (isSock) { + len = send(f, buf, num,0); + if (len==SOCKET_ERROR) + ec = WSAGetLastError(); + } else { + len = write(f, buf, num); + /* XXX NG: GHC does some special processing in certain circumstances, + * but I'm not sure we can do the same, so I'm ignoring that. + */ + if (len==-1) { + ec = errno; + if (ec==EINVAL && GetLastError()==ERROR_NO_DATA) ec = EPIPE; + } + } + + pLsrAlloc(IhrPair, res, &ihrVTablePair, ihrPairSize); + res->fst = len; + res->snd = ec; + return res; +} + +typedef sintp (*IhrProc)(void*); + +static sintp ihrAsyncDoProc(void* f_, void* a) +{ + IhrProc f = (IhrProc)f_; + if (f) + return f(a); + else + return 1; +} + +#endif // !PLSR_LINUX + +/*** Concurency ***/ + +#define ihrThreadMask(t) (ihrThreadB(t)->maskAsyncExn=1, ihrThreadB(t)->uninteruptable=1) +#define ihrThreadUnmask(t) (ihrThreadB(t)->maskAsyncExn=0, ihrThreadB(t)->uninteruptable=0) + +#define ihrThreadAllocInit(t, pt) \ + do { \ + pLsrAlloc(IhrThreadId, (t), &ihrVTableThread, ihrThreadSize); \ + ihrThreadB(t)->prtThread = (pt); \ + ihrThreadB(t)->status = IhrTsRunning; \ + ihrThreadMask(t); \ + } while(0) + +static void ihrThreadWrapper0(void (*f)()) +{ + IhrTlsU tls; + PlsrObjectB exn; + PilContinuation0 cv; + //PilContinuation(PlsrObjectB) cv; + pilContinuationLocal(cvl); + ihrThreadAllocInit(tls.thread, prtGetTaskHandle()); + pilContinuationMake(cv, cvl, c); + tls.exnHandler = c; + tls.exn = NULL; + ihrTlsSet(&tls); + ihrThreadUnmask(tls.thread); + f() pilCutToC(c); + ihrThreadMask(tls.thread); + ihrThreadB(tls.thread)->status = IhrTsCompleted; + return; + pilContinuation0(cvl, c) + //pilContinuation(cvl, c, exn) + ihrThreadB(tls.thread)->status = IhrTsKilled; + return; +} + +static void ihrThreadWrapper1(void (*f)(PlsrObjectB), PlsrObjectB a) +{ + IhrTlsU tls; + PlsrObjectB exn; + PilContinuation0 cv; + //PilContinuation(PlsrObjectB) cv; + pilContinuationLocal(cvl); + ihrThreadAllocInit(tls.thread, prtGetTaskHandle()); + pilContinuationMake(cv, cvl, c); + tls.exnHandler = c; + tls.exn = NULL; + ihrTlsSet(&tls); + ihrThreadUnmask(tls.thread); + f(a) pilCutToC(c); + ihrThreadMask(tls.thread); + ihrThreadB(tls.thread)->status = IhrTsCompleted; + return; + pilContinuation0(cvl, c) + //pilContinuation(cvl, c, exn) + ihrThreadB(tls.thread)->status = IhrTsKilled; + return; +} + +/* We make this a separate function because some of the thunk evals are macros, + * and we need to annotate the cuts below in ihrForkedThreadMain + */ +static void ihrForkedThreadRun(PlsrObjectB thnk) +{ + pLsrThunkEvalRef(thnk); +} + +static void ihrForkedThreadMain(IhrThreadId thrd, PlsrObjectB thnk) +{ + IhrTlsU tls; + PlsrObjectB exn; + PilContinuation0 cv; + //PilContinuation(PlsrObjectB) cv; + pilContinuationLocal(cvl); + ihrThreadB(thrd)->prtThread = prtGetTaskHandle(); + tls.thread = thrd; + pilContinuationMake(cv, cvl, c); + tls.exnHandler = c; + tls.exn = NULL; + ihrTlsSet(&tls); + ihrThreadUnmask(tls.thread); + ihrForkedThreadRun(thnk) pilCutToC(c); + ihrThreadMask(tls.thread); + ihrThreadB(tls.thread)->status = IhrTsCompleted; + return; + pilContinuation0(cvl, c) + //pilContinuation(cvl, c, exn) + ihrThreadB(tls.thread)->status = IhrTsKilled; + return; +} + +static void __cdecl ihrForkOnRse(PrtCodeAddress func, + void *arguments, + struct PrtRseInfo *rse, + PrtCimSpecificDataType opaque) { + rse->callback(rse->env,((void**)arguments)+1,PrtGcTagDefault,0); +} + +static IhrThreadId ihrForkOn(sintp p, PlsrObjectB thnk) +{ + IhrThreadId thrd; + ihrThreadAllocInit(thrd, NULL); + noyield { + void* args[2] = {thrd, NULL}; + // This write barrier promotes thnk to the public space. + pLsrWriteBarrierRef(args[1], thnk); + // This statement keeps thnk alive past the write barrier so that + // any GC caused by the write barrier doesn't eliminate it. + args[1] = thnk; + prtPcall((PrtCodeAddress)ihrForkedThreadMain, args, 2, p, ihrForkOnRse); + } + return thrd; +} + +static IhrThreadId ihrFork(PlsrObjectB t) +{ + return ihrForkOn(PRT_NO_PROC_AFFINITY, t); +} + +static void ihrKillThread(IhrThreadId t, PlsrObjectB exn) +{ + pLsrRuntimeError("ihrKillThread unimplemented"); +} + +static void ihrYield() +{ + prtYield(); +} + +static IhrThreadId ihrMyThreadId() +{ + return ihrTlsGet()->thread; +} + +static void ihrLabelThread(IhrThreadId t, void* l) +{ + pLsrRuntimeError("ihrLabelThread unimplemented"); +} + +static sintp ihrIsCurrentThreadBound() +{ + return 0; +} + +static void ihrNoDuplicate() +{ + /* This GHC primitive ensures that no two threads are evaluating the same thunk at the same time. + * Since we currently ensure this all the time, this is a no operation. + * If this changes, then we need to implement this operation. + */ +} + +static sintp ihrThreadStatus(IhrThreadId t) +{ + //pLsrRuntimeError("ihrThreadStatus unimplemented"); + return 0; +} + +/*** Weak Pointers ***/ + +/* a no-op just to keep object alive */ +#define pLsrPrimGHCTouchzh(p) do {volatile ref pLsrPrimGHCTouchzh_tmp = (ref)(uintp)(p);} while (0) + +/* Run the Haskell finaliser - needs to wrap the thread for exceptions, etc. */ +void ihrRunHaskellFinaliser(PlsrObjectB f) +{ + ihrThreadWrapper1(pLsrWpoRunFinalizer, f); +} + +/*** Float and Double Encoding/Decoding ***/ + +typedef struct { + PlsrVTable vtable; + uintp tag; + sintp man_sign; + uintp man_high; + uintp man_low; + sintp exp; +} PlsrDecodedDoubleU; +typedef PlsrDecodedDoubleU *PlsrDecodedDoubleB; +#define pLsrDecodedDoublePadding \ + (sizeof(PlsrDecodedDoubleU) - sizeof(PlsrVTable) - sizeof(uintp) - sizeof(sintp) - 2*sizeof(uintp) - sizeof(sintp)) +pLsrVTableStatic (pLsrVTableDecodedDouble, VNoneTag, "*decoded double*", pLsrDecodedDoublePadding); +PlsrRef pLsrPrimGHCDecodeDouble2Intzh (double v) +{ + PlsrDecodedDoubleB d; + uintp h, l; + sintp s, e; + __decodeDouble_2Int (&s, &h, &l, &e, v); + pLsrAlloc (PlsrDecodedDoubleB, d, &pLsrVTableDecodedDouble, sizeof (PlsrDecodedDoubleU)); + d->tag = 0; + d->man_sign = s; + d->man_high = h; + d->man_low = l; + d->exp = e; + return (PlsrRef)d; +} + + +typedef struct { + PlsrVTable vtable; + uintp tag; + sintp man; + sintp exp; +} PlsrDecodedFloatU; +#define pLsrDecodedFloatPadding \ + (sizeof(PlsrDecodedFloatU) - sizeof(PlsrVTable) - sizeof(uintp) - 2*sizeof(sintp)) +typedef PlsrDecodedFloatU *PlsrDecodedFloatB; +pLsrVTableStatic (pLsrVTableDecodedFloat, VNoneTag, "*decoded float*", pLsrDecodedFloatPadding); +PlsrRef pLsrPrimGHCDecodeFloatzh (float v) +{ + PlsrDecodedFloatB f; + sintp m, e; + __decodeFloat_Int (&m, &e, v); + pLsrAlloc (PlsrDecodedFloatB, f, &pLsrVTableDecodedFloat, sizeof (PlsrDecodedFloatU)); + f->tag = 0; + f->man = m; + f->exp = e; + return (PlsrRef)f; +} +#define pLsrDecodedDoubleSize (sizeof(PlsrDecodedDoubleU)) +#define pLsrDecodedFloatSize (sizeof(PlsrDecodedFloatU)) + +/*** Initialisation ***/ + +static void pLsrPrimGHCRegisterVTables() +{ + static PgcIsRef ihrThreadRefs[ihrThreadSize/P_WORD_SIZE] = { 0, }; + static PgcIsRef ihrPairRefs[ihrPairSize/P_WORD_SIZE] = { 0, }; + static PgcIsRef pLsrDecodedDoubleRefs[pLsrDecodedDoubleSize/P_WORD_SIZE] = { 0, }; + static PgcIsRef pLsrDecodedFloatRefs[pLsrDecodedFloatSize/P_WORD_SIZE] = { 0, }; + + pLsrVTableRegister(&ihrVTableThread, 4, ihrThreadSize, ihrThreadRefs, 0, 0, 0, PGC_ALWAYS_MUTABLE, 0); + pLsrVTableRegister(&ihrVTablePair, 4, ihrPairSize, ihrPairRefs, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegister(&pLsrVTableDecodedDouble, 4, pLsrDecodedDoubleSize, pLsrDecodedDoubleRefs, + 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegister(&pLsrVTableDecodedFloat, 4, pLsrDecodedFloatSize, pLsrDecodedFloatRefs, + 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0); +} + +static void ihrInit() +{ + pLsrPrimGHCRegisterVTables(); + ihrTlsInit(); + ihrGlobalInit(); +} + +#endif // P_USE_PILLAR + +#endif // !_PLSR_PRIMS_GHC_H_ diff --git a/runtime/include/hrc/plsr-prims-prims.h b/runtime/include/hrc/plsr-prims-prims.h new file mode 100644 index 0000000..71b3c51 --- /dev/null +++ b/runtime/include/hrc/plsr-prims-prims.h @@ -0,0 +1,665 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_PRIMS_PRIMS_H_ +#define _PLSR_PRIMS_PRIMS_H_ + +/********************************************************************** + * Unboxed Machine Integers and Floating-Point + */ + +#define binArith(t, oper, dest, a, b) \ + ((dest) = ((t) ((t) a) oper ((t) b))) + +#define binArithPrefix(t, oper, dest, a, b) \ + ((dest) = ((t) (oper ((t) a, (t) b)))) + +#define unArith(t, oper, dest, a) \ + ((dest) = ((t) (oper ((t) a)))) + +#define binPred(t, oper, dest, a, b) \ + ((dest) = ((PlsrBoolean) (((t) a) oper ((t) b)))) + +#define pLsrPrimNumericConv(dest, tTo, a) \ + ((dest) = (tTo) (a)) + +/* SInt8 ops */ +#define pLsrPrimPSInt8Plus(dest, a, b) binArith(sint8, +, dest, a, b) +#define pLsrPrimPSInt8Minus(dest, a, b) binArith(sint8, -, dest, a, b) +#define pLsrPrimPSInt8Times(dest, a, b) binArith(sint8, *, dest, a, b) +#define pLsrPrimPSInt8DivT(dest, a, b) binArith(sint8, /, dest, a, b) +#define pLsrPrimPSInt8ModT(dest, a, b) binArith(sint8, %, dest, a, b) +#define pLsrPrimPSInt8Negate(dest, a) unArith(sint8, -, dest, a) + +#define pLsrPrimPSInt8EQ(dest, a, b) binPred(sint8, ==, dest, a, b) +#define pLsrPrimPSInt8NE(dest, a, b) binPred(sint8, !=, dest, a, b) +#define pLsrPrimPSInt8LT(dest, a, b) binPred(sint8, <, dest, a, b) +#define pLsrPrimPSInt8LE(dest, a, b) binPred(sint8, <=, dest, a, b) + +#define pLsrPrimPSInt8FromSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt8FromSInt16(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt8FromSInt32(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt8FromSInt64(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt8FromUInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt8FromUInt16(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt8FromUInt32(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt8FromUInt64(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt8FromFloat32(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt8FromFloat64(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt8FromRational(dest, a) pLsrSInt8FromRational(dest, a) +#define pLsrPrimPSInt8FromInteger(dest, a) pLsrSInt8FromInteger(dest, a) + +#define pLsrPrimPSInt8CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt16CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt32CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPSInt64CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPUInt8CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPUInt16CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPUInt32CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPUInt64CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPFloat32CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPFloat64CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a) +#define pLsrPrimPIntegerCastSInt8(dest, a) pLsrIntegerCastToSInt8(dest, a) + +/* SInt16 ops */ +#define pLsrPrimPSInt16Plus(dest, a, b) binArith(sint16, +, dest, a, b) +#define pLsrPrimPSInt16Minus(dest, a, b) binArith(sint16, -, dest, a, b) +#define pLsrPrimPSInt16Times(dest, a, b) binArith(sint16, *, dest, a, b) +#define pLsrPrimPSInt16DivT(dest, a, b) binArith(sint16, /, dest, a, b) +#define pLsrPrimPSInt16ModT(dest, a, b) binArith(sint16, %, dest, a, b) +#define pLsrPrimPSInt16Negate(dest, a) unArith(sint16, -, dest, a) + +#define pLsrPrimPSInt16EQ(dest, a, b) binPred(sint16, ==, dest, a, b) +#define pLsrPrimPSInt16NE(dest, a, b) binPred(sint16, !=, dest, a, b) +#define pLsrPrimPSInt16LT(dest, a, b) binPred(sint16, <, dest, a, b) +#define pLsrPrimPSInt16LE(dest, a, b) binPred(sint16, <=, dest, a, b) + +#define pLsrPrimPSInt16FromSInt8(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt16FromSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt16FromSInt32(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt16FromSInt64(dest, a) pLsrPrimNumericConv(dest, sint16, a +#define pLsrPrimPSInt16FromUInt8(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt16FromUInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt16FromUInt32(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt16FromUInt64(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt16FromFloat32(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt16FromFloat64(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt16FromRational(dest, a) pLsrSInt16FromRational(dest, a) +#define pLsrPrimPSInt16FromInteger(dest, a) pLsrSInt16FromInteger(dest, a) + +#define pLsrPrimPSInt8CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt16CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt32CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPSInt64CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPUInt8CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPUInt16CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPUInt32CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPUInt64CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPFloat32CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPFloat64CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a) +#define pLsrPrimPIntegerCastSInt16(dest, a) pLsrIntegerCastToSInt16(dest, a) + +/* SInt32 ops */ +#define pLsrPrimPSInt32Plus(dest, a, b) binArith(sint32, +, dest, a, b) +#define pLsrPrimPSInt32Minus(dest, a, b) binArith(sint32, -, dest, a, b) +#define pLsrPrimPSInt32Times(dest, a, b) binArith(sint32, *, dest, a, b) +#define pLsrPrimPSInt32DivT(dest, a, b) binArith(sint32, /, dest, a, b) +#define pLsrPrimPSInt32ModT(dest, a, b) binArith(sint32, %, dest, a, b) +#define pLsrPrimPSInt32DivModT(dest1, dest2, a, b) pLsrSInt32DivModT(dest1, dest2, a, b) +#define pLsrPrimPSInt32DivModE(dest1, dest2, a, b) pLsrSInt32DivModE(dest1, dest2, a, b) +#define pLsrPrimPSInt32DivModF(dest1, dest2, a, b) pLsrSInt32DivModF(dest1, dest2, a, b) +#define pLsrPrimPSInt32Negate(dest, a) unArith(sint32, -, dest, a) + +#define pLsrSInt32DivModT(dest1, dest2, a, b) \ + do { \ + div_t q = div (a, b); \ + dest1 = q.quot; \ + dest2 = q.rem; \ + } while (0) + +#define pLsrPrimPSInt32EQ(dest, a, b) binPred(sint32, ==, dest, a, b) +#define pLsrPrimPSInt32NE(dest, a, b) binPred(sint32, !=, dest, a, b) +#define pLsrPrimPSInt32LT(dest, a, b) binPred(sint32, <, dest, a, b) +#define pLsrPrimPSInt32LE(dest, a, b) binPred(sint32, <=, dest, a, b) + +#define pLsrPrimPSInt32FromSInt8(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt32FromSInt16(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt32FromSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt32FromSInt64(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt32FromUInt8(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt32FromUInt16(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt32FromUInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt32FromUInt64(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt32FromFloat32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt32FromFloat64(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt32FromRational(dest, a) pLsrSInt32FromRational(dest, a) +#define pLsrPrimPSInt32FromInteger(dest, a) pLsrSInt32FromInteger(dest, a) + +#define pLsrPrimPSInt8CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt16CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt32CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPSInt64CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPUInt8CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPUInt16CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPUInt32CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPUInt64CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPFloat32CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPFloat64CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a) +#define pLsrPrimPIntegerCastSInt32(dest, a) pLsrIntegerCastToSInt32(dest, a) + +/* SInt64 ops */ +#define pLsrPrimPSInt64Plus(dest, a, b) binArith(sint64, +, dest, a, b) +#define pLsrPrimPSInt64Minus(dest, a, b) binArith(sint64, -, dest, a, b) +#define pLsrPrimPSInt64Times(dest, a, b) binArith(sint64, *, dest, a, b) +#define pLsrPrimPSInt64DivT(dest, a, b) binArith(sint64, /, dest, a, b) +#define pLsrPrimPSInt64ModT(dest, a, b) binArith(sint64, %, dest, a, b) +#define pLsrPrimPSInt64DivModT(dest1, dest2, a, b) pLsrSInt64DivModT(dest1, dest2, a, b) +#define pLsrPrimPSInt64DivModE(dest1, dest2, a, b) pLsrSInt64DivModE(dest1, dest2, a, b) +#define pLsrPrimPSInt64DivModF(dest1, dest2, a, b) pLsrSInt64DivModF(dest1, dest2, a, b) +#define pLsrPrimPSInt64Negate(dest, a) unArith(sint64, -, dest, a) + +#define pLsrSInt64DivModT(dest1, dest2, a, b) \ + do { \ + lldiv_t q = lldiv (a, b); \ + dest1 = q.quot; \ + dest2 = q.rem; \ + } while (0) + +#define pLsrPrimPSInt64EQ(dest, a, b) binPred(sint64, ==, dest, a, b) +#define pLsrPrimPSInt64NE(dest, a, b) binPred(sint64, !=, dest, a, b) +#define pLsrPrimPSInt64LT(dest, a, b) binPred(sint64, <, dest, a, b) +#define pLsrPrimPSInt64LE(dest, a, b) binPred(sint64, <=, dest, a, b) + +#define pLsrPrimPSInt64FromSInt8(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt64FromSInt16(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt64FromSInt32(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt64FromSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt64FromUInt8(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt64FromUInt16(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt64FromUInt32(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt64FromUInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt64FromFloat32(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt64FromFloat64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt64FromRational(dest, a) pLsrSInt64FromRational(dest, a) +#define pLsrPrimPSInt64FromInteger(dest, a) pLsrSInt64FromInteger(dest, a) + +#define pLsrPrimPSInt8CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt16CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt32CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPSInt64CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPUInt8CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPUInt16CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPUInt32CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPUInt64CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPFloat32CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPFloat64CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a) +#define pLsrPrimPIntegerCastSInt64(dest, a) pLsrIntegerCastToSInt64(dest, a) + +/* UInt8 ops */ +#define pLsrPrimPUInt8Plus(dest, a, b) binArith(uint8, +, dest, a, b) +#define pLsrPrimPUInt8Minus(dest, a, b) binArith(uint8, -, dest, a, b) +#define pLsrPrimPUInt8Times(dest, a, b) binArith(uint8, *, dest, a, b) +#define pLsrPrimPUInt8DivT(dest, a, b) binArith(uint8, /, dest, a, b) +#define pLsrPrimPUInt8ModT(dest, a, b) binArith(uint8, %, dest, a, b) +#define pLsrPrimPUInt8Negate(dest, a) unArith(uint8, -, dest, a) + +#define pLsrPrimPUInt8EQ(dest, a, b) binPred(uint8, ==, dest, a, b) +#define pLsrPrimPUInt8NE(dest, a, b) binPred(uint8, !=, dest, a, b) +#define pLsrPrimPUInt8LT(dest, a, b) binPred(uint8, <, dest, a, b) +#define pLsrPrimPUInt8LE(dest, a, b) binPred(uint8, <=, dest, a, b) + +#define pLsrPrimPUInt8FromSInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt8FromSInt16(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt8FromSInt32(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt8FromSInt64(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt8FromUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt8FromUInt16(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt8FromUInt32(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt8FromUInt64(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt8FromFloat32(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt8FromFloat64(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt8FromRational(dest, a) pLsrUInt8FromRational(dest, a) +#define pLsrPrimPUInt8FromInteger(dest, a) pLsrUInt8FromInteger(dest, a) + +#define pLsrPrimPSInt8CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPSInt16CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPSInt32CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPSInt64CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt8CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt16CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt32CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPUInt64CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPFloat32CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPFloat64CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a) +#define pLsrPrimPIntegerCastUInt8(dest, a) pLsrIntegerCastToUInt8(dest, a) + +/* UInt16 ops */ +#define pLsrPrimPUInt16Plus(dest, a, b) binArith(uint16, +, dest, a, b) +#define pLsrPrimPUInt16Minus(dest, a, b) binArith(uint16, -, dest, a, b) +#define pLsrPrimPUInt16Times(dest, a, b) binArith(uint16, *, dest, a, b) +#define pLsrPrimPUInt16DivT(dest, a, b) binArith(uint16, /, dest, a, b) +#define pLsrPrimPUInt16ModT(dest, a, b) binArith(uint16, %, dest, a, b) +#define pLsrPrimPUInt16Negate(dest, a) unArith(uint16, -, dest, a) + +#define pLsrPrimPUInt16EQ(dest, a, b) binPred(uint16, ==, dest, a, b) +#define pLsrPrimPUInt16NE(dest, a, b) binPred(uint16, !=, dest, a, b) +#define pLsrPrimPUInt16LT(dest, a, b) binPred(uint16, <, dest, a, b) +#define pLsrPrimPUInt16LE(dest, a, b) binPred(uint16, <=, dest, a, b) + +#define pLsrPrimPUInt16FromSInt8(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt16FromSInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt16FromSInt32(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt16FromSInt64(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt16FromUInt8(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt16FromUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt16FromUInt32(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt16FromUInt64(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt16FromFloat32(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt16FromFloat64(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt16FromRational(dest, a) pLsrUInt16FromRational(dest, a) +#define pLsrPrimPUInt16FromInteger(dest, a) pLsrUInt16FromInteger(dest, a) + +#define pLsrPrimPSInt8CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPSInt16CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPSInt32CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPSInt64CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt8CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt16CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt32CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPUInt64CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPFloat32CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPFloat64CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a) +#define pLsrPrimPIntegerCastUInt16(dest, a) pLsrIntegerCastToUInt16(dest, a) + +/* UInt32 ops */ +#define pLsrPrimPUInt32Plus(dest, a, b) binArith(uint32, +, dest, a, b) +#define pLsrPrimPUInt32Minus(dest, a, b) binArith(uint32, -, dest, a, b) +#define pLsrPrimPUInt32Times(dest, a, b) binArith(uint32, *, dest, a, b) +#define pLsrPrimPUInt32DivT(dest, a, b) binArith(uint32, /, dest, a, b) +#define pLsrPrimPUInt32ModT(dest, a, b) binArith(uint32, %, dest, a, b) +#define pLsrPrimPUInt32DivModT(dest1, dest2, a, b) pLsrUInt32DivModT(dest1, dest2, a, b) +#define pLsrPrimPUInt32DivModE(dest1, dest2, a, b) pLsrUInt32DivModE(dest1, dest2, a, b) +#define pLsrPrimPUInt32DivModF(dest1, dest2, a, b) pLsrUInt32DivModF(dest1, dest2, a, b) +#define pLsrPrimPUInt32Negate(dest, a) unArith(uint32, -, dest, a) +#define pLsrPrimPUInt32Max(dest, a, b) binArithPrefix(uint32, max, dest, a, b) +#define pLsrPrimPUInt32Min(dest, a, b) binArithPrefix(uint32, min, dest, a, b) + +#define pLsrUInt32DivModT(dest1, dest2, a, b) \ + do { \ + dest1 = (a)/(b); \ + dest2 = (a)%(b); \ + } while (0) + +#define pLsrPrimPUInt32EQ(dest, a, b) binPred(uint32, ==, dest, a, b) +#define pLsrPrimPUInt32NE(dest, a, b) binPred(uint32, !=, dest, a, b) +#define pLsrPrimPUInt32LT(dest, a, b) binPred(uint32, <, dest, a, b) +#define pLsrPrimPUInt32LE(dest, a, b) binPred(uint32, <=, dest, a, b) + +#define pLsrPrimPUInt32FromSInt8(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt32FromSInt16(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt32FromSInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt32FromSInt64(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt32FromUInt8(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt32FromUInt16(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt32FromUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt32FromUInt64(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt32FromFloat32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt32FromFloat64(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt32FromRational(dest, a) pLsrUInt32FromRational(dest, a) +#define pLsrPrimPUInt32FromInteger(dest, a) pLsrUInt32FromInteger(dest, a) + +#define pLsrPrimPSInt8CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPSInt16CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPSInt32CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPSInt64CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt8CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt16CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt32CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPUInt64CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPFloat32CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPFloat64CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a) +#define pLsrPrimPIntegerCastUInt32(dest, a) pLsrIntegerCastToUInt32(dest, a) + +#define pLsrPrimPUInt32BNot(dest, b) ((dest) = ~(b)) +#define pLsrPrimPUInt32BOr(dest, a, b) binArith(uint32, |, dest, a, b) +#define pLsrPrimPSInt32BOr(dest, a, b) binArith(sint32, |, dest, a, b) +#define pLsrPrimPUInt32BAnd(dest, a, b) binArith(uint32, &, dest, a, b) +#define pLsrPrimPSInt32BAnd(dest, a, b) binArith(sint32, &, dest, a, b) +#define pLsrPrimPUInt32BXor(dest, a, b) binArith(uint32, ^, dest, a, b) +#define pLsrPrimPSInt32BXor(dest, a, b) binArith(sint32, ^, dest, a, b) +#define pLsrPrimPSInt32BShiftL(dest, a, b) binArith(sint32, <<, dest, a, b) +/*Note: implementation defined on negative values */ +#define pLsrPrimPSInt32BShiftR(dest, a, b) binArith(sint32, >>, dest, a, b) +#define pLsrPrimPUInt32BShiftL(dest, a, b) binArith(uint32, <<, dest, a, b) +#define pLsrPrimPSInt32BShiftR(dest, a, b) binArith(sint32, >>, dest, a, b) +#define pLsrPrimPUInt32BShiftR(dest, a, b) binArith(uint32, >>, dest, a, b) + +/* UInt64 ops */ +#define pLsrPrimPUInt64Plus(dest, a, b) binArith(uint64, +, dest, a, b) +#define pLsrPrimPUInt64Minus(dest, a, b) binArith(uint64, -, dest, a, b) +#define pLsrPrimPUInt64Times(dest, a, b) binArith(uint64, *, dest, a, b) +#define pLsrPrimPUInt64DivT(dest, a, b) binArith(uint64, /, dest, a, b) +#define pLsrPrimPUInt64ModT(dest, a, b) binArith(uint64, %, dest, a, b) +#define pLsrPrimPUInt64Negate(dest, a) unArith(uint64, -, dest, a) + +#define pLsrPrimPUInt64EQ(dest, a, b) binPred(uint64, ==, dest, a, b) +#define pLsrPrimPUInt64NE(dest, a, b) binPred(uint64, !=, dest, a, b) +#define pLsrPrimPUInt64LT(dest, a, b) binPred(uint64, <, dest, a, b) +#define pLsrPrimPUInt64LE(dest, a, b) binPred(uint64, <=, dest, a, b) + +#define pLsrPrimPUInt64FromSInt8(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt64FromSInt16(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt64FromSInt32(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt64FromSInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt64FromUInt8(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt64FromUInt16(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt64FromUInt32(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt64FromUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt64FromFloat32(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt64FromFloat64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt64FromRational(dest, a) pLsrUInt64FromRational(dest, a) +#define pLsrPrimPUInt64FromInteger(dest, a) pLsrUInt64FromInteger(dest, a) + +#define pLsrPrimPSInt8CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPSInt16CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPSInt32CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPSInt64CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt8CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt16CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt32CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPUInt64CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPFloat32CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPFloat64CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a) +#define pLsrPrimPIntegerCastUInt64(dest, a) pLsrIntegerCastToUInt64(dest, a) + +#define pLsrPrimPUInt64BNot(dest, b) ((dest) = ~(b)) +#define pLsrPrimPUInt64BOr(dest, a, b) binArith(uint64, |, dest, a, b) +#define pLsrPrimPSInt64BOr(dest, a, b) binArith(sint64, |, dest, a, b) +#define pLsrPrimPUInt64BAnd(dest, a, b) binArith(uint64, &, dest, a, b) +#define pLsrPrimPSInt64BAnd(dest, a, b) binArith(sint64, &, dest, a, b) +#define pLsrPrimPUInt64BXor(dest, a, b) binArith(uint64, ^, dest, a, b) +#define pLsrPrimPSInt64BXor(dest, a, b) binArith(sint64, ^, dest, a, b) +#define pLsrPrimPSInt64BShiftL(dest, a, b) binArith(sint64, <<, dest, a, b) +/*Note: implementation defined on negative values */ +#define pLsrPrimPSInt64BShiftR(dest, a, b) binArith(sint64, >>, dest, a, b) +#define pLsrPrimPUInt64BShiftL(dest, a, b) binArith(uint64, <<, dest, a, b) +#define pLsrPrimPSInt64BShiftR(dest, a, b) binArith(sint64, >>, dest, a, b) +#define pLsrPrimPUInt64BShiftR(dest, a, b) binArith(uint64, >>, dest, a, b) + +/* Float32 ops */ +#define pLsrPrimPFloat32Plus(dest, a, b) binArith(float32, +, dest, a, b) +#define pLsrPrimPFloat32Minus(dest, a, b) binArith(float32, -, dest, a, b) +#define pLsrPrimPFloat32Times(dest, a, b) binArith(float32, *, dest, a, b) +#define pLsrPrimPFloat32Divide(dest, a, b) binArith(float32, /, dest, a, b) +#if (defined(__pillar__) && !defined(__pillar2c__)) +#warning "Using double precision math for floats to work around pillar bug" +#define pLsrPrimPFloat32Max(dest, a, b) binArithPrefix(float32, fmax, dest, a, b) +#define pLsrPrimPFloat32Min(dest, a, b) binArithPrefix(float32, fmin, dest, a, b) +#define pLsrPrimPFloat32ModT(dest, a, b) binArithPrefix(float32, fmod, dest, a, b) +#else +#define pLsrPrimPFloat32Max(dest, a, b) binArithPrefix(float32, fmaxf, dest, a, b) +#define pLsrPrimPFloat32Min(dest, a, b) binArithPrefix(float32, fminf, dest, a, b) +#define pLsrPrimPFloat32ModT(dest, a, b) binArithPrefix(float32, fmodf, dest, a, b) +#endif +#define pLsrPrimPFloat32Negate(dest, a) unArith(float32, -, dest, a) + +#define pLsrPrimPFloat32EQ(dest, a, b) binPred(float32, ==, dest, a, b) +#define pLsrPrimPFloat32NE(dest, a, b) binPred(float32, !=, dest, a, b) +#define pLsrPrimPFloat32LT(dest, a, b) binPred(float32, <, dest, a, b) +#define pLsrPrimPFloat32LE(dest, a, b) binPred(float32, <=, dest, a, b) + +#define pLsrPrimPFloat32FromSInt8(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat32FromSInt16(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat32FromSInt32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat32FromSInt64(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat32FromUInt8(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat32FromUInt16(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat32FromUInt32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat32FromUInt64(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat32FromFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat32FromFloat64(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat32FromRational(dest, a) pLsrFloat32FromRational(dest, a) +#define pLsrPrimPFloat32FromInteger(dest, a) pLsrFloat32FromInteger(dest, a) + +#define pLsrPrimPSInt8CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPSInt16CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPSInt32CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPSInt64CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPUInt8CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPUInt16CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPUInt32CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPUInt64CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat32CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPFloat64CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a) +#define pLsrPrimPIntegerCastFloat32(dest, a) pLsrIntegerCastToFloat32(dest, a) + +/* XXX NG: need to find a header file for this */ +float64 __cdecl trunc(float64); + +#if (defined(__pillar__) && !defined(__pillar2c__)) +#warning "Using double precision math for floats to work around pillar bug" + +#define pLsrPrimPFloat32ACos(dest, a) ((dest) = (float32)acos((float32)(a))) +#define pLsrPrimPFloat32ASin(dest, a) ((dest) = (float32)asin((float32)(a))) +#define pLsrPrimPFloat32ATan(dest, a) ((dest) = (float32)atan((float32)(a))) +#define pLsrPrimPFloat32TanH(dest, a) ((dest) = (float32)tanh((float32)(a))) +#define pLsrPrimPFloat32CosH(dest, a) ((dest) = (float32)cosh((float32)(a))) +#define pLsrPrimPFloat32SinH(dest, a) ((dest) = (float32)sinh((float32)(a))) +#define pLsrPrimPFloat32Ceil(dest, a) ((dest) = (float32)ceil((float32)(a))) +#define pLsrPrimPFloat32Cos(dest, a) ((dest) = (float32)cos((float32)(a))) +#define pLsrPrimPFloat32Exp(dest, a) ((dest) = (float32)exp((float32)(a))) +#define pLsrPrimPFloat32Floor(dest, a) ((dest) = (float32)floor((float32)(a))) +#define pLsrPrimPFloat32Ln(dest, a) ((dest) = (float32)log((float32)(a))) +#define pLsrPrimPFloat32Rcp(dest, a) ((dest) = (((float32)1.0)/((float32) a))) +#define pLsrPrimPFloat32Sin(dest, a) ((dest) = (float32)sin((float32)(a))) +#define pLsrPrimPFloat32Sqrt(dest, a) ((dest) = (float32)sqrt((float32)(a))) +#define pLsrPrimPFloat32Tan(dest, a) ((dest) = (float32)tan((float32)(a))) +#define pLsrPrimPFloat32Trunc(dest, a) ((dest) = (float32)trunc((float32)(a))) +#define pLsrPrimPFloat32Pow(dest, a, b) ((dest) = (float32)pow((float32)(a), (float32)(b))) +#else +#define pLsrPrimPFloat32ACos(dest, a) ((dest) = acosf((float32)(a))) +#define pLsrPrimPFloat32ATan(dest, a) ((dest) = atanf((float32)(a))) +#define pLsrPrimPFloat32ASin(dest, a) ((dest) = asinf((float32)(a))) +#define pLsrPrimPFloat32TanH(dest, a) ((dest) = tanhf((float32)(a))) +#define pLsrPrimPFloat32CosH(dest, a) ((dest) = coshf((float32)(a))) +#define pLsrPrimPFloat32SinH(dest, a) ((dest) = sinhf((float32)(a))) +#define pLsrPrimPFloat32Ceil(dest, a) ((dest) = ceilf((float32)(a))) +#define pLsrPrimPFloat32Cos(dest, a) ((dest) = cosf((float32)(a))) +#define pLsrPrimPFloat32Exp(dest, a) ((dest) = expf((float32)(a))) +#define pLsrPrimPFloat32Floor(dest, a) ((dest) = floorf((float32)(a))) +#define pLsrPrimPFloat32Ln(dest, a) ((dest) = logf((float32)(a))) +#define pLsrPrimPFloat32Rcp(dest, a) ((dest) = (((float32)1.0)/((float32) a))) +#define pLsrPrimPFloat32Sin(dest, a) ((dest) = sinf((float32)(a))) +#define pLsrPrimPFloat32Sqrt(dest, a) ((dest) = sqrtf((float32)(a))) +#define pLsrPrimPFloat32Tan(dest, a) ((dest) = tanf((float32)(a))) +#define pLsrPrimPFloat32Trunc(dest, a) ((dest) = truncf((float32)(a))) +#define pLsrPrimPFloat32Pow(dest, a, b) ((dest) = powf((float32)(a), (float32)(b))) +#endif + +/* Float64 ops */ +#define pLsrPrimPFloat64Plus(dest, a, b) binArith(float64, +, dest, a, b) +#define pLsrPrimPFloat64Minus(dest, a, b) binArith(float64, -, dest, a, b) +#define pLsrPrimPFloat64Times(dest, a, b) binArith(float64, *, dest, a, b) +#define pLsrPrimPFloat64Divide(dest, a, b) binArith(float64, /, dest, a, b) +#define pLsrPrimPFloat64Max(dest, a, b) binArithPrefix(float64, fmax, dest, a, b) +#define pLsrPrimPFloat64Min(dest, a, b) binArithPrefix(float64, fmin, dest, a, b) +#define pLsrPrimPFloat64ModT(dest, a, b) binArithPrefix(float64, fmod, dest, a, b) +#define pLsrPrimPFloat64Negate(dest, a) unArith(float64, -, dest, a) + +#define pLsrPrimPFloat64EQ(dest, a, b) binPred(float64, ==, dest, a, b) +#define pLsrPrimPFloat64NE(dest, a, b) binPred(float64, !=, dest, a, b) +#define pLsrPrimPFloat64LT(dest, a, b) binPred(float64, <, dest, a, b) +#define pLsrPrimPFloat64LE(dest, a, b) binPred(float64, <=, dest, a, b) + +#define pLsrPrimPFloat64FromSInt8(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat64FromSInt16(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat64FromSInt32(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat64FromSInt64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat64FromUInt8(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat64FromUInt16(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat64FromUInt32(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat64FromUInt64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat64FromFloat32(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat64FromFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat64FromRational(dest, a) pLsrFloat64FromRational(dest, a) +#define pLsrPrimPFloat64FromInteger(dest, a) pLsrFloat64FromInteger(dest, a) + +#define pLsrPrimPSInt8CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPSInt16CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPSInt32CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPSInt64CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPUInt8CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPUInt16CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPUInt32CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPUInt64CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat32CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPFloat64CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a) +#define pLsrPrimPIntegerCastFloat64(dest, a) pLsrIntegerCastToFloat64(dest, a) + +#define pLsrPrimPFloat64ACos(dest, a) ((dest) = acos((float64)(a))) +#define pLsrPrimPFloat64ATan(dest, a) ((dest) = atan((float64)(a))) +#define pLsrPrimPFloat64ASin(dest, a) ((dest) = asin((float64)(a))) +#define pLsrPrimPFloat64TanH(dest, a) ((dest) = tanh((float64)(a))) +#define pLsrPrimPFloat64CosH(dest, a) ((dest) = cosh((float64)(a))) +#define pLsrPrimPFloat64SinH(dest, a) ((dest) = sinh((float64)(a))) +#define pLsrPrimPFloat64Ceil(dest, a) ((dest) = ceil((float64)(a))) +#define pLsrPrimPFloat64Cos(dest, a) ((dest) = cos((float64)(a))) +#define pLsrPrimPFloat64Floor(dest, a) ((dest) = floor((float64)(a))) +#define pLsrPrimPFloat64Exp(dest, a) ((dest) = exp((float64)(a))) +#define pLsrPrimPFloat64Ln(dest, a) ((dest) = log((float64)(a))) +#define pLsrPrimPFloat64Rcp(dest, a) ((dest) = (((float64)1.0)/((float64) a))) +#define pLsrPrimPFloat64Sin(dest, a) ((dest) = sin((float64)(a))) +#define pLsrPrimPFloat64Sqrt(dest, a) ((dest) = sqrt((float64)(a))) +#define pLsrPrimPFloat64Tan(dest, a) ((dest) = tan((float64)(a))) +#define pLsrPrimPFloat64Trunc(dest, a) ((dest) = trunc((float64)(a))) +#define pLsrPrimPFloat64Pow(dest, a, b) ((dest) = pow((float64)(a), (float64)(b))) + +/* Arbitrary precision integer ops */ +#define pLsrPrimPIntegerPlus(dest, a, b) pLsrIntegerPlus(dest, a, b) +#define pLsrPrimPIntegerMinus(dest, a, b) pLsrIntegerMinus(dest, a, b) +#define pLsrPrimPIntegerTimes(dest, a, b) pLsrIntegerTimes(dest, a, b) +#define pLsrPrimPIntegerDivT(dest, a, b) pLsrIntegerDivT(dest, a, b) +#define pLsrPrimPIntegerDivE(dest, a, b) pLsrIntegerDivE(dest, a, b) +#define pLsrPrimPIntegerDivF(dest, a, b) pLsrIntegerDivF(dest, a, b) +#define pLsrPrimPIntegerModT(dest, a, b) pLsrIntegerModT(dest, a, b) +#define pLsrPrimPIntegerModE(dest, a, b) pLsrIntegerModE(dest, a, b) +#define pLsrPrimPIntegerModF(dest, a, b) pLsrIntegerModF(dest, a, b) +#define pLsrPrimPIntegerDivModT(dest1, dest2, a, b) pLsrIntegerDivModT(dest1, dest2, a, b) +#define pLsrPrimPIntegerDivModE(dest1, dest2, a, b) pLsrIntegerDivModE(dest1, dest2, a, b) +#define pLsrPrimPIntegerDivModF(dest1, dest2, a, b) pLsrIntegerDivModF(dest1, dest2, a, b) +#define pLsrPrimPIntegerNegate(dest, a) pLsrIntegerNegate(dest, a) + +#define pLsrPrimPIntegerEQ(dest, a, b) pLsrIntegerEQ(dest, a, b) +#define pLsrPrimPIntegerNE(dest, a, b) pLsrIntegerNE(dest, a, b) +#define pLsrPrimPIntegerLT(dest, a, b) pLsrIntegerLT(dest, a, b) +#define pLsrPrimPIntegerLE(dest, a, b) pLsrIntegerLE(dest, a, b) + +#define pLsrPrimPIntegerFromSInt8(dest, a) pLsrIntegerFromSInt8(dest, a) +#define pLsrPrimPIntegerFromSInt16(dest, a) pLsrIntegerFromSInt16(dest, a) +#define pLsrPrimPIntegerFromSInt32(dest, a) pLsrIntegerFromSInt32(dest, a) +#define pLsrPrimPIntegerFromSInt64(dest, a) pLsrIntegerFromSInt64(dest, a) +#define pLsrPrimPIntegerFromUInt8(dest, a) pLsrIntegerFromUInt8(dest, a) +#define pLsrPrimPIntegerFromUInt16(dest, a) pLsrIntegerFromUInt16(dest, a) +#define pLsrPrimPIntegerFromUInt32(dest, a) pLsrIntegerFromUInt32(dest, a) +#define pLsrPrimPIntegerFromUInt64(dest, a) pLsrIntegerFromUInt64(dest, a) +#define pLsrPrimPIntegerFromFloat32(dest, a) pLsrIntegerFromFloat32(dest, a) +#define pLsrPrimPIntegerFromFloat64(dest, a) pLsrIntegerFromFloat64(dest, a) +#define pLsrPrimPIntegerFromRational(dest, a) pLsrIntegerFromRational(dest, a) +#define pLsrPrimPIntegerFromInteger(dest, a) pLsrIntegerFromInteger(dest, a) + +#define pLsrPrimPSInt8CastInteger(dest, a) pLsrIntegerFromSInt8(dest, a) +#define pLsrPrimPSInt16CastInteger(dest, a) pLsrIntegerFromSInt16(dest, a) +#define pLsrPrimPSInt32CastInteger(dest, a) pLsrIntegerFromSInt32(dest, a) +#define pLsrPrimPSInt64CastInteger(dest, a) pLsrIntegerFromSInt64(dest, a) +#define pLsrPrimPUInt8CastInteger(dest, a) pLsrIntegerFromUInt8(dest, a) +#define pLsrPrimPUInt16CastInteger(dest, a) pLsrIntegerFromUInt16(dest, a) +#define pLsrPrimPUInt32CastInteger(dest, a) pLsrIntegerFromUInt32(dest, a) +#define pLsrPrimPUInt64CastInteger(dest, a) pLsrIntegerFromUInt64(dest, a) +#define pLsrPrimPIntegerCastInteger(dest, a) pLsrIntegerFromInteger(dest, a) + +#define pLsrPrimPIntegerBNot(dest, a) pLsrIntegerBNot(dest, a) +#define pLsrPrimPIntegerBAnd(dest, a, b) pLsrIntegerBAnd(dest, a, b) +#define pLsrPrimPIntegerBOr(dest, a, b) pLsrIntegerBOr(dest, a, b) +#define pLsrPrimPIntegerBShiftL(dest, a, b) pLsrIntegerBShiftL(dest, a, b) +#define pLsrPrimPIntegerBShiftR(dest, a, b) pLsrIntegerBShiftR(dest, a, b) +#define pLsrPrimPIntegerBXor(dest, a, b) pLsrIntegerBXor(dest, a, b) + +/* Rational ops */ +#define pLsrPrimPRationalPlus(dest, a, b) pLsrRationalPlus(dest, a, b) +#define pLsrPrimPRationalMinus(dest, a, b) pLsrRationalMinus(dest, a, b) +#define pLsrPrimPRationalTimes(dest, a, b) pLsrRationalTimes(dest, a, b) +#define pLsrPrimPRationalDivT(dest, a, b) pLsrRationalDivT(dest, a, b) +#define pLsrPrimPRationalDivE(dest, a, b) pLsrRationalDivE(dest, a, b) +#define pLsrPrimPRationalDivF(dest, a, b) pLsrRationalDivF(dest, a, b) +#define pLsrPrimPRationalModT(dest, a, b) pLsrRationalModT(dest, a, b) +#define pLsrPrimPRationalModE(dest, a, b) pLsrRationalModE(dest, a, b) +#define pLsrPrimPRationalModF(dest, a, b) pLsrRationalModF(dest, a, b) +#define pLsrPrimPRationalDivModT(dest1, dest2, a, b) pLsrRationalDivModT(dest1, dest2, a, b) +#define pLsrPrimPRationalDivModE(dest1, dest2, a, b) pLsrRationalDivModE(dest1, dest2, a, b) +#define pLsrPrimPRationalDivModF(dest1, dest2, a, b) pLsrRationalDivModF(dest1, dest2, a, b) +#define pLsrPrimPRationalDivide(dest, a) pLsrRationalDivide(dest, a) +#define pLsrPrimPRationalNegate(dest, a) pLsrRationalNegate(dest, a) + +#define pLsrPrimPRationalEQ(dest, a, b) pLsrRationalEQ(dest, a, b) +#define pLsrPrimPRationalNE(dest, a, b) pLsrRationalNE(dest, a, b) +#define pLsrPrimPRationalLT(dest, a, b) pLsrRationalLT(dest, a, b) +#define pLsrPrimPRationalLE(dest, a, b) pLsrRationalLE(dest, a, b) + +#define pLsrPrimPRationalFromSInt8(dest, a) pLsrRationalFromSInt8(dest, a) +#define pLsrPrimPRationalFromSInt16(dest, a) pLsrRationalFromSInt16(dest, a) +#define pLsrPrimPRationalFromSInt32(dest, a) pLsrRationalFromSInt32(dest, a) +#define pLsrPrimPRationalFromSInt64(dest, a) pLsrRationalFromSInt64(dest, a) +#define pLsrPrimPRationalFromUInt8(dest, a) pLsrRationalFromUInt8(dest, a) +#define pLsrPrimPRationalFromUInt16(dest, a) pLsrRationalFromUInt16(dest, a) +#define pLsrPrimPRationalFromUInt32(dest, a) pLsrRationalFromUInt32(dest, a) +#define pLsrPrimPRationalFromUInt64(dest, a) pLsrRationalFromUInt64(dest, a) +#define pLsrPrimPRationalFromFloat32(dest, a) pLsrRationalFromFloat32(dest, a) +#define pLsrPrimPRationalFromFloat64(dest, a) pLsrRationalFromFloat64(dest, a) +#define pLsrPrimPRationalFromRational(dest, a) pLsrRationalFromRational(dest, a) +#define pLsrPrimPRationalFromInteger(dest, a) pLsrRationalFromInteger(dest, a) + +/********************************************************************** + * Names + */ + +#define pLsrPrimPNameGetString(dest, n) (dest = pLsrPNameGetString(n)) +#define pLsrPrimPNameGetHash(dest, n) (dest = pLsrPNameGetHash(n)) + +/********************************************************************** + * CString + */ + +#define pLsrPrimPCStringAllocate(dest, len) \ + do { \ + dest = (char*) pLsrAllocC(len+1); \ + dest[len] = '\0'; \ + } while(0) + +#define pLsrPrimPCStringDeallocate(str) (pLsrFreeC(str)) +#define pLsrPrimPCStringGetLen(dest, str) ((dest) = strlen(str)) +#define pLsrPrimPCStringGetChar(dest, str, idx) ((dest) = (str)[idx]) +#define pLsrPrimPCStringSetChar(str, idx, c) ((str)[idx] = (c)) + +/********************************************************************** + * Pointer Equality + */ + +#define pLsrPrimPPtrEq(dest, p1, p2) (dest = ((void*)p1)==((void*)p2)) + +/********************************************************************** + * Conditional MOV + */ +#define pLsrPrimPCondMov(dest, b, u, v) ((dest) = ((b) ? (u) : (v))) + +/********************************************************************** + * Booleans + */ + +#define pLsrPrimPBooleanNot(dest, b0) ((dest) = !(b0)) +#define pLsrPrimPBooleanAnd(dest, b0, b1) ((dest) = (b0) && (b1)) +#define pLsrPrimPBooleanOr(dest, b0, b1) ((dest) = (b0) || (b1)) +#define pLsrPrimPBooleanXOr(dest, b0, b1) ((dest) = (b0) ^ (b1)) +#define pLsrPrimPBooleanEq(dest, b0, b1) ((dest) = (b0) == (b1)) + + +#endif /* _PLSR_PRIMS_PRIMS_H_ */ diff --git a/runtime/include/hrc/plsr-prims-runtime.h b/runtime/include/hrc/plsr-prims-runtime.h new file mode 100644 index 0000000..bc145ad --- /dev/null +++ b/runtime/include/hrc/plsr-prims-runtime.h @@ -0,0 +1,855 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_PRIMS_RUNTIME_H_ +#define _PLSR_PRIMS_RUNTIME_H_ + +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ +#endif /* P_USE_PILLAR */ + +#ifdef PLSR_LINUX +#include +#endif + +#ifdef P_USE_PILLAR +# undef to +# pragma pillar_managed(on) +#endif + +/********************************************************************** + * IO + */ + +/* IO Hacks */ + +#define BUF_SZ (1<<16) + +struct PlsrBufList { + char buf[BUF_SZ]; + struct PlsrBufList* next; +}; + +static char* pLsrReadLine() +{ + struct PlsrBufList* l = 0; + char* res; + uintp fullBufs = -1, len, cur, lastAmt, cont; + do { + struct PlsrBufList* p = l; + char* lastRead; + fullBufs++; + l = (struct PlsrBufList*)pLsrAllocC(sizeof(struct PlsrBufList)); + l->next = p; + lastRead = fgets(l->buf, BUF_SZ, stdin); + assert(lastRead); + lastAmt = strlen(l->buf); + cont = l->buf[lastAmt-1]!='\n'; + } while (cont); + len = fullBufs * (BUF_SZ-1) + lastAmt; + res = (char*)pLsrAllocC(len+1); + cur = len; + while(l) { + struct PlsrBufList* n = l->next; + lastAmt = strlen(l->buf); + cur -= lastAmt; + strncpy(res+cur, l->buf, lastAmt); + pLsrFreeC(l); + l = n; + } + assert(cur==0); + return res; +} + +static char* pLsrReadFile(char* fn) +{ + FILE* f = fopen(fn, "r"); + struct PlsrBufList* l = 0; + size_t lastRead; + uintp fullBufs = -1, len, cur; + char* res; + assert(f); + do { + struct PlsrBufList* p = l; + fullBufs++; + l = (struct PlsrBufList*)pLsrAllocC(sizeof(struct PlsrBufList)); + l->next = p; + lastRead = fread(l->buf, 1, BUF_SZ, f); + } while (lastRead == BUF_SZ); + cur = fullBufs * BUF_SZ; + len = cur + lastRead; + res = (char*)pLsrAllocC(len+1); + res[len] = '\0'; + while(l) { + struct PlsrBufList* n = l->next; + strncpy(res+cur, l->buf, lastRead); + pLsrFreeC(l); + l = n; + cur -= BUF_SZ; + lastRead = BUF_SZ; + } + fclose(f); + assert(cur==0); + return res; +} + +static void pLsrWriteFile(char* fn, uintp n, char* s) +{ + FILE* f = fopen(fn, "wb"); + assert(f); + fwrite(s, 1, n, f); + fclose(f); +} + +static char* pLsrPrimRReadAllStdin() +{ + struct PlsrBufList* l = 0; + size_t lastRead; + uintp fullBufs = -1, len, cur; + char* res; + FILE* f; + f = stdin; + assert(f); + do { + struct PlsrBufList* p = l; + fullBufs++; + l = (struct PlsrBufList*)pLsrAllocC(sizeof(struct PlsrBufList)); + l->next = p; + lastRead = fread(l->buf, 1, BUF_SZ, f); + } while (lastRead == BUF_SZ); + cur = fullBufs * BUF_SZ; + len = cur + lastRead; + res = (char*)pLsrAllocC(len+1); + res[len] = '\0'; + while(l) { + struct PlsrBufList* n = l->next; + strncpy(res+cur, l->buf, lastRead); + pLsrFreeC(l); + l = n; + cur -= BUF_SZ; + lastRead = BUF_SZ; + } + return res; +} + +static uintp pLsrPrimNativeOpenOut(char* fn) +{ + FILE* f = fopen(fn, "wb"); + assert(f); + return (uintp)f; +} + +static void pLsrPrimNativeOutputByte(uintp fi, char b) +{ + FILE* f = (FILE*)fi; + fwrite(&b, 1, 1, f); +} + +static void pLsrPrimNativeCloseOut(uintp fi) +{ + FILE* f = (FILE*)fi; + fclose(f); +} + +static uintp pLsrPrimNativeOpenIn(char* fn) +{ + FILE* f = fopen(fn, "r"); + assert(f); + return (uintp)f; +} + +static uintp pLsrPrimNativeInputByte(uintp fi) +{ + FILE* f = (FILE*)fi; + int c = fgetc(f); + return (uintp)c; +} + +static int isDelimiter (char c, char *delimiters) +{ + int i = 0; + for (; delimiters[i]; i++) { + if (c == delimiters[i]) { + return 1; + } + } + return 0; +} + +static char* pLsrPrimNativeInputString(uintp fi, char *delimiters) +{ + // TODO WL: add max_len argument to the function + char* res = pLsrAllocC(256); + FILE* f = (FILE *)fi; + assert(f); + + char c; + // filter out all spaces and returns + while (!feof(f)) { + c = fgetc(f); + if (c == 0x0a || c == 0x0d || isDelimiter(c, delimiters)) + continue; + else + break; + } + + int i = 0; + while (!feof(f)) { + if (c == 0x0a || c == 0x0d || isDelimiter(c, delimiters)) { + res[i] = 0; + return res; + } else { + res[i++] = c; + } + assert(i<256); + c = fgetc(f); + } + + res[i] = 0; + return res; +} + +static char* pLsrPrimNativeInputAll(uintp fi) +{ + struct PlsrBufList* l = 0; + size_t lastRead; + uintp fullBufs = -1, len, cur; + char* res; + FILE* f = (FILE *)fi; + assert(f); + do { + struct PlsrBufList* p = l; + fullBufs++; + l = (struct PlsrBufList*)pLsrAllocC(sizeof(struct PlsrBufList)); + l->next = p; + lastRead = fread(l->buf, 1, BUF_SZ, f); + } while (lastRead == BUF_SZ); + cur = fullBufs * BUF_SZ; + len = cur + lastRead; + res = (char*)pLsrAllocC(len+1); + res[len] = '\0'; + while(l) { + struct PlsrBufList* n = l->next; + strncpy(res+cur, l->buf, lastRead); + pLsrFreeC(l); + l = n; + cur -= BUF_SZ; + lastRead = BUF_SZ; + } + return res; +} + +static uintp pLsrPrimNativeIsEOF(uintp fi) +{ + FILE* f = (FILE*)fi; + return (uintp)feof(f); +} + +static void pLsrPrimNativeCloseIn(uintp fi) +{ + FILE* f = (FILE*)fi; + fclose(f); +} + +static char* pLsrPrimNativeInputLine(uintp fi) +{ + struct PlsrBufList* l = 0; + FILE* f = (FILE*)fi; + char* res; + uintp fullBufs = -1, len, cur, lastAmt, cont; + assert(f); + do { + struct PlsrBufList* p = l; + char* lastRead; + fullBufs++; + l = (struct PlsrBufList*)pLsrAllocC(sizeof(struct PlsrBufList)); + l->next = p; + lastRead = fgets(l->buf, BUF_SZ, f); + assert(lastRead); + lastAmt = strlen(l->buf); + cont = l->buf[lastAmt-1]!='\n'; + } while (cont); + len = fullBufs * (BUF_SZ-1) + lastAmt; + res = (char*)pLsrAllocC(len+1); + cur = len; + while(l) { + struct PlsrBufList* n = l->next; + lastAmt = strlen(l->buf); + cur -= lastAmt; + strncpy(res+cur, l->buf, lastAmt); + pLsrFreeC(l); + l = n; + } + assert(cur==0); + return res; +} + +/* The natives */ + +static PlsrRational pLsrPrimROpenOut(PlsrPAny a) +{ + char* filename = pLsrPStringToCString(a); + uintp f = pLsrPrimNativeOpenOut(filename); + PlsrRational r; + pLsrFreeC(filename); + pLsrRationalFromUIntp(r, f); + return r; +} + +#define pLsrPrimROpenOutT pLsrPrimROpenOut + +static PlsrRational pLsrPrimRGetStdout() +{ + PlsrRational r; + pLsrRationalFromUIntp(r, (uintp)stdout); + return r; +} + +#define pLsrPrimRGetStdoutT pLsrPrimRGetStdout + +static void pLsrPrimROutputByte(PlsrRational a, PlsrRational b) +{ + uintp ai; + uintp bi; + + pLsrUIntpFromRational(ai, a); + pLsrUIntpFromRational(bi, b); + pLsrPrimNativeOutputByte(ai, (char)bi); +} + +#define pLsrPrimROutputByteT pLsrPrimROutputByte + +static void pLsrPrimRCloseOut(PlsrRational a) +{ + uintp ai; + pLsrUIntpFromRational(ai, a); + pLsrPrimNativeCloseOut(ai); +} + +#define pLsrPrimRCloseOutT pLsrPrimRCloseOut + +static PlsrRational pLsrPrimROpenIn(PlsrPAny a) +{ + char* filename = pLsrPStringToCString(a); + uintp f = pLsrPrimNativeOpenIn(filename); + PlsrRational r; + pLsrFreeC(filename); + pLsrRationalFromUIntp(r, f); + return r; +} + +#define pLsrPrimROpenInT pLsrPrimROpenIn + +static PlsrRational pLsrPrimRGetStdin() +{ + PlsrRational r; + pLsrRationalFromUIntp(r, (uintp)stdin); + return r; +} + +#define pLsrPrimRGetStdinT pLsrPrimRGetStdin + +static PlsrRational pLsrPrimRInputByte(PlsrRational a) +{ + uintp f; + uintp c; + PlsrRational r; + pLsrUIntpFromRational(f, a); + c = pLsrPrimNativeInputByte(f); + pLsrRationalFromUIntp(r, c); + return r; +} + +#define pLsrPrimRInputByteT pLsrPrimRInputByte + +static PlsrPAny pLsrPrimRInputString(PlsrRational a, PlsrPAny b) +{ + uintp f; + pLsrUIntpFromRational(f, a); + char *delimiters = pLsrPStringToCString(b); + char *s = pLsrPrimNativeInputString(f, delimiters); + pLsrFreeC(delimiters); + PlsrPAny res = pLsrCStringToPString(s); + pLsrFreeC(s); + return res; +} + +static PlsrPAny pLsrPrimRInputStringT(PlsrRational a, PlsrPAny b) +{ + uintp f; + pLsrUIntpFromRational(f, a); + char *delimiters = pLsrPStringToCString(b); + char *s = pLsrPrimNativeInputString(f, delimiters); + pLsrFreeC(delimiters); + PlsrPAny res = pLsrCStringToPStringT(s); + pLsrFreeC(s); + return res; +} + +static PlsrPAny pLsrPrimRInputAll(PlsrRational a) +{ + uintp f; + pLsrUIntpFromRational(f, a); + char *s = pLsrPrimNativeInputAll(f); + PlsrPAny res = pLsrCStringToPString(s); + pLsrFreeC(s); + return res; +} + +static PlsrPAny pLsrPrimRInputAllT(PlsrRational a) +{ + uintp f; + pLsrUIntpFromRational(f, a); + char *s = pLsrPrimNativeInputAll(f); + PlsrPAny res = pLsrCStringToPStringT(s); + pLsrFreeC(s); + return res; +} + +static PlsrBoolean pLsrPrimRIsEOF(PlsrRational a) +{ + uintp f; + pLsrUIntpFromRational(f, a); + uintp c = pLsrPrimNativeIsEOF(f); + return toPlsrBoolean(c); +} + +#define pLsrPrimRIsEOFT pLsrPrimRIsEOF + +static void pLsrPrimRCloseIn(PlsrRational a) +{ + uintp f; + pLsrUIntpFromRational(f, a); + pLsrPrimNativeCloseIn(f); +} + +#define pLsrPrimRCloseInT pLsrPrimRCloseIn + +/********************************************************************** + * Core Natives + */ + +static float pLsrPrimRFloatMk(PlsrRational a, PlsrRational b) +{ + sint32 m; + sint32 e; + double r, p; + pLsrSInt32FromRational(m, a); + pLsrSInt32FromRational(e, b); + r = m; + p = 10; + if (e<0) e=-e, p=0.1; + while (e-->0) r*=p; + return r; +} + +#define pLsrPrimRFloatMkT pLsrPrimRFloatMk + +static void pLsrPrimRWriteln(PlsrPAny a) +{ + pLsrValuePrint(a); + printf("\n"); +} + +#define pLsrPrimRWritelnT pLsrPrimRWriteln + +static PlsrPAny pLsrPrimRReadln() +{ + char* s = pLsrPrimNativeInputLine((uintp)stdin); + PlsrPAny res = pLsrCStringToPString(s); + pLsrFreeC(s); + return res; +} + +static PlsrPAny pLsrPrimRReadlnT() +{ + char* s = pLsrPrimNativeInputLine((uintp)stdin); + PlsrPAny res = pLsrCStringToPStringT(s); + pLsrFreeC(s); + return res; +} + +static PlsrPAny pLsrPrimRAssert(PlsrPAny a) +{ + pLsrRuntimeError("PAssert unimplemented"); + return 0; +} + +#define pLsrPrimRAssertT pLsrPrimRAssert + +static PlsrPAny pLsrPrimRError(PlsrPAny a) +{ + printf("Fatal P error: "); + pLsrValuePrint(a); + printf("\nExiting...\n"); + pLsrExit(-1); + return 0; +} + +#define pLsrPrimRErrorT pLsrPrimRError + +static void pLsrPrimRDebug(PlsrPAny a) +{ + printf("P Debug: "); + pLsrValuePrint(a); + printf("\n"); +} + +#define pLsrPrimRDebugT pLsrPrimRDebug + +/********************************************************************** + * Command Line support + */ + +uintp pargc; +const char **pargv; + +static PlsrPAny pLsrPrimRCommandLineT() +{ + PlsrPAny res = pLsrPArrayONew(pargc); + for (uintp i = 0; i < pargc; i++) { + PlsrPAny ps = pLsrCStringToPStringT(pargv[i]); + PlsrThunkBRef pst = pLsrThunkNewValRef((PlsrRef) ps); + pLsrWriteBarrierRefBase(res, pLsrPArrayOElt(res, i), (PlsrObjectB)pst); + } + return res; +} + +static PlsrPAny pLsrPrimRCommandLine() +{ + PlsrPAny res = pLsrPArrayONew (pargc); + for (uintp i = 0; i < pargc; i++) { + PlsrPAny ps = pLsrCStringToPString(pargv[i]); + pLsrWriteBarrierRefBase(res, pLsrPArrayOElt(res, i), ps); + } + return res; +} + +#define pLsrCommandLineCount() (pargc) +#define pLsrCommandLineGet(i) ((char*)pargv[i]) + +/********************************************************************** + * Conversions + */ + +static uintp pLsrString2Nat(char *str) +{ + return (uintp)atoi(str); +} + +static PlsrRational pLsrStringToRat(char *str) +{ + return pLsrRationalFromCString(str); +} + +static float pLsrStringToFloat(char *str) +{ + return atof(str); +} + + +static PlsrRational pLsrPrimRStringToNat(PlsrPAny a) +{ + char* str = pLsrPStringToCString(a); + uintp i = pLsrString2Nat (str); + pLsrFreeC(str); + PlsrRational r; + pLsrRationalFromUIntp(r, i); + return r; +} + +#define pLsrPrimRStringToNatT pLsrPrimRStringToNat + +static PlsrRational pLsrPrimRStringToRat(PlsrPAny a) +{ + char* str = pLsrPStringToCString(a); + PlsrRational r = pLsrStringToRat (str); + pLsrFreeC(str); + return r; +} + +#define pLsrPrimRStringToRatT pLsrPrimRStringToRat + +static PlsrPAny pLsrPrimRRatToString(PlsrRational r) +{ + char* s = pLsrCStringFromRational(r); + PlsrPAny o = pLsrCStringToPString(s); + pLsrFreeC(s); + return o; +} + +static PlsrPAny pLsrPrimRRatToStringT(PlsrRational r) +{ + char* s = pLsrCStringFromRational(r); + PlsrPAny o = pLsrCStringToPStringT(s); + pLsrFreeC(s); + return o; +} + +static float pLsrPrimRStringToFloat(PlsrPAny a) +{ + char* str = pLsrPStringToCString(a); + float f = pLsrStringToFloat (str); + pLsrFreeC(str); + return f; +} + +#define pLsrPrimRStringToFloatT pLsrPrimRStringToFloat + +static PlsrPAny pLsrPrimRFloatToString(float f, PlsrRational p) +{ + uintp pi; + pLsrUIntpFromRational(pi, p); + char str[100]; + sprintf(str, "%.*f", pi, f); + return pLsrCStringToPString(str); +} + +static PlsrPAny pLsrPrimRFloatToStringT(float f, PlsrRational p) +{ + uintp pi; + pLsrUIntpFromRational(pi, p); + char str[100]; + sprintf(str, "%.*f", pi, f); + return pLsrCStringToPStringT(str); +} + +/* These are for the rats as unsafe integers hack */ +static PlsrPAny pLsrPrimRFloat2StringI(float f, sintp p) +{ + uintp pi = (uintp) p; + char str[100]; + sprintf(str, "%.*f", pi, f); + return pLsrCStringToPString(str); +} + +static PlsrPAny pLsrPrimRFloat2StringIT(float f, sintp p) +{ + uintp pi = (uintp) p; + char str[100]; + sprintf(str, "%.*f", pi, f); + return pLsrCStringToPStringT(str); +} + +#define pLsrPrimRRatNumerator pLsrRatNumerator +#define pLsrPrimRRatNumeratorT pLsrRatNumeratorT + +#define pLsrPrimRRatDenominator pLsrRatDenominator +#define pLsrPrimRRatDenominatorT pLsrRatDenominatorT + + +/********************************************************************** + * Equality and unification + */ + +static PlsrBoolean pLsrPrimREqualSlow(PlsrPAny v1, PlsrPAny v2) +{ + PlsrVTable vt1; + PlsrVTable vt2; + uintp i; + vt1 = pLsrObjectGetVTable(v1); + vt2 = pLsrObjectGetVTable(v2); + if (pLsrVTableGetTag(vt1) != pLsrVTableGetTag(vt2)) return 0; + switch(pLsrVTableGetTag(vt1)) { + case VRatTag: { + PlsrBoolean b; + PlsrRational r1, r2; + pLsrRationalFromPRat(r1, v1); + pLsrRationalFromPRat(r2, v2); + pLsrRationalEQ(b, r1, r2); + return b; + } + case VNameTag: + return pLsrPNameGetTag(v1) == pLsrPNameGetTag(v2); + case VFloatTag: + return pLsrPFloatGet(v1) == pLsrPFloatGet(v2); + case VArrayTag: + if (pLsrPArrayOGetLen(v1) != pLsrPArrayOGetLen(v2)) return 0; + for(i=0; i + +typedef __m256 PlsrVector256F32; +typedef __m256d PlsrVector256F64; +typedef __m256i PlsrVector256B32; +typedef __m256i PlsrVector256Ref; + +typedef __m256 PlsrVectorMask256Fs32; +typedef __m256 PlsrVectorMask256Fs64; + +#ifdef TWOVEC64 +typedef struct two__m128i { + __m128i low,high; +} two__m128i; + +typedef two__m128i PlsrVector256B64; +#elif defined ONEVEC64 +typedef __m256i PlsrVector256B64; +#else +typedef struct PlsrVector256B64 { + __int64 values[4]; +} PlsrVector256B64; +#endif + +#if 0 +#define pLsrPrimV256PointwiseFloat32LT(dest, a, b) ((dest) = _mm256_cmp_ps((a),(b),_CMP_LT_OS)) +#else +#define pLsrPrimV256CompareFloat32LT(dest, a, b) ((dest) = _mm256_cmp_ps((a),(b),_CMP_LT_OS)) +#define pLsrPrimV256CompareFloat64LT(dest, a, b) ((dest) = _mm256_cmp_pd((a),(b),_CMP_LT_OS)) +#define pLsrPrimV256CompareFloat32LE(dest, a, b) ((dest) = _mm256_cmp_ps((a),(b),_CMP_LE_OS)) +#define pLsrPrimV256CompareFloat64LE(dest, a, b) ((dest) = _mm256_cmp_pd((a),(b),_CMP_LE_OS)) +#endif +#define pLsrPrimV256PointwiseFloat32Exp(dest, a) ((dest) = _mm256_exp_ps((a))) +#define pLsrPrimV256PointwiseFloat32Ln(dest, a) ((dest) = _mm256_log_ps((a))) +#define pLsrPrimV256PointwiseFloat64Pow(dest, a, b) ((dest) = _mm256_pow_pd((a), (b))) +#define pLsrPrimV256DataBlendF32(dest, mask, a, b) ((dest) = _mm256_blendv_ps((b),(a),(mask))) +#define pLsrPrimV256DataBlendB32(dest, mask, a, b) ((dest) = _mm256_blendv_ps((b),(a),(mask))) +#define pLsrPrimV256DataBlendF64(dest, mask, a, b) ((dest) = _mm256_blendv_pd((b),(a),(mask))) +#define pLsrPrimV256DataBlendB64(dest, mask, a, b) ((dest) = _mm256_blendv_pd((b),(a),(mask))) + + +// ***************************************************************************** +// Conversions +// ***************************************************************************** + +#define pLsrPrimV256256ConvertFloat32FromUInt32(dest, a) ((dest) = _mm256_cvtepi32_ps(a)) +#define pLsrPrimV256256ConvertUInt32FromFloat32(dest, a) ((dest) = _mm256_cvtps_epi32(a)) + +#define pLsrPrimV256128ConvertFloat64FromFloat32(dest, a) ((dest) = _mm256_cvtps_pd(a)) + +// vec128 -> vec256 +#define pLsrPrimV256U32ToF64(dest, a) ((dest) = _mm256__cvtepi32_pd (_mm_movpi64_epi64(a))) +#define pLsrPrimV256F64ToU32(dest, a) ((dest) = _mm256__movepi64_pi64 (_mm256_cvtpd_epi32(a))) + +#define pLsrPrimV256256CastSInt32ToFloat32(dest, a) ((dest) = _mm256_cvtepi32_ps(a)) +#define pLsrPrimV256128CastSInt32ToFloat64(dest, a) ((dest) = _mm256_cvtepi32_pd(a)) +#define pLsrPrimV256256CastFloat32ToSInt32(dest, a) ((dest) = _mm256_cvttps_epi32(a)) +#define pLsrPrimV256256CastSInt32ToUInt32(dest, a) ((dest) = a) +#define pLsrPrimV256256CastUInt32ToSInt32(dest, a) ((dest) = a) + +#define pLsrPrimV256256ConvertUInt32FromSInt32(dest, a) ((dest) = a) +#define pLsrPrimV256256ConvertSInt32FromUInt32(dest, a) ((dest) = a) + + + +#ifdef TWOVEC64 +#elif defined ONEVEC64 +#else +#define pLsrPrimV128256CastSInt64ToFloat32(dest, a) (dest) = _mm_set_ps((a).values[3], (a).values[2], (a).values[1], (a).values[0]) +#define pLsrPrimV256128CastFloat32ToSInt64(dest, a) \ + do { \ + float in[4]; \ + _mm_store_ps(in,a); \ + dest.values[0] = in[0]; \ + dest.values[1] = in[1]; \ + dest.values[2] = in[2]; \ + dest.values[3] = in[3]; \ + } while(0) + +#endif + +// ***************************************************************************** +// Data operations on the more abstract B32/F32/F64 types +// ***************************************************************************** + +#define pLsrPrimV256DataVectorB32(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + pLsrPrimV256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) + +#define pLsrPrimV256DataVectorB64(dest, c0, c1, c2, c3) \ + pLsrPrimV256UInt64Const(dest, c0, c1, c2, c3) + +#define pLsrPrimV256DataVectorF32(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + pLsrPrimV256Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) + +#define pLsrPrimV256DataVectorF64(dest, c0, c1, c2, c3) \ + pLsrPrimV256Float64Const(dest, c0, c1, c2, c3) + +#define pLsrPrimV256DataBroadcastB32(dest, c0) pLsrPrimV256UInt32Lift(dest, c0) +#define pLsrPrimV256DataBroadcastB64(dest, c0) pLsrPrimV256UInt64Lift(dest, c0) +#define pLsrPrimV256DataBroadcastF32(dest, c0) pLsrPrimV256Float32Lift(dest, c0) +#define pLsrPrimV256DataBroadcastF64(dest, c0) pLsrPrimV256Float64Lift(dest, c0) + + // Subscripting operations (chose between the lower or higher part) + + +#define pLsrPrimV256DataSub0B32(dest, a) pLsrPrimV256DataSubB32L(dest, 0, a) +#define pLsrPrimV256DataSub1B32(dest, a) pLsrPrimV256DataSubB32L(dest, 1, a) +#define pLsrPrimV256DataSub2B32(dest, a) pLsrPrimV256DataSubB32L(dest, 2, a) +#define pLsrPrimV256DataSub3B32(dest, a) pLsrPrimV256DataSubB32L(dest, 3, a) +#define pLsrPrimV256DataSub4B32(dest, a) pLsrPrimV256DataSubB32H(dest, 0, a) +#define pLsrPrimV256DataSub5B32(dest, a) pLsrPrimV256DataSubB32H(dest, 1, a) +#define pLsrPrimV256DataSub6B32(dest, a) pLsrPrimV256DataSubB32H(dest, 2, a) +#define pLsrPrimV256DataSub7B32(dest, a) pLsrPrimV256DataSubB32H(dest, 3, a) + +#define pLsrPrimV256DataSub0F32(dest, a) pLsrPrimV256DataSubF32L(dest, 0, a) +#define pLsrPrimV256DataSub1F32(dest, a) pLsrPrimV256DataSubF32L(dest, 1, a) +#define pLsrPrimV256DataSub2F32(dest, a) pLsrPrimV256DataSubF32L(dest, 2, a) +#define pLsrPrimV256DataSub3F32(dest, a) pLsrPrimV256DataSubF32L(dest, 3, a) +#define pLsrPrimV256DataSub4F32(dest, a) pLsrPrimV256DataSubF32H(dest, 0, a) +#define pLsrPrimV256DataSub5F32(dest, a) pLsrPrimV256DataSubF32H(dest, 1, a) +#define pLsrPrimV256DataSub6F32(dest, a) pLsrPrimV256DataSubF32H(dest, 2, a) +#define pLsrPrimV256DataSub7F32(dest, a) pLsrPrimV256DataSubF32H(dest, 3, a) + +#define pLsrPrimV256DataSubB32L(dest, sub, a) ((dest) = pLsrPrimV256DataSubB32L_help(sub, a)) +static inline int pLsrPrimV256DataSubB32L_help(int sub, __m256i a) { + __m128i loA = _mm256_extractf128_si256(a, 0); + return (_mm_extract_epi32(loA, sub)); +} +#define pLsrPrimV256DataSubB32H(dest, sub, a) ((dest) = pLsrPrimV256DataSubB32H_help(sub, a)) +static inline int pLsrPrimV256DataSubB32H_help(int sub, __m256i a) { + __m128i hiA = _mm256_extractf128_si256(a, 1); + return (_mm_extract_epi32(hiA, sub)); +} + +#define pLsrPrimV256DataSubF32H(dest, sub, a) ((dest) = pLsrPrimV256DataSubF32H_help(sub, a)) +static inline float pLsrPrimV256DataSubF32H_help(int sub, __m256 a) { + // float* out = (float*)malloc(sizeof(float)); + __m128 hiA = _mm256_extractf128_ps(a, 1); + //__m128 outA = mm_extract_ps(out, hiA, sub); + // _mm_move_ss(out, outA); + // return *out; + float out; + _MM_EXTRACT_FLOAT(out, hiA, sub); + return out; + +} + +#define pLsrPrimV256DataSubF32L(dest, sub, a) ((dest) = pLsrPrimV256DataSubF32L_help(sub, a)) +static inline float pLsrPrimV256DataSubF32L_help(int sub, __m256 a) { + // float* out = (float*)malloc(sizeof(float)); + __m128 hiA = _mm256_extractf128_ps(a, 0); + // __m128 outA = mm_extract_ps(out, hiA, sub); + // _mm_move_ss(out, outA); + float out; + _MM_EXTRACT_FLOAT(out, hiA, sub); + return out; +} + +#define pLsrPrimV256DataSub0F64(dest, a) pLsrPrimV256DataSubF64(dest, 0, a) +#define pLsrPrimV256DataSub1F64(dest, a) pLsrPrimV256DataSubF64(dest, 1, a) +#define pLsrPrimV256DataSub2F64(dest, a) pLsrPrimV256DataSubF64(dest, 2, a) +#define pLsrPrimV256DataSub3F64(dest, a) pLsrPrimV256DataSubF64(dest, 3, a) + +#define pLsrPrimV256DataSubF64(dest, sub, a) ((dest) = ((double*)&a)[sub]) + +// loads and stores + +#define pLsrPrimV256B32StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256UInt32Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256UInt32StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) +#define pLsrPrimV256B64StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256UInt64Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256UInt64StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) +#define pLsrPrimV256F32StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256Float32Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256Float32StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) +#define pLsrPrimV256F64StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256Float64Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256Float64StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) +#define pLsrPrimV256RefStoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256RefStore(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256RefStoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) + + +#define pLsrPrimV256B32LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256UInt32Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256UInt32LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV256B64LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256UInt64Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256UInt64LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV256F32LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256Float32Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256Float32LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV256F64LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256Float64Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256Float64LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV256RefLoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256RefLoad(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256RefLoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) + +#define pLsrPrimV128Float32LoadIndexed(dest, arr, off, vi) \ + do { \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx0; \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx1; \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx2; \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx3; \ + \ + pLsrPrimV256Float32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(vi, 0); \ + pLsrPrimV256Float32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(vi, 1); \ + pLsrPrimV256Float32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(vi, 2); \ + pLsrPrimV256Float32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(vi, 3); \ + (dest) = _mm_set_ps(*((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx3), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx2), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx1), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx0)); \ + } while (0) + +#define pLsrPrimV128Float32LoadIndexed64(dest, arr, offset, vi) \ + do { \ + (dest) = _mm_set_ps( *((float32*)((char*)arr + offset) + vi.values[3]), \ + *((float32*)((char*)arr + offset) + vi.values[2]), \ + *((float32*)((char*)arr + offset) + vi.values[1]), \ + *((float32*)((char*)arr + offset) + vi.values[0])); \ + } while (0) + +/* scalar array, scalar offset, vector of indices */ +#define pLsrPrimV256F32LoadVI(dest, arr, off, vi) \ + do { \ + __m128i pLsrPrimV256Float32LoadVectorHelp_idxV; \ + __m128 pLsrPrimV256Float32LoadVectorHelp_lo; \ + __m128 pLsrPrimV256Float32LoadVectorHelp_hi; \ + __m256 pLsrPrimV256Float32LoadVectorHelp_res; \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx0; \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx1; \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx2; \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx3; \ + \ + pLsrPrimV256Float32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 0); \ + pLsrPrimV256Float32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 0); \ + pLsrPrimV256Float32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 1); \ + pLsrPrimV256Float32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 2); \ + pLsrPrimV256Float32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 3); \ + pLsrPrimV256Float32LoadVectorHelp_lo = \ + _mm_set_ps(*((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx3), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx2), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx1), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx0)); \ + pLsrPrimV256Float32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 1); \ + pLsrPrimV256Float32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 0); \ + pLsrPrimV256Float32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 1); \ + pLsrPrimV256Float32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 2); \ + pLsrPrimV256Float32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 3); \ + pLsrPrimV256Float32LoadVectorHelp_hi = \ + _mm_set_ps(*((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx3), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx2), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx1), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx0)); \ + pLsrPrimV256Float32LoadVectorHelp_res = _mm256_castps128_ps256(pLsrPrimV256Float32LoadVectorHelp_lo); \ + pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \ + pLsrPrimV256Float32LoadVectorHelp_hi, \ + 1); \ + (dest) = pLsrPrimV256Float32LoadVectorHelp_res; \ + } while (0) + +/* scalar array, scalar offset, vector of indices */ +#define pLsrPrimV256F64LoadVI(dest, arr, off, vi) \ + do { \ + __m128i pLsrPrimV256Float64LoadVectorHelp_idxV; \ + __m256d pLsrPrimV256Float64LoadVectorHelp_res; \ + uint32 pLsrPrimV256Float64LoadVectorHelp_idx0; \ + uint32 pLsrPrimV256Float64LoadVectorHelp_idx1; \ + uint32 pLsrPrimV256Float64LoadVectorHelp_idx2; \ + uint32 pLsrPrimV256Float64LoadVectorHelp_idx3; \ + \ + pLsrPrimV256Float64LoadVectorHelp_idxV = vi; \ + pLsrPrimV256Float64LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 0); \ + pLsrPrimV256Float64LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 1); \ + pLsrPrimV256Float64LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 2); \ + pLsrPrimV256Float64LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 3); \ + pLsrPrimV256Float64LoadVectorHelp_res = \ + _mm256_set_pd(*((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx3), \ + *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx2), \ + *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx1), \ + *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx0)); \ + (dest) = pLsrPrimV256Float64LoadVectorHelp_res; \ + } while (0) + + +/* scalar array, scalar offset, vector of indices */ +/* For some insane reason, the only way to load out a float from an __m128 register + * is as an integer containing the binary representation of the float. -leaf */ +#define pLsrPrimV256F32StoreVI(arr, off, vi, src) \ + do { \ + __m128i pLsrPrimV256Float32StoreVectorHelp_idxV; \ + __m128 pLsrPrimV256Float32StoreVectorHelp_srcV; \ + uint32 pLsrPrimV256Float32StoreVectorHelp_idx; \ + \ + pLsrPrimV256Float32StoreVectorHelp_idxV =_mm256_extractf128_si256(vi, 0); \ + pLsrPrimV256Float32StoreVectorHelp_srcV =_mm256_extractf128_ps(src, 0); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 0); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 0); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 1); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 1); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 2); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 2); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 3); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 3); \ + pLsrPrimV256Float32StoreVectorHelp_idxV =_mm256_extractf128_si256(vi, 1); \ + pLsrPrimV256Float32StoreVectorHelp_srcV =_mm256_extractf128_ps(src, 1); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 0); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 0); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 1); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 1); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 2); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 2); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 3); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 3); \ + } while (0) + +/* scalar array, scalar offset, vector of indices */ +#define pLsrPrimV256F64StoreVI(arr, off, vi, src) \ + do { \ + __m128i pLsrPrimV256Float64StoreVectorHelp_idxV; \ + __m128d pLsrPrimV256Float64StoreVectorHelp_srcV; \ + uint32 pLsrPrimV256Float64StoreVectorHelp_idx; \ + \ + pLsrPrimV256Float64StoreVectorHelp_idxV = vi; \ + pLsrPrimV256Float64StoreVectorHelp_srcV =_mm256_extractf128_pd(src, 0); \ + pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 0); \ + _mm_storel_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \ + pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 1); \ + _mm_storeh_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \ + \ + pLsrPrimV256Float64StoreVectorHelp_srcV =_mm256_extractf128_pd(src, 1); \ + pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 2); \ + _mm_storel_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \ + pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 3); \ + _mm_storeh_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \ + } while (0) + + +#define pLsrPrimV256B32LoadVectorStrided(dest, arr, offset, idx, stride) \ + pLsrPrimV256UInt32LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256B64LoadVectorStrided(dest, arr, offset, idx, stride) \ + pLsrPrimV256UInt64LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256F32LoadVectorStrided(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 0) { \ + pLsrPrimV256Float32LoadVector(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256Float32LoadVectorStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) \ + +#define pLsrPrimV256B32LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV256UInt32LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256B64LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV256UInt64LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256F32LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV256F32LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256F64LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV256F64LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256RefLoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV256RefLoadVectorStrided(dest, arr, offset, idx, stride) + + +// ***************************************************************************** +// Type-specific arithemetic and memory operations +// ***************************************************************************** + +// ================ +// UInt32 - 256-bit +// ================ + +// ***************************************************************************** +// Arithmetic +// ***************************************************************************** + +#define pLsrPrimV256PointwiseUInt32Plus(dest, a, b) vector256PointwiseUInt32Plus(dest, a, b) +#define pLsrPrimV256PointwiseUInt32Minus(dest, a, b) vector256PointwiseUInt32Minus(dest, a, b) +#define pLsrPrimV256PointwiseUInt32Times(dest, a, b) vector256PointwiseUInt32Times(dest, a, b) + +#define pLsrPrimV256PointwiseUInt64Plus(dest, a, b) vector256PointwiseUInt64Plus(dest, a, b) +#define pLsrPrimV256PointwiseUInt64Minus(dest, a, b) vector256PointwiseUInt64Minus(dest, a, b) +#define pLsrPrimV256PointwiseUInt64Times(dest, a, b) vector256PointwiseUInt64Times(dest, a, b) + +#define pLsrPrimV256PointwiseSInt32Plus(dest, a, b) vector256PointwiseSInt32Plus(dest, a, b) +#define pLsrPrimV256PointwiseSInt32Minus(dest, a, b) vector256PointwiseSInt32Minus(dest, a, b) +#define pLsrPrimV256PointwiseSInt32Times(dest, a, b) vector256PointwiseSInt32Times(dest, a, b) + +#define pLsrPrimV256PointwiseSInt64Plus(dest, a, b) vector256PointwiseSInt64Plus(dest, a, b) +#define pLsrPrimV256PointwiseSInt64Minus(dest, a, b) vector256PointwiseSInt64Minus(dest, a, b) +#define pLsrPrimV256PointwiseSInt64Times(dest, a, b) vector256PointwiseSInt64Times(dest, a, b) + +#define integer256via128(dest, a, b, op) \ + do { \ + __m256i integer256via128_dest = {0, 0, 0, 0, 0, 0, 0, 0}; \ + __m128i integer256via128_hiA = _mm256_extractf128_si256(a, 1); \ + __m128i integer256via128_loA = _mm256_extractf128_si256(a, 0); \ + __m128i integer256via128_hiB = _mm256_extractf128_si256(b, 1); \ + __m128i integer256via128_loB = _mm256_extractf128_si256(b, 0); \ + __m128i integer256via128_loTemp; \ + __m128i integer256via128_hiTemp; \ + op(integer256via128_loTemp, integer256via128_loA, integer256via128_loB); \ + op(integer256via128_hiTemp, integer256via128_hiA, integer256via128_hiB); \ + integer256via128_dest = _mm256_insertf128_si256(integer256via128_dest, integer256via128_hiTemp, 1); \ + integer256via128_dest = _mm256_insertf128_si256(integer256via128_dest, integer256via128_loTemp, 0); \ + (dest) = integer256via128_dest; \ +} while (0) + +#define vector256PointwiseUInt32Plus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Plus) +#define vector256PointwiseUInt32Minus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Minus) +#define vector256PointwiseUInt32Times(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Times) + +#ifdef TWOVEC64 + +#define vector256PointwiseSInt64Times(dest, a, b) \ + do { \ + __declspec(align(16)) __int64 valsa[4], valsb[8]; \ + _mm_store_si128((void *)&valsa[0],(a).low); \ + _mm_store_si128((void *)&valsa[2],(a).high); \ + _mm_store_si128((void *)&valsb[0],(b).low); \ + _mm_store_si128((void *)&valsb[2],(b).high); \ + unsigned i; \ + for(i = 0; i < 4; ++i) { \ + valsa[i] *= valsb[i]; \ + } \ + (dest).low = _mm_load_si128((void *)&valsa[0]); \ + (dest).high = _mm_load_si128((void *)&valsa[2]); \ + } while (0) + +#define vector256PointwiseUInt64Plus(dest, a, b) \ + do { \ + (dest).low = _mm_add_epi64((a).low, (b).low); \ + (dest).high = _mm_add_epi64((a).high, (b).high); \ + } while (0) + +#define vector256PointwiseUInt64Minus(dest, a, b) \ + do { \ + (dest).low = _mm_sub_epi64((a).low, (b).low); \ + (dest).high = _mm_sub_epi64((a).high, (b).high); \ + } while (0) + +#define vector256PointwiseSInt64Plus(dest, a, b) \ + do { \ + (dest).low = _mm_add_epi64((a).low, (b).low); \ + (dest).high = _mm_add_epi64((a).high, (b).high); \ + } while (0) + +#define vector256PointwiseSInt64Minus(dest, a, b) \ + do { \ + (dest).low = _mm_sub_epi64((a).low, (b).low); \ + (dest).high = _mm_sub_epi64((a).high, (b).high); \ + } while (0) +#elif defined ONEVEC64 +#else +#define vector256PointwiseSInt64Times(dest, a, b) \ + do { \ + dest.values[0] = a.values[0] * b.values[0]; \ + dest.values[1] = a.values[1] * b.values[1]; \ + dest.values[2] = a.values[2] * b.values[2]; \ + dest.values[3] = a.values[3] * b.values[3]; \ + } while (0) + +#define vector256PointwiseSInt64Plus(dest, a, b) \ + do { \ + dest.values[0] = a.values[0] + b.values[0]; \ + dest.values[1] = a.values[1] + b.values[1]; \ + dest.values[2] = a.values[2] + b.values[2]; \ + dest.values[3] = a.values[3] + b.values[3]; \ + } while (0) + +#define vector256PointwiseSInt64Minus(dest, a, b) \ + do { \ + dest.values[0] = a.values[0] - b.values[0]; \ + dest.values[1] = a.values[1] - b.values[1]; \ + dest.values[2] = a.values[2] - b.values[2]; \ + dest.values[3] = a.values[3] - b.values[3]; \ + } while (0) + +#define pLsrPrimV256PointwiseSInt64DivT(dest, a, b) \ + do { \ + dest.values[0] = a.values[0] / b.values[0]; \ + dest.values[1] = a.values[1] / b.values[1]; \ + dest.values[2] = a.values[2] / b.values[2]; \ + dest.values[3] = a.values[3] / b.values[3]; \ + } while (0) + +#define pLsrPrimV256PointwiseSInt64ModT(dest, a, b) \ + do { \ + dest.values[0] = a.values[0] % b.values[0]; \ + dest.values[1] = a.values[1] % b.values[1]; \ + dest.values[2] = a.values[2] % b.values[2]; \ + dest.values[3] = a.values[3] % b.values[3]; \ + } while (0) + + +#define vector256PointwiseUInt64Times(dest, a, b) \ + do { \ + dest.values[0] = (uint64_t)a.values[0] * (uint64_t)b.values[0]; \ + dest.values[1] = (uint64_t)a.values[1] * (uint64_t)b.values[1]; \ + dest.values[2] = (uint64_t)a.values[2] * (uint64_t)b.values[2]; \ + dest.values[3] = (uint64_t)a.values[3] * (uint64_t)b.values[3]; \ + } while (0) + +#define vectorV256PointwiseUInt64Plus(dest, a, b) \ + do { \ + dest.values[0] = (uint64_t)a.values[0] + (uint64_t)b.values[0]; \ + dest.values[1] = (uint64_t)a.values[1] + (uint64_t)b.values[1]; \ + dest.values[2] = (uint64_t)a.values[2] + (uint64_t)b.values[2]; \ + dest.values[3] = (uint64_t)a.values[3] + (uint64_t)b.values[3]; \ + } while (0) + +#define vectorV256PointwiseUInt64Minus(dest, a, b) \ + do { \ + dest.values[0] = (uint64_t)a.values[0] - (uint64_t)b.values[0]; \ + dest.values[1] = (uint64_t)a.values[1] - (uint64_t)b.values[1]; \ + dest.values[2] = (uint64_t)a.values[2] - (uint64_t)b.values[2]; \ + dest.values[3] = (uint64_t)a.values[3] - (uint64_t)b.values[3]; \ + } while (0) + +#define pLsrPrimV256PointwiseUInt64DivT(dest, a, b) \ + do { \ + dest.values[0] = (uint64_t)a.values[0] / (uint64_t)b.values[0]; \ + dest.values[1] = (uint64_t)a.values[1] / (uint64_t)b.values[1]; \ + dest.values[2] = (uint64_t)a.values[2] / (uint64_t)b.values[2]; \ + dest.values[3] = (uint64_t)a.values[3] / (uint64_t)b.values[3]; \ + } while (0) + +#define pLsrPrimV256PointwiseUInt64ModT(dest, a, b) \ + do { \ + dest.values[0] = (uint64_t)a.values[0] % (uint64_t)b.values[0]; \ + dest.values[1] = (uint64_t)a.values[1] % (uint64_t)b.values[1]; \ + dest.values[2] = (uint64_t)a.values[2] % (uint64_t)b.values[2]; \ + dest.values[3] = (uint64_t)a.values[3] % (uint64_t)b.values[3]; \ + } while (0) + +#endif + +#define vector256PointwiseSInt32Plus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Plus) +#define vector256PointwiseSInt32Minus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Minus) +#define vector256PointwiseSInt32Times(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Times) + + +// ***************************************************************************** +// Comparison +// ***************************************************************************** + +#if 0 +#define pLsrPrimV256PointwiseSInt32LT(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32LT) +#define pLsrPrimV256PointwiseSInt32EQ(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32EQ) +#define pLsrPrimV256PointwiseSInt32GT(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32GT) +#else +#define pLsrPrimV256CompareSInt32LT(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128CompareSInt32LT) +#define pLsrPrimV256CompareSInt32EQ(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128CompareSInt32EQ) +#define pLsrPrimV256CompareSInt32GT(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128CompareSInt32GT) +#endif + +// ***************************************************************************** +// Reductions (horizontals) +// ***************************************************************************** + +#define pLsrPrimV256ReduceAUInt32Plus(dest, init, a) vector256reduceAUInt32Plus(dest, init, a) +#define pLsrPrimV256ReduceAUInt32Times(dest, init, a) vector256reduceAUInt32Times(dest, init, a) +#define pLsrPrimV256ReduceAUInt32Max(dest, init, a) vector256reduceAUInt32Max(dest, init, a) +#define pLsrPrimV256ReduceAUInt32Min(dest, init, a) vector256reduceAUInt32Min(dest, init, a) + +#define pLsrPrimV256ReduceASInt32Plus(dest, init, a) vector256reduceASInt32Plus(dest, init, a) +//#define pLsrPrimV256ReduceASInt32Times(dest, init, a) vector256reduceAUInt32Times(dest, init, a) +//#define pLsrPrimV256ReduceASInt32Max(dest, init, a) vector256reduceAUInt32Max(dest, init, a) +//#define pLsrPrimV256ReduceASInt32Min(dest, init, a) vector256reduceAUInt32Min(dest, init, a) + +#define vector256reduceAUInt32Plus(dest, init, a) \ + do { \ + __m128i vector256reduceAUInt32Plus_hiA = _mm256_extractf128_si256(a, 1); \ + __m128i vector256reduceAUInt32Plus_loA = _mm256_extractf128_si256(a, 0); \ + uint32 vector256reduceAUInt32Plus_aRes; \ + pLsrPrimV128ReduceAUInt32Plus(vector256reduceAUInt32Plus_aRes, init, vector256reduceAUInt32Plus_hiA); \ + uint32 vector256reduceAUInt32Plus_bRes; \ + pLsrPrimV128ReduceAUInt32Plus(vector256reduceAUInt32Plus_bRes, init, vector256reduceAUInt32Plus_loA); \ + (dest) = (vector256reduceAUInt32Plus_aRes + vector256reduceAUInt32Plus_bRes); \ + } while (0) + +#define vector256reduceAUInt32Times(dest, init, a) \ + do { \ + __m128i vector256reduceAUInt32Times_hiA = _mm256_extractf128_si256(a, 1); \ + __m128i vector256reduceAUInt32Times_loA = _mm256_extractf128_si256(a, 0); \ + uint32 vector256reduceAUInt32Times_aRes; \ + pLsrPrimV128ReduceAUInt32Times(vector256reduceAUInt32Times_aRes, init, vector256reduceAUInt32Times_hiA); \ + uint32 vector256reduceAUInt32Times_bRes; \ + pLsrPrimV128ReduceAUInt32Times(vector256reduceAUInt32Times_bRes, init, vector256reduceAUInt32Times_loA); \ + (dest) = (vector256reduceAUInt32Times_aRes * vector256reduceAUInt32Times_bRes); \ + } while (0) + +#define vector256reduceAUInt32Max(dest, init, a) \ + do { \ + __m128i vector256reduceAUInt32Max_hiA = _mm256_extractf128_si256(a, 1); \ + __m128i vector256reduceAUInt32Max_loA = _mm256_extractf128_si256(a, 0); \ + uint32 vector256reduceAUInt32Max_aRes; \ + pLsrPrimV128ReduceAUInt32Max(vector256reduceAUInt32Max_aRes, init, vector256reduceAUInt32Max_hiA); \ + uint32 vector256reduceAUInt32Max_bRes; \ + pLsrPrimV128ReduceAUInt32Max(vector256reduceAUInt32Max_bRes, init, vector256reduceAUInt32Max_loA); \ + (dest) = (max(vector256reduceAUInt32Max_aRes, vector256reduceAUInt32Max_bRes)); \ + } while (0) + +#define vector256reduceAUInt32Min(dest, init, a) \ + do { \ + __m128i vector256reduceAUInt32Min_hiA = _mm256_extractf128_si256(a, 1); \ + __m128i vector256reduceAUInt32Min_loA = _mm256_extractf128_si256(a, 0); \ + uint32 vector256reduceAUInt32Min_aRes; \ + pLsrPrimV128ReduceAUInt32Min(vector256reduceAUInt32Min_aRes, init, vector256reduceAUInt32Min_hiA); \ + uint32 vector256reduceAUInt32Min_bRes; \ + pLsrPrimV128ReduceAUInt32Min(vector256reduceAUInt32Min_bRes, init, vector256reduceAUInt32Min_loA); \ + (dest) = (min(vector256reduceAUInt32Min_aRes, vector256reduceAUInt32Min_bRes)); \ + } while (0) + +#define vector256reduceASInt32Plus(dest, init, a) \ + do { \ + __m128i vector256reduceASInt32Plus_hiA = _mm256_extractf128_si256(a, 1); \ + __m128i vector256reduceASInt32Plus_loA = _mm256_extractf128_si256(a, 0); \ + sint32 vector256reduceASInt32Plus_aRes; \ + pLsrPrimV128ReduceASInt32Plus(vector256reduceASInt32Plus_aRes, init, vector256reduceASInt32Plus_hiA); \ + sint32 vector256reduceASInt32Plus_bRes; \ + pLsrPrimV128ReduceASInt32Plus(vector256reduceASInt32Plus_bRes, init, vector256reduceASInt32Plus_loA); \ + (dest) = (vector256reduceASInt32Plus_aRes + vector256reduceASInt32Plus_bRes); \ + } while (0) + + +// ***************************************************************************** +// Data operations +// ***************************************************************************** + +#define pLsrPrimV256UInt32Lift(dest, a) vector256UInt32Lift(dest, a) +#define pLsrPrimV256UInt64Lift(dest, a) vector256UInt64Lift(dest, a) +#define pLsrPrimV256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + vector256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) +#define pLsrPrimV256UInt64Const(dest, c0, c1, c2, c3) \ + vector256UInt64Const(dest, c0, c1, c2, c3) + +#define vector256UInt32Lift(dest, a) \ + do { \ + __m256i vector256UInt32Lift_dest = {0, 0, 0, 0, 0, 0, 0, 0}; \ + __m128i vector256UInt32Lift_lo = _mm_set1_epi32(a); \ + vector256UInt32Lift_dest = _mm256_insertf128_si256(vector256UInt32Lift_dest, vector256UInt32Lift_lo, 1); \ + vector256UInt32Lift_dest = _mm256_insertf128_si256(vector256UInt32Lift_dest, vector256UInt32Lift_lo, 0); \ + (dest) = vector256UInt32Lift_dest; \ + } while (0) + +#define pLsrPrimV256PointwiseUInt32BAnd(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32BAnd) +#define pLsrPrimV256PointwiseUInt32BShiftL(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32BShiftL) +#define pLsrPrimV256PointwiseUInt32BShiftR(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32BShiftR) +//#define pLsrPrimV256PointwiseUInt32BShiftR(dest, a, b) (dest) = _mm256_srlv_epi32((a),(b)) + +#ifdef TWOVEC64 + +#if 0 +#define intToM64(a) __m64{a} +#if 0 +inline __m64 intToM64(__int64 a) { + __m64 ret; + ret.__m = a; + return ret; +} +#endif +#else +#define intToM64 _mm_cvtsi64_m64 +//#define intToM64 _m_from_int64 +#endif + +#if 0 +#define vector128UInt64Const(dest, c0, c1) \ + do { \ + dest.m128i_u64[0] = c0; \ + dest.m128i_u64[1] = c1; \ + } while (0) +#endif + +#if 0 +#define vector256UInt64Lift(dest, a) vector256UInt64Const(dest, a, a, a, a) +#else +#define vector256UInt64Lift(dest, a) \ + do { \ + (dest).low = _mm_set1_epi64(intToM64(a)); \ + (dest).high = _mm_set1_epi64(intToM64(a)); \ + } while (0) +#endif + +#if 0 +#define vector256UInt64Const(dest, c0, c1, c2, c3) \ + do { \ + vector128UInt64Const((dest).low, c0, c1); \ + vector128UInt64Const((dest).high, c2, c3); \ + } while (0) +#else +#define vector256UInt64Const(dest, c0, c1, c2, c3) \ + do { \ + (dest).low = _mm_set_epi64(intToM64(c0), intToM64(c1)); \ + (dest).high = _mm_set_epi64(intToM64(c2), intToM64(c3)); \ + } while (0) +#endif + +#elif defined ONEVEC64 +#else +#define vector256UInt64Lift(dest, a) \ + do { \ + (dest).values[0] = a; \ + (dest).values[1] = a; \ + (dest).values[2] = a; \ + (dest).values[3] = a; \ + } while (0) + +#define vector256UInt64Const(dest, c0, c1, c2, c3) \ + do { \ + (dest).values[0] = c0; \ + (dest).values[1] = c1; \ + (dest).values[2] = c2; \ + (dest).values[3] = c3; \ + } while (0) +#endif + +#if 1 +#define vector256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + (dest) = _mm256_set_epi32(c7, c6, c5, c4, c3, c2, c1, c0) +#else +#define vector256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + do { \ + __m256i vector256UInt32Const_out = {0, 0, 0, 0, 0, 0, 0, 0}; \ + __m128i vector256UInt32Const_lo = _mm_set_epi32(c3, c2, c1, c0); \ + __m128i vector256UInt32Const_hi = _mm_set_epi32(c7, c6, c5, c4); \ + vector256UInt32Const_out = _mm256_insertf128_si256(vector256UInt32Const_out, vector256UInt32Const_hi, 1); \ + vector256UInt32Const_out = _mm256_insertf128_si256(vector256UInt32Const_out, vector256UInt32Const_lo, 0); \ + (dest) = vector256UInt32Const_out; \ + } while (0) +#endif + +#define pLsrPrimV256UInt32Load(dest, arr, off, idx) pLsrPrimV256UInt32LoadHelp (dest, arr, off, idx) +#define pLsrPrimV256UInt32Store(arr, off, idx, v) pLsrPrimV256UInt32StoreHelp (arr, off, idx, v) +#define pLsrPrimV256RefStore(arr, off, idx, v) pLsrPrimV256UInt32StoreHelp (arr, off, idx, v) + +#define pLsrPrimV256UInt32LoadHelp(dest, arr, off, idx) \ + do { \ + __m256i* pLsrPrimV256UInt32LoadHelp_marr = (__m256i*)((uint32*)((char*)arr+off) + idx); \ + (dest) = _mm256_loadu_si256(pLsrPrimV256UInt32LoadHelp_marr); \ + } while (0) + +#define pLsrPrimV256UInt32StoreHelp(arr, off, idx, v) \ + do { \ + __m256i* pLsrPrimV256UInt32StoreHelp_marr = (__m256i*)((uint32*)((char*)arr+off) + idx); \ + _mm256_storeu_si256(pLsrPrimV256UInt32StoreHelp_marr, v); \ + } while (0) + +/* scalar array, scalar offset, vector of indices */ +#define pLsrPrimV256B32LoadVI(dest, arr, off, vi) \ + do { \ + __m128i pLsrPrimV256B32LoadVectorHelp_idxV; \ + __m128i pLsrPrimV256B32LoadVectorHelp_lo; \ + __m128i pLsrPrimV256B32LoadVectorHelp_hi; \ + __m256i pLsrPrimV256B32LoadVectorHelp_res; \ + uint32 pLsrPrimV256B32LoadVectorHelp_idx0; \ + uint32 pLsrPrimV256B32LoadVectorHelp_idx1; \ + uint32 pLsrPrimV256B32LoadVectorHelp_idx2; \ + uint32 pLsrPrimV256B32LoadVectorHelp_idx3; \ + \ + pLsrPrimV256B32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 0); \ + pLsrPrimV256B32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 0); \ + pLsrPrimV256B32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 1); \ + pLsrPrimV256B32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 2); \ + pLsrPrimV256B32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 3); \ + pLsrPrimV256B32LoadVectorHelp_lo = \ + _mm_set_epi32(*((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx3), \ + *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx2), \ + *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx1), \ + *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx0)); \ + pLsrPrimV256B32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 1); \ + pLsrPrimV256B32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 0); \ + pLsrPrimV256B32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 1); \ + pLsrPrimV256B32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 2); \ + pLsrPrimV256B32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 3); \ + pLsrPrimV256B32LoadVectorHelp_hi = \ + _mm_set_epi32(*((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx3), \ + *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx2), \ + *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx1), \ + *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx0)); \ + pLsrPrimV256B32LoadVectorHelp_res =_mm256_insertf128_si256(pLsrPrimV256B32LoadVectorHelp_res, pLsrPrimV256B32LoadVectorHelp_lo, 0); \ + pLsrPrimV256B32LoadVectorHelp_res =_mm256_insertf128_si256(pLsrPrimV256B32LoadVectorHelp_res, pLsrPrimV256B32LoadVectorHelp_hi, 1); \ + (dest) = pLsrPrimV256B32LoadVectorHelp_res; \ + } while (0) + +// ===== +// Float +// ===== + +// ***************************************************************************** +// Arithmetic operations +// ***************************************************************************** + +static __m256 pLsrPrimV256Float32Zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +#define pLsrPrimV256PointwiseFloat32Plus(dest, a, b) ((dest) = _mm256_add_ps(a, b)) +#define pLsrPrimV256PointwiseFloat32Minus(dest, a, b) ((dest) = _mm256_sub_ps(a, b)) +#define pLsrPrimV256PointwiseFloat32Times(dest, a, b) ((dest) = _mm256_mul_ps(a, b)) +#define pLsrPrimV256PointwiseFloat32Divide(dest, a, b) ((dest) = _mm256_div_ps(a, b)) +#define pLsrPrimV256PointwiseFloat32Negate(dest, a) ((dest) = _mm256_sub_ps(pLsrPrimV256Float32Zero, a)) +#define pLsrPrimV256PointwiseFloat32Max(dest, a, b) ((dest) = _mm256_max_ps(a, b)) +#define pLsrPrimV256PointwiseFloat32Sqrt(dest, a) ((dest) = _mm256_sqrt_ps(a)) + +// ***************************************************************************** +// Reductions +// ***************************************************************************** + +#define pLsrPrimV256ReduceAFloat32Plus(dest, init, a) reduce256AFloat32Plus(dest, init, a) + +#define reduce256AFloat32Plus(dest, init, a) \ + do { \ + __m256 reduceAFloat32Plus_a_ = a; \ + __m128 rf32_hi = _mm256_extractf128_ps(reduceAFloat32Plus_a_, 1); \ + __m256 rf32_tmp = _mm256_insertf128_ps(reduceAFloat32Plus_a_, rf32_hi, 0); \ + __m256 reduceAFloat32Plus_tmp = _mm256_hadd_ps(reduceAFloat32Plus_a_, rf32_tmp); \ + __m256 reduceAFloat32Plus_tmp2 = _mm256_hadd_ps(reduceAFloat32Plus_tmp, reduceAFloat32Plus_tmp); \ + __m256 reduceAFloat32Plus_tmp3 = _mm256_hadd_ps(reduceAFloat32Plus_tmp2, reduceAFloat32Plus_tmp2); \ + float reduceAFloat32Plus_dest_; \ + pLsrPrimV256DataSubF32L(reduceAFloat32Plus_dest_, 0, reduceAFloat32Plus_tmp3); \ + (dest) = (reduceAFloat32Plus_dest_ + (init)); \ + } while (0) + +// ***************************************************************************** +// Data operations +// ***************************************************************************** + +#define pLsrPrimV256Float32Load(dest, arr, off, idx) \ + do { \ + float* pLsrPrimV256Float32Load_marr = ((float*)((char*)arr+off) + idx); \ + __m256 float32load = _mm256_loadu_ps(pLsrPrimV256Float32Load_marr); \ + (dest) = float32load; \ + } while (0) + +#define pLsrPrimV256Float32LoadVector(dest, arr, off, idx) \ + pLsrPrimV256Float32LoadVectorHelp (dest, arr, off, idx) + +#define pLsrPrimV256Float32Store(arr, off, idx, v) \ + do { \ + float* pLsrPrimV256Float32Store_marr = ((float*)((char*)arr+off) + idx); \ + _mm256_storeu_ps(pLsrPrimV256Float32Store_marr, v); \ + } while (0) + + +#define pLsrPrimV256Float32LoadStrided(dest, arr, off, idx, stride) \ + do { \ + (dest) = _mm256_set_ps(*((float32*)((char*)arr + off) + idx + 7*stride), \ + *((float32*)((char*)arr + off) + idx + 6*stride), \ + *((float32*)((char*)arr + off) + idx + 5*stride), \ + *((float32*)((char*)arr + off) + idx + 4*stride), \ + *((float32*)((char*)arr + off) + idx + 3*stride), \ + *((float32*)((char*)arr + off) + idx + 2*stride), \ + *((float32*)((char*)arr + off) + idx + stride), \ + *((float32*)((char*)arr + off) + idx + 0)); \ + } while (0) + +#define pLsrPrimV256Float32StoreStrided(arr, off, idx, stride, src) \ + do { \ + __m128 pLsrPrimV256Float32StoreStrided_srcV; \ + \ + pLsrPrimV256Float32StoreStrided_srcV =_mm256_extractf128_ps(src, 0); \ + *((uint32*)((char*)arr + off) + idx + 0) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 0); \ + *((uint32*)((char*)arr + off) + idx + stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 1); \ + *((uint32*)((char*)arr + off) + idx + 2*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 2); \ + *((uint32*)((char*)arr + off) + idx + 3*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 3); \ + \ + pLsrPrimV256Float32StoreStrided_srcV =_mm256_extractf128_ps(src, 1); \ + *((uint32*)((char*)arr + off) + idx + 4*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 0); \ + *((uint32*)((char*)arr + off) + idx + 5*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 1); \ + *((uint32*)((char*)arr + off) + idx + 6*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 2); \ + *((uint32*)((char*)arr + off) + idx + 7*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 3); \ + } while (0) + +#define pLsrPrimV256Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + ((dest) = _mm256_set_ps(c7, c6, c5, c4, c3, c2, c1, c0)) + +#define pLsrPrimV256Float32Lift(dest, a) \ + do { \ + float temp = a; \ + (dest) = _mm256_broadcast_ss((float *)(&temp)); \ + } while(0) + + +#define pLsrPrimV256Float32LoadVectorHelp(dest, arr, off, idx) \ + do { \ + __m128i pLsrPrimV256Float32LoadVectorHelp_ptrV; \ + __m128 pLsrPrimV256Float32LoadVectorHelp_lo; \ + __m128 pLsrPrimV256Float32LoadVectorHelp_hi; \ + __m256 pLsrPrimV256Float32LoadVectorHelp_res; \ + float32* pLsrPrimV256Float32LoadVectorHelp_ptr0; \ + float32* pLsrPrimV256Float32LoadVectorHelp_ptr1; \ + float32* pLsrPrimV256Float32LoadVectorHelp_ptr2; \ + float32* pLsrPrimV256Float32LoadVectorHelp_ptr3; \ + \ + pLsrPrimV256Float32LoadVectorHelp_ptrV = _mm256_extractf128_si256(arr, 0); \ + pLsrPrimV256Float32LoadVectorHelp_ptr0 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 0); \ + pLsrPrimV256Float32LoadVectorHelp_ptr1 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 1); \ + pLsrPrimV256Float32LoadVectorHelp_ptr2 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 2); \ + pLsrPrimV256Float32LoadVectorHelp_ptr3 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 3); \ + pLsrPrimV256Float32LoadVectorHelp_lo = \ + _mm_set_ps(*(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr0 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr1 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr2 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr3 + off) + idx)); \ + pLsrPrimV256Float32LoadVectorHelp_ptrV =_mm256_extractf128_si256(arr, 1); \ + pLsrPrimV256Float32LoadVectorHelp_ptr0 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 0); \ + pLsrPrimV256Float32LoadVectorHelp_ptr1 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 1); \ + pLsrPrimV256Float32LoadVectorHelp_ptr2 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 2); \ + pLsrPrimV256Float32LoadVectorHelp_ptr3 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 3); \ + pLsrPrimV256Float32LoadVectorHelp_hi = \ + _mm_set_ps(*(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr0 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr1 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr2 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr3 + off) + idx)); \ + pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \ + pLsrPrimV256Float32LoadVectorHelp_hi, \ + 1); \ + pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \ + pLsrPrimV256Float32LoadVectorHelp_lo, \ + 0); \ + (dest) = pLsrPrimV256Float32LoadVectorHelp_res; \ + } while (0) + +// ===== +// Double +// ===== + +// ***************************************************************************** +// Arithmetic operations +// ***************************************************************************** + +static __m256d pLsrPrimV256Float64Zero = {0.0, 0.0, 0.0, 0.0}; +#define pLsrPrimV256PointwiseFloat64Plus(dest, a, b) ((dest) = _mm256_add_pd(a, b)) +#define pLsrPrimV256PointwiseFloat64Minus(dest, a, b) ((dest) = _mm256_sub_pd(a, b)) +#define pLsrPrimV256PointwiseFloat64Times(dest, a, b) ((dest) = _mm256_mul_pd(a, b)) +#define pLsrPrimV256PointwiseFloat64Divide(dest, a, b) ((dest) = _mm256_div_pd(a, b)) +#define pLsrPrimV256PointwiseFloat64Negate(dest, a) ((dest) = _mm256_sub_pd(pLsrPrimV256Float64Zero, a)) +#define pLsrPrimV256PointwiseFloat64Max(dest, a, b) ((dest) = _mm256_max_pd(a, b)) +#define pLsrPrimV256PointwiseFloat64Sqrt(dest, a) ((dest) = _mm256_sqrt_pd(a)) + +#define pLsrPrimV256PointwiseSInt32DivT(dest, a, b) \ + do { \ + __m128i pLsrPrimV256PointwiseSInt32DivT_a; \ + __m128i pLsrPrimV256PointwiseSInt32DivT_b; \ + __m128i pLsrPrimV256PointwiseSInt32DivT_rlo; \ + __m128i pLsrPrimV256PointwiseSInt32DivT_rhi; \ + __m256i pLsrPrimV256PointwiseSInt32DivT_res; \ + \ + pLsrPrimV256PointwiseSInt32DivT_a = _mm256_extractf128_si256(a, 0); \ + pLsrPrimV256PointwiseSInt32DivT_b = _mm256_extractf128_si256(b, 0); \ + pLsrPrimV128PointwiseSInt32DivT(pLsrPrimV256PointwiseSInt32DivT_rlo, pLsrPrimV256PointwiseSInt32DivT_a, pLsrPrimV256PointwiseSInt32DivT_b);\ + pLsrPrimV256PointwiseSInt32DivT_a = _mm256_extractf128_si256(a, 1); \ + pLsrPrimV256PointwiseSInt32DivT_b = _mm256_extractf128_si256(b, 1); \ + pLsrPrimV128PointwiseSInt32DivT(pLsrPrimV256PointwiseSInt32DivT_rhi, pLsrPrimV256PointwiseSInt32DivT_a, pLsrPrimV256PointwiseSInt32DivT_b); \ + pLsrPrimV256PointwiseSInt32DivT_res = _mm256_castsi128_si256(pLsrPrimV256PointwiseSInt32DivT_rlo); \ + pLsrPrimV256PointwiseSInt32DivT_res = _mm256_insertf128_si256(pLsrPrimV256PointwiseSInt32DivT_res, \ + pLsrPrimV256PointwiseSInt32DivT_rhi, \ + 1); \ + (dest) = pLsrPrimV256PointwiseSInt32DivT_res; \ + } while (0) + +#define pLsrPrimV256PointwiseSInt32ModT(dest, a, b) \ + do { \ + __m128i pLsrPrimV256PointwiseSInt32ModT_a; \ + __m128i pLsrPrimV256PointwiseSInt32ModT_b; \ + __m128i pLsrPrimV256PointwiseSInt32ModT_rlo; \ + __m128i pLsrPrimV256PointwiseSInt32ModT_rhi; \ + __m256i pLsrPrimV256PointwiseSInt32ModT_res; \ + \ + pLsrPrimV256PointwiseSInt32ModT_a = _mm256_extractf128_si256(a, 0); \ + pLsrPrimV256PointwiseSInt32ModT_b = _mm256_extractf128_si256(b, 0); \ + pLsrPrimV128PointwiseSInt32ModT(pLsrPrimV256PointwiseSInt32ModT_rlo, pLsrPrimV256PointwiseSInt32ModT_a, pLsrPrimV256PointwiseSInt32ModT_b);\ + pLsrPrimV256PointwiseSInt32ModT_a = _mm256_extractf128_si256(a, 1); \ + pLsrPrimV256PointwiseSInt32ModT_b = _mm256_extractf128_si256(b, 1); \ + pLsrPrimV128PointwiseSInt32ModT(pLsrPrimV256PointwiseSInt32ModT_rhi, pLsrPrimV256PointwiseSInt32ModT_a, pLsrPrimV256PointwiseSInt32ModT_b); \ + pLsrPrimV256PointwiseSInt32ModT_res = _mm256_castsi128_si256(pLsrPrimV256PointwiseSInt32ModT_rlo); \ + pLsrPrimV256PointwiseSInt32ModT_res = _mm256_insertf128_si256(pLsrPrimV256PointwiseSInt32ModT_res, \ + pLsrPrimV256PointwiseSInt32ModT_rhi, \ + 1); \ + (dest) = pLsrPrimV256PointwiseSInt32ModT_res; \ + } while (0) + +// ***************************************************************************** +// Reductions +// ***************************************************************************** + +#define pLsrPrimV256ReduceAFloat64Plus(dest, init, a) \ + do { \ + __m256d reduceAFloat64Plus_p_ = a; \ + __m128d reduceAFloat64Plus_p_lo = _mm256_extractf128_pd(reduceAFloat64Plus_p_, 0); \ + __m128d reduceAFloat64Plus_p_hi = _mm256_extractf128_pd(reduceAFloat64Plus_p_, 1); \ + __m128d reduceAFloat64Plus_p_tmp = _mm_hadd_pd(reduceAFloat64Plus_p_lo, reduceAFloat64Plus_p_hi); \ + reduceAFloat64Plus_p_tmp = _mm_hadd_pd(reduceAFloat64Plus_p_tmp, reduceAFloat64Plus_p_tmp); \ + double reduceAFloat64Plus_dest_; \ + pLsrPrimV256DataSubF64(reduceAFloat64Plus_dest_, 0, reduceAFloat64Plus_p_tmp); \ + (dest) = (reduceAFloat64Plus_dest_ + (init)); \ + } while (0) + +// ***************************************************************************** +// Data operations +// ***************************************************************************** + +#define pLsrPrimV256Float64Load(dest, arr, off, idx) \ + do { \ + double* pLsrPrimV256Float64Load_marr = ((double*)((char*)arr+off) + idx); \ + __m256d float64load = _mm256_loadu_pd(pLsrPrimV256Float64Load_marr); \ + (dest) = float64load; \ + } while (0) + +#define pLsrPrimV256Float64Store(arr, off, idx, v) \ + do { \ + double* pLsrPrimV256Float64Store_marr = ((double*)((char*)arr+off) + idx); \ + _mm256_storeu_pd(pLsrPrimV256Float64Store_marr, v); \ + } while (0) + + +#define pLsrPrimV256Float64LoadStrided(dest, arr, off, idx, stride) \ + do { \ + (dest) = _mm256_set_pd(*((float64*)((char*)arr + off) + idx + 3*stride), \ + *((float64*)((char*)arr + off) + idx + 2*stride), \ + *((float64*)((char*)arr + off) + idx + stride), \ + *((float64*)((char*)arr + off) + idx + 0)); \ + } while (0) + +#define pLsrPrimV256Float64StoreStrided(arr, off, idx, stride, src) \ + do { \ + __m128d pLsrPrimV256Float64StoreStrided_srcV; \ + \ + pLsrPrimV256Float64StoreStrided_srcV =_mm256_extractf128_pd(src, 0); \ + _mm_storel_pd(((float64*)((char*)arr + off) + idx + 0), pLsrPrimV256Float64StoreStrided_srcV); \ + _mm_storeh_pd(((float64*)((char*)arr + off) + idx + stride), pLsrPrimV256Float64StoreStrided_srcV); \ + \ + pLsrPrimV256Float64StoreStrided_srcV =_mm256_extractf128_pd(src, 1); \ + _mm_storel_pd(((float64*)((char*)arr + off) + idx + 2*stride), pLsrPrimV256Float64StoreStrided_srcV); \ + _mm_storeh_pd(((float64*)((char*)arr + off) + idx + 3*stride), pLsrPrimV256Float64StoreStrided_srcV); \ + } while (0) + +#define pLsrPrimV256Float64Const(dest, c0, c1, c2, c3) \ + ((dest) = _mm256_set_pd(c3, c2, c1, c0)) + +#define pLsrPrimV256Float64Lift(dest, a) \ + do { \ + double temp = a; \ + (dest) = _mm256_broadcast_sd((double *)(&temp)); \ + } while(0) + +// ================ +// Ref - 256-bit +// ================ + +#define pLsrPrimV256RefLoad(dest, arr, off, idx) pLsrPrimV256UInt32Load(dest, arr, off, idx) + +#endif _PLSR_PRIMS_VECTOR_AVX_H_ diff --git a/runtime/include/hrc/plsr-prims-vector-mic.h b/runtime/include/hrc/plsr-prims-vector-mic.h new file mode 100644 index 0000000..8add559 --- /dev/null +++ b/runtime/include/hrc/plsr-prims-vector-mic.h @@ -0,0 +1,1633 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_PRIMS_VECTOR_MIC_H_ +#define _PLSR_PRIMS_VECTOR_MIC_H_ + +#ifdef P_USE_PILLAR +#ifdef LINUX +# pragma pillar_push_cc(__pcdecl) +# define to __to__ +#else +# pragma pillar_managed(off) +# define to __to__ +#endif +#endif /* P_USE_PILLAR */ + +#ifdef P_USE_PILLAR +#define __ICL_INTRINCC __pcdecl +#endif /* P_USE_PILLAR */ +#include +#undef __cdecl // deep in the include for immintrin.h, there is a "#define __cdecl". This undef gets rid of the effects of this. + +#ifdef P_USE_PILLAR +#ifdef LINUX +# undef to +# pragma pillar_pop_cc +#else +# undef to +# pragma pillar_managed(on) +#endif +#endif /* P_USE_PILLAR */ + +//#define __pillar2c__ 1 + +typedef __m512 PlsrVector512F32; +typedef __m512i PlsrVector512B64; +typedef __m512d PlsrVector512F64; +typedef __m512i PlsrVector512Ref; + +typedef __mmask16 PlsrVectorMask512Fs32; +typedef __mmask8 PlsrVectorMask512Fs64; + +#define pLsrPrimV512F64StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV512Float64Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV512Float64StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) + +#define pLsrPrimV512F32StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV512Float32Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV512Float32StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) + +#define pLsrPrimV512F64LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV512Float64Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV512Float64LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) + +#define pLsrPrimV512F32LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV512Float32Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV512Float32LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) + +#define pLsrPrimV512B64LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV512UInt64Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV512UInt64LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) + +#define pLsrPrimV512B64StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV512UInt64Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV512UInt64StoreStrided(arr, offset, idx, stride, v ); \ + } \ + } while (0) + +#define pLsrPrimV512RefStoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV512RefStore(arr, offset, idx, v); \ + } else { \ + pLsrPrimV512RefStoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) + +#define pLsrPrimV512RefLoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV512RefLoad(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV512RefLoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) + +#define pLsrPrimV512512ConvertFloat32FromUInt32(dest, a) ((dest) = _mm512_castsi512_ps(a)) +#define pLsrPrimV512512ConvertUInt32FromFloat32(dest, a) ((dest) = _mm512_castps_si512(a)) +#define pLsrPrimV512256ConvertFloat64FromFloat32(dest, a) ((dest) = _mm512_cvtpslo_pd(a)) +#define pLsrPrimV512512CastSInt32ToFloat32(dest, a) ((dest) = _mm512_castsi512_ps(a)) +#define pLsrPrimV512512ConvertUInt32FromSInt32(dest, a) ((dest) = a) +#define pLsrPrimV512512ConvertSInt32FromUInt32(dest, a) ((dest) = a) +#define pLsrPrimV512512ConvertUInt64FromSInt64(dest, a) ((dest) = a) +#define pLsrPrimV512512ConvertSInt64FromUInt64(dest, a) ((dest) = a) +#define pLsrPrimV512512CastUInt64ToSInt64(dest, a) ((dest) = a) +#define pLsrPrimV512512CastSInt64ToUInt64(dest, a) ((dest) = a) +#define pLsrPrimV512512CastSInt64ToFloat64(dest, a) ((dest) = _mm512_castsi512_pd(a)) + +#define pLsrPrimV512DataVectorB64(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + pLsrPrimV512UInt64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) + +#define pLsrPrimV512Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15) \ + ((dest) = _mm512_setr_ps(c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15)) + +#define pLsrPrimV512DataVectorF32(dest, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15) \ + pLsrPrimV512Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15) + +#define pLsrPrimV512DataVectorF64(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + pLsrPrimV512Float64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) + +#if 1 +#define pLsrPrimV512Float32Lift(dest, a) ((dest) = _mm512_set1_ps(a)) +#else +#define pLsrPrimV512Float32Lift(dest, a) \ + do { \ + float ftemp = a; \ + (dest) = _mm512_extload_ps((void const *)(&ftemp),_MM_UPCONV_PS_NONE,_MM_BROADCAST_1X16,0); \ + } while (0) +#endif + +#define pLsrPrimV512DataBroadcastB64(dest, c0) pLsrPrimV512UInt64Lift(dest, c0) +//#define pLsrPrimV512DataBroadcastB32(dest, c0) pLsrPrimV512UInt32Lift(dest, c0) +#define pLsrPrimV512DataBroadcastF32(dest, c0) pLsrPrimV512Float32Lift(dest, c0) +#define pLsrPrimV512DataBroadcastF64(dest, c0) pLsrPrimV512Float64Lift(dest, c0) + +#define pLsrPrimV512F64LoadVI(dest, arr, off, vi) \ + (dest) = _mm512_i32logather_pd(vi, ((char*)arr + off), sizeof(float64)) + +#if 0 + do { \ + __int64 indices[8]; \ + _mm512_extstore_epi64(&indices, vi, _MM_DOWNCONV_PD_NONE, 0); \ + (dest) = _mm512_set_pd(*((float64*)((char*)arr + off) + indices[7]), \ + *((float64*)((char*)arr + off) + indices[6]), \ + *((float64*)((char*)arr + off) + indices[5]), \ + *((float64*)((char*)arr + off) + indices[4]), \ + *((float64*)((char*)arr + off) + indices[3]), \ + *((float64*)((char*)arr + off) + indices[2]), \ + *((float64*)((char*)arr + off) + indices[1]), \ + *((float64*)((char*)arr + off) + indices[0])); \ + } while (0) +#endif + +#define pLsrPrimV512PointwiseUInt64BAnd(dest, a, b) (dest) = _mm512_and_epi64((a),(b)) +#define pLsrPrimV512PointwiseUInt64BOr(dest, a, b) (dest) = _mm512_or_epi64((a),(b)) + +//#define pLsrPrimV512PointwiseUInt64BShiftR(dest, a, b) (dest) = _mm512_srlv_epi64((a),(b)) +#define pLsrPrimV512PointwiseUInt64BShiftR(dest, a, b) \ + do { \ + __declspec(align(64)) __int64 valsa[8], valsb[8]; \ + _mm512_store_epi64(valsa,(a)); \ + _mm512_store_epi64(valsb, (b)); \ + unsigned i; \ + for(i = 0; i < 8; ++i) { \ + valsa[i] >>= valsb[i]; \ + } \ + (dest) = _mm512_load_epi64(valsa); \ + } while (0) + +#define pLsrPrimV512PointwiseUInt64BShiftL(dest, a, b) \ + do { \ + __declspec(align(64)) __int64 valsa[8], valsb[8]; \ + _mm512_store_epi64(valsa,(a)); \ + _mm512_store_epi64(valsb, (b)); \ + unsigned i; \ + for(i = 0; i < 8; ++i) { \ + valsa[i] <<= valsb[i]; \ + } \ + (dest) = _mm512_load_epi64(valsa); \ + } while (0) + +// This is brain dead. Unpack the vector. Multiply each one and repack. +#define pLsrPrimV512PointwiseSInt64Times(dest, a, b) \ + do { \ + __declspec(align(64)) __int64 valsa[8], valsb[8]; \ + _mm512_store_epi64(valsa,(a)); \ + _mm512_store_epi64(valsb,(b)); \ + unsigned i; \ + for(i = 0; i < 8; ++i) { \ + valsa[i] *= valsb[i]; \ + } \ + (dest) = _mm512_load_epi64(valsa); \ + } while (0) + +#define pLsrPrimV512PointwiseSInt64Plus(dest, a, b) vector512PointwiseSInt64Plus(dest, a, b) +#define vector512PointwiseSInt64Plus(dest, a, b) (dest) = _mm512_add_epi64((a),(b)) + +#define pLsrPrimV512PointwiseSInt64Minus(dest, a, b) vector512PointwiseSInt64Minus(dest, a, b) +#define vector512PointwiseSInt64Minus(dest, a, b) (dest) = _mm512_sub_epi64((a),(b)) + +#if 0 +#define pLsrPrimV512PointwiseFloat32LT(dest, a, b) \ + do { \ + __declspec(align(64)) float32 avec[16], bvec[16]; \ + __declspec(align(64)) uint32_t resvec[16]; \ + _mm512_store_ps(avec,(a)); \ + _mm512_store_ps(bvec,(b)); \ + unsigned i; \ + for(i=0;i<16;++i) { \ + if(avec[i] < bvec[i]) resvec[i] = 1; else resvec[i] = 0; \ + } \ + (dest) = _mm512_load_epi32(resvec); \ + } while(0) + +#define pLsrPrimV512PointwiseSInt64LT(dest, a, b) \ + do { \ + __declspec(align(64)) __int64 avec[8], bvec[8]; \ + _mm512_store_epi64(avec,(a)); \ + _mm512_store_epi64(bvec,(b)); \ + unsigned i; \ + for(i=0;i<8;++i) { \ + if(avec[i] < bvec[i]) avec[i] = 1; else avec[i] = 0; \ + } \ + (dest) = _mm512_load_epi64(avec); \ + } while(0) + +#define pLsrPrimV512PointwiseSInt64EQ(dest, a, b) \ + do { \ + __declspec(align(64)) __int64 avec[8], bvec[8]; \ + _mm512_store_epi64(avec,(a)); \ + _mm512_store_epi64(bvec,(b)); \ + unsigned i; \ + for(i=0;i<8;++i) { \ + if(avec[i] == bvec[i]) avec[i] = 1; else avec[i] = 0; \ + } \ + (dest) = _mm512_load_epi64(avec); \ + } while(0) + +#define pLsrPrimV512PointwiseCondMov(dest, mask, a, b) \ + do { \ + __mmask8 realmask=0; \ + __declspec(align(64)) __int64 mvec[8]; \ + _mm512_store_epi64(mvec,(mask)); \ + unsigned i; \ + for(i=0;i<8;++i) { \ + if(mvec[i]) realmask |= (1 << i); \ + } \ + (dest) = _mm512_mask_blend_epi64((realmask),(a),(b)); \ + } while (0) +#else +#define pLsrPrimV512CompareFloat32LT(dest, a, b) (dest) = _mm512_cmplt_ps_mask((a),(b)) +#define pLsrPrimV512DataBlendB32(dest, mask, a, b) (dest) = _mm512_mask_blend_epi32((mask),(a),(b)) +#define pLsrPrimV512DataBlendB64(dest, mask, a, b) (dest) = _mm512_mask_blend_epi64((mask),(a),(b)) +#define pLsrPrimV512DataBlendF32(dest, mask, a, b) (dest) = _mm512_mask_blend_ps((mask),(a),(b)) +#define pLsrPrimV512DataBlendF64(dest, mask, a, b) (dest) = _mm512_mask_blend_pd((mask),(a),(b)) + +//#define pLsrPrimV512CompareSInt64LT(dest, a, b) (dest) = _mm512_cmplt_epi64_mask((a),(b)) +//#define pLsrPrimV512CompareSInt64EQ(dest, a, b) (dest) = _mm512_cmpeq_epi64_mask((a),(b)) +#define pLsrPrimV512CompareSInt64LT(dest, a, b) \ + do { \ + __declspec(align(64)) __int64 avec[8], bvec[8]; \ + _mm512_store_epi64(avec,(a)); \ + _mm512_store_epi64(bvec,(b)); \ + unsigned i; \ + (dest) = 0; \ + for(i=0;i<8;++i) { \ + if(avec[i] < bvec[i]) (dest) |= (1 << i); \ + } \ + } while(0) + +#define pLsrPrimV512CompareSInt64EQ(dest, a, b) \ + do { \ + __declspec(align(64)) __int64 avec[8], bvec[8]; \ + _mm512_store_epi64(avec,(a)); \ + _mm512_store_epi64(bvec,(b)); \ + unsigned i; \ + (dest) = 0; \ + for(i=0;i<8;++i) { \ + if(avec[i] == bvec[i]) (dest) |= (1 << i); \ + } \ + } while(0) + +#endif + +#define pLsrPrimV512UInt64Lift(dest, a) vector512UInt64Lift(dest, a) +#define pLsrPrimV512UInt64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + vector512UInt64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) + +#define vector512UInt64Lift(dest, a) \ + do { \ + uint64_t tmp = a; \ + (dest) = _mm512_extload_epi64((void const *)(&tmp),_MM_UPCONV_EPI64_NONE,_MM_BROADCAST_1X8,0); \ + } while(0) + +#define vector512UInt64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + (dest) = _mm512_setr_epi64(c0, c1, c2, c3, c4, c5, c6, c7) + + + +#define pLsrPrimV512PointwiseFloat64Plus(dest, a, b) ((dest) = _mm512_add_pd(a, b)) +#define pLsrPrimV512PointwiseFloat64Minus(dest, a, b) ((dest) = _mm512_sub_pd(a, b)) +#define pLsrPrimV512PointwiseFloat64Times(dest, a, b) ((dest) = _mm512_mul_pd(a, b)) + +#define pLsrPrimV512PointwiseFloat32Plus(dest, a, b) ((dest) = _mm512_add_ps(a, b)) +#define pLsrPrimV512PointwiseFloat32Minus(dest, a, b) ((dest) = _mm512_sub_ps(a, b)) +#define pLsrPrimV512PointwiseFloat32Times(dest, a, b) ((dest) = _mm512_mul_ps(a, b)) +#define pLsrPrimV512PointwiseFloat32Sqrt(dest, a) ((dest) = _mm512_sqrt_ps(a)) +#define pLsrPrimV512PointwiseFloat32Divide(dest, a, b) ((dest) = _mm512_div_ps(a, b)) + +#define pLsrPrimV512PointwiseSInt64DivT(dest, a, b) \ + (dest) = _mm512_div_epi64((a),(b)) + +#define pLsrPrimV512PointwiseSInt64ModT(dest, a, b) \ + (dest) = _mm512_rem_epi64((a),(b)) + +#define pLsrPrimV512ReduceAFloat64Plus(dest, init, a) \ + (dest) = (init) + _mm512_reduce_add_pd(a) + +#define pLsrPrimV512ReduceAFloat32Plus(dest, init, a) \ + (dest) = (init) + _mm512_reduce_add_ps(a) + +#ifdef HAS_ALIGNMENT +#define pLsrPrimV512Float64Load(dest, arr, off, idx) \ + (dest) = _mm512_load_pd((void const *)((double*)((char*)arr+off) + idx)) +#else +#define pLsrPrimV512Float64Load(dest, arr, off, idx) \ + do { \ + (dest) = _mm512_loadunpacklo_pd((dest),(void *)((double*)((char*)arr+off) + idx)); \ + (dest) = _mm512_loadunpackhi_pd((dest),(char *)((double*)((char*)arr+off) + idx) + 64); \ + } while (0) +#endif + +#ifdef HAS_ALIGNMENT +#define pLsrPrimV512Float32Load(dest, arr, off, idx) \ + (dest) = _mm512_load_ps((void const *)((float*)((char*)arr+off) + idx)) +#else +#define pLsrPrimV512Float32Load(dest, arr, off, idx) \ + do { \ + (dest) = _mm512_loadunpacklo_ps((dest),(void *)((float*)((char*)arr+off) + idx)); \ + (dest) = _mm512_loadunpackhi_ps((dest),(char *)((float*)((char*)arr+off) + idx) + 64); \ + } while (0) +#endif + +#ifdef HAS_ALIGNMENT +#define pLsrPrimV512Float64Store(arr, off, idx, v) \ + _mm512_store_pd((void const *)((double*)((char*)arr+off) + idx), v) +#define pLsrPrimV512Float32Store(arr, off, idx, v) \ + _mm512_store_ps((void const *)((float*)((char*)arr+off) + idx), v) +#define pLsrPrimV512UInt64Store(arr, off, idx, v) \ + _mm512_store_epi64((void const *)((uint64_t*)((char*)arr+off) + idx), v) +#else +#define pLsrPrimV512Float64Store(arr, off, idx, v) \ + do { \ + _mm512_packstorelo_pd((void *)((double*)((char*)arr+off) + idx),(v)); \ + _mm512_packstorehi_pd((char *)((double*)((char*)arr+off) + idx) + 64,(v)); \ + } while (0) +#define pLsrPrimV512Float32Store(arr, off, idx, v) \ + do { \ + _mm512_packstorelo_ps((void *)((float*)((char*)arr+off) + idx),(v)); \ + _mm512_packstorehi_ps((char *)((float*)((char*)arr+off) + idx) + 64,(v)); \ + } while (0) +#define pLsrPrimV512UInt64Store(arr, off, idx, v) \ + do { \ + _mm512_packstorelo_epi64((void *)((uint64_t*)((char*)arr+off) + idx),(v)); \ + _mm512_packstorehi_epi64((char *)((uint64_t*)((char*)arr+off) + idx) + 64,(v)); \ + } while (0) +#endif + +#ifdef HAS_ALIGNMENT +#define pLsrPrimV512UInt64Load(dest, arr, off, idx) \ + (dest) = _mm512_load_epi64((void const *)((uint64_t*)((char*)arr+off) + idx)) +#else +#define pLsrPrimV512UInt64Load(dest, arr, off, idx) \ + do { \ + (dest) = _mm512_loadunpacklo_epi64((dest),(void *)((uint64_t*)((char*)arr+off) + idx)); \ + (dest) = _mm512_loadunpackhi_epi64((dest),(char *)((uint64_t*)((char*)arr+off) + idx) + 64); \ + } while (0) +#endif + +#ifdef HAS_ALIGNMENT +#define pLsrPrimV512RefLoad(dest, arr, off, idx) \ + (dest) = _mm512_load_epi64((void const *)((void**)((char*)arr+off) + idx)) +#else +#define pLsrPrimV512RefLoad(dest, arr, off, idx) \ + do { \ + (dest) = _mm512_loadunpacklo_epi64((dest),(void *)((void**)((char*)arr+off) + idx)); \ + (dest) = _mm512_loadunpackhi_epi64((dest),(char *)((void**)((char*)arr+off) + idx) + 64); \ + } while (0) +#endif + +#ifdef HAS_ALIGNMENT +#define pLsrPrimV512RefStore(arr, off, idx, v) \ + _mm512_store_epi64((void const *)((void**)((char*)arr+off) + idx), v) +#else +#define pLsrPrimV512RefStore(arr, off, idx, v) \ + do { \ + _mm512_packstorelo_epi64((void *)((void**)((char*)arr+off) + idx),(v)); \ + _mm512_packstorehi_epi64((char *)((void**)((char*)arr+off) + idx) + 64,(v)); \ + } while (0) +#endif + + + +#define pLsrPrimV512Float64LoadStrided(dest, arr, off, idx, stride) \ + do { \ + (dest) = _mm512_set_pd(*((float64*)((char*)arr + off) + idx + 7*stride), \ + *((float64*)((char*)arr + off) + idx + 6*stride), \ + *((float64*)((char*)arr + off) + idx + 5*stride), \ + *((float64*)((char*)arr + off) + idx + 4*stride), \ + *((float64*)((char*)arr + off) + idx + 3*stride), \ + *((float64*)((char*)arr + off) + idx + 2*stride), \ + *((float64*)((char*)arr + off) + idx + 1*stride), \ + *((float64*)((char*)arr + off) + idx + 0*stride)); \ + } while (0) + +#define pLsrPrimV512Float32LoadStrided(dest, arr, off, idx, stride) \ + do { \ + (dest) = _mm512_set_ps(*((float32*)((char*)arr + off) + idx + 15*stride), \ + *((float32*)((char*)arr + off) + idx + 14*stride), \ + *((float32*)((char*)arr + off) + idx + 13*stride), \ + *((float32*)((char*)arr + off) + idx + 12*stride), \ + *((float32*)((char*)arr + off) + idx + 11*stride), \ + *((float32*)((char*)arr + off) + idx + 10*stride), \ + *((float32*)((char*)arr + off) + idx + 9*stride), \ + *((float32*)((char*)arr + off) + idx + 8*stride), \ + *((float32*)((char*)arr + off) + idx + 7*stride), \ + *((float32*)((char*)arr + off) + idx + 6*stride), \ + *((float32*)((char*)arr + off) + idx + 5*stride), \ + *((float32*)((char*)arr + off) + idx + 4*stride), \ + *((float32*)((char*)arr + off) + idx + 3*stride), \ + *((float32*)((char*)arr + off) + idx + 2*stride), \ + *((float32*)((char*)arr + off) + idx + 1*stride), \ + *((float32*)((char*)arr + off) + idx + 0*stride)); \ + } while (0) + +#define pLsrPrimV512Float64StoreStrided(arr, off, idx, stride, src) \ + do { \ + __declspec(align(64)) double ftemp[8]; \ + _mm512_store_pd(ftemp, src); \ + *((float64*)((char*)arr + off) + idx + 0*stride) = ftemp[0]; \ + *((float64*)((char*)arr + off) + idx + 1*stride) = ftemp[1]; \ + *((float64*)((char*)arr + off) + idx + 2*stride) = ftemp[2]; \ + *((float64*)((char*)arr + off) + idx + 3*stride) = ftemp[3]; \ + *((float64*)((char*)arr + off) + idx + 4*stride) = ftemp[4]; \ + *((float64*)((char*)arr + off) + idx + 5*stride) = ftemp[5]; \ + *((float64*)((char*)arr + off) + idx + 6*stride) = ftemp[6]; \ + *((float64*)((char*)arr + off) + idx + 7*stride) = ftemp[7]; \ + } while (0) + +#define pLsrPrimV512UInt64StoreStrided(arr, off, idx, stride, src) \ + do { \ + __declspec(align(64)) uint64_t ftemp[8]; \ + _mm512_store_epi64(ftemp, src); \ + *((uint64_t*)((char*)arr + off) + idx + 0*stride) = ftemp[0]; \ + *((uint64_t*)((char*)arr + off) + idx + 1*stride) = ftemp[1]; \ + *((uint64_t*)((char*)arr + off) + idx + 2*stride) = ftemp[2]; \ + *((uint64_t*)((char*)arr + off) + idx + 3*stride) = ftemp[3]; \ + *((uint64_t*)((char*)arr + off) + idx + 4*stride) = ftemp[4]; \ + *((uint64_t*)((char*)arr + off) + idx + 5*stride) = ftemp[5]; \ + *((uint64_t*)((char*)arr + off) + idx + 6*stride) = ftemp[6]; \ + *((uint64_t*)((char*)arr + off) + idx + 7*stride) = ftemp[7]; \ + } while (0) + +#define pLsrPrimV512Float32StoreStrided(arr, off, idx, stride, src) \ + do { \ + __declspec(align(64)) float ftemp[16]; \ + _mm512_store_ps(ftemp, src); \ + *((float32*)((char*)arr + off) + idx + 0*stride) = ftemp[0]; \ + *((float32*)((char*)arr + off) + idx + 1*stride) = ftemp[1]; \ + *((float32*)((char*)arr + off) + idx + 2*stride) = ftemp[2]; \ + *((float32*)((char*)arr + off) + idx + 3*stride) = ftemp[3]; \ + *((float32*)((char*)arr + off) + idx + 4*stride) = ftemp[4]; \ + *((float32*)((char*)arr + off) + idx + 5*stride) = ftemp[5]; \ + *((float32*)((char*)arr + off) + idx + 6*stride) = ftemp[6]; \ + *((float32*)((char*)arr + off) + idx + 7*stride) = ftemp[7]; \ + *((float32*)((char*)arr + off) + idx + 8*stride) = ftemp[8]; \ + *((float32*)((char*)arr + off) + idx + 9*stride) = ftemp[9]; \ + *((float32*)((char*)arr + off) + idx + 10*stride) = ftemp[10]; \ + *((float32*)((char*)arr + off) + idx + 11*stride) = ftemp[11]; \ + *((float32*)((char*)arr + off) + idx + 12*stride) = ftemp[12]; \ + *((float32*)((char*)arr + off) + idx + 13*stride) = ftemp[13]; \ + *((float32*)((char*)arr + off) + idx + 14*stride) = ftemp[14]; \ + *((float32*)((char*)arr + off) + idx + 15*stride) = ftemp[15]; \ + } while (0) + +#define pLsrPrimV512UInt64LoadStrided(dest, arr, off, idx, stride) \ + do { \ + (dest) = _mm512_set_epi64(*((uint64_t*)((char*)arr + off) + idx + 7*stride), \ + *((uint64_t*)((char*)arr + off) + idx + 6*stride), \ + *((uint64_t*)((char*)arr + off) + idx + 5*stride), \ + *((uint64_t*)((char*)arr + off) + idx + 4*stride), \ + *((uint64_t*)((char*)arr + off) + idx + 3*stride), \ + *((uint64_t*)((char*)arr + off) + idx + 2*stride), \ + *((uint64_t*)((char*)arr + off) + idx + 1*stride), \ + *((uint64_t*)((char*)arr + off) + idx + 0*stride)); \ + } while (0) + +#define pLsrPrimV512RefLoadStrided(dest, arr, off, idx, stride) \ + do { \ + (dest) = _mm512_set_epi64(*((void**)((char*)arr + off) + idx + 7*stride), \ + *((void**)((char*)arr + off) + idx + 6*stride), \ + *((void**)((char*)arr + off) + idx + 5*stride), \ + *((void**)((char*)arr + off) + idx + 4*stride), \ + *((void**)((char*)arr + off) + idx + 3*stride), \ + *((void**)((char*)arr + off) + idx + 2*stride), \ + *((void**)((char*)arr + off) + idx + 1*stride), \ + *((void**)((char*)arr + off) + idx + 0*stride)); \ + } while (0) + +#define pLsrPrimV512ReftoreStrided(arr, off, idx, stride, src) \ + do { \ + __declspec(align(64)) void * ftemp[8]; \ + _mm512_store_epi64(ftemp, src); \ + *((void**)((char*)arr + off) + idx + 0*stride) = ftemp[0]; \ + *((void**)((char*)arr + off) + idx + 1*stride) = ftemp[1]; \ + *((void**)((char*)arr + off) + idx + 2*stride) = ftemp[2]; \ + *((void**)((char*)arr + off) + idx + 3*stride) = ftemp[3]; \ + *((void**)((char*)arr + off) + idx + 4*stride) = ftemp[4]; \ + *((void**)((char*)arr + off) + idx + 5*stride) = ftemp[5]; \ + *((void**)((char*)arr + off) + idx + 6*stride) = ftemp[6]; \ + *((void**)((char*)arr + off) + idx + 7*stride) = ftemp[7]; \ + } while (0) + +#define pLsrPrimV512Float64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + ((dest) = _mm512_set_pd(c7, c6, c5, c4, c3, c2, c1, c0)) + + +#define pLsrPrimV256512CastSInt64ToFloat32(dest, a) \ + do { \ + __declspec(align(64)) __int64 the_ints[8]; \ + _mm512_store_epi64(the_ints, a); \ + (dest) = _mm512_set_ps(0.0, \ + 0.0, \ + 0.0, \ + 0.0, \ + 0.0, \ + 0.0, \ + 0.0, \ + 0.0, \ + (float)the_ints[7], \ + (float)the_ints[6], \ + (float)the_ints[5], \ + (float)the_ints[4], \ + (float)the_ints[3], \ + (float)the_ints[2], \ + (float)the_ints[1], \ + (float)the_ints[0]); \ + } while (0) + +#define pLsrPrimV512ReduceASInt64Plus(dest, init, a) \ + do { \ + __declspec(align(64)) __int64 vi64[8]; \ + _mm512_store_epi64(vi64, a); \ + (dest) = (init); \ + unsigned i; \ + for(i=0;i<8;++i) (dest) += vi64[i]; \ + } while(0) + +// ============================================================================================== + + + +#ifndef HAS_256 + +typedef __m512 PlsrVector256F32; +typedef __m512i PlsrVector256B32; +typedef __m512d PlsrVector256F64; +typedef __m512i PlsrVector256Ref; + +#define lower256of512 ((__mmask16)0xff) +#define higher256of512 ((__mmask16)0xff00) +#define lower256of512double ((__mmask8)0xf) +#define higher256of512double ((__mmask8)0xf0) + +#define pLsrPrimV256PointwiseFloat32Ln(dest, a) ((dest) = _mm512_mask_log_ps(dest, lower256of512, a)) +#define pLsrPrimV256PointwiseFloat32Exp(dest, a) ((dest) = _mm512_mask_exp_ps(dest, lower256of512, a)) +#define pLsrPrimV256PointwiseFloat32Negate(dest, a) ((dest) = _mm512_mask_mul_ps(dest, lower256of512, a, _mm512_set1_ps(-1.0))) + +#define pLsrPrimV512256CastFloat32ToSInt64(dest, a) \ + do { \ + __declspec(align(64)) float32 vi32[16]; \ + __declspec(align(64)) __int64 vi64[8]; \ + _mm512_store_ps(vi32, a); \ + unsigned i; \ + for(i=0;i<8;i++) vi64[i] = vi32[i]; \ + (dest) = _mm512_load_epi64(vi64); \ + } while (0) + +//#define pLsrPrimV512256ConvertSInt64FromSInt32(dest, a) ((dest) = _mm512_cvtepi32_epi64(a)) +#define pLsrPrimV512256ConvertSInt64FromSInt32(dest, a) \ + do { \ + __declspec(align(64)) __int32 vi32[16]; \ + __declspec(align(64)) __int64 vi64[8]; \ + _mm512_store_epi32(vi32, a); \ + unsigned i; \ + for(i=0;i<8;i++) vi64[i] = vi32[i]; \ + (dest) = _mm512_load_epi64(vi64); \ + } while (0) + +#define pLsrPrimV256DataConcatF32(dest, lower, upper) \ + do { \ + __m512 res = lower; \ + res = _mm512_mask_permute4f128_ps(res, higher256of512, upper, _MM_PERM_BABA);\ + (dest) = res; \ + } while (0) + +/* +#define pLsrPrimV256DataConcatF32(dest, lower, upper) \ + do { \ + __m512 res = lower; \ + res = _mm512_mask_alignr_epi32(res, higher256of512, upper, res, 8);\ + (dest) = res; \ + } while (0) +#define pLsrPrimV256DataConcatF32(dest, lower, upper) \ + do { \ + __m512 res = lower; \ + __declspec(align(64)) double ftemp[8]; \ + _mm512_mask_store_pd(&ftemp,lower256of512double,upper); \ + _mm512_mask_extload_pd(res,higher256of512double,&ftemp,_MM_UPCONV_PD_NONE,_MM_BROADCAST_4X8,0); \ + (dest) = res; \ + } while (0) +*/ + +#if 0 +#define pLsrPrimV256PointwiseCondMov(dest, mask, a, b) \ + do { \ + __mmask16 realmask=0; \ + __declspec(align(64)) __int64 mvec[8]; \ + _mm512_store_epi64(mvec,(mask)); \ + unsigned i; \ + for(i=0;i<8;++i) { \ + if(mvec[i]) realmask |= (1 << i); \ + } \ + (dest) = _mm512_mask_blend_ps((realmask),(a),(b)); \ + } while (0) +#else +#define pLsrPrimV256DataBlendB32(dest, mask, a, b) \ + do { \ + __mmask16 realmask=mask; \ + (dest) = _mm512_mask_blend_epi32((realmask),(a),(b)); \ + } while (0) +#define pLsrPrimV256DataBlendF32(dest, mask, a, b) \ + do { \ + __mmask16 realmask=mask; \ + (dest) = _mm512_mask_blend_ps((realmask),(a),(b)); \ + } while (0) +#endif + +#define pLsrPrimV256DataVectorF32(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + pLsrPrimV512Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) + +#define pLsrPrimV256B32LoadVI64(dest, arr, off, vi) \ + do { \ + __declspec(align(64)) __int64 vi64[8]; \ + __declspec(align(64)) __int32 vi32[16]; \ + _mm512_store_epi64(vi64, vi); \ + unsigned i; \ + for(i=0;i<8;++i) vi32[i] = vi64[i]; \ + __m512i indices32bit; \ + indices32bit = _mm512_load_epi32(vi32); \ + _mm512_mask_i32gather_epi32(dest,lower256of512,indices32bit,((char*)arr + off),1); \ + } while (0) + +#define pLsrPrimV256F32LoadVI64(dest, arr, off, vi) \ + do { \ + __declspec(align(64)) __int64 vi64[8]; \ + __declspec(align(64)) __int32 vi32[16]; \ + _mm512_store_epi64(vi64, vi); \ + unsigned i; \ + for(i=0;i<8;++i) vi32[i] = vi64[i]; \ + __m512i indices32bit; \ + indices32bit = _mm512_load_epi32(vi32); \ + _mm512_mask_i32gather_ps(dest,lower256of512,indices32bit,((char*)arr + off),1); \ + } while (0) + +#define pLsrPrimV256F32LoadVI(dest, arr, off, vi) _mm512_mask_i32gather_ps(dest,lower256of512,vi,((char*)arr + off),1); + +#define pLsrPrimV256PointwiseFloat32Plus(dest, a, b) ((dest) = _mm512_mask_add_ps( (dest), lower256of512, (a), (b))) +#define pLsrPrimV256PointwiseFloat32Minus(dest, a, b) ((dest) = _mm512_mask_sub_ps( (dest), lower256of512, (a), (b))) +#define pLsrPrimV256PointwiseFloat32Times(dest, a, b) ((dest) = _mm512_mask_mul_ps( (dest), lower256of512, (a), (b))) +#define pLsrPrimV256PointwiseFloat32Divide(dest, a, b) ((dest) = _mm512_mask_mul_ps( (dest), lower256of512, (a), _mm512_mask_rcp23_ps( (b), lower256of512, (b)))) +#define pLsrPrimV256PointwiseFloat32Sqrt(dest, a) ((dest) = _mm512_mask_sqrt_ps( (dest), lower256of512, (a))) + +#define pLsrPrimV256ReduceAFloat32Plus(dest, init, a) reduce256AFloat32Plus(dest, init, a) + +#define reduce256AFloat32Plus(dest, init, a) \ + (dest) = (_mm512_mask_reduce_add_ps(lower256of512,a) + (init)); + +#define pLsrPrimV256Float32Lift(dest, a) \ + do { \ + float ftemp = a; \ + (dest) = _mm512_mask_extload_ps((dest), lower256of512, (void const *)(&ftemp),_MM_UPCONV_PS_NONE,_MM_BROADCAST_1X16,0); \ + } while (0) + +#define pLsrPrimV256DataBroadcastF32(dest, c0) pLsrPrimV512Float32Lift(dest, c0) + +#ifdef HAS_ALIGNMENT +#define pLsrPrimV256Float32Load(dest, arr, off, idx) \ + (dest) = _mm512_mask_load_ps((dest), lower256of512, (void const *)((float*)((char*)arr+off) + idx)) +#else +#define pLsrPrimV256Float32Load(dest, arr, off, idx) \ + do { \ + (dest) = _mm512_mask_loadunpacklo_ps((dest),lower256of512,(void *)((float*)((char*)arr+off) + idx)); \ + (dest) = _mm512_mask_loadunpackhi_ps((dest),lower256of512,(char *)((float*)((char*)arr+off) + idx) + 64); \ + } while (0) +#endif + +#define STRIDE_VIA_GATHER + +#ifdef STRIDE_VIA_GATHER +#define pLsrPrimV256Float32StoreStrided(arr, off, idx, stride, src) \ + do { \ + __m512i indices, vstride; \ + indices = _mm512_set_epi32(0,0,0,0,0,0,0,0,7,6,5,4,3,2,1,0); \ + vstride = _mm512_set1_epi32(stride); \ + indices = _mm512_mask_mullo_epi32(indices,lower256of512,indices,vstride); \ + _mm512_mask_i32scatter_ps((float*)((char*)arr + off) + idx,lower256of512,indices,(src),_MM_SCALE_1); \ + } while (0) +#else +#define pLsrPrimV256Float32StoreStrided(arr, off, idx, stride, src) \ + do { \ + __declspec(align(64)) float ftemp[16]; \ + _mm512_mask_store_ps(ftemp, lower256of512, src); \ + *((float*)((char*)arr + off) + idx + 0*stride) = ftemp[0]; \ + *((float*)((char*)arr + off) + idx + 1*stride) = ftemp[1]; \ + *((float*)((char*)arr + off) + idx + 2*stride) = ftemp[2]; \ + *((float*)((char*)arr + off) + idx + 3*stride) = ftemp[3]; \ + *((float*)((char*)arr + off) + idx + 4*stride) = ftemp[4]; \ + *((float*)((char*)arr + off) + idx + 5*stride) = ftemp[5]; \ + *((float*)((char*)arr + off) + idx + 6*stride) = ftemp[6]; \ + *((float*)((char*)arr + off) + idx + 7*stride) = ftemp[7]; \ + } while (0) +#endif + +#ifdef HAS_ALIGNMENT +#define pLsrPrimV256Float32Store(arr, off, idx, v) \ + _mm512_mask_store_ps((void const *)((float*)((char*)arr+off) + idx), lower256of512, v) +#else +#define pLsrPrimV256Float32Store(arr, off, idx, v) \ + do { \ + _mm512_mask_packstorelo_ps((void *)((float*)((char*)arr+off) + idx),lower256of512,v); \ + _mm512_mask_packstorehi_ps((char *)((float*)((char*)arr+off) + idx) + 64,lower256of512,v); \ + } while (0) +#endif + +#if 0 + if(stride <= 8 && stride/2%==0) { \ + (dest) = _mm512_mask_i32gather_ps((dest),lower256of512,,(float*)((char*)arr + off) + idx,stride); \ + } else { \ + (dest) = _mm512_mask_i32gather_ps((dest),lower256of512,,(float*)((char*)arr + off) + idx,_MM_SCALE_1); \ + } +#endif + +#ifdef STRIDE_VIA_GATHER +#define pLsrPrimV256Float32LoadStrided(dest, arr, off, idx, stride) \ + do { \ + __m512i indices, vstride; \ + indices = _mm512_set_epi32(0,0,0,0,0,0,0,0,7,6,5,4,3,2,1,0); \ + vstride = _mm512_set1_epi32(stride); \ + indices = _mm512_mask_mullo_epi32(indices,lower256of512,indices,vstride); \ + (dest) = _mm512_mask_i32gather_ps((dest),lower256of512,indices,(float*)((char*)arr + off) + idx,_MM_SCALE_1); \ + } while (0) +#else +#define pLsrPrimV256Float32LoadStrided(dest, arr, off, idx, stride) \ + do { \ + (dest) = _mm512_set_ps(0.0, \ + 0.0, \ + 0.0, \ + 0.0, \ + 0.0, \ + 0.0, \ + 0.0, \ + 0.0, \ + *((float*)((char*)arr + off) + idx + 7*stride), \ + *((float*)((char*)arr + off) + idx + 6*stride), \ + *((float*)((char*)arr + off) + idx + 5*stride), \ + *((float*)((char*)arr + off) + idx + 4*stride), \ + *((float*)((char*)arr + off) + idx + 3*stride), \ + *((float*)((char*)arr + off) + idx + 2*stride), \ + *((float*)((char*)arr + off) + idx + 1*stride), \ + *((float*)((char*)arr + off) + idx + 0*stride)); \ + } while (0) +#endif + +#define pLsrPrimV256F32LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256Float32Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256Float32LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) + +#define pLsrPrimV256F32StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256Float32Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256Float32StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) + +#ifdef HAS_ALIGNMENT +#define pLsrPrimV256UInt32Load(dest, arr, off, idx) \ + (dest) = _mm512_mask_load_epi32((dest), lower256of512, (void const *)((uint32_t*)((char*)arr+off) + idx)) +#else +#define pLsrPrimV256UInt32Load(dest, arr, off, idx) \ + do { \ + (dest) = _mm512_mask_loadunpacklo_epi32((dest),lower256of512,(void *)((uint32_t*)((char*)arr+off) + idx)); \ + (dest) = _mm512_mask_loadunpackhi_epi32((dest),lower256of512,(char *)((uint32_t*)((char*)arr+off) + idx) + 64); \ + } while (0) +#endif + +#define pLsrPrimV256UInt32LoadStrided(dest, arr, off, idx, stride) \ + do { \ + (dest) = _mm512_set_epi32(0, \ + 0, \ + 0, \ + 0, \ + 0, \ + 0, \ + 0, \ + 0, \ + *((uint32_t*)((char*)arr + off) + idx + 7*stride), \ + *((uint32_t*)((char*)arr + off) + idx + 6*stride), \ + *((uint32_t*)((char*)arr + off) + idx + 5*stride), \ + *((uint32_t*)((char*)arr + off) + idx + 4*stride), \ + *((uint32_t*)((char*)arr + off) + idx + 3*stride), \ + *((uint32_t*)((char*)arr + off) + idx + 2*stride), \ + *((uint32_t*)((char*)arr + off) + idx + 1*stride), \ + *((uint32_t*)((char*)arr + off) + idx + 0*stride)); \ + } while (0) + +#define pLsrPrimV256B32LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256UInt32Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256UInt32LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) + +#define pLsrPrimV256UInt32StoreStrided(arr, off, idx, stride, src) \ + do { \ + __declspec(align(64)) uint32_t ftemp[16]; \ + _mm512_mask_store_epi32(ftemp, lower256of512, src); \ + *((uint32_t*)((char*)arr + off) + idx + 0*stride) = ftemp[0]; \ + *((uint32_t*)((char*)arr + off) + idx + 1*stride) = ftemp[1]; \ + *((uint32_t*)((char*)arr + off) + idx + 2*stride) = ftemp[2]; \ + *((uint32_t*)((char*)arr + off) + idx + 3*stride) = ftemp[3]; \ + *((uint32_t*)((char*)arr + off) + idx + 4*stride) = ftemp[4]; \ + *((uint32_t*)((char*)arr + off) + idx + 5*stride) = ftemp[5]; \ + *((uint32_t*)((char*)arr + off) + idx + 6*stride) = ftemp[6]; \ + *((uint32_t*)((char*)arr + off) + idx + 7*stride) = ftemp[7]; \ + } while (0) + +#ifdef HAS_ALIGNMENT +#define pLsrPrimV256UInt32Store(arr, off, idx, v) \ + _mm512_mask_store_epi32((void const *)((uint32_t*)((char*)arr+off) + idx), lower256of512, v) +#else +#define pLsrPrimV256UInt32Store(arr, off, idx, v) \ + do { \ + _mm512_mask_packstorelo_epi32((void *)((uint32_t*)((char*)arr+off) + idx),lower256of512,v); \ + _mm512_mask_packstorehi_epi32((char *)((uint32_t*)((char*)arr+off) + idx) + 64,lower256of512,v); \ + } while (0) +#endif + +#define pLsrPrimV256B32StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256UInt32Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256UInt32StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) + +#else // HAS_256 + +typedef __m256 PlsrVector256F32; +typedef __m256i PlsrVector256B32; +typedef __m256d PlsrVector256F64; +typedef __m256i PlsrVector256Ref; + +// ***************************************************************************** +// Conversions +// ***************************************************************************** + +#define pLsrPrimV256256ConvertFloat32FromUInt32(dest, a) ((dest) = _mm256_cvtepi32_ps(a)) +#define pLsrPrimV256256ConvertUInt32FromFloat32(dest, a) ((dest) = _mm256_cvtps_epi32(a)) +#define pLsrPrimV256128ConvertFloat64FromFloat32(dest, a) ((dest) = _mm256_cvtps_pd(a)) + +// vec128 -> vec256 +#define pLsrPrimV256U32ToF64(dest, a) ((dest) = _mm256__cvtepi32_pd (_mm_movpi64_epi64(a))) +#define pLsrPrimV256F64ToU32(dest, a) ((dest) = _mm256__movepi64_pi64 (_mm256_cvtpd_epi32(a))) +#define pLsrPrimV256256CastSInt32ToFloat32(dest, a) ((dest) = _mm256_cvtepi32_ps(a)) + +// ***************************************************************************** +// Data operations on the more abstract B32/F32/F64 types +// ***************************************************************************** + +#define pLsrPrimV256DataVectorB32(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + pLsrPrimV256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) + +#define pLsrPrimV256DataVectorF32(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + pLsrPrimV256Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) + + // Subscripting operations (chose between the lower or higher part) + + +#define pLsrPrimV256DataSub0B32(dest, a) pLsrPrimV256DataSubB32L(dest, 0, a) +#define pLsrPrimV256DataSub1B32(dest, a) pLsrPrimV256DataSubB32L(dest, 1, a) +#define pLsrPrimV256DataSub2B32(dest, a) pLsrPrimV256DataSubB32L(dest, 2, a) +#define pLsrPrimV256DataSub3B32(dest, a) pLsrPrimV256DataSubB32L(dest, 3, a) +#define pLsrPrimV256DataSub4B32(dest, a) pLsrPrimV256DataSubB32H(dest, 0, a) +#define pLsrPrimV256DataSub5B32(dest, a) pLsrPrimV256DataSubB32H(dest, 1, a) +#define pLsrPrimV256DataSub6B32(dest, a) pLsrPrimV256DataSubB32H(dest, 2, a) +#define pLsrPrimV256DataSub7B32(dest, a) pLsrPrimV256DataSubB32H(dest, 3, a) + +#define pLsrPrimV256DataSub0F32(dest, a) pLsrPrimV256DataSubF32L(dest, 0, a) +#define pLsrPrimV256DataSub1F32(dest, a) pLsrPrimV256DataSubF32L(dest, 1, a) +#define pLsrPrimV256DataSub2F32(dest, a) pLsrPrimV256DataSubF32L(dest, 2, a) +#define pLsrPrimV256DataSub3F32(dest, a) pLsrPrimV256DataSubF32L(dest, 3, a) +#define pLsrPrimV256DataSub4F32(dest, a) pLsrPrimV256DataSubF32H(dest, 0, a) +#define pLsrPrimV256DataSub5F32(dest, a) pLsrPrimV256DataSubF32H(dest, 1, a) +#define pLsrPrimV256DataSub6F32(dest, a) pLsrPrimV256DataSubF32H(dest, 2, a) +#define pLsrPrimV256DataSub7F32(dest, a) pLsrPrimV256DataSubF32H(dest, 3, a) + +#define pLsrPrimV256DataSubB32L(dest, sub, a) ((dest) = pLsrPrimV256DataSubB32L_help(sub, a)) +static inline int pLsrPrimV256DataSubB32L_help(int sub, __m256i a) { + __m128i loA = _mm256_extractf128_si256(a, 0); + return (_mm_extract_epi32(loA, sub)); +} +#define pLsrPrimV256DataSubB32H(dest, sub, a) ((dest) = pLsrPrimV256DataSubB32H_help(sub, a)) +static inline int pLsrPrimV256DataSubB32H_help(int sub, __m256i a) { + __m128i hiA = _mm256_extractf128_si256(a, 1); + return (_mm_extract_epi32(hiA, sub)); +} + +#define pLsrPrimV256DataSubF32H(dest, sub, a) ((dest) = pLsrPrimV256DataSubF32H_help(sub, a)) +static inline float pLsrPrimV256DataSubF32H_help(int sub, __m256 a) { + // float* out = (float*)malloc(sizeof(float)); + __m128 hiA = _mm256_extractf128_ps(a, 1); + //__m128 outA = mm_extract_ps(out, hiA, sub); + // _mm_move_ss(out, outA); + // return *out; + float out; + _MM_EXTRACT_FLOAT(out, hiA, sub); + return out; + +} + +#define pLsrPrimV256DataSubF32L(dest, sub, a) ((dest) = pLsrPrimV256DataSubF32L_help(sub, a)) +static inline float pLsrPrimV256DataSubF32L_help(int sub, __m256 a) { + // float* out = (float*)malloc(sizeof(float)); + __m128 hiA = _mm256_extractf128_ps(a, 0); + // __m128 outA = mm_extract_ps(out, hiA, sub); + // _mm_move_ss(out, outA); + float out; + _MM_EXTRACT_FLOAT(out, hiA, sub); + return out; +} + +#define pLsrPrimV256DataSub0F64(dest, a) pLsrPrimV256DataSubF64(dest, 0, a) +#define pLsrPrimV256DataSub1F64(dest, a) pLsrPrimV256DataSubF64(dest, 1, a) +#define pLsrPrimV256DataSub2F64(dest, a) pLsrPrimV256DataSubF64(dest, 2, a) +#define pLsrPrimV256DataSub3F64(dest, a) pLsrPrimV256DataSubF64(dest, 3, a) + +#define pLsrPrimV256DataSubF64(dest, sub, a) ((dest) = ((double*)&a)[sub]) + +// loads and stores + +#define pLsrPrimV256B32StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256UInt32Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256UInt32StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) +#define pLsrPrimV256B64StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256UInt64Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256UInt64StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) +#define pLsrPrimV256F32StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256Float32Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256Float32StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) +#define pLsrPrimV256F64StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256Float64Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256Float64StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) +#define pLsrPrimV256RefStoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256RefStore(arr, offset, idx, v); \ + } else { \ + pLsrPrimV256RefStoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) + + +#define pLsrPrimV256B32LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256UInt32Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256UInt32LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV256B64LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256UInt64Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256UInt64LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV256F32LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256Float32Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256Float32LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV256F64LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256Float64Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256Float64LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV256RefLoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV256RefLoad(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256RefLoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) + +#define pLsrPrimV256F64LoadVI(dest, arr, off, vi) \ + do { \ + __m128i pLsrPrimV256Float64LoadVectorHelp_idxV; \ + __m256d pLsrPrimV256Float64LoadVectorHelp_res; \ + uint32 pLsrPrimV256Float64LoadVectorHelp_idx0; \ + uint32 pLsrPrimV256Float64LoadVectorHelp_idx1; \ + uint32 pLsrPrimV256Float64LoadVectorHelp_idx2; \ + uint32 pLsrPrimV256Float64LoadVectorHelp_idx3; \ + \ + pLsrPrimV256Float64LoadVectorHelp_idxV = vi; \ + pLsrPrimV256Float64LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 0); \ + pLsrPrimV256Float64LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 1); \ + pLsrPrimV256Float64LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 2); \ + pLsrPrimV256Float64LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 3); \ + pLsrPrimV256Float64LoadVectorHelp_res = \ + _mm256_set_pd(*((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx3), \ + *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx2), \ + *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx1), \ + *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx0)); \ + (dest) = pLsrPrimV256Float64LoadVectorHelp_res; \ + } while (0) + +/* scalar array, scalar offset, vector of indices */ +#define pLsrPrimV256F32LoadVI(dest, arr, off, vi) \ + do { \ + __m128i pLsrPrimV256Float32LoadVectorHelp_idxV; \ + __m128 pLsrPrimV256Float32LoadVectorHelp_lo; \ + __m128 pLsrPrimV256Float32LoadVectorHelp_hi; \ + __m256 pLsrPrimV256Float32LoadVectorHelp_res; \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx0; \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx1; \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx2; \ + uint32 pLsrPrimV256Float32LoadVectorHelp_idx3; \ + \ + pLsrPrimV256Float32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 0); \ + pLsrPrimV256Float32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 0); \ + pLsrPrimV256Float32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 1); \ + pLsrPrimV256Float32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 2); \ + pLsrPrimV256Float32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 3); \ + pLsrPrimV256Float32LoadVectorHelp_lo = \ + _mm_set_ps(*((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx3), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx2), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx1), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx0)); \ + pLsrPrimV256Float32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 1); \ + pLsrPrimV256Float32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 0); \ + pLsrPrimV256Float32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 1); \ + pLsrPrimV256Float32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 2); \ + pLsrPrimV256Float32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 3); \ + pLsrPrimV256Float32LoadVectorHelp_hi = \ + _mm_set_ps(*((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx3), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx2), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx1), \ + *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx0)); \ + pLsrPrimV256Float32LoadVectorHelp_res = _mm256_castps128_ps256(pLsrPrimV256Float32LoadVectorHelp_lo); \ + pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \ + pLsrPrimV256Float32LoadVectorHelp_hi, \ + 1); \ + (dest) = pLsrPrimV256Float32LoadVectorHelp_res; \ + } while (0) + +/* scalar array, scalar offset, vector of indices */ + +/* scalar array, scalar offset, vector of indices */ +/* For some insane reason, the only way to load out a float from an __m128 register + * is as an integer containing the binary representation of the float. -leaf */ +#define pLsrPrimV256F32StoreVI(arr, off, vi, src) \ + do { \ + __m128i pLsrPrimV256Float32StoreVectorHelp_idxV; \ + __m128 pLsrPrimV256Float32StoreVectorHelp_srcV; \ + uint32 pLsrPrimV256Float32StoreVectorHelp_idx; \ + \ + pLsrPrimV256Float32StoreVectorHelp_idxV =_mm256_extractf128_si256(vi, 0); \ + pLsrPrimV256Float32StoreVectorHelp_srcV =_mm256_extractf128_ps(src, 0); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 0); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 0); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 1); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 1); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 2); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 2); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 3); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 3); \ + pLsrPrimV256Float32StoreVectorHelp_idxV =_mm256_extractf128_si256(vi, 1); \ + pLsrPrimV256Float32StoreVectorHelp_srcV =_mm256_extractf128_ps(src, 1); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 0); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 0); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 1); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 1); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 2); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 2); \ + pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 3); \ + *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 3); \ + } while (0) + +/* scalar array, scalar offset, vector of indices */ +#define pLsrPrimV256F64StoreVI(arr, off, vi, src) \ + do { \ + __m128i pLsrPrimV256Float64StoreVectorHelp_idxV; \ + __m128d pLsrPrimV256Float64StoreVectorHelp_srcV; \ + uint32 pLsrPrimV256Float64StoreVectorHelp_idx; \ + \ + pLsrPrimV256Float64StoreVectorHelp_idxV = vi; \ + pLsrPrimV256Float64StoreVectorHelp_srcV =_mm256_extractf128_pd(src, 0); \ + pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 0); \ + _mm_storel_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \ + pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 1); \ + _mm_storeh_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \ + \ + pLsrPrimV256Float64StoreVectorHelp_srcV =_mm256_extractf128_pd(src, 1); \ + pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 2); \ + _mm_storel_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \ + pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 3); \ + _mm_storeh_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \ + } while (0) + + +#define pLsrPrimV256B32LoadVectorStrided(dest, arr, offset, idx, stride) \ + pLsrPrimV256UInt32LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256B64LoadVectorStrided(dest, arr, offset, idx, stride) \ + pLsrPrimV256UInt64LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256F32LoadVectorStrided(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 0) { \ + pLsrPrimV256Float32LoadVector(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV256Float32LoadVectorStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) \ + +#define pLsrPrimV256B32LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV256UInt32LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256B64LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV256UInt64LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256F32LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV256F32LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256F64LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV256F64LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV256RefLoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV256RefLoadVectorStrided(dest, arr, offset, idx, stride) + + +// ***************************************************************************** +// Type-specific arithemetic and memory operations +// ***************************************************************************** + +// ================ +// UInt32 - 256-bit +// ================ + +// ***************************************************************************** +// Arithmetic +// ***************************************************************************** + +#define pLsrPrimV256PointwiseUInt32Plus(dest, a, b) vector256PointwiseUInt32Plus(dest, a, b) +#define pLsrPrimV256PointwiseUInt32Minus(dest, a, b) vector256PointwiseUInt32Minus(dest, a, b) +#define pLsrPrimV256PointwiseUInt32Times(dest, a, b) vector256PointwiseUInt32Times(dest, a, b) + +#define pLsrPrimV256PointwiseSInt32Plus(dest, a, b) vector256PointwiseSInt32Plus(dest, a, b) +#define pLsrPrimV256PointwiseSInt32Minus(dest, a, b) vector256PointwiseSInt32Minus(dest, a, b) +#define pLsrPrimV256PointwiseSInt32Times(dest, a, b) vector256PointwiseSInt32Times(dest, a, b) + +#define integer256via128(dest, a, b, op) \ + do { \ + __m256i integer256via128_dest = {0, 0, 0, 0, 0, 0, 0, 0}; \ + __m128i integer256via128_hiA = _mm256_extractf128_si256(a, 1); \ + __m128i integer256via128_loA = _mm256_extractf128_si256(a, 0); \ + __m128i integer256via128_hiB = _mm256_extractf128_si256(b, 1); \ + __m128i integer256via128_loB = _mm256_extractf128_si256(b, 0); \ + __m128i integer256via128_loTemp; \ + __m128i integer256via128_hiTemp; \ + op(integer256via128_loTemp, integer256via128_loA, integer256via128_loB); \ + op(integer256via128_hiTemp, integer256via128_hiA, integer256via128_hiB); \ + integer256via128_dest = _mm256_insertf128_si256(integer256via128_dest, integer256via128_hiTemp, 1); \ + integer256via128_dest = _mm256_insertf128_si256(integer256via128_dest, integer256via128_loTemp, 0); \ + (dest) = integer256via128_dest; \ +} while (0) + +#define vector256PointwiseUInt32Plus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Plus) +#define vector256PointwiseUInt32Minus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Minus) +#define vector256PointwiseUInt32Times(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Times) + +#define vector256PointwiseSInt32Plus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Plus) +#define vector256PointwiseSInt32Minus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Minus) +#define vector256PointwiseSInt32Times(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Times) + +// ***************************************************************************** +// Reductions (horizontals) +// ***************************************************************************** + +#define pLsrPrimV256ReduceAUInt32Plus(dest, init, a) vector256reduceAUInt32Plus(dest, init, a) +#define pLsrPrimV256ReduceAUInt32Times(dest, init, a) vector256reduceAUInt32Times(dest, init, a) +#define pLsrPrimV256ReduceAUInt32Max(dest, init, a) vector256reduceAUInt32Max(dest, init, a) +#define pLsrPrimV256ReduceAUInt32Min(dest, init, a) vector256reduceAUInt32Min(dest, init, a) + +#define vector256reduceAUInt32Plus(dest, init, a) \ + do { \ + __m128i vector256reduceAUInt32Plus_hiA = _mm256_extractf128_si256(a, 1); \ + __m128i vector256reduceAUInt32Plus_loA = _mm256_extractf128_si256(a, 0); \ + uint32 vector256reduceAUInt32Plus_aRes; \ + pLsrPrimV128ReduceAUInt32Plus(vector256reduceAUInt32Plus_aRes, init, vector256reduceAUInt32Plus_hiA); \ + uint32 vector256reduceAUInt32Plus_bRes; \ + pLsrPrimV128ReduceAUInt32Plus(vector256reduceAUInt32Plus_bRes, init, vector256reduceAUInt32Plus_loA); \ + (dest) = (vector256reduceAUInt32Plus_aRes + vector256reduceAUInt32Plus_bRes); \ + } while (0) + +#define vector256reduceAUInt32Times(dest, init, a) \ + do { \ + __m128i vector256reduceAUInt32Times_hiA = _mm256_extractf128_si256(a, 1); \ + __m128i vector256reduceAUInt32Times_loA = _mm256_extractf128_si256(a, 0); \ + uint32 vector256reduceAUInt32Times_aRes; \ + pLsrPrimV128ReduceAUInt32Times(vector256reduceAUInt32Times_aRes, init, vector256reduceAUInt32Times_hiA); \ + uint32 vector256reduceAUInt32Times_bRes; \ + pLsrPrimV128ReduceAUInt32Times(vector256reduceAUInt32Times_bRes, init, vector256reduceAUInt32Times_loA); \ + (dest) = (vector256reduceAUInt32Times_aRes * vector256reduceAUInt32Times_bRes); \ + } while (0) + +#define vector256reduceAUInt32Max(dest, init, a) \ + do { \ + __m128i vector256reduceAUInt32Max_hiA = _mm256_extractf128_si256(a, 1); \ + __m128i vector256reduceAUInt32Max_loA = _mm256_extractf128_si256(a, 0); \ + uint32 vector256reduceAUInt32Max_aRes; \ + pLsrPrimV128ReduceAUInt32Max(vector256reduceAUInt32Max_aRes, init, vector256reduceAUInt32Max_hiA); \ + uint32 vector256reduceAUInt32Max_bRes; \ + pLsrPrimV128ReduceAUInt32Max(vector256reduceAUInt32Max_bRes, init, vector256reduceAUInt32Max_loA); \ + (dest) = (max(vector256reduceAUInt32Max_aRes, vector256reduceAUInt32Max_bRes)); \ + } while (0) + +#define vector256reduceAUInt32Min(dest, init, a) \ + do { \ + __m128i vector256reduceAUInt32Min_hiA = _mm256_extractf128_si256(a, 1); \ + __m128i vector256reduceAUInt32Min_loA = _mm256_extractf128_si256(a, 0); \ + uint32 vector256reduceAUInt32Min_aRes; \ + pLsrPrimV128ReduceAUInt32Min(vector256reduceAUInt32Min_aRes, init, vector256reduceAUInt32Min_hiA); \ + uint32 vector256reduceAUInt32Min_bRes; \ + pLsrPrimV128ReduceAUInt32Min(vector256reduceAUInt32Min_bRes, init, vector256reduceAUInt32Min_loA); \ + (dest) = (min(vector256reduceAUInt32Min_aRes, vector256reduceAUInt32Min_bRes)); \ + } while (0) + +// ***************************************************************************** +// Data operations +// ***************************************************************************** + +#define pLsrPrimV256UInt32Lift(dest, a) vector256UInt32Lift(dest, a) +#define pLsrPrimV256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + vector256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) + +#define vector256UInt32Lift(dest, a) \ + do { \ + __m256i vector256UInt32Lift_dest = {0, 0, 0, 0, 0, 0, 0, 0}; \ + __m128i vector256UInt32Lift_lo = _mm_set1_epi32(a); \ + vector256UInt32Lift_dest = _mm256_insertf128_si256(vector256UInt32Lift_dest, vector256UInt32Lift_lo, 1); \ + vector256UInt32Lift_dest = _mm256_insertf128_si256(vector256UInt32Lift_dest, vector256UInt32Lift_lo, 0); \ + (dest) = vector256UInt32Lift_dest; \ + } while (0) + +#define vector256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + do { \ + __m256i vector256UInt32Const_out = {0, 0, 0, 0, 0, 0, 0, 0}; \ + __m128i vector256UInt32Const_lo = _mm_set_epi32(c3, c2, c1, c0); \ + __m128i vector256UInt32Const_hi = _mm_set_epi32(c7, c6, c5, c4); \ + vector256UInt32Const_out = _mm256_insertf128_si256(vector256UInt32Const_out, vector256UInt32Const_hi, 1); \ + vector256UInt32Const_out = _mm256_insertf128_si256(vector256UInt32Const_out, vector256UInt32Const_lo, 0); \ + (dest) = vector256UInt32Const_out; \ + } while (0) + +#define pLsrPrimV256UInt32Load(dest, arr, off, idx) pLsrPrimV256UInt32LoadHelp (dest, arr, off, idx) +#define pLsrPrimV256UInt32Store(arr, off, idx, v) pLsrPrimV256UInt32StoreHelp (arr, off, idx, v) + +#define pLsrPrimV256UInt32LoadHelp(dest, arr, off, idx) \ + do { \ + __m256i* pLsrPrimV256UInt32LoadHelp_marr = (__m256i*)((uint32*)((char*)arr+off) + idx); \ + (dest) = _mm256_loadu_si256(pLsrPrimV256UInt32LoadHelp_marr); \ + } while (0) + +#define pLsrPrimV256UInt32StoreHelp(arr, off, idx, v) \ + do { \ + __m256i* pLsrPrimV256UInt32StoreHelp_marr = (__m256i*)((uint32*)((char*)arr+off) + idx); \ + _mm256_storeu_si256(pLsrPrimV256UInt32StoreHelp_marr, v); \ + } while (0) + +// ===== +// Float +// ===== + +// ***************************************************************************** +// Arithmetic operations +// ***************************************************************************** + +static __m256 pLsrPrimV256Float32Zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +#define pLsrPrimV256PointwiseFloat32Plus(dest, a, b) ((dest) = _mm256_add_ps(a, b)) +#define pLsrPrimV256PointwiseFloat32Minus(dest, a, b) ((dest) = _mm256_sub_ps(a, b)) +#define pLsrPrimV256PointwiseFloat32Times(dest, a, b) ((dest) = _mm256_mul_ps(a, b)) +#define pLsrPrimV256PointwiseFloat32Divide(dest, a, b) ((dest) = _mm256_div_ps(a, b)) +#define pLsrPrimV256PointwiseFloat32Negate(dest, a) ((dest) = _mm256_sub_ps(pLsrPrimV256Float32Zero, a)) +#define pLsrPrimV256PointwiseFloat32Max(dest, a, b) ((dest) = _mm256_max_ps(a, b)) +#define pLsrPrimV256PointwiseFloat32Sqrt(dest, a) ((dest) = _mm256_sqrt_ps(a)) + +// ***************************************************************************** +// Reductions +// ***************************************************************************** + +#define pLsrPrimV256ReduceAFloat32Plus(dest, init, a) reduce256AFloat32Plus(dest, init, a) + +#define reduce256AFloat32Plus(dest, init, a) \ + do { \ + __m256 reduceAFloat32Plus_a_ = a; \ + __m128 rf32_hi = _mm256_extractf128_ps(reduceAFloat32Plus_a_, 1); \ + __m256 rf32_tmp = _mm256_insertf128_ps(reduceAFloat32Plus_a_, rf32_hi, 0); \ + __m256 reduceAFloat32Plus_tmp = _mm256_hadd_ps(reduceAFloat32Plus_a_, rf32_tmp); \ + __m256 reduceAFloat32Plus_tmp2 = _mm256_hadd_ps(reduceAFloat32Plus_tmp, reduceAFloat32Plus_tmp); \ + __m256 reduceAFloat32Plus_tmp3 = _mm256_hadd_ps(reduceAFloat32Plus_tmp2, reduceAFloat32Plus_tmp2); \ + float reduceAFloat32Plus_dest_; \ + pLsrPrimV256DataSubF32L(reduceAFloat32Plus_dest_, 0, reduceAFloat32Plus_tmp3); \ + (dest) = (reduceAFloat32Plus_dest_ + (init)); \ + } while (0) + +// ***************************************************************************** +// Data operations +// ***************************************************************************** + +#define pLsrPrimV256Float32Load(dest, arr, off, idx) \ + do { \ + float* pLsrPrimV256Float32Load_marr = ((float*)((char*)arr+off) + idx); \ + __m256 float32load = _mm256_loadu_ps(pLsrPrimV256Float32Load_marr); \ + (dest) = float32load; \ + } while (0) + +#define pLsrPrimV256Float32LoadVector(dest, arr, off, idx) \ + pLsrPrimV256Float32LoadVectorHelp (dest, arr, off, idx) + +#define pLsrPrimV256Float32Store(arr, off, idx, v) \ + do { \ + float* pLsrPrimV256Float32Store_marr = ((float*)((char*)arr+off) + idx); \ + _mm256_storeu_ps(pLsrPrimV256Float32Store_marr, v); \ + } while (0) + + +#define pLsrPrimV256Float32LoadStrided(dest, arr, off, idx, stride) \ + do { \ + (dest) = _mm256_set_ps(*((float32*)((char*)arr + off) + idx + 7*stride), \ + *((float32*)((char*)arr + off) + idx + 6*stride), \ + *((float32*)((char*)arr + off) + idx + 5*stride), \ + *((float32*)((char*)arr + off) + idx + 4*stride), \ + *((float32*)((char*)arr + off) + idx + 3*stride), \ + *((float32*)((char*)arr + off) + idx + 2*stride), \ + *((float32*)((char*)arr + off) + idx + stride), \ + *((float32*)((char*)arr + off) + idx + 0)); \ + } while (0) + +#define pLsrPrimV256Float32StoreStrided(arr, off, idx, stride, src) \ + do { \ + __m128 pLsrPrimV256Float32StoreStrided_srcV; \ + \ + pLsrPrimV256Float32StoreStrided_srcV =_mm256_extractf128_ps(src, 0); \ + *((uint32*)((char*)arr + off) + idx + 0) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 0); \ + *((uint32*)((char*)arr + off) + idx + stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 1); \ + *((uint32*)((char*)arr + off) + idx + 2*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 2); \ + *((uint32*)((char*)arr + off) + idx + 3*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 3); \ + \ + pLsrPrimV256Float32StoreStrided_srcV =_mm256_extractf128_ps(src, 1); \ + *((uint32*)((char*)arr + off) + idx + 4*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 0); \ + *((uint32*)((char*)arr + off) + idx + 5*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 1); \ + *((uint32*)((char*)arr + off) + idx + 6*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 2); \ + *((uint32*)((char*)arr + off) + idx + 7*stride) = \ + (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 3); \ + } while (0) + +#define pLsrPrimV256Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7) \ + ((dest) = _mm256_set_ps(c7, c6, c5, c4, c3, c2, c1, c0)) + +#define pLsrPrimV256Float32Lift(dest, a) \ + do { \ + float temp = a; \ + (dest) = _mm256_broadcast_ss((float *)(&temp)); \ + } while(0) + + +#define pLsrPrimV256Float32LoadVectorHelp(dest, arr, off, idx) \ + do { \ + __m128i pLsrPrimV256Float32LoadVectorHelp_ptrV; \ + __m128 pLsrPrimV256Float32LoadVectorHelp_lo; \ + __m128 pLsrPrimV256Float32LoadVectorHelp_hi; \ + __m256 pLsrPrimV256Float32LoadVectorHelp_res; \ + float32* pLsrPrimV256Float32LoadVectorHelp_ptr0; \ + float32* pLsrPrimV256Float32LoadVectorHelp_ptr1; \ + float32* pLsrPrimV256Float32LoadVectorHelp_ptr2; \ + float32* pLsrPrimV256Float32LoadVectorHelp_ptr3; \ + \ + pLsrPrimV256Float32LoadVectorHelp_ptrV = _mm256_extractf128_si256(arr, 0); \ + pLsrPrimV256Float32LoadVectorHelp_ptr0 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 0); \ + pLsrPrimV256Float32LoadVectorHelp_ptr1 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 1); \ + pLsrPrimV256Float32LoadVectorHelp_ptr2 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 2); \ + pLsrPrimV256Float32LoadVectorHelp_ptr3 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 3); \ + pLsrPrimV256Float32LoadVectorHelp_lo = \ + _mm_set_ps(*(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr0 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr1 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr2 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr3 + off) + idx)); \ + pLsrPrimV256Float32LoadVectorHelp_ptrV =_mm256_extractf128_si256(arr, 1); \ + pLsrPrimV256Float32LoadVectorHelp_ptr0 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 0); \ + pLsrPrimV256Float32LoadVectorHelp_ptr1 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 1); \ + pLsrPrimV256Float32LoadVectorHelp_ptr2 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 2); \ + pLsrPrimV256Float32LoadVectorHelp_ptr3 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 3); \ + pLsrPrimV256Float32LoadVectorHelp_hi = \ + _mm_set_ps(*(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr0 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr1 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr2 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr3 + off) + idx)); \ + pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \ + pLsrPrimV256Float32LoadVectorHelp_hi, \ + 1); \ + pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \ + pLsrPrimV256Float32LoadVectorHelp_lo, \ + 0); \ + (dest) = pLsrPrimV256Float32LoadVectorHelp_res; \ + } while (0) + +// ===== +// Double +// ===== + +// ***************************************************************************** +// Arithmetic operations +// ***************************************************************************** + +static __m256d pLsrPrimV256Float64Zero = {0.0, 0.0, 0.0, 0.0}; +#define pLsrPrimV256PointwiseFloat64Plus(dest, a, b) ((dest) = _mm256_add_pd(a, b)) +#define pLsrPrimV256PointwiseFloat64Minus(dest, a, b) ((dest) = _mm256_sub_pd(a, b)) +#define pLsrPrimV256PointwiseFloat64Times(dest, a, b) ((dest) = _mm256_mul_pd(a, b)) +#define pLsrPrimV256PointwiseFloat64Divide(dest, a, b) ((dest) = _mm256_div_pd(a, b)) +#define pLsrPrimV256PointwiseFloat64Negate(dest, a) ((dest) = _mm256_sub_pd(pLsrPrimV256Float64Zero, a)) +#define pLsrPrimV256PointwiseFloat64Max(dest, a, b) ((dest) = _mm256_max_pd(a, b)) +#define pLsrPrimV256PointwiseFloat64Sqrt(dest, a) ((dest) = _mm256_sqrt_pd(a)) + +#define pLsrPrimV256PointwiseSInt32DivT(dest, a, b) \ + do { \ + __m128i pLsrPrimV256PointwiseSInt32DivT_a; \ + __m128i pLsrPrimV256PointwiseSInt32DivT_b; \ + __m128i pLsrPrimV256PointwiseSInt32DivT_rlo; \ + __m128i pLsrPrimV256PointwiseSInt32DivT_rhi; \ + __m256i pLsrPrimV256PointwiseSInt32DivT_res; \ + \ + pLsrPrimV256PointwiseSInt32DivT_a = _mm256_extractf128_si256(a, 0); \ + pLsrPrimV256PointwiseSInt32DivT_b = _mm256_extractf128_si256(b, 0); \ + pLsrPrimV128PointwiseSInt32DivT(pLsrPrimV256PointwiseSInt32DivT_rlo, pLsrPrimV256PointwiseSInt32DivT_a, pLsrPrimV256PointwiseSInt32DivT_b);\ + pLsrPrimV256PointwiseSInt32DivT_a = _mm256_extractf128_si256(a, 1); \ + pLsrPrimV256PointwiseSInt32DivT_b = _mm256_extractf128_si256(b, 1); \ + pLsrPrimV128PointwiseSInt32DivT(pLsrPrimV256PointwiseSInt32DivT_rhi, pLsrPrimV256PointwiseSInt32DivT_a, pLsrPrimV256PointwiseSInt32DivT_b); \ + pLsrPrimV256PointwiseSInt32DivT_res = _mm256_castsi128_si256(pLsrPrimV256PointwiseSInt32DivT_rlo); \ + pLsrPrimV256PointwiseSInt32DivT_res = _mm256_insertf128_si256(pLsrPrimV256PointwiseSInt32DivT_res, \ + pLsrPrimV256PointwiseSInt32DivT_rhi, \ + 1); \ + (dest) = pLsrPrimV256PointwiseSInt32DivT_res; \ + } while (0) + +#define pLsrPrimV256PointwiseSInt32ModT(dest, a, b) \ + do { \ + __m128i pLsrPrimV256PointwiseSInt32ModT_a; \ + __m128i pLsrPrimV256PointwiseSInt32ModT_b; \ + __m128i pLsrPrimV256PointwiseSInt32ModT_rlo; \ + __m128i pLsrPrimV256PointwiseSInt32ModT_rhi; \ + __m256i pLsrPrimV256PointwiseSInt32ModT_res; \ + \ + pLsrPrimV256PointwiseSInt32ModT_a = _mm256_extractf128_si256(a, 0); \ + pLsrPrimV256PointwiseSInt32ModT_b = _mm256_extractf128_si256(b, 0); \ + pLsrPrimV128PointwiseSInt32ModT(pLsrPrimV256PointwiseSInt32ModT_rlo, pLsrPrimV256PointwiseSInt32ModT_a, pLsrPrimV256PointwiseSInt32ModT_b);\ + pLsrPrimV256PointwiseSInt32ModT_a = _mm256_extractf128_si256(a, 1); \ + pLsrPrimV256PointwiseSInt32ModT_b = _mm256_extractf128_si256(b, 1); \ + pLsrPrimV128PointwiseSInt32ModT(pLsrPrimV256PointwiseSInt32ModT_rhi, pLsrPrimV256PointwiseSInt32ModT_a, pLsrPrimV256PointwiseSInt32ModT_b); \ + pLsrPrimV256PointwiseSInt32ModT_res = _mm256_castsi128_si256(pLsrPrimV256PointwiseSInt32ModT_rlo); \ + pLsrPrimV256PointwiseSInt32ModT_res = _mm256_insertf128_si256(pLsrPrimV256PointwiseSInt32ModT_res, \ + pLsrPrimV256PointwiseSInt32ModT_rhi, \ + 1); \ + (dest) = pLsrPrimV256PointwiseSInt32ModT_res; \ + } while (0) + +// ***************************************************************************** +// Reductions +// ***************************************************************************** + +#define pLsrPrimV256ReduceAFloat64Plus(dest, init, a) \ + do { \ + __m256d reduceAFloat64Plus_p_ = a; \ + __m128d reduceAFloat64Plus_p_lo = _mm256_extractf128_pd(reduceAFloat64Plus_p_, 0); \ + __m128d reduceAFloat64Plus_p_hi = _mm256_extractf128_pd(reduceAFloat64Plus_p_, 1); \ + __m128d reduceAFloat64Plus_p_tmp = _mm_hadd_pd(reduceAFloat64Plus_p_lo, reduceAFloat64Plus_p_hi); \ + reduceAFloat64Plus_p_tmp = _mm_hadd_pd(reduceAFloat64Plus_p_tmp, reduceAFloat64Plus_p_tmp); \ + double reduceAFloat64Plus_dest_; \ + pLsrPrimV256DataSubF64(reduceAFloat64Plus_dest_, 0, reduceAFloat64Plus_p_tmp); \ + (dest) = (reduceAFloat64Plus_dest_ + (init)); \ + } while (0) + +// ***************************************************************************** +// Data operations +// ***************************************************************************** + +#define pLsrPrimV256Float64Load(dest, arr, off, idx) \ + do { \ + double* pLsrPrimV256Float64Load_marr = ((double*)((char*)arr+off) + idx); \ + __m256d float64load = _mm256_loadu_pd(pLsrPrimV256Float64Load_marr); \ + (dest) = float64load; \ + } while (0) + +#define pLsrPrimV256Float64Store(arr, off, idx, v) \ + do { \ + double* pLsrPrimV256Float64Store_marr = ((double*)((char*)arr+off) + idx); \ + _mm256_storeu_pd(pLsrPrimV256Float64Store_marr, v); \ + } while (0) + + +#define pLsrPrimV256Float64LoadStrided(dest, arr, off, idx, stride) \ + do { \ + (dest) = _mm256_set_pd(*((float64*)((char*)arr + off) + idx + 3*stride), \ + *((float64*)((char*)arr + off) + idx + 2*stride), \ + *((float64*)((char*)arr + off) + idx + stride), \ + *((float64*)((char*)arr + off) + idx + 0)); \ + } while (0) + +#define pLsrPrimV256Float64StoreStrided(arr, off, idx, stride, src) \ + do { \ + __m128d pLsrPrimV256Float64StoreStrided_srcV; \ + \ + pLsrPrimV256Float64StoreStrided_srcV =_mm256_extractf128_pd(src, 0); \ + _mm_storel_pd(((float64*)((char*)arr + off) + idx + 0), pLsrPrimV256Float64StoreStrided_srcV); \ + _mm_storeh_pd(((float64*)((char*)arr + off) + idx + stride), pLsrPrimV256Float64StoreStrided_srcV); \ + \ + pLsrPrimV256Float64StoreStrided_srcV =_mm256_extractf128_pd(src, 1); \ + _mm_storel_pd(((float64*)((char*)arr + off) + idx + 2*stride), pLsrPrimV256Float64StoreStrided_srcV); \ + _mm_storeh_pd(((float64*)((char*)arr + off) + idx + 3*stride), pLsrPrimV256Float64StoreStrided_srcV); \ + } while (0) + +#define pLsrPrimV256Float64Lift(dest, a) \ + do { \ + double temp = a; \ + (dest) = _mm256_broadcast_sd((double *)(&temp)); \ + } while(0) + +// ================ +// Ref - 256-bit +// ================ + +#define pLsrPrimV256RefLoad(dest, arr, off, idx) pLsrPrimV256UInt32Load(dest, arr, off, idx) + +#endif + +#endif _PLSR_PRIMS_VECTOR_MIC_H_ diff --git a/runtime/include/hrc/plsr-prims-vector-sse.h b/runtime/include/hrc/plsr-prims-vector-sse.h new file mode 100644 index 0000000..2263dd1 --- /dev/null +++ b/runtime/include/hrc/plsr-prims-vector-sse.h @@ -0,0 +1,1224 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_PRIMS_VECTOR_SSE_H_ +#define _PLSR_PRIMS_VECTOR_SSE_H_ + +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ +#endif /* P_USE_PILLAR */ + + +#ifdef P_USE_PILLAR +# undef to +# pragma pillar_managed(on) +#endif /* P_USE_PILLAR */ + +#ifdef PLSR_VECTOR_USE_ASM +// define the vector types +/* typedef float __m64 __attribute__ ((__vector_size__ (8))); +typedef int __m128i __attribute__ ((__vector_size__ (16))); +typedef double __m128d __attribute__ ((__vector_size__ (16))); +typedef float __m128 __attribute__ ((__vector_size__ (16))); */ + +/* +struct floatVec { + float elems[4]; +}; +typedef struct floatVec __m128; + +struct floatVecShort { + float elems[2]; +}; +typedef struct floatVecShort __m64; + +struct doubleVec { + double elems[2]; +}; +typedef struct doubleVec __m128d; */ + +typedef union __m128i { + __int8 m128i_i8[16]; + __int16 m128i_i16[8]; + __int32 m128i_i32[4]; + __int64 m128i_i64[2]; + unsigned __int8 m128i_u8[16]; + unsigned __int16 m128i_u16[8]; + unsigned __int32 m128i_u32[4]; + unsigned __int64 m128i_u64[2]; +} __m128i; + +typedef struct __m128r { + ref m128i_u32[4]; +} __m128r; + +typedef struct __m128d { + double m128d_f64[2]; +} __m128d; + +typedef union __m128 { + float m128_f32[4]; + unsigned __int64 m128_u64[2]; + __int8 m128_i8[16]; + __int16 m128_i16[8]; + __int32 m128_i32[4]; + __int64 m128_i64[2]; + unsigned __int8 m128_u8[16]; + unsigned __int16 m128_u16[8]; + unsigned __int32 m128_u32[4]; + } __m128; + +typedef union __m64 +{ + unsigned __int64 m64_u64; + float m64_f32[2]; + __int8 m64_i8[8]; + __int16 m64_i16[4]; + __int32 m64_i32[2]; + __int64 m64_i64; + unsigned __int8 m64_u8[8]; + unsigned __int16 m64_u16[4]; + unsigned __int32 m64_u32[2]; + } __m64; + +#else +#ifdef P_USE_PILLAR +#define __ICL_INTRINCC __pcdecl +#endif /* P_USE_PILLAR */ +#include +#endif + +typedef __m64 PlsrVector64F32; + +typedef __m128 PlsrVector128F32; +typedef __m128i PlsrVector128B32; +typedef __m128d PlsrVector128F64; +typedef __m128i PlsrVector128Ref; + +typedef __m128 PlsrVectorMask128Fs32; +typedef __m128 PlsrVectorMask128Fs64; + +// ***************************************************************************** +// Comparison +// ***************************************************************************** +#if 0 +#define pLsrPrimV128PointwiseSInt32LT(dest, a, b) ((dest) = _mm_cmplt_epi32((a),(b))) +#define pLsrPrimV128PointwiseSInt32EQ(dest, a, b) ((dest) = _mm_cmpeq_epi32((a),(b))) +#define pLsrPrimV128PointwiseSInt32GT(dest, a, b) ((dest) = _mm_cmpgt_epi32((a),(b))) +#else +#define pLsrPrimV128CompareSInt32LT(dest, a, b) ((dest) = _mm_cmplt_epi32((a),(b))) +#define pLsrPrimV128CompareSInt32EQ(dest, a, b) ((dest) = _mm_cmpeq_epi32((a),(b))) +#define pLsrPrimV128CompareSInt32GT(dest, a, b) ((dest) = _mm_cmpgt_epi32((a),(b))) +#endif + +// ***************************************************************************** +// Conversions & Casts +// ***************************************************************************** + +#ifdef PLSR_VECTOR_USE_ASM + +#define pLsrPrimV128128ConvertFloat32FromUInt32(dest, a) vector128128ConvertFloat32FromUInt32(dest, a) +#define pLsrPrimV128128ConvertUInt32FromFloat32(dest, a) vector128128ConvertUInt32FromFloat32(dest, a) + +#define vector128128ConvertFloat32FromUInt32(dest, a) \ + do { \ + __asm__ ("cvtdq2ps %1, %0;" \ + : "=x"(dest) \ + : "xm"(a)); \ + } while (0) + +#define vector128128ConvertUInt32FromFloat32(dest, a) \ + do { \ + __asm__ ("cvtps2dq %1, %0;" \ + : "=x"(dest) \ + : "xm"(a)); \ + } while (0) + +#else + +#define pLsrPrimV128128ConvertFloat32FromUInt32(dest, a) ((dest) = _mm_cvtepi32_ps(a)) +#define pLsrPrimV128128ConvertUInt32FromFloat32(dest, a) ((dest) = _mm_cvtps_epi32(a)) +// vec64 -> vec128 +#define pLsrPrimV128128ConvertFloat64FromUInt32(dest, a) ((dest) = _mm_cvtepi32_pd (_mm_movpi64_epi64(a))) +#define pLsrPrimV128128ConvertUInt32FromFloat64(dest, a) ((dest) = _mm_movepi64_pi64 (_mm_cvtpd_epi32(a))) + +#define pLsrPrimV128128CastSInt32ToFloat32(dest, a) ((dest) = _mm_cvtepi32_ps(a)) + +#endif + +#define pLsrPrimV128128ConvertUInt32FromSInt32(dest, a) ((dest) = a) +#define pLsrPrimV128128ConvertSInt32FromUInt32(dest, a) ((dest) = a) + +// ***************************************************************************** +// Data operations on the more abstract B32/F32/F64 types +// ***************************************************************************** + +#define pLsrPrimV128DataVectorB32(dest, c0, c1, c2, c3) pLsrPrimV128UInt32Const(dest, c0, c1, c2, c3) +#define pLsrPrimV128DataVectorF32(dest, c0, c1, c2, c3) pLsrPrimV128Float32Const(dest, c0, c1, c2, c3) +#define pLsrPrimV128DataVectorF64(dest, c0, c1) pLsrPrimV128Float64Const(dest, c0, c1) + +#define pLsrPrimV128DataBroadcastB32(dest, c0) pLsrPrimV128UInt32Lift(dest, c0) +#define pLsrPrimV128DataBroadcastF32(dest, c0) pLsrPrimV128Float32Lift(dest, c0) +#define pLsrPrimV128DataBroadcastF64(dest, c0) pLsrPrimV128Float64Lift(dest, c0) + +// Subscripting operations + +#define pLsrPrimV128DataSub0B32(dest, a) pLsrPrimV128DataSubB32(dest, 0, a) +#define pLsrPrimV128DataSub1B32(dest, a) pLsrPrimV128DataSubB32(dest, 1, a) +#define pLsrPrimV128DataSub2B32(dest, a) pLsrPrimV128DataSubB32(dest, 2, a) +#define pLsrPrimV128DataSub3B32(dest, a) pLsrPrimV128DataSubB32(dest, 3, a) + +/* +static inline int pLsrPrimV128DataSubB32(int index, __m128i a) { + // _mm_extract_epi32 (SSE >= 4.1) +} */ + +#define pLsrPrimV128DataSubB32(dest, n, a) \ + do { \ + int pLsrPrimV128DataSubB32_x; \ + __asm__ ("pshufd %1, %2, %2;\n\t" \ + "movd %2, %0;\n\t" \ + : "=r"(pLsrPrimV128DataSubB32_x) \ + : "i"(n), "x"(a)); \ + (dest) = pLsrPrimV128DataSubB32_x; \ + } while (0) + + +#define pLsrPrimV128DataSub0F32(dest, a) pLsrPrimV128DataSubF32(dest, 0, a) +#define pLsrPrimV128DataSub1F32(dest, a) pLsrPrimV128DataSubF32(dest, 1, a) +#define pLsrPrimV128DataSub2F32(dest, a) pLsrPrimV128DataSubF32(dest, 2, a) +#define pLsrPrimV128DataSub3F32(dest, a) pLsrPrimV128DataSubF32(dest, 3, a) + +#define pLsrPrimV128DataSubF32(dest, sub, a) ((dest) = ((float*)&a)[sub]) + +#define pLsrPrimV128DataSub0F64(dest, a) pLsrPrimV128DataSubF64(dest, 0, a) +#define pLsrPrimV128DataSub1F64(dest, a) pLsrPrimV128DataSubF64(dest, 1, a) + +#define pLsrPrimV128DataSubF64(dest, sub, a) ((dest) = ((double*)&a)[sub]) + +// loads and stores + +#define pLsrPrimV128B32StoreF(arr, offset, v) pLsrPrimV128UInt32StoreF(arr, offset, v) +#define pLsrPrimV128B64StoreF(arr, offset, v) pLsrPrimV128UInt64StoreF(arr, offset, v) +#define pLsrPrimV128F32StoreF(arr, offset, v) pLsrPrimV128Float32StoreF(arr, offset, v) +#define pLsrPrimV128F64StoreF(arr, offset, v) pLsrPrimV128Float64StoreF(arr, offset, v) +#define pLsrPrimV128RefStoreF(arr, offset, v) pLsrPrimV128RefStoreF(arr, offset, v) + +#define pLsrPrimV128B32StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV128UInt32Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV128UInt32StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) + +#define pLsrPrimV128B64StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV128UInt64Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV128UInt64StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) +#define pLsrPrimV128F32StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV128Float32Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV128Float32StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) +#define pLsrPrimV128F64StoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV128Float64Store(arr, offset, idx, v); \ + } else { \ + pLsrPrimV128Float64StoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) +#define pLsrPrimV128RefStoreVS(arr, offset, idx, stride, v) \ + do { \ + if (stride == 1) { \ + pLsrPrimV128RefStore(arr, offset, idx, v); \ + } else { \ + pLsrPrimV128RefStoreStrided(arr, offset, idx, stride, v); \ + } \ + } while (0) + +#define pLsrPrimV128B32StoreVI(arr, offset, vi, v) pLsrPrimV128UInt32StoreIndexed(arr, offset, vi, v) +#define pLsrPrimV128B64StoreVI(arr, offset, vi, v) pLsrPrimV128UInt64StoreIndexed(arr, offset, vi, v) +#define pLsrPrimV128F32StoreVI(arr, offset, vi, v) pLsrPrimV128Float32StoreIndexed(arr, offset, vi, v) +#define pLsrPrimV128F64StoreVI(arr, offset, vi, v) pLsrPrimV128Float64StoreIndexed(arr, offset, vi, v) +#define pLsrPrimV128RefStoreVI(arr, offset, vi, v) pLsrPrimV128RefStoreIndexed(arr, offset, vi, v) + +#define pLsrPrimV128B32StoreVVS(arr, offset, idx, stride, v) \ + pLsrPrimV128UInt32StoreVectorStrided(arr, offset, idx, stride, v) +#define pLsrPrimV128B64StoreVVS(arr, offset, idx, stride, v) \ + pLsrPrimV128UInt64StoreVectorStrided(arr, offset, idx, stride, v) +#define pLsrPrimV128F32StoreVVS(arr, offset, idx, stride, v) \ + pLsrPrimV128Float32StoreVectorStrided(arr, offset, idx, stride, v) +#define pLsrPrimV128F64StoreVVS(arr, offset, idx, stride, v) \ + pLsrPrimV128Float64StoreVectorStrided(arr, offset, idx, stride, v) +#define pLsrPrimV128RefStoreVVS(arr, offset, idx, stride, v) \ + pLsrPrimV128RefStoreVectorStrided(arr, offset, idx, stride, v) + +#define pLsrPrimV128B32StoreVVI(arr, offset, vi, v) pLsrPrimV128UInt32StoreVectorIndexed(arr, offset, vi, v) +#define pLsrPrimV128B64StoreVVI(arr, offset, vi, v) pLsrPrimV128UInt64StoreVectorIndexed(arr, offset, vi, v) +#define pLsrPrimV128F32StoreVVI(arr, offset, vi, v) pLsrPrimV128Float32StoreVectorIndexed(arr, offset, vi, v) +#define pLsrPrimV128F64StoreVVI(arr, offset, vi, v) pLsrPrimV128Float64StoreVectorIndexed(arr, offset, vi, v) +#define pLsrPrimV128RefStoreVVI(arr, offset, vi, v) pLsrPrimV128RefStoreVectorIndexed(arr, offset, vi, v) + +#define pLsrPrimV128B32LoadF(dest, arr, offset) pLsrPrimV128UInt32LoadF(dest, arr, offset) +#define pLsrPrimV128B64LoadF(dest, arr, offset) pLsrPrimV128UInt64LoadF(dest, arr, offset) +#define pLsrPrimV128F32LoadF(dest, arr, offset) pLsrPrimV128Float32LoadF(dest, arr, offset) +#define pLsrPrimV128F64LoadF(dest, arr, offset) pLsrPrimV128Float64LoadF(dest, arr, offset) +#define pLsrPrimV128RefLoadF(dest, arr, offset) pLsrPrimV128RefLoadF(dest, arr, offset) + +#define pLsrPrimV128B32LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV128UInt32Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV128UInt32LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV128B64LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV128UInt64Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV128UInt64LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV128F32LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV128Float32Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV128Float32LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV128F64LoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV128Float64Load(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV128Float64LoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) +#define pLsrPrimV128RefLoadVS(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 1) { \ + pLsrPrimV128RefLoad(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV128RefLoadStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) + +#define pLsrPrimV128B32LoadVI(dest, arr, offset, vi) pLsrPrimV128UInt32LoadIndexed(dest, arr, offset, vi) +#define pLsrPrimV128B64LoadVI(dest, arr, offset, vi) pLsrPrimV128UInt64LoadIndexed(dest, arr, offset, vi) +#define pLsrPrimV128F32LoadVI(dest, arr, offset, vi) pLsrPrimV128Float32LoadIndexed(dest, arr, offset, vi) +#define pLsrPrimV128F32LoadVI64(dest, arr, offset, vi) pLsrPrimV128Float32LoadIndexed64(dest, arr, offset, vi) +#define pLsrPrimV128F64LoadVI(dest, arr, offset, vi) pLsrPrimV128Float64LoadIndexed(dest, arr, offset, vi) +#define pLsrPrimV128RefLoadVI(dest, arr, offset, vi) pLsrPrimV128RefLoadIndexed(dest, arr, offset, vi) + +#define pLsrPrimV128B32LoadVectorStrided(dest, arr, offset, idx, stride) \ + pLsrPrimV128UInt32LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV128B64LoadVectorStrided(dest, arr, offset, idx, stride) \ + pLsrPrimV128UInt64LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV128F32LoadVectorStrided(dest, arr, offset, idx, stride) \ + do { \ + if (stride == 0) { \ + pLsrPrimV128Float32LoadVector(dest, arr, offset, idx); \ + } else { \ + pLsrPrimV128Float32LoadVectorStrided(dest, arr, offset, idx, stride); \ + } \ + } while (0) \ + +#define pLsrPrimV128F64LoadVectorStrided(dest, arr, offset, idx, stride) \ + pLsrPrimV128Float64LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV128RefLoadVectorStrided(dest, arr, offset, idx, stride) \ + pLsrPrimV128RefLoadVectorStrided(dest, arr, offset, idx, stride) + +#define pLsrPrimV128B32LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV128UInt32LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV128B64LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV128UInt64LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV128F32LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV128F32LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV128F64LoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV128F64LoadVectorStrided(dest, arr, offset, idx, stride) +#define pLsrPrimV128RefLoadVVS(dest, arr, offset, idx, stride) \ + pLsrPrimV128RefLoadVectorStrided(dest, arr, offset, idx, stride) + +#define pLsrPrimV128B32LoadVVI(dest, arr, offset, vi) pLsrPrimV128UInt32LoadVectorIndexed(dest, arr, offset, vi) +#define pLsrPrimV128B64LoadVVI(dest, arr, offset, vi) pLsrPrimV128UInt64LoadVectorIndexed(dest, arr, offset, vi) +#define pLsrPrimV128F32LoadVVI(dest, arr, offset, vi) pLsrPrimV128Float32LoadVectorIndexed(dest, arr, offset, vi) +#define pLsrPrimV128F64LoadVVI(dest, arr, offset, vi) pLsrPrimV128Float64LoadVectorIndexed(dest, arr, offset, vi) +#define pLsrPrimV128RefLoadVVI(dest, arr, offset, vi) pLsrPrimV128RefLoadVectorIndexed(dest, arr, offset, vi) + +// ***************************************************************************** +// Type-specific arithemetic and memory operations +// ***************************************************************************** + +// ================ +// UInt32 - 128-bit +// ================ + +// Arithmetic + +#ifdef PLSR_VECTOR_USE_ASM + +#define pLsrPrimV128PointwiseUInt32Plus(dest, a, b) vector128PointwiseUInt32Plus(dest, a, b) +#define pLsrPrimV128PointwiseUInt32Minus(dest, a, b) vector128PointwiseUInt32Minus(dest, a, b) +#define pLsrPrimV128PointwiseUInt32Times(dest, a, b) vector128PointwiseUInt32Times(dest, a, b) + +#define vector128PointwiseUInt32Plus(dest, a, b) \ + do { \ + __m128i vector128PointwiseUInt32Plus_c = b; \ + __asm__ ("paddd %2, %0;\n\t" \ + : "=x"(vector128PointwiseUInt32Plus_c) \ + : "0"(vector128PointwiseUInt32Plus_c), "x"(a)); \ + (dest) = vector128PointwiseUInt32Plus_c; \ + } while (0) + +#define vector128PointwiseUInt32Minus(dest, a, b) \ + do { \ + __m128i vector128PointwiseUInt32Minus_c = b; \ + __asm__ ("psubd %2, %0;\n\t" \ + : "=x"(vector128PointwiseUInt32Minus_c) \ + : "0"(vector128PointwiseUInt32Minus_c), "x"(a)); \ + (dest) = vector128PointwiseUInt32Minus_c; \ + } while (0) + +#define vector128PointwiseUInt32Times(dest, a, b) \ + do { \ + __m128i vector128PointwiseUInt32Times_c; \ + __asm__ ("movdqu %1, %%xmm0;\n\t" \ + "movdqu %2, %%xmm1;\n\t" \ + "psrldq $0x04, %%xmm0;\n\t" /* c */ \ + "psrldq $0x04, %%xmm1;\n\t" /* d */ \ + "pmuludq %%xmm0, %%xmm1;\n\t" /* xmm1 = c*d */ \ + "pmuludq %1, %2;\n\t" /* v = a*b */ \ + "pshufd $0x08, %2, %0;\n\t" \ + "pshufd $0x08, %%xmm1, %%xmm1;\n\t" \ + "punpckldq %%xmm1, %0;\n\t" \ + : "=x"(vector128PointwiseUInt32Times_c) \ + : "x"(a), "x"(b) \ + : "xmm0", "xmm1"); \ + (dest) = vector128PointwiseUInt32Times_c; \ + } while (0) + +#else + +#define pLsrPrimV128PointwiseUInt32Plus(dest, a, b) ((dest) = _mm_add_epi32(a, b)) +#define pLsrPrimV128PointwiseUInt32Minus(dest, a, b) ((dest) = _mm_sub_epi32(a, b)) +#define pLsrPrimV128PointwiseUInt32Times(dest, a, b) u32_mul(dest, a, b) // _mm_mullo_epi32(a, b)) <- SSE 4.1 +#define pLsrPrimV128PointwiseUInt32Max(a, b) (_mm_max_epi32(a, b)) //not supported <- SSE 4.1 + +#define pLsrPrimV128PointwiseSInt32Plus(dest, a, b) ((dest) = _mm_add_epi32(a, b)) +#define pLsrPrimV128PointwiseSInt32Minus(dest, a, b) ((dest) = _mm_sub_epi32(a, b)) +#define pLsrPrimV128PointwiseSInt32Times(dest, a, b) ((dest) = _mm_mullo_epi32(a, b)) //<- SSE 4.1 + +#define pLsrPrimV128PointwiseUInt32BAnd(dest, a, b) ((dest) = _mm_and_si128((a), (b))) + +/* +#define pLsrPrimV128PointwiseUInt32BShiftR(dest, a, b) \ + do { \ + int pLsrPrimV128PointwiseUInt32BShiftR_count = _mm_extract_epi32(b, 0); \ + (dest) = _mm_srli_epi32((a), pLsrPrimV128PointwiseUInt32BShiftR_count); \ + } while (0) + +#define pLsrPrimV128PointwiseUInt32BShiftL(dest, a, b) \ + do { \ + int pLsrPrimV128PointwiseUInt32BShiftL_count = _mm_extract_epi32(b, 0); \ + (dest) = _mm_slli_epi32((a), pLsrPrimV128PointwiseUInt32BShiftL_count); \ + } while (0) +*/ + +#define pLsrPrimV128PointwiseUInt32BShiftR(dest, a, b) \ + do { \ + uint32 pLsrPrimV128PointwiseUInt32BShiftR_R0 = _mm_extract_epi32(a, 0) >> _mm_extract_epi32(b, 0) ; \ + uint32 pLsrPrimV128PointwiseUInt32BShiftR_R1 = _mm_extract_epi32(a, 1) >> _mm_extract_epi32(b, 1) ; \ + uint32 pLsrPrimV128PointwiseUInt32BShiftR_R2 = _mm_extract_epi32(a, 2) >> _mm_extract_epi32(b, 2) ; \ + uint32 pLsrPrimV128PointwiseUInt32BShiftR_R3 = _mm_extract_epi32(a, 3) >> _mm_extract_epi32(b, 3) ; \ + (dest) = _mm_set_epi32(pLsrPrimV128PointwiseUInt32BShiftR_R3, \ + pLsrPrimV128PointwiseUInt32BShiftR_R2, \ + pLsrPrimV128PointwiseUInt32BShiftR_R1, \ + pLsrPrimV128PointwiseUInt32BShiftR_R0); \ + } while (0) + +#define pLsrPrimV128PointwiseUInt32BShiftL(dest, a, b) \ + do { \ + uint32 pLsrPrimV128PointwiseUInt32BShiftL_R0 = _mm_extract_epi32(a, 0) << _mm_extract_epi32(b, 0) ; \ + uint32 pLsrPrimV128PointwiseUInt32BShiftL_R1 = _mm_extract_epi32(a, 1) << _mm_extract_epi32(b, 1) ; \ + uint32 pLsrPrimV128PointwiseUInt32BShiftL_R2 = _mm_extract_epi32(a, 2) << _mm_extract_epi32(b, 2) ; \ + uint32 pLsrPrimV128PointwiseUInt32BShiftL_R3 = _mm_extract_epi32(a, 3) << _mm_extract_epi32(b, 3) ; \ + (dest) = _mm_set_epi32(pLsrPrimV128PointwiseUInt32BShiftL_R3, \ + pLsrPrimV128PointwiseUInt32BShiftL_R2, \ + pLsrPrimV128PointwiseUInt32BShiftL_R1, \ + pLsrPrimV128PointwiseUInt32BShiftL_R0); \ + } while (0) + +#define pLsrPrimV128PointwiseSInt32DivT(dest, a, b) \ + do { \ + sint32 pLsrPrimV128PointwiseSInt32DivT_R0 = _mm_extract_epi32(a, 0) / _mm_extract_epi32(b, 0) ; \ + sint32 pLsrPrimV128PointwiseSInt32DivT_R1 = _mm_extract_epi32(a, 1) / _mm_extract_epi32(b, 1) ; \ + sint32 pLsrPrimV128PointwiseSInt32DivT_R2 = _mm_extract_epi32(a, 2) / _mm_extract_epi32(b, 2) ; \ + sint32 pLsrPrimV128PointwiseSInt32DivT_R3 = _mm_extract_epi32(a, 3) / _mm_extract_epi32(b, 3) ; \ + (dest) = _mm_set_epi32(pLsrPrimV128PointwiseSInt32DivT_R3, \ + pLsrPrimV128PointwiseSInt32DivT_R2, \ + pLsrPrimV128PointwiseSInt32DivT_R1, \ + pLsrPrimV128PointwiseSInt32DivT_R0); \ + } while (0) + +#define pLsrPrimV128PointwiseSInt32ModT(dest, a, b) \ + do { \ + sint32 pLsrPrimV128PointwiseSInt32ModT_R0 = _mm_extract_epi32(a, 0) % _mm_extract_epi32(b, 0) ; \ + sint32 pLsrPrimV128PointwiseSInt32ModT_R1 = _mm_extract_epi32(a, 1) % _mm_extract_epi32(b, 1) ; \ + sint32 pLsrPrimV128PointwiseSInt32ModT_R2 = _mm_extract_epi32(a, 2) % _mm_extract_epi32(b, 2) ; \ + sint32 pLsrPrimV128PointwiseSInt32ModT_R3 = _mm_extract_epi32(a, 3) % _mm_extract_epi32(b, 3) ; \ + (dest) = _mm_set_epi32(pLsrPrimV128PointwiseSInt32ModT_R3, \ + pLsrPrimV128PointwiseSInt32ModT_R2, \ + pLsrPrimV128PointwiseSInt32ModT_R1, \ + pLsrPrimV128PointwiseSInt32ModT_R0); \ + } while (0) + + +// Multiplication routine for SSE < 4.1 +#define u32_mul(u32_mul_dest, u32_mul_a, u32_mul_b) \ + do { \ + __m128i u32_mul_c = _mm_srli_si128(u32_mul_a, 4); \ + __m128i u32_mul_d = _mm_srli_si128(u32_mul_b, 4); \ + __m128i u32_mul_u = _mm_mul_epu32(u32_mul_a, u32_mul_b); \ + __m128i u32_mul_v = _mm_mul_epu32(u32_mul_c, u32_mul_d); \ + __m128i u32_mul_r1 = _mm_shuffle_epi32(u32_mul_u, 8); \ + __m128i u32_mul_r2 = _mm_shuffle_epi32(u32_mul_v, 8); \ + (u32_mul_dest) = _mm_unpacklo_epi32(u32_mul_r1, u32_mul_r2); \ + } while (0) + +#endif + +// Reductions (horizontals) + +#ifdef PLSR_VECTOR_USE_ASM + +#define pLsrPrimV128ReduceAUInt32Plus(dest, init, a) reduceAUInt32Plus(dest, init, a) +#define pLsrPrimV128ReduceASInt32Plus(dest, init, a) reduceAUInt32Plus(dest, init, a) + +#define reduceAUInt32Plus(dest, init, a) \ + do { \ + uint32 reduceAUInt32Plus_c; \ + __asm__("phaddd %1, %1;\n\t" \ + "phaddd %1, %1;\n\t" \ + "movd %1, %0;\n\t" \ + : "=r"(reduceAUInt32Plus_c) \ + : "x"(a)); \ + (dest) = (reduceAUInt32Plus_c + init); \ + } while (0) + +#else + +#define pLsrPrimV128ReduceAUInt32Plus(dest, init, a) reduceAUInt32Plus(dest, init, a) +#define pLsrPrimV128ReduceAUInt32Times(dest, init, a) reduceAUInt32Times(dest, init, a) +#define pLsrPrimV128ReduceAUInt32Max(dest, init, a) reduceAUInt32Max(dest, init, a) +#define pLsrPrimV128ReduceAUInt32Min(dest, init, a) reduceAUInt32Min(dest, init, a) + +#define pLsrPrimV128ReduceASInt32Plus(dest, init, a) reduceAUInt32Plus(dest, init, a) + +#define reduceAUInt32Plus(reduceAUInt32Plus_dest, reduceAUInt32Plus_init, reduceAUInt32Plus_a) \ + do { \ + /* probably not a great implementation */ \ + __m128i reduceAUInt32Plus_tmp = _mm_hadd_epi32(reduceAUInt32Plus_a, reduceAUInt32Plus_a); \ + __m128i reduceAUInt32Plus_tmp2 = _mm_hadd_epi32(reduceAUInt32Plus_tmp, reduceAUInt32Plus_tmp); \ + uint32 reduceAUInt32Plus_tmp3; \ + pLsrPrimV128DataSub0B32(reduceAUInt32Plus_tmp3, reduceAUInt32Plus_tmp2); \ + (reduceAUInt32Plus_dest) = (reduceAUInt32Plus_tmp3 + reduceAUInt32Plus_init); \ + } while (0) + +#define reduceAUInt32Times(dest, init, a) \ + do { \ + __m128i reduceAUInt32Times_atmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)); \ + __m128i reduceAUInt32Times_b; \ + pLsrPrimV128PointwiseUInt32Times(reduceAUInt32Times_b, reduceAUInt32Times_atmp, a); \ + __m128i reduceAUInt32Times_btmp = _mm_shuffle_epi32(reduceAUInt32Times_b, _MM_SHUFFLE(0, 0, 0, 1)); \ + __m128i reduceAUInt32Times_c; \ + pLsrPrimV128PointwiseUInt32Times(reduceAUInt32Times_c, reduceAUInt32Times_btmp, reduceAUInt32Times_b); \ + uint32 reduceAUInt32Times_r; \ + pLsrPrimV128DataSub0B32(reduceAUInt32Times_r, c); \ + (dest) = (reduceAUInt32Times_r * (init)); \ + } while (0) + +//inline int max(int a, int b) { +// return a > b ? a : b; +//} + +#define reduceAUInt32Max(dest, init, a) \ + do { \ + __m128i reduceAUInt32Max_atmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)); \ + __m128i reduceAUInt32Max_b = _mm_max_epi32(reduceAUInt32Max_atmp, a); /* note is SSE >= 4.1 */ \ + __m128i reduceAUInt32Max_btmp = _mm_shuffle_epi32(reduceAUInt32Max_b, _MM_SHUFFLE(0, 0, 0, 1)); \ + __m128i reduceAUInt32Max_c = _mm_max_epi32(reduceAUInt32Max_btmp, reduceAUInt32Max_b); \ + uint32 reduceAUInt32Max_r; \ + pLsrPrimV128DataSub0B32(reduceAUInt32Max_r, reduceAUInt32Max_c); \ + (dest) = (max(reduceAUInt32Max_r, (init))); \ + } while (0) \ + +#define reduceAUInt32Min(dest, init, a) \ + do { \ + __m128i reduceAUInt32Min_atmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)); \ + __m128i reduceAUInt32Min_b = _mm_min_epi32(reduceAUInt32Min_atmp, a); /* note is SSE >= 4.1 */ \ + __m128i reduceAUInt32Min_btmp = _mm_shuffle_epi32(reduceAUInt32Min_b, _MM_SHUFFLE(0, 0, 0, 1)); \ + __m128i reduceAUInt32Min_c = _mm_min_epi32(reduceAUInt32Min_btmp, reduceAUInt32Min_b); \ + uint32 reduceAUInt32Min_r; \ + pLsrPrimV128DataSub0B32(reduceAUInt32Min_r, c); \ + (dest) = (min(reduceAUInt32Min_r, (init))); \ + } while (0) + +#endif + +// Data operations + +#ifdef PLSR_VECTOR_USE_ASM + +#define pLsrPrimV128UInt32Lift(dest, a) vector128UInt32Lift(dest, a) +#define pLsrPrimV128UInt32Const(dest, c0, c1, c2, c3) vector128UInt32Const(dest, c0, c1, c2, c3) +#define pLsrPrimV128UInt32Load(dest, arr, off, idx) vector128UInt32LoadHelp (dest, arr, off, idx) +#define pLsrPrimV128UInt32Store(arr, off, idx, v) vector128UInt32StoreHelp (arr, off, idx, v) + +#define vector128UInt32Lift(dest, a) \ + do { \ + __m128i vector128UInt32Lift_b; \ + __asm__ ("movd %1, %0;\n\t" \ + "pshufd $0x0, %0, %0;\n\t" \ + : "=x"(vector128UInt32Lift_b) \ + : "r"(a)); \ + (dest) = vector128UInt32Lift_b; \ + } while (0) + + // SSE 4.1 + //__m128i dest + // __asm__ ("pinsrd $0x0, %1, %0;\n\t" + // "pinsrd $0x1, %2, %0;\n\t" + // "pinsrd $0x2, %3, %0;\n\t" + // "pinsrd $0x3, %4, %0;\n\t" + // : "=x"(dest) + // : "rm"(c0), "rm"(c1), "rm"(c2), "rm"(c3)); + +#define vector128UInt32Const(dest, c0, c1, c2, c3) \ + do { \ + int vector128UInt32Const_c[4]; \ + __m128i vector128UInt32Const_r; \ + vector128UInt32Const_c[0] = c0; \ + vector128UInt32Const_c[1] = c1; \ + vector128UInt32Const_c[2] = c2; \ + vector128UInt32Const_c[3] = c3; \ + __asm__("movdqu (%1), %0;" \ + : "=x"(vector128UInt32Const_r) \ + : "r"(vector128UInt32Const_c)); \ + dest = vector128UInt32Const_r; \ + } while (0) + +#define vector128UInt32LoadHelp(dest, arr, off, idx) \ + do { \ + __m128i* vector128UInt32LoadHelp_marr = (__m128i*)((uint32*)((char*)arr+off) + idx); \ + __m128i vector128UInt32LoadHelp_r; \ + __asm__("movdqu (%1), %0;\n" : "=x"(vector128UInt32LoadHelp_r) : "r"(vector128UInt32LoadHelp_marr)); \ + (dest) = vector128UInt32LoadHelp_r; \ + } while (0) + +#define vector128UInt32StoreHelp(arr, off, idx, v) \ + do { \ + __m128i* vector128UInt32StoreHelp_marr = (__m128i*)((uint32*)((char*)arr+off) + idx); \ + __asm__("movdqu %1, (%0);\n" : : "r"(vector128UInt32StoreHelp_marr), "x"(v)); \ + } while (0) + +#define pLsrPrimV128UInt32LoadStrided(dest, arr, off, idx, stride) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + +#define pLsrPrimV128UInt32StoreStrided(arr, off, idx, stride, v) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + +#else + +#define pLsrPrimV128UInt32Lift(dest, a) ((dest) = _mm_set1_epi32(a)) +#define pLsrPrimV128UInt32Const(dest, c0, c1, c2, c3) ((dest) = _mm_set_epi32(c3, c2, c1, c0)) + +#define pLsrPrimV128UInt32Load(dest, arr, off, idx) pLsrPrimV128UInt32LoadHelp (dest, arr, off, idx) +#define pLsrPrimV128UInt32Store(arr, off, idx, v) pLsrPrimV128UInt32StoreHelp (arr, off, idx, v) + +#define pLsrPrimV128UInt32LoadHelp(dest, arr, off, idx) \ + do { \ + __m128i* pLsrPrimV128UInt32LoadHelp_marr = (__m128i*)((uint32*)((char*)arr+off) + idx); \ + (dest) = _mm_loadu_si128(pLsrPrimV128UInt32LoadHelp_marr); \ + } while (0) + +#define pLsrPrimV128UInt32StoreHelp(arr, off, idx, v) \ + do { \ + __m128i* pLsrPrimV128UInt32StoreHelp_marr = (__m128i*)((uint32*)((char*)arr+off) + idx); \ + _mm_storeu_si128(pLsrPrimV128UInt32StoreHelp_marr, v); \ + } while (0) + +#define pLsrPrimV128UInt32LoadStrided(dest, arr, off, idx, stride) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + +#define pLsrPrimV128UInt32StoreStrided(arr, off, idx, stride, v) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + +#endif + +// ================ +// UInt32 - 64-bit +// ================ + +// Arihmetic operations + +#ifdef PLSR_VECTOR_USE_ASM + + +#else + +#define pLsrPrimV64PointwiseUInt32Plus(dest, a, b) ((dest) = _mm_add_pi32(a, b)) +#define pLsrPrimV64PointwiseUInt32Minus(dest, a, b) ((dest) = _mm_sub_pi32(a, b)) +#define pLsrPrimV64PointwiseUInt32Times(dest, a, b) (u32_mul_64(dest, a, b)) //_mm_mullo_pi32(a, b)) + + +// Multiplication route for SSE < 4.1 +#define u32_mul_64(dest, a, b) \ + do { \ + __m64 u32_mul_64_c = _mm_srli_si64(a, 4); \ + __m64 u32_mul_64_d = _mm_srli_si64(b, 4); \ + __m64 u32_mul_64_u = _mm_mul_su32(a, b); \ + __m64 u32_mul_64_v = _mm_mul_su32(u32_mul_64_c, u32_mul_64_d); \ + (dest) = _mm_unpacklo_pi32(u32_mul_64_u, u32_mul_64_v); \ + } while (0) + +#endif + +// Data operations + +#ifdef PLSR_VECTOR_USE_ASM + + + +#else + +#define pLsrPrimV64UInt32Lift(dest, a) ((dest) = _mm_set1_pi32(a)) +#define pLsrPrimV64UInt32Const(dest, c0, c1) ((dest) = _mm_set_pi32(c1, c0)) + +#endif + +// ================ +// SInt32 - 128-bit +// ================ + +#ifdef PLSR_VECTOR_USE_ASM + + + +#else + +#define pLsrPrimV128PointwiseSInt32Plus(dest, a, b) ((dest) = _mm_add_epi32(a, b)) +#define pLsrPrimV128PointwiseSInt32Minus(dest, a, b) ((dest) = _mm_sub_epi32(a, b)) +#define pLsrPrimV128PointwiseSInt32Times(dest, a, b) ((dest) = _mm_mullo_epi32(a, b)) +#define pLsrPrimV128PointwiseSInt32Lift(dest, a) ((dest) = _mm_set1_epi32(a)) +#define pLsrPrimV128PointwiseSInt32Const(dest, c0, c1, c2, c3) ((dest) = _mm_set_epi32(c3, c2, c1, c0)) + +#define pLsrPrimV64PointwiseSInt32Plus(dest, a, b) ((dest) = _mm_add_pi32(a, b)) +#define pLsrPrimV64PointwiseSInt32Minus(dest, a, b) ((dest) = _mm_sub_pi32(a, b)) +#define pLsrPrimV64PointwiseSInt32Times(dest, a, b) ((dest) = _mm_mullo_pi32(a, b)) +#define pLsrPrimV64SInt32Lift(dest, a) ((dest) = _mm_set1_pi32(a)) +#define pLsrPrimV64SInt32Const(dest, c0, c1) ((dest) = _mm_set_pi32(c1, c0)) + +#endif + +// ===== +// Float +// ===== + +#ifdef PLSR_VECTOR_USE_ASM + +static __m128 pLsrPrimV128Float32Zero = {0.0, 0.0, 0.0, 0.0}; +#define pLsrPrimV128PointwiseFloat32Plus(dest, a, b) vector128PointwiseFloat32Plus(dest, a, b) +#define pLsrPrimV128PointwiseFloat32Minus(dest, a, b) vector128PointwiseFloat32Minus(dest, a, b) +#define pLsrPrimV128PointwiseFloat32Times(dest, a, b) vector128PointwiseFloat32Times(dest, a, b) +#define pLsrPrimV128PointwiseFloat32Divide(dest, a, b) vector128PointwiseFloat32Divide(dest, a, b) +#define pLsrPrimV128PointwiseFloat32Negate(dest, a) vector128PointwiseFloat32Minus(dest, pLsrPrimV128Float32Zero, a) +#define pLsrPrimV128PointwiseFloat32Max(dest, a, b) vector128PointwiseFloat32Max(dest, a, b) +#define pLsrPrimV128PointwiseFloat32Min(dest, a, b) vector128PointwiseFloat32Min(dest, a, b) + +#define vector128PointwiseFloat32Plus(dest, a, b) \ + do { \ + __m128 vector128PointwiseFloat32Plus_a = a; \ + __m128 vector128PointwiseFloat32Plus_b = b; \ + __asm__ ("addps %1, %0;" : "+x"(vector128PointwiseFloat32Plus_b) : "x"(vector128PointwiseFloat32Plus_a)); \ + (dest) = vector128PointwiseFloat32Plus_b; \ + } while (0) + +#define vector128PointwiseFloat32Minus(dest, a, b) \ + do { \ + __m128 vector128PointwiseFloat32Minus_a = a; \ + __m128 vector128PointwiseFloat32Minus_b = b; \ + __asm__ ("subps %1, %0;" : "+x"(vector128PointwiseFloat32Minus_b) : "x"(vector128PointwiseFloat32Minus_a)); \ + (dest) = vector128PointwiseFloat32Minus_b; \ + } while (0) + +#define vector128PointwiseFloat32Times(dest, a, b) \ + do { \ + __m128 vector128PointwiseFloat32Times_a = a; \ + __m128 vector128PointwiseFloat32Times_b = b; \ + __asm__ ("mulps %1, %0;" : "+x"(vector128PointwiseFloat32Times_b) : "x"(vector128PointwiseFloat32Times_a)); \ + (dest) = vector128PointwiseFloat32Times_b; \ + } while (0) + +#define vector128PointwiseFloat32Divide(dest, a, b) \ + do { \ + __m128 vector128PointwiseFloat32Div_a = a; \ + __m128 vector128PointwiseFloat32Div_b = b; \ + __asm__ ("divps %1, %0;" : "+x"(vector128PointwiseFloat32Div_b) : "x"(vector128PointwiseFloat32Div_a)); \ + (dest) = vector128PointwiseFloat32Div_b; \ + } while (0) + +#define vector128PointwiseFloat32Max(dest, a, b) \ + do { \ + __m128 vector128PointwiseFloat32Max_a = a; \ + __m128 vector128PointwiseFloat32Max_b = b; \ + __asm__ ("maxps %1, %0;" : "+x"(vector128PointwiseFloat32Max_b) : "x"(vector128PointwiseFloat32Max_a)); \ + (dest) = vector128PointwiseFloat32Max_b; \ + } while (0) + +#define vector128PointwiseFloat32Min(dest, a, b) \ + do { \ + __m128 vector128PointwiseFloat32Min_a = a; \ + __m128 vector128PointwiseFloat32Min_b = b; \ + __asm__ ("maxps %1, %0;" : "+x"(vector128PointwiseFloat32Min_b) : "x"(vector128PointwiseFloat32Min_a)); \ + (dest) = vector128PointwiseFloat32Min_b; \ + } while (0) + +#else + +static __m128 pLsrPrimV128Float32Zero = {0.0, 0.0, 0.0, 0.0}; +#define pLsrPrimV128PointwiseFloat32Plus(dest, a, b) ((dest) = _mm_add_ps(a, b)) +#define pLsrPrimV128PointwiseFloat32Minus(dest, a, b) ((dest) = _mm_sub_ps(a, b)) +#define pLsrPrimV128PointwiseFloat32Times(dest, a, b) ((dest) = _mm_mul_ps(a, b)) +#define pLsrPrimV128PointwiseFloat32Divide(dest, a, b) ((dest) = _mm_div_ps(a, b)) +#define pLsrPrimV128PointwiseFloat32Negate(dest, a) ((dest) = _mm_sub_ps(pLsrPrimV128Float32Zero, a)) +#define pLsrPrimV128PointwiseFloat32Max(a, b) ((dest) = _mm_max_ps(a, b)) +#define pLsrPrimV128PointwiseFloat32Min(dest, a, b) ((dest) = _mm_min_ps(a, b)) +#define pLsrPrimV128PointwiseFloat32Sqrt(dest, a) ((dest) = _mm_sqrt_ps(a)) + +#endif + +// Reductions (horizontals) + +// ===== +// Float +// ===== + +#ifdef PLSR_VECTOR_USE_ASM + +#define pLsrPrimV128ReduceAFloat32Plus(dest, init, a) reduceAFloat32Plus(dest, init, a) +#define pLsrPrimV128ReduceAFloat32Max(dest, init, a) reduceAFloat32Max(dest, init, a) + +#define reduceAFloat32Plus(dest, init, a) \ + do { \ + __m128 rAFloat32Plus_a_ = a; \ + float rAFloat32Plus_dest_; \ + __asm__("haddps %1, %1;\n\t" \ + "pextrd $0x0, %1, %0;\n\t" \ + : "=r"(rAFloat32Plus_dest_) \ + : "x"(rAFloat32Plus_a_)); \ + (dest) = (rAFloat32Plus_dest_ + (init)); \ + } while (0) + +#define reduceAFloat32Max(dest, init, a) \ + do { \ + __m128 reduceAFloat32Max_a_ = a; \ + float reduceAFloat32Max_dest_; \ + __asm__ ("shuhfps $0x0e, %1, %1;\n\t" \ + "max_ps %1, %1;\n\t" \ + "shufps $0x01, %1, %1;\n\t" \ + "max_ps %1, %1;\n\t" \ + "pextrd $0x0, %1, %0; \n\t" \ + : "=r"(reduceAFloat32Max_dest_) \ + : "x"(reduceAFloat32Max_a_)); \ + (dest) = (max(reduceAFloat32Max_dest_, init)); \ + } while (0) + +#else + +#define pLsrPrimV128ReduceAFloat32Plus(dest, init, a) reduceAFloat32Plus(dest, init, a) +#define pLsrPrimV128ReduceAFloat32Times(dest, init, a) reduceAFloat32Times(dest, init, a) +#define pLsrPrimV128ReduceAFloat32Max(dest, init, a) reduceAFloat32Max(dest, init, a) +#define pLsrPrimV128ReduceAFloat32Min(dest, init, a) reduceAFloat32Min(dest, init, a) + +#define reduceAFloat32Plus(dest, init, a) \ + do { \ + __m128 reduceAFloat32Plus_a_ = a; \ + __m128 reduceAFloat32Plus_tmp = _mm_hadd_ps(reduceAFloat32Plus_a_, reduceAFloat32Plus_a_); \ + __m128 reduceAFloat32Plus_tmp2 = _mm_hadd_ps(reduceAFloat32Plus_tmp, reduceAFloat32Plus_tmp); \ + float reduceAFloat32Plus_dest_; \ + pLsrPrimV128DataSubF32(reduceAFloat32Plus_dest_, 0, reduceAFloat32Plus_tmp2); \ + (dest) = (reduceAFloat32Plus_dest_ + (init)); \ + } while (0) + +#define reduceAFloat32Times(dest, init, a) \ + do { \ + __m128 reduceAFloat32Times_a_ = a; \ + __m128 reduceAFloat32Times_atmp = _mm_shuffle_ps(reduceAFloat32Times_a_, reduceAFloat32Times_a_, _MM_SHUFFLE(0, 0, 3, 2)); \ + __m128 reduceAFloat32Times_b; \ + pLsrPrimV128PointwiseFloat32Times(reduceAFloat32Times_b, reduceAFloat32Times_atmp, reduceAFloat32Times_a_); \ + __m128 reduceAFloat32Times_btmp = _mm_shuffle_ps(reduceAFloat32Times_b, reduceAFloat32Times_b, _MM_SHUFFLE(0, 0, 0, 1)); \ + __m128 reduceAFloat32Times_c; \ + pLsrPrimV128PointwiseFloat32Times(reduceAFloat32Times_c, reduceAFloat32Times_btmp, reduceAFloat32Times_b); \ + float reduceAFloat32Times_dest_; \ + pLsrPrimV128DataSubF32(reduceAFloat32Times_dest_, 0, reduceAFloat32Times_c); \ + (dest) = (reduceAFloat32Times_dest_ * (init)); \ + } while (0) + +#define reduceAFloat32Max(init, a) \ + do { \ + __m128 reduceAFloat32Max_a_ = a; \ + __m128 reduceAFloat32Max_atmp = _mm_shuffle_ps(reduceAFloat32Max_a_, reduceAFloat32Max_a_, _MM_SHUFFLE(0, 0, 3, 2)); \ + __m128 reduceAFloat32Max_b = _mm_max_ps(reduceAFloat32Max_atmp, reduceAFloat32Max_a); \ + __m128 reduceAFloat32Max_btmp = _mm_shuffle_ps(reduceAFloat32Max_b, reduceAFloat32Max_b, _MM_SHUFFLE(0, 0, 0, 1)); \ + __m128 reduceAFloat32Max_c = _mm_max_ps(reduceAFloat32Max_btmp, reduceAFloat32Max_b); \ + float reduceAFloat32Max_dest_; \ + pLsrPrimV128DataSubF32(reduceAFloat32Max_dest_, 0, reduceAFloat32Max_c); \ + (dest) = max(reduceAFloat32Max_, (init)); \ + } while (0) + +#define reduceAFloat32Min(init, a) \ + do { \ + __m128 reduceAFloat32Min_a_ = a; \ + __m128 reduceAFloat32Min_atmp = _mm_shuffle_ps(reduceAFloat32Min_a_, reduceAFloat32Min_a_, _MM_SHUFFLE(0, 0, 3, 2)); \ + __m128 reduceAFloat32Min_b = _mm_min_ps(reduceAFloat32Min_atmp, reduceAFloat32Min_a); \ + __m128 reduceAFloat32Min_btmp = _mm_shuffle_ps(reduceAFloat32Min_b, reduceAFloat32Min_b, _MM_SHUFFLE(0, 0, 0, 1)); \ + __m128 reduceAFloat32Min_c = _mm_min_ps(reduceAFloat32Min_btmp, reduceAFloat32Min_b); \ + float reduceAFloat32Min_dest_; \ + pLsrPrimV128DataSubF32(reduceAFloat32Min_dest_, 0, reduceAFloat32Min_c); \ + (dest) = min(reduceAFloat32Min_, (init)); \ + } while (0) + +#endif + +// ===== +// Double +// ===== + + +#ifdef PLSR_VECTOR_USE_ASM +#else + +#define pLsrPrimV128ReduceAFloat64Plus(dest, init, a) reduceAFloat64Plus(dest, init, a) +#define pLsrPrimV128ReduceAFloat64Times(dest, init, a) reduceAFloat64Times(dest, init, a) +#define pLsrPrimV128ReduceAFloat64Max(dest, init, a) reduceAFloat64Max(dest, init, a) +#define pLsrPrimV128ReduceAFloat64Min(dest, init, a) reduceAFloat64Min(dest, init, a) + +#define reduceAFloat64Plus(dest, init, a) \ + do { \ + __m128d reduceAFloat64Plus_a_ = a; \ + __m128d reduceAFloat64Plus_tmp = _mm_hadd_pd(reduceAFloat64Plus_a_, reduceAFloat64Plus_a_); \ + double reduceAFloat64Plus_dest_; \ + pLsrPrimV128DataSubF64(reduceAFloat64Plus_dest_, 0, reduceAFloat64Plus_tmp); \ + (dest) = (reduceAFloat64Plus_dest_ + (init)); \ + } while (0) + +#define reduceAFloat64Times(dest, init, a) \ + do { \ + __m128d reduceAFloat64Times_a_ = a; \ + double reduceAFloat64Times_dest0_; \ + double reduceAFloat64Times_dest1_; \ + pLsrPrimV128DataSubF64(reduceAFloat64Times_dest0_, 0, reduceAFloat64Times_a_); \ + pLsrPrimV128DataSubF64(reduceAFloat64Times_dest1_, 1, reduceAFloat64Times_a_); \ + (dest) = reduceAFloat64Times_dest0_ * reduceAFloat64Times_dest0_ * (init); \ + } while (0) + +#define reduceAFloat64Max(init, a) \ + do { \ + __m128d reduceAFloat64Max_a_ = a; \ + double reduceAFloat64Max_dest0_; \ + double reduceAFloat64Max_dest1_; \ + pLsrPrimV128DataSubF64(reduceAFloat64Max_dest0_, 0, reduceAFloat64Max_a_); \ + pLsrPrimV128DataSubF64(reduceAFloat64Max_dest1_, 1, reduceAFloat64Max_a_); \ + (dest) = max(max(reduceAFloat64Max_dest0_, reduceAFloat64Max_dest0_), (init)); \ + } while (0) + +#define reduceAFloat64Min(init, a) \ + do { \ + __m128 reduceAFloat64Min_a_ = a; \ + double reduceAFloat64Min_dest0_; \ + double reduceAFloat64Min_dest1_; \ + pLsrPrimV128DataSubF64(reduceAFloat64Min_dest0_, 0, reduceAFloat64Min_a_); \ + pLsrPrimV128DataSubF64(reduceAFloat64Min_dest1_, 1, reduceAFloat64Min_a_); \ + (dest) = min(min(reduceAFloat64Min_dest0_, reduceAFloat64Min_dest0_), (init)); \ + } while (0) + +#endif + + +#ifdef PLSR_VECTOR_USE_ASM + +#define pLsrPrimV128Float32Const(dest, c0, c1, c2, c3) vector128Float32Const(dest, c0, c1, c2, c3) +#define pLsrPrimV128Float32Lift(dest, a) vector128Float32Lift(dest, a) +#define pLsrPrimV128Float32Load(dest, arr, off, idx) pLsrPrimV128Float32LoadHelp (dest, arr, off, idx) +#define pLsrPrimV128Float32Store(arr, off, idx, v) pLsrPrimV128Float32StoreHelp (arr, off, idx, v) + +#define pLsrPrimV128Float32LoadVector(dest, arr, off, idx) pLsrPrimV128Float32LoadVectorHelp (dest, arr, off, idx) + +/* Suspicous register constraint XXX -leaf */ +#define vector128Float32Lift(dest, a) \ + do { \ + float32 vector128Float32Lift_a_ = a; \ + __m128 vector128Float32Lift_dest_; \ + __asm__ ("movss %1, %0;\n\t" \ + "shufps $0x0, %0, %0;" \ + : "=x"(vector128Float32Lift_dest_) \ + : "xm"(vector128Float32Lift_a_)); \ + (dest) = vector128Float32Lift_dest_; \ + } while (0) + +#define vector128Float32Const(dest, c0, c1, c2, c3) \ + do { \ + __m128 vector128Float32Const_dest_; \ + float vector128Float32Const_c[4]; \ + vector128Float32Const_c[0] = c0; \ + vector128Float32Const_c[1] = c1; \ + vector128Float32Const_c[2] = c2; \ + vector128Float32Const_c[3] = c3; \ + __asm__("movups (%1), %0;" \ + : "=x"(vector128Float32Const_dest_) \ + : "r"(vector128Float32Const_c)); \ + (dest) = vector128Float32Const_dest_; \ + } while(0); + +#define pLsrPrimV128Float32LoadVectorHelp(dest, arr, off, idx) \ + do { \ + float32* pLsrPrimV128Float32LoadVectorHelp_ptr; \ + float32 pLsrPrimV128Float32LoadVectorHelp_marr[4]; \ + pLsrPrimV128DataSubB32(pLsrPrimV128Float32LoadVectorHelp_ptr, 0, arr); \ + pLsrPrimV128Float32LoadVectorHelp_marr[0] = \ + *((float32*) (((char*)pLsrPrimV128Float32LoadVectorHelp_ptr + off) + idx)); \ + pLsrPrimV128DataSubB32(pLsrPrimV128Float32LoadVectorHelp_ptr, 1, arr); \ + pLsrPrimV128Float32LoadVectorHelp_marr[1] = \ + *((float32*) (((char*)pLsrPrimV128Float32LoadVectorHelp_ptr + off) + idx)); \ + pLsrPrimV128DataSubB32(pLsrPrimV128Float32LoadVectorHelp_ptr, 2, arr); \ + pLsrPrimV128Float32LoadVectorHelp_marr[2] = \ + *((float32*) (((char*)pLsrPrimV128Float32LoadVectorHelp_ptr + off) + idx)); \ + pLsrPrimV128DataSubB32(pLsrPrimV128Float32LoadVectorHelp_ptr, 3, arr); \ + pLsrPrimV128Float32LoadVectorHelp_marr[3] = \ + *((float32*) (((char*)pLsrPrimV128Float32LoadVectorHelp_ptr + off) + idx)); \ + pLsrPrimV128Float32Load(dest, &pLsrPrimV128Float32LoadVectorHelp_marr, 0, 0); \ + } while (0) + + +#define pLsrPrimV128Float32LoadHelp(dest, arr, off, idx) \ + do { \ + __m128* pLsrPrimV128Float32LoadHelp_marr = (__m128*)((float*)((char*)arr+off) + idx); \ + __m128 pLsrPrimV128Float32LoadHelp_dest_; \ + __asm__("movups (%1), %0;" \ + : "=x"(pLsrPrimV128Float32LoadHelp_dest_) \ + : "r"(pLsrPrimV128Float32LoadHelp_marr)); \ + (dest) = pLsrPrimV128Float32LoadHelp_dest_; \ + } while (0) + +#define pLsrPrimV128Float32StoreHelp(arr, off, idx, v) \ + do { \ + float* pLsrPrimV128Float32StoreHelp_marr = ((float*)((char*)arr+off) + idx); \ + __asm__("movups %1, (%0);" \ + : \ + : "r"(pLsrPrimV128Float32StoreHelp_marr), "x"(v)); \ + } while (0) + +#else + +#define pLsrPrimV128Float32Const(dest, c0, c1, c2, c3) ((dest) = _mm_set_ps (c3, c2, c1, c0)) +#define pLsrPrimV128Float32Lift(dest, a) ((dest) = _mm_set1_ps(a)) +#define pLsrPrimV128Float32Load(dest, arr, off, idx) pLsrPrimV128Float32LoadHelp (dest, arr, off, idx) +#define pLsrPrimV128Float32LoadVector(dest, arr, off, idx) \ + pLsrPrimV128Float32LoadVectorHelp (dest, arr, off, idx) +#define pLsrPrimV128Float32Store(arr, off, idx, v) pLsrPrimV128Float32StoreHelp (arr, off, idx, v) + +#define pLsrPrimV128Float32LoadHelp(dest, arr, off, idx) \ + do { \ + float32* pLsrPrimV128Float32LoadHelp_marr = (float32*)((char*)arr+off)+idx; \ + (dest) = _mm_loadu_ps(pLsrPrimV128Float32LoadHelp_marr); \ + } while (0) + +#define pLsrPrimV128Float32StoreHelp(arr, off, idx, v) \ + do { \ + float32* pLsrPrimV128Float32StoreHelp_marr = (float32*)((char*)arr+off) + idx; \ + _mm_storeu_ps(pLsrPrimV128Float32StoreHelp_marr, v); \ + } while (0) + +#define pLsrPrimV128Float32LoadVectorHelp(dest, arr, off, idx) \ + do { \ + float32* pLsrPrimV128Float32LoadVectorHelp_ptr0; \ + float32* pLsrPrimV128Float32LoadVectorHelp_ptr1; \ + float32* pLsrPrimV128Float32LoadVectorHelp_ptr2; \ + float32* pLsrPrimV128Float32LoadVectorHelp_ptr3; \ + \ + pLsrPrimV128Float32LoadVectorHelp_ptr0 = (float*)_mm_extract_epi32(arr, 0); \ + pLsrPrimV128Float32LoadVectorHelp_ptr1 = (float*)_mm_extract_epi32(arr, 1); \ + pLsrPrimV128Float32LoadVectorHelp_ptr2 = (float*)_mm_extract_epi32(arr, 2); \ + pLsrPrimV128Float32LoadVectorHelp_ptr3 = (float*)_mm_extract_epi32(arr, 3); \ + (dest) = \ + _mm_set_ps(*(float32*)(((char*)pLsrPrimV128Float32LoadVectorHelp_ptr0 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV128Float32LoadVectorHelp_ptr1 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV128Float32LoadVectorHelp_ptr2 + off) + idx), \ + *(float32*)(((char*)pLsrPrimV128Float32LoadVectorHelp_ptr3 + off) + idx)); \ + } while (0) + +#define pLsrPrimV128Float32LoadStrided(dest, arr, off, idx, stride) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + +#define pLsrPrimV128Float32LoadVectorStrided(dest, arr, off, idx, stride) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + +#define pLsrPrimV128Float32StoreStrided(arr, off, idx, stride, v) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + +#endif + +// ====== +// Double +// ====== + +#ifdef PLSR_VECTOR_USE_ASM + + + +#else + +#define pLsrPrimV128PointwiseFloat64Plus(dest, a, b) ((dest) = _mm_add_pd(a, b)) +#define pLsrPrimV128PointwiseFloat64Minus(dest, a, b) ((dest) = _mm_sub_pd(a, b)) +#define pLsrPrimV128PointwiseFloat64Times(dest, a, b) ((dest) = _mm_mul_pd(a, b)) +#define pLsrPrimV128PointwiseFloat64Divide(dest, a, b) ((dest) = _mm_div_pd(a, b)) + +#define pLsrPrimV128Float64Const(dest, c0, c1) ((dest) = _mm_set_pd(c1, c0)) +#define pLsrPrimV128Float64Lift(dest, a) ((dest) = _mm_set1_pd(a)) +#define pLsrPrimV128Float64Load(dest, arr, off, idx) pLsrPrimV128Float64LoadHelp (dest, arr, off, idx) +#define pLsrPrimV128Float64Store(arr, off, idx, v) pLsrPrimV128Float64StoreHelp (arr, off, idx, v) + +#define pLsrPrimV128Float64LoadHelp(dest, arr, off, idx) \ + do { \ + float64* pLsrPrimV128Float64LoadHelp_marr = (float64*)((char*)arr+off) + idx; \ + (dest) = _mm_loadu_pd(pLsrPrimV128Float64LoadHelp_marr); \ + } while (0) + +#define pLsrPrimV128Float64StoreHelp(arr, off, idx, v) \ + do { \ + float64* pLsrPrimV128Float64StoreHelp_marr = (float64*)((char*)arr+off) + idx; \ + _mm_storeu_pd(pLsrPrimV128Float64StoreHelp_marr, v); \ + } while (0) + +#endif + +#define pLsrPrimV128Float64LoadStrided(dest, arr, off, idx, stride) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + +#define pLsrPrimV128Float64StoreStrided(arr, off, idx, stride, v) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + + + +// ================ +// Ref - 128-bit +// ================ + +#ifdef PLSR_VECTOR_USE_ASM + +#define pLsrPrimV128RefLift(dest, a) vector128RefLift(dest, a) +#define pLsrPrimV128RefConst(dest, c0, c1, c2, c3) vector128RefConst(dest, c0, c1, c2, c3) +#define pLsrPrimV128RefLoad(dest, arr, off, idx) vector128RefLoadHelp (dest, arr, off, idx) +#define pLsrPrimV128RefStore(arr, off, idx, v) vector128RefStoreHelp (arr, off, idx, v) + +#define vector128RefLift(dest, a) \ + do { \ + __m128r vector128RefLift_b; \ + __asm__ ("movd %1, %0;\n\t" \ + "pshufd $0x0, %0, %0;\n\t" \ + : "=x"(vector128RefLift_b) \ + : "r"(a)); \ + (dest) = vector128RefLift_b; \ + } while (0) + + // SSE 4.1 + //__m128r dest + // __asm__ ("pinsrd $0x0, %1, %0;\n\t" + // "pinsrd $0x1, %2, %0;\n\t" + // "pinsrd $0x2, %3, %0;\n\t" + // "pinsrd $0x3, %4, %0;\n\t" + // : "=x"(dest) + // : "rm"(c0), "rm"(c1), "rm"(c2), "rm"(c3)); + +#define vector128RefConst(dest, c0, c1, c2, c3) \ + do { \ + int vector128RefConst_c[4]; \ + __m128r vector128RefConst_r; \ + vector128RefConst_c[0] = c0; \ + vector128RefConst_c[1] = c1; \ + vector128RefConst_c[2] = c2; \ + vector128RefConst_c[3] = c3; \ + __asm__("movdqu (%1), %0;" \ + : "=x"(vector128RefConst_r) \ + : "r"(vector128RefConst_c)); \ + dest = vector128RefConst_r; \ + } while (0) + +#define vector128RefLoadHelp(dest, arr, off, idx) \ + do { \ + __m128r* vector128RefLoadHelp_marr = (__m128r*)((uint32*)((char*)arr+off) + idx); \ + __m128r vector128RefLoadHelp_r; \ + __asm__("movdqu (%1), %0;\n" : "=x"(vector128RefLoadHelp_r) : "r"(vector128RefLoadHelp_marr)); \ + (dest) = vector128RefLoadHelp_r; \ + } while (0) + +#define vector128RefStoreHelp(arr, off, idx, v) \ + do { \ + __m128r* vector128RefStoreHelp_marr = (__m128r*)((uint32*)((char*)arr+off) + idx); \ + __asm__("movdqu %1, (%0);\n" : : "r"(vector128RefStoreHelp_marr), "x"(v)); \ + } while (0) + +#define pLsrPrimV128RefLoadStrided(dest, arr, off, idx, stride) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + +#define pLsrPrimV128RefStoreStrided(arr, off, idx, stride, v) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + +#else + +#define pLsrPrimV128RefLoad(dest, arr, off, idx) pLsrPrimV128UInt32Load(dest, arr, off, idx) + +#define pLsrPrimV128RefLoadStrided(dest, arr, off, idx, stride) \ + do { \ + pLsrRuntimeError("Strided operations unimplemented"); \ + } while (0) + +#endif + +#endif /* _PLSR_PRIMS_VECTOR_SSE_H_ */ diff --git a/runtime/include/hrc/plsr-prims-vector.h b/runtime/include/hrc/plsr-prims-vector.h new file mode 100644 index 0000000..e083ccd --- /dev/null +++ b/runtime/include/hrc/plsr-prims-vector.h @@ -0,0 +1,25 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_PRIMS_VECTOR_H_ +#define _PLSR_PRIMS_VECTOR_H_ + +#ifdef P_USE_VI_SSE +#define pLsrViWidth 128 +#include "hrc/plsr-prims-vector-sse.h" + +#elif P_USE_VI_AVX +#define pLsrViWidth 256 +#include "hrc/plsr-prims-vector-sse.h" +#include "hrc/plsr-prims-vector-avx.h" + +#elif P_USE_VI_MIC +#define pLsrViWidth 512 +#include "hrc/plsr-prims-vector-mic.h" + +#else +#define pLsrViWidth 128 + +#endif + +#endif /* _PLSR_PRIMS_VECTOR_H_ */ diff --git a/runtime/include/hrc/plsr-prims.h b/runtime/include/hrc/plsr-prims.h new file mode 100755 index 0000000..3d6c296 --- /dev/null +++ b/runtime/include/hrc/plsr-prims.h @@ -0,0 +1,12 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_PRIMS_H_ +#define _PLSR_PRIMS_H_ + +#include "hrc/plsr-prims-prims.h" +#include "hrc/plsr-prims-runtime.h" +#include "hrc/plsr-prims-vector.h" +#include "hrc/plsr-prims-ghc.h" + +#endif /* _PLSR_PRIMS_H_ */ diff --git a/runtime/include/hrc/plsr-ptk-thunk.h b/runtime/include/hrc/plsr-ptk-thunk.h new file mode 100644 index 0000000..0f1d7c7 --- /dev/null +++ b/runtime/include/hrc/plsr-ptk-thunk.h @@ -0,0 +1,570 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Unified Futures Thunk Implementation */ + +#ifndef _PLSR_THUNK_H_ +#define _PLSR_THUNK_H_ + +#ifdef PLSR_THUNK_INTERCEPT_CUTS + #ifndef P_USE_PILLAR + #error "Cut interception only supported on Pillar" + #endif +#endif + +/*** Types ***/ + +typedef enum { + PlsrThunkIndir = PtkFutureStatusUserDefined, + PlsrThunkCut +} PlsrFutureStatus; + +#ifdef PLSR_THUNK_INTERCEPT_CUTS +#define PlsrTypesMk(name, retType) \ + typedef PlsrFutureStatus PlsrThunkReturnType##name; \ + typedef PlsrThunkReturnType##name (*PlsrThunkCode##name)(PlsrThunkB##name); \ + /* PlsrObjectU must be a prefix of this structure */ \ + typedef struct PlsrThunkS##name { \ + PlsrVTable vtable; \ + PtkFutureBuffer future; \ + PilContinuation0 cutCont; \ + retType result; \ + char fvs[]; \ + } PlsrThunkU##name +#else /* !PLSR_THUNK_INTERCEPT_CUTS */ +#define PlsrTypesMk(name, retType) \ + typedef PlsrFutureStatus PlsrThunkReturnType##name; \ + typedef PlsrThunkReturnType##name (*PlsrThunkCode##name)(PlsrThunkB##name); \ + /* PlsrObjectU must be a prefix of this structure */ \ + typedef struct PlsrThunkS##name { \ + PlsrVTable vtable; \ + PtkFutureBuffer future; \ + retType result; \ + char fvs[]; \ + } PlsrThunkU##name +#endif /* !PLSR_THUNK_INTERCEPT_CUTS */ + +/* + * PlsrThunkCodeRef, pLsrThunkSRef, pLsrThunkURef + * PlsrThunkCode32, pLsrThunkS32, pLsrThunkU32 + * PlsrThunkCode64, pLsrThunkS64, pLsrThunkU64 + * PlsrThunkCodeFloat, pLsrThunkSFloat, pLsrThunkUFloat + * PlsrThunkCodeDouble, pLsrThunkSDouble, pLsrThunkUDouble + */ +PlsrTypesMk(Ref, PlsrRef); +PlsrTypesMk(32, uint32); +PlsrTypesMk(64, uint64); +PlsrTypesMk(Float, float32); +PlsrTypesMk(Double, float64); + +#define pLsrThunkRef(t) ((PlsrThunkURef*)(t)) +#define pLsrThunk32(t) ((PlsrThunkU32*)(t)) +#define pLsrThunk64(t) ((PlsrThunkU64*)(t)) +#define pLsrThunkFloat(t) ((PlsrThunkUFloat*)(t)) +#define pLsrThunkDouble(t) ((PlsrThunkUDouble*)(t)) + +#define pLsrThunkResultFieldRef(thnk) (pLsrThunkRef(thnk)->result) +#define pLsrThunkResultField32(thnk) (pLsrThunk32(thnk)->result) +#define pLsrThunkResultField64(thnk) (pLsrThunk64(thnk)->result) +#define pLsrThunkResultFieldFloat(thnk) (pLsrThunkFloat(thnk)->result) +#define pLsrThunkResultFieldDouble(thnk) (pLsrThunkDouble(thnk)->result) + +#ifdef PLSR_THUNK_SUBSUMPTION +#define pLsrThunkCastToObjectRef(thunk) ((PlsrRef) thunk) +#else /* !PLSR_THUNK_SUBSUMPTION */ +#define pLsrThunkCastToObjectRef(thunk) ((PlsrRef) 0) +#endif /* !PLSR_THUNK_SUBSUMPTION */ +#define pLsrThunkCastToObject32(thunk) ((uint32) 0) +#define pLsrThunkCastToObject64(thunk) ((uint64) 0) +#define pLsrThunkCastToObjectFloat(thunk) ((float32) 0) +#define pLsrThunkCastToObjectDouble(thunk) ((float64) 0) + +/* Global Thunks */ + + #define pLsrThunkStaticValueMk(name, v, vt, retType, val) \ + static PlsrThunkU##name v = \ + { .vtable = (vt), \ + .future = ptkFutureStatic(PlsrThunkIndir, 0), \ + .result = (retType) (val) } + +/* These are marked ALWAYS_IMMUTABLE, so must be initialized atomically wrt the gc */ +#define pLsrThunkStaticValueRef(v, val) pLsrThunkStaticValueMk(Ref, v, pLsrThunkValVTableRef, PlsrRef, val) +#define pLsrThunkStaticValue32(v, val) pLsrThunkStaticValueMk(32, v, pLsrThunkValVTable32, uint32, val) +#define pLsrThunkStaticValue64(v, val) pLsrThunkStaticValueMk(64, v, pLsrThunkValVTable64, uint64, val) +#define pLsrThunkStaticValueFloat(v, val) pLsrThunkStaticValueMk(Float, v, pLsrThunkValVTableFloat, float32, val) +#define pLsrThunkStaticValueDouble(v, val) pLsrThunkStaticValueMk(Double, v, pLsrThunkValVTableDouble, float64, val) + +/* Creation */ + +/* This cannot engender a yield */ +#define pLsrThunkNewMk(name, dv, vt, sz, algn) \ + do { \ + pLsrAllocAligned(PlsrThunkB##name, (dv), (vt), (sz), (algn)); \ + assert((vt)->tag==VThunkTag); \ + } while (0) +#ifdef PLSR_ZERO_REFS +#define pLsrThunkNewRef(dv, vt, sz, algn) \ + do { \ + pLsrThunkNewMk(Ref, dv, vt, sz, algn); \ + pLsrThunkResultFieldRef(dv) = NULL; \ + } while (0) +#else +#define pLsrThunkNewRef(dv, vt, sz, algn) pLsrThunkNewMk(Ref, dv, vt, sz, algn) +#endif +#define pLsrThunkNew32(dv, vt, sz, algn) pLsrThunkNewMk(32, dv, vt, sz, algn) +#define pLsrThunkNew64(dv, vt, sz, algn) pLsrThunkNewMk(64, dv, vt, sz, algn) +#define pLsrThunkNewFloat(dv, vt, sz, algn) pLsrThunkNewMk(Float, dv, vt, sz, algn) +#define pLsrThunkNewDouble(dv, vt, sz, algn) pLsrThunkNewMk(Double, dv, vt, sz, algn) + +/* Initialisation */ + +#define pLsrThunkSetInitMk(name, thnk, code) \ + (ptkFutureInit((PtkRef)(thnk), \ + (unsigned)&pLsrThunk##name(0)->future, \ + (PtkFutureCodePointer)code)) + +#define pLsrThunkSetInitRef(thnk, code) pLsrThunkSetInitMk(Ref, thnk, code) +#define pLsrThunkSetInit32(thnk, code) pLsrThunkSetInitMk(32, thnk, code) +#define pLsrThunkSetInit64(thnk, code) pLsrThunkSetInitMk(64, thnk, code) +#define pLsrThunkSetInitFloat(thnk, code) pLsrThunkSetInitMk(Float, thnk, code) +#define pLsrThunkSetInitDouble(thnk, code) pLsrThunkSetInitMk(Double, thnk, code) + +#define pLsrThunkSetValueRef(thnk, v) \ + (pLsrWriteBarrierRefBase((PlsrThunkBRef)(thnk), pLsrThunkRef(thnk)->result, (v)), \ + ptkFutureSetStatus((PtkRef)(thnk), \ + (unsigned)&pLsrThunkRef(0)->future, \ + (PtkFutureStatus)PlsrThunkIndir)) +/* This should not engender a yield */ +#define pLsrThunkSetValueNonWbMk(name, thnk, v) \ + (pLsrThunk##name(thnk)->result = (v), \ + ptkFutureSetStatus((PlsrThunkB##name)(thnk), \ + (unsigned)&pLsrThunk##name(0)->future, \ + (PtkFutureStatus)PlsrThunkIndir)) + +#define pLsrThunkSetValue32(thnk, v) pLsrThunkSetValueNonWbMk(32, thnk, v) +#define pLsrThunkSetValue64(thnk, v) pLsrThunkSetValueNonWbMk(64, thnk, v) +#define pLsrThunkSetValueFloat(thnk, v) pLsrThunkSetValueNonWbMk(Float, thnk, v) +#define pLsrThunkSetValueDouble(thnk, v) pLsrThunkSetValueNonWbMk(Double, thnk, v) + +#define pLsrThunkValueInitRef(thnk, v) \ + do { \ + pLsrThunkSetValueRef(thnk, v); \ + pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTableRef); \ + } while (0) + +#define pLsrThunkValueInit32(thnk, v) \ + do { \ + pLsrThunkSetValue32(thnk, v); \ + pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTable32); \ + } while (0) + +#define pLsrThunkValueInit64(thnk, v) \ + do { \ + pLsrThunkSetValue64(thnk, v); \ + pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTable64); \ + } while (0) + +#define pLsrThunkValueInitFloat(thnk, v) \ + do { \ + pLsrThunkSetValueFloat(thnk, v); \ + pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTableFloat); \ + } while (0) + +#define pLsrThunkValueInitDouble(thnk, v) \ + do { \ + pLsrThunkSetValueDouble(thnk, v); \ + pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTableDouble); \ + } while (0) + +#define pLsrThunkNewValueMk(name, dv, vt, sz, algn, v) \ + do { \ + noyield { \ + pLsrThunkNew##name(dv, vt, sz, algn); \ + pLsrThunkSetValueNonWbMk(name, dv, v); \ + } \ + } while(0) + +#define pLsrThunkNewValueRef(dv, vt, sz, algn, v) pLsrThunkNewValueMk(Ref, dv, vt, sz, algn, ((PlsrRef) v)) +#define pLsrThunkNewValue32(dv, vt, sz, algn, v) pLsrThunkNewValueMk(32, dv, vt, sz, algn, v) +#define pLsrThunkNewValue64(dv, vt, sz, algn, v) pLsrThunkNewValueMk(64, dv, vt, sz, algn, v) +#define pLsrThunkNewValueFloat(dv, vt, sz, algn, v) pLsrThunkNewValueMk(Float, dv, vt, sz, algn, v) +#define pLsrThunkNewValueDouble(dv, vt, sz, algn, v) pLsrThunkNewValueMk(Double, dv, vt, sz, algn, v) + +static PlsrThunkBRef pLsrThunkNewValRef(PlsrRef v) +{ + noyield { + PlsrThunkBRef res; + pLsrAllocAligned(PlsrThunkBRef, res, pLsrThunkValVTableRef, sizeof(PlsrThunkURef), sizeof(PlsrRef)); + pLsrThunkSetValueRef(res, v); + return res; + } +} + +/* Projection */ +#define pLsrThunkIsUnboxedRef(thnk) (0) +#define pLsrThunkIsUnboxed32(thnk) (0) +#define pLsrThunkIsUnboxed64(thnk) (0) +#define pLsrThunkIsUnboxedFloat(thnk) (0) +#define pLsrThunkIsUnboxedDouble(thnk) (0) + +#define pLsrThunkStatusMk(name, thnk) \ + (ptkFutureGetStatus((PlsrThunkB##name)(thnk), (unsigned)&pLsrThunk##name(0)->future)) +#define pLsrThunkStatusRef(thnk) pLsrThunkStatusMk(Ref, thnk) +#define pLsrThunkStatus32(thnk) pLsrThunkStatusMk(32, thnk) +#define pLsrThunkStatus64(thnk) pLsrThunkStatusMk(64, thnk) +#define pLsrThunkStatusFloat(thnk) pLsrThunkStatusMk(Float, thnk) +#define pLsrThunkStatusDouble(thnk) pLsrThunkStatusMk(Double, thnk) + +#define pLsrThunkIsEvaledRef(thnk) (pLsrThunkStatusRef(thnk) == PlsrThunkIndir) +#define pLsrThunkIsEvaled32(thnk) (pLsrThunkStatus32 (thnk) == PlsrThunkIndir) +#define pLsrThunkIsEvaled64(thnk) (pLsrThunkStatus64 (thnk) == PlsrThunkIndir) +#define pLsrThunkIsEvaledFloat(thnk) (pLsrThunkStatusFloat (thnk) == PlsrThunkIndir) +#define pLsrThunkIsEvaledDouble(thnk) (pLsrThunkStatusDouble (thnk) == PlsrThunkIndir) + +#define pLsrThunkGetValRef(thnk) pLsrThunkResultFieldRef(thnk) +#define pLsrThunkGetVal32(thnk) pLsrThunkResultField32(thnk) +#define pLsrThunkGetVal64(thnk) pLsrThunkResultField64(thnk) +#define pLsrThunkGetValFloat(thnk) pLsrThunkResultFieldFloat(thnk) +#define pLsrThunkGetValDouble(thnk) pLsrThunkResultFieldDouble(thnk) + + +/* Evaluation */ + +static PlsrRef pLsrThunkEvalStatusErrorRef(PlsrFutureStatus status) +{ + char msg[80]; + sprintf(msg, "Bad thunk status in Eval: %d", status); + pLsrRuntimeError(msg); + return 0; +} + +static PlsrRef pLsrThunkEvalSlowRef(PlsrThunkBRef thunk) +{ + for(;;) { + switch (pLsrThunkStatusRef(thunk)) { + case PtkFutureStatusUninit: + pLsrRuntimeError("Uninitialised thunk in Eval"); + return 0; +#ifdef P_USE_PARALLEL_FUTURES + case PtkFutureStatusSpawning: + case PtkFutureStatusSpawned: +#endif /* P_USE_PARALLEL_FUTURES */ + case PtkFutureStatusInit: + case PtkFutureStatusStarted: + ptkFutureWait((PtkRef)thunk, + (unsigned)&pLsrThunkRef(0)->future, + 1); +#ifdef PLSR_THUNK_INTERCEPT_CUTS + assert(pLsrThunkStatusRef(thunk) == PlsrThunkIndir || + pLsrThunkStatusRef(thunk) == PlsrThunkCut); +#else /* !PLSR_THUNK_INTERCEPT_CUTS */ + assert(pLsrThunkStatusRef(thunk) == PlsrThunkIndir); + return pLsrThunkGetValRef(thunk); +#endif /* !PLSR_THUNK_INTERCEPT_CUTS */ + break; + case PlsrThunkIndir: + return pLsrThunkGetValRef(thunk); +#ifdef PLSR_THUNK_INTERCEPT_CUTS + case PlsrThunkCut: + pilCutTo0(pLsrThunkRef(thunk)->cutCont); + return 0; +#endif /* PLSR_THUNK_INTERCEPT_CUTS */ + default: + return pLsrThunkEvalStatusErrorRef(pLsrThunkStatusRef(thunk)); + } + } +} + + +#ifdef P_USE_PARALLEL_FUTURES +#ifdef PLSR_THUNK_INTERCEPT_CUTS +#define pLsrThunkEvalSlowNonRefMk(retType, name) \ + \ + static retType pLsrThunkEvalStatusError##name(PlsrFutureStatus status) \ + { \ + char msg[80]; \ + sprintf(msg, "Bad thunk status in Eval: %d", status); \ + pLsrRuntimeError(msg); \ + return 0; \ + } \ + \ + static retType pLsrThunkEvalSlow##name(PlsrThunkB##name thunk) \ + { \ + for(;;) { \ + switch (pLsrThunkStatus##name(thunk)) { \ + case PtkFutureStatusUninit: \ + pLsrRuntimeError("Uninitialised thunk in Eval"); \ + return 0; \ + case PtkFutureStatusSpawning: \ + case PtkFutureStatusSpawned: \ + case PtkFutureStatusInit: \ + case PtkFutureStatusStarted: \ + ptkFutureWait((PtkRef)thunk, \ + (unsigned)&pLsrThunk##name(0)->future, \ + 1); \ + assert(pLsrThunkStatus##name(thunk) == PlsrThunkIndir || \ + pLsrThunkStatus##name(thunk) == PlsrThunkCut); \ + break; \ + case PlsrThunkIndir: \ + return pLsrThunkGetVal##name(thunk); \ + case PlsrThunkCut: \ + pilCutTo0(pLsrThunk##name(thunk)->cutCont); \ + return 0; \ + default: \ + return pLsrThunkEvalStatusError##name(pLsrThunkStatus##name(thunk)); \ + } \ + } \ + } +#else /* !PLSR_THUNK_INTERCEPT_CUTS */ +#define pLsrThunkEvalSlowNonRefMk(retType, name) \ + \ + static retType pLsrThunkEvalStatusError##name(PlsrFutureStatus status) \ + { \ + char msg[80]; \ + sprintf(msg, "Bad thunk status in Eval: %d", status); \ + pLsrRuntimeError(msg); \ + return 0; \ + } \ + \ + static retType pLsrThunkEvalSlow##name(PlsrThunkB##name thunk) \ + { \ + for(;;) { \ + switch (pLsrThunkStatus##name(thunk)) { \ + case PtkFutureStatusUninit: \ + pLsrRuntimeError("Uninitialised thunk in Eval"); \ + return 0; \ + case PtkFutureStatusSpawning: \ + case PtkFutureStatusSpawned: \ + case PtkFutureStatusInit: \ + case PtkFutureStatusStarted: \ + ptkFutureWait((PtkRef)thunk, \ + (unsigned)&pLsrThunk##name(0)->future, \ + 1); \ + assert(pLsrThunkStatus##name(thunk) == PlsrThunkIndir); \ + break; \ + case PlsrThunkIndir: \ + return pLsrThunkGetVal##name(thunk); \ + default: \ + return pLsrThunkEvalStatusError##name(pLsrThunkStatus##name(thunk)); \ + } \ + } \ + } +#endif /* PLSR_THUNK_INTERCEPT_CUTS */ +#else /* !P_USE_PARALLEL_FUTURES */ +#define pLsrThunkEvalSlowNonRefMk(retType, name) \ + \ + static retType pLsrThunkEvalStatusError##name(PlsrFutureStatus status) \ + { \ + char msg[80]; \ + sprintf(msg, "Bad thunk status in Eval: %d", status); \ + pLsrRuntimeError(msg); \ + return 0; \ + } \ + \ + static retType pLsrThunkEvalSlow##name(PlsrThunkB##name thunk) \ + { \ + for(;;) { \ + switch (pLsrThunkStatus##name(thunk)) { \ + case PtkFutureStatusUninit: \ + pLsrRuntimeError("Uninitialised thunk in Eval"); \ + return 0; \ + case PtkFutureStatusInit: \ + case PtkFutureStatusStarted: \ + ptkFutureWait((PtkRef)thunk, \ + (unsigned)&pLsrThunk##name(0)->future, \ + 1); \ + assert(pLsrThunkStatus##name(thunk) == PlsrThunkIndir); \ + return pLsrThunkGetVal##name(thunk); \ + case PlsrThunkIndir: \ + return pLsrThunkGetVal##name(thunk); \ + default: \ + return pLsrThunkEvalStatusError##name(pLsrThunkStatus##name(thunk)); \ + } \ + } \ + } + +#endif /* P_USE_PARALLEL_FUTURES */ + +/* + * pLsrThunkEval32 + * pLsrThunkEval64 + * pLsrThunkEvalFloat + * pLsrThunkEvalDouble + */ +pLsrThunkEvalSlowNonRefMk(uint32, 32); +pLsrThunkEvalSlowNonRefMk(uint64, 64); +pLsrThunkEvalSlowNonRefMk(float32, Float); +pLsrThunkEvalSlowNonRefMk(float64, Double); + +#define pLsrThunkEvalRef(thunk) pLsrThunkEvalSlowRef (thunk) +#define pLsrThunkEval32(thunk) pLsrThunkEvalSlow32 (thunk) +#define pLsrThunkEval64(thunk) pLsrThunkEvalSlow64 (thunk) +#define pLsrThunkEvalFloat(thunk) pLsrThunkEvalSlowFloat (thunk) +#define pLsrThunkEvalDouble(thunk) pLsrThunkEvalSlowDouble (thunk) + +#define pLsrThunkCallRef(thunk) pLsrThunkEvalSlowRef (thunk) +#define pLsrThunkCall32(thunk) pLsrThunkEvalSlow32 (thunk) +#define pLsrThunkCall64(thunk) pLsrThunkEvalSlow64 (thunk) +#define pLsrThunkCallFloat(thunk) pLsrThunkEvalSlowFloat (thunk) +#define pLsrThunkCallDouble(thunk) pLsrThunkEvalSlowDouble (thunk) + +#define pLsrThunkEvalDirectRef(code, thunk) pLsrThunkEvalSlowRef (thunk) +#define pLsrThunkEvalDirect32(code, thunk) pLsrThunkEvalSlow32 (thunk) +#define pLsrThunkEvalDirect64(code, thunk) pLsrThunkEvalSlow64 (thunk) +#define pLsrThunkEvalDirectFloat(code, thunk) pLsrThunkEvalSlowFloat (thunk) +#define pLsrThunkEvalDirectDouble(code, thunk) pLsrThunkEvalSlowDouble (thunk) + +#define pLsrThunkCallDirectRef(code, thunk) pLsrThunkEvalDirectRef(code, thunk) +#define pLsrThunkCallDirect32(code, thunk) pLsrThunkEvalDirect32 (code, thunk) +#define pLsrThunkCallDirect64(code, thunk) pLsrThunkEvalDirect64 (code, thunk) +#define pLsrThunkCallDirectFloat(code, thunk) pLsrThunkEvalDirectFloat (code, thunk) +#define pLsrThunkCallDirectDouble(code, thunk) pLsrThunkEvalDirectDouble (code, thunk) + +#define pLsrThunkTailEvalRef(code, thunk) TAILCALL(pLsrThunkEvalSlowRef (thunk)) +#define pLsrThunkTailEval32(code, thunk) TAILCALL(pLsrThunkEvalSlow32 (thunk)) +#define pLsrThunkTailEval64(code, thunk) TAILCALL(pLsrThunkEvalSlow64 (thunk)) +#define pLsrThunkTailEvalFloat(code, thunk) TAILCALL(pLsrThunkEvalSlowFloat (thunk)) +#define pLsrThunkTailEvalDouble(code, thunk) TAILCALL(pLsrThunkEvalSlowDouble (thunk)) + +#define pLsrThunkTailCallRef(thunk) TAILCALL(pLsrThunkEvalSlowRef (thunk)) +#define pLsrThunkTailCall32(thunk) TAILCALL(pLsrThunkEvalSlow32 (thunk)) +#define pLsrThunkTailCall64(thunk) TAILCALL(pLsrThunkEvalSlow64 (thunk)) +#define pLsrThunkTailCallFloat(thunk) TAILCALL(pLsrThunkEvalSlowFloat (thunk)) +#define pLsrThunkTailCallDouble(thunk) TAILCALL(pLsrThunkEvalSlowDouble (thunk)) + +#define pLsrThunkTailEvalDirectRef(code, thunk) TAILCALL(pLsrThunkEvalSlowRef (thunk)) +#define pLsrThunkTailEvalDirect32(code, thunk) TAILCALL(pLsrThunkEvalSlow32 (thunk)) +#define pLsrThunkTailEvalDirect64(code, thunk) TAILCALL(pLsrThunkEvalSlow64 (thunk)) +#define pLsrThunkTailEvalDirectFloat(code, thunk) TAILCALL(pLsrThunkEvalSlowFloat (thunk)) +#define pLsrThunkTailEvalDirectDouble(code, thunk) TAILCALL(pLsrThunkEvalSlowDouble (thunk)) + +#define pLsrThunkTailCallDirectRef(code, thunk) TAILCALL(pLsrThunkEvalDirectRef(code, thunk)) +#define pLsrThunkTailCallDirect32(code, thunk) TAILCALL(pLsrThunkEvalDirect32 (code, thunk)) +#define pLsrThunkTailCallDirect64(code, thunk) TAILCALL(pLsrThunkEvalDirect64 (code, thunk)) +#define pLsrThunkTailCallDirectFloat(code, thunk) TAILCALL(pLsrThunkEvalDirectFloat (code, thunk)) +#define pLsrThunkTailCallDirectDouble(code, thunk) TAILCALL(pLsrThunkEvalDirectDouble (code, thunk)) + +#define pLsrThunkReturnRef(thnk, val) \ + do { \ + pLsrWriteBarrierRefBase((thnk), pLsrThunkResultFieldRef(thnk), (val)); \ + return PlsrThunkIndir; \ + } while (0) + +#define pLsrThunkReturnNonRefMk(name, thnk, val) \ + do { \ + pLsrThunkResultField##name(thnk) = (val); \ + return PlsrThunkIndir; \ + } while (0) + +#define pLsrThunkReturn32(thnk, val) pLsrThunkReturnNonRefMk(32, thnk, val) +#define pLsrThunkReturn64(thnk, val) pLsrThunkReturnNonRefMk(64, thnk, val) +#define pLsrThunkReturnFloat(thnk, val) pLsrThunkReturnNonRefMk(Float, thnk, val) +#define pLsrThunkReturnDouble(thnk, val) pLsrThunkReturnNonRefMk(Double, thnk, val) + +#ifdef PLSR_THUNK_INTERCEPT_CUTS + +#define pLsrThunkCutMk(name, thnk, cont) \ + do { \ + pLsrThunk##name(thnk)->cutCont = (cont); \ + return PlsrThunkCut; \ + cut to cont; \ + } while (0) +#define pLsrThunkCutRef(thnk, cont) pLsrThunkCutMk(Ref, thnk, cont) +#define pLsrThunkCut32(thnk, cont) pLsrThunkCutMk(32, thnk, cont) +#define pLsrThunkCut64(thnk, cont) pLsrThunkCutMk(64, thnk, cont) +#define pLsrThunkCutFloat(thnk, cont) pLsrThunkCutMk(Float, thnk, cont) +#define pLsrThunkCutDouble(thnk, cont) pLsrThunkCutMk(Double, thnk, cont) + +#endif /* PLSR_THUNK_INTERCEPT_CUTS */ + +/* Printing */ + +static void pLsrThunkPrintRef(PlsrThunkBRef t) +{ + pLsrValuePrint((PlsrObjectB)pLsrThunkEvalRef(t)); +} + +/* Spawning */ + +#ifdef P_USE_PARALLEL_FUTURES + +#define pLsrThunkSpawnMk(name) \ + static void pLsrThunkSpawn##name(PlsrThunkB##name thunk) \ + { \ + ptkFutureSpawn((PtkRef)thunk, (unsigned)&pLsrThunk##name(0)->future); \ + } + +/* + * pLsrThunkSpawnRef + * pLsrThunkSpawn32 + * pLsrThunkSpawn64 + * pLsrThunkSpawnFloat + * pLsrThunkSpawnDouble + */ +pLsrThunkSpawnMk(Ref); +pLsrThunkSpawnMk(32); +pLsrThunkSpawnMk(64); +pLsrThunkSpawnMk(Float); +pLsrThunkSpawnMk(Double); + +#endif /* P_USE_PARALLEL_FUTURES */ + +#define pLsrThunkBlackHoleRef(thunk) +#define pLsrThunkBlackHole32(thunk) +#define pLsrThunkBlackHole64(thunk) +#define pLsrThunkBlackHoleFloat(thunk) +#define pLsrThunkBlackHoleDouble(thunk) + +#define pLsrThunkClaimRef(thunk) +#define pLsrThunkClaim32(thunk) +#define pLsrThunkClaim64(thunk) +#define pLsrThunkClaimFloat(thunk) +#define pLsrThunkClaimDouble(thunk) + + +#define pLsrThunkZeroFV(zero) zero + +/*** Check Object Model ***/ + +/* Generated code defines: + * pLsrThunkFixedSizeRef + * pLsrThunkFixedSize64 + * pLsrThunkFixedSize32 + * pLsrThunkFixedSizeFloat + * pLsrThunkFixedSizeDouble + * pLsrThunkResultOffsetRef + * pLsrThunkResultOffset64 + * pLsrThunkResultOffset32 + * pLsrThunkResultOffsetFloat + * pLsrThunkResultOffsetDouble + */ + +static void pLsrThunkCheck() +{ + /*printf("Thunk check: %d/%d, %d/%d, %d/%d\n", pLsrThunkFixedSizeRef, + sizeof(PlsrThunkURef), pLsrThunkFixedSize32, sizeof(PlsrThunkU32), + pLsrThunkFixedSize64, sizeof(PlsrThunkU64));*/ + if (pLsrThunkFixedSizeRef != sizeof(PlsrThunkURef)) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkFixedSize32 != sizeof(PlsrThunkU32)) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkFixedSize64 != sizeof(PlsrThunkU64)) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkFixedSizeFloat != sizeof(PlsrThunkUFloat)) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkFixedSizeDouble != sizeof(PlsrThunkUDouble)) + pLsrRuntimeError("Bad thunk object model!\n"); + + if (pLsrThunkResultOffsetRef != ((unsigned)(&pLsrThunkResultFieldRef(0)))) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkResultOffset32 != ((unsigned)(&pLsrThunkResultField32(0)))) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkResultOffset64 != ((unsigned)(&pLsrThunkResultField64(0)))) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkResultOffsetFloat != ((unsigned)(&pLsrThunkResultFieldFloat(0)))) + pLsrRuntimeError("Bad thunk object model!\n"); + if (pLsrThunkResultOffsetDouble != ((unsigned)(&pLsrThunkResultFieldDouble(0)))) + pLsrRuntimeError("Bad thunk object model!\n"); +} + +#endif /* !_PLSR_THUNK_H_ */ diff --git a/runtime/include/hrc/plsr-rational.h b/runtime/include/hrc/plsr-rational.h new file mode 100755 index 0000000..d2f4f86 --- /dev/null +++ b/runtime/include/hrc/plsr-rational.h @@ -0,0 +1,311 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Arbitrary precision numbers */ + +#ifndef _PLSR_RATIONAL_H_ +#define _PLSR_RATIONAL_H_ + +/********************************************************************** + * This file implements the Rational type in terms of + * an arbitrary precision rational package APRat, with associated + * operations. + */ + +#include "hrc/plsr-ap-rational.h" + +static void pLsrRationalRegisterVTables() +{ + pLsrAPRatRegisterVTables(); +} + +static void pLsrRationalRegisterGlobals() { + pLsrAPRatRegisterGlobals(); +} + + +#ifdef P_USE_TAGGED_RATIONALS + +#include "hrc/plsr-tagged-int32.h" + +/* Basic constructors */ + +typedef PlsrAPRat PlsrRational; +typedef PlsrAPRatS PlsrRationalU; + +#define pLsrRationalIsTagged(a) (pLsrTaggedInt32TaggedIntIsTagged(a)) + +#define pLsrRationalUnTag(a) (pLsrTaggedInt32TaggedIntUnTag(a)) + +/* Basic queries and destructors */ + +#define pLsrRationalIsIntegral(a) \ + (pLsrRationalIsTagged(a) || pLsrAPRatIsIntegral((PlsrAPRat) a)) + +#define pLsrRationalStaticUnboxedDef pLsrAPRatStaticUnboxedDef +#define pLsrRationalStaticInit pLsrAPRatStaticInit + +#define pLsrSmallRationalFromSInt32(a) (pLsrTaggedInt32TaggedIntFromSmallInt32(PlsrRational, a)) + +#define pLsrSmallRationalMax pLsrTaggedInt32Max +#define pLsrSmallRationalMin pLsrTaggedInt32Min + + +/* Conversions */ +/* Helpers */ +static void pLsrIntegersFromRational(PlsrInteger* numO, PlsrInteger* denO, PlsrRational r) { + if (pLsrRationalIsTagged(r)) { + /* This conversion could be optimized */ + pLsrIntegerFromSInt32(*numO, pLsrRationalUnTag(r)); + *denO = pLsrIntegerOne; + } else { + pLsrIntegersFromAPRat(numO, denO, r); + } +} + +static PlsrInteger pLsrRatNumeratorSlow(PlsrRational r) { + if (pLsrRationalIsTagged(r)) { + PlsrInteger i; + /* This conversion could be optimized */ + pLsrIntegerFromSInt32(i, pLsrRationalUnTag(r)); + return i; + } else { + return pLsrAPRatNumerator(r); + } +} + +#define pLsrRatNumerator pLsrRatNumeratorSlow + +static PlsrInteger pLsrRatDenominatorSlow(PlsrRational r) { + if (pLsrRationalIsTagged(r)) { + return pLsrIntegerOne; + } else { + return pLsrAPRatDenominator(r); + } +} + +#define pLsrRatDenominator pLsrRatDenominatorSlow + +/************* Runtime/Compiler API ***************/ + + +#define pLsrRationalFromSInt8 pLsrRationalFromSInt32 +#define pLsrSInt8FromRational pLsrSInt32FromRational + +#define pLsrRationalFromSInt16 pLsrRationalFromSInt32 +#define pLsrSInt16FromRational pLsrSInt32FromRational + +#define pLsrRationalFromSInt32(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(PlsrRational, sint32, pLsrAPRatFromSInt32, dest, a) +#define pLsrSInt32FromRational(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint32, PlsrRational, PlsrAPRat, pLsrSInt32FromAPRat, dest, a) + +#define pLsrRationalFromSIntp(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(PlsrRational, sintp, pLsrAPRatFromSIntp, dest, a) +#define pLsrSIntpFromRational(dest, a) \ + pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sintp, PlsrRational, PlsrAPRat, pLsrSIntpFromAPRat, dest, a) + +#define pLsrRationalFromSInt64(dest, a) \ + pLsrTaggedInt64NumConvFastTaggedIntFromSIntX(PlsrRational, sint64, pLsrAPRatFromSInt64, dest, a) +#define pLsrSInt64FromRational(dest, a) \ + pLsrTaggedInt64NumConvFastSIntXFromTaggedInt(sint64, PlsrRational, PlsrAPRat, pLsrSInt64FromAPRat, dest, a) + +#define pLsrRationalFromUInt8 pLsrRationalFromUInt32 +#define pLsrUInt8FromRational pLsrUInt32FromRational + +#define pLsrRationalFromUInt16 pLsrRationalFromUInt32 +#define pLsrUInt16FromRational pLsrUInt32FromRational + +#define pLsrRationalFromUInt32(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(PlsrRational, uint32, pLsrAPRatFromUInt32, dest, a) +#define pLsrUInt32FromRational(dest, a) \ + pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(uint32, PlsrRational, PlsrAPRat, pLsrUInt32FromAPRat, dest, a) + +#define pLsrRationalFromUIntp(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(PlsrRational, uintp, pLsrAPRatFromUIntp, dest, a) +#define pLsrUIntpFromRational(dest, a) \ + pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(uintp, PlsrRational, PlsrAPRat, pLsrUIntpFromAPRat, dest, a) + +#define pLsrRationalFromUInt64(dest, a) \ + pLsrTaggedInt64NumConvFastTaggedIntFromUIntX(PlsrRational, uint64, pLsrAPRatFromUInt64, dest, a) +#define pLsrUInt64FromRational(dest, a) \ + pLsrTaggedInt64NumConvFastUIntXFromTaggedInt(uint64, PlsrRational, PlsrAPRat, pLsrUInt64FromAPRat, dest, a) + +#define pLsrRationalFromFloat32(dest, a) \ + pLsrTaggedInt32NumConvFastTaggedIntFromFloat32(PlsrRational, pLsrAPRatFromFloat32, dest, a) +#define pLsrFloat32FromRational(dest, a) \ + pLsrTaggedInt32NumConvFastFloatXFromTaggedInt(float32, PlsrRational, PlsrAPRat, pLsrFloat32FromAPRat, dest, a) + +#define pLsrRationalFromFloat64(dest, a) \ + pLsrTaggedInt64NumConvFastTaggedIntFromFloat64(PlsrRational, pLsrAPRatFromFloat64, dest, a) +#define pLsrFloat64FromRational(dest, a) \ + pLsrTaggedInt64NumConvFastFloatXFromTaggedInt(float64, PlsrRational, PlsrAPRat, pLsrFloat64FromAPRat, dest, a) + +static PlsrRational pLsrRationalFromIntegerSlow(PlsrInteger i) +{ + if (pLsrIntegerFitsInSInt32(i)) { + sint32 si; + PlsrRational r; + pLsrSInt32FromInteger(si, i); + pLsrRationalFromSInt32(r, si); + return r; + } else { + return (PlsrRational) pLsrAPRatFromInteger(i); + } +} + +#define pLsrRationalFromInteger(dest, i) ((dest) = (pLsrRationalFromIntegerSlow(i))) + +static PlsrInteger pLsrIntegerFromRationalSlow(PlsrRational r) { + if (pLsrRationalIsTagged(r)) { + PlsrInteger i; + /* This conversion could be optimized */ + pLsrIntegerFromSInt32(i, pLsrRationalUnTag(r)); + return i; + } else { + return pLsrIntegerFromAPRat(r); + } +} + +#define pLsrIntegerFromRational(dest, r) ((dest) = (pLsrIntegerFromRationalSlow(r))) + +#define pLsrRationalToUInt32Checked(dest, a) \ + do { \ + if (pLsrRationalIsTagged(a)) { \ + sint32 aa = pLsrRationalUnTag(a); \ + if (aa < 0) {(dest) = 0xffffffff;} \ + else {(dest) = ((uint32) aa);} \ + } else { \ + ((dest) = pLsrAPRatToUInt32Checked(a)); \ + } \ + } while(0) + +/* pLsrCStringFromRational */ +pLsrTaggedInt32MkCStringFrom(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrCStringFromAPRat); + +/* pLsrRationalFromCString */ +pLsrTaggedInt32MkFromCString(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, \ + pLsrSInt32FromAPRat, pLsrAPRatLe, pLsrAPRatFromCString); + +/* Arithmetic */ + +#define pLsrRationalNegate(dest, r) \ + pLsrTaggedInt32TaggedIntNeg(PlsrRational, PlsrAPRat, pLsrAPRatNeg, pLsrAPRatFromSInt32, dest, r) + +/* pLsrTaggedInt32RationalAddSlow*/ +pLsrTaggedInt32MkTaggedIntAddSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatPlus); +#define pLsrRationalPlus(dest, a, b) \ + pLsrTaggedInt32TaggedIntAdd(PlsrRational, PlsrAPRat, pLsrTaggedInt32RationalAddSlow, dest, a, b) + +/* pLsrTaggedInt32RationalSubSlow*/ +pLsrTaggedInt32MkTaggedIntSubSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatMinus); +#define pLsrRationalMinus(dest, a, b) \ + pLsrTaggedInt32TaggedIntSub(PlsrRational, PlsrAPRat, pLsrTaggedInt32RationalSubSlow, dest, a, b) + +/* pLsrTaggedInt32RationalMulSlow*/ +pLsrTaggedInt32MkTaggedIntMulSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatFromSInt64, pLsrAPRatMult); +#define pLsrRationalTimes(dest, a, b) \ + pLsrTaggedInt32TaggedIntMul(PlsrRational, PlsrAPRat, pLsrTaggedInt32RationalMulSlow, dest, a, b) + +/* Comparisons */ +pLsrTaggedInt32MkTaggedIntEqSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatEq); +pLsrTaggedInt32MkTaggedIntNeSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatNe); +pLsrTaggedInt32MkTaggedIntLtSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatLt); +pLsrTaggedInt32MkTaggedIntGtSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatGt); +pLsrTaggedInt32MkTaggedIntLeSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatLe); +pLsrTaggedInt32MkTaggedIntGeSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatGe); + +#define pLsrRationalEQ(dest, a, b) pLsrTaggedInt32TaggedIntEq(pLsrTaggedInt32RationalEqSlow, dest, a, b) +#define pLsrRationalNE(dest, a, b) pLsrTaggedInt32TaggedIntNe(pLsrTaggedInt32RationalNeSlow, dest, a, b) +#define pLsrRationalLT(dest, a, b) pLsrTaggedInt32TaggedIntLt(pLsrTaggedInt32RationalLtSlow, dest, a, b) +#define pLsrRationalGT(dest, a, b) pLsrTaggedInt32TaggedIntGt(pLsrTaggedInt32RationalGtSlow, dest, a, b) +#define pLsrRationalLE(dest, a, b) pLsrTaggedInt32TaggedIntLe(pLsrTaggedInt32RationalLeSlow, dest, a, b) +#define pLsrRationalGE(dest, a, b) pLsrTaggedInt32TaggedIntGe(pLsrTaggedInt32RationalGeSlow, dest, a, b) + +/* Constants */ +#define pLsrRationalOne pLsrTaggedInt32One +#define pLsrRationalZero pLsrTaggedInt32Zero + +#define pLsrRationalCheckTaggedInt32Assertions() pLsrTaggedInt32Check(PlsrRational, PlsrAPRat) + +static void pLsrRationalCheckAssertions() { + assert(pLsrTaggedInt32Max == pLsrSmallRationalMax); + assert(pLsrTaggedInt32Min == pLsrSmallRationalMin); + pLsrRationalCheckTaggedInt32Assertions(); +} + +#else /* P_USE_TAGGED_RATIONALS*/ + +/********************************************************************** + * This file implements the Rational type in terms of + * an arbitrary precision rational package APRat, with associated + * operations. + */ + +/* Basic constructors */ +typedef PlsrAPRat PlsrRational; +typedef PlsrAPRatS PlsrRationalU; + +/* Basic queries and destructors */ + +#define pLsrRationalIsIntegral pLsrAPRatIsIntegral +#define pLsrRationalStaticInit pLsrAPRatStaticInit +#define pLsrRationalStaticUnboxedDef pLsrAPRatStaticUnboxedDef + +/* Conversions */ + +#define pLsrRationalFromIntegers pLsrAPRatFromIntegers +#define pLsrIntegersFromRational pLsrIntegersFromAPRat + +#define pLsrRatNumerator pLsrAPRatNumerator +#define pLsrRatDenominator pLsrAPRatDenominator + +#define pLsrRationalFromInteger(dest, a) ((dest) = pLsrAPRatFromInteger(a)) + + +#define pLsrIntegerFromRational(dest, a) ((dest) = pLsrIntegerFromAPRat(a)) + +#define pLsrRationalFromSIntp(dest, a) ((dest) = pLsrAPRatFromSIntp(a)) +#define pLsrSIntpFromRational(dest, a) ((dest) = pLsrSIntpFromAPRat(a)) + +#define pLsrRationalFromUIntp(dest, a) ((dest) = pLsrAPRatFromUIntp(a)) +#define pLsrUIntpFromRational(dest, a) ((dest) = pLsrUIntpFromAPRat(a)) + +#define pLsrRationalFromSInt32(dest, a) ((dest) = pLsrAPRatFromSInt32(a)) +#define pLsrSInt32FromRational(dest, a) ((dest) = pLsrSInt32FromAPRat(a)) + +#define pLsrSmallRationalFromSInt32 pLsrRationalFromSInt32 + +#define pLsrRationalFromUInt32(dest, a) ((dest) = pLsrAPRatFromUInt32(a)) +#define pLsrUInt32FromRational(dest, a) ((dest) = pLsrUInt32FromAPRat(a)) + +#define pLsrRationalToUInt32Checked(dest, a) ((dest) = pLsrAPRatToUInt32Checked(a)) + +#define pLsrCStringFromRational pLsrCStringFromAPRat +#define pLsrRationalFromCString pLsrAPRatFromCString + +/* Arithmetic */ + +#define pLsrRationalNegate(dest, a) ((dest) = pLsrAPRatNeg(a)) +#define pLsrRationalPlus(dest, a, b) ((dest) = pLsrAPRatPlus(a, b)) +#define pLsrRationalMinus(dest, a, b) ((dest) = pLsrAPRatMinus(a, b)) +#define pLsrRationalTimes(dest, a, b) ((dest) = pLsrAPRatMult(a, b)) + +/* Comparisons */ +#define pLsrRationalEQ(dest, a, b) ((dest) = pLsrAPRatEq(a, b)) +#define pLsrRationalNE(dest, a, b) ((dest) = pLsrAPRatNe(a, b)) +#define pLsrRationalLT(dest, a, b) ((dest) = pLsrAPRatLt(a, b)) +#define pLsrRationalGT(dest, a, b) ((dest) = pLsrAPRatGt(a, b)) +#define pLsrRationalLE(dest, a, b) ((dest) = pLsrAPRatLe(a, b)) +#define pLsrRationalGE(dest, a, b) ((dest) = pLsrAPRatGe(a, b)) + +/* Constants */ +#define pLsrRationalOne pLsrAPRatOne +#define pLsrRationalZero pLsrAPRatZero + +static void pLsrRationalCheckAssertions() { +} + +#endif /* P_USE_TAGGED_RATIONALS*/ + +#endif /* !_PLSR_RATIONAL_H_ */ diff --git a/runtime/include/hrc/plsr-synchronization.h b/runtime/include/hrc/plsr-synchronization.h new file mode 100644 index 0000000..c9359db --- /dev/null +++ b/runtime/include/hrc/plsr-synchronization.h @@ -0,0 +1,244 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Synchronization primitives */ + +#ifndef _PLSR_SYNCHRONIZATION_H_ +#define _PLSR_SYNCHRONIZATION_H_ + +#ifdef P_USE_PILLAR + +#include "pgc/pgc.h" + +#ifdef __pillar2c__ +#define SYNCHCC __pcdecl +#else +#define SYNCHCC __cdecl +#endif + +static PrtBool SYNCHCC iFlcSyncEqual(volatile void *location, void *data) +{ + return (*(void **)location == data) ? PrtTrue : PrtFalse; +} + +static void iFlcSynchWaitNull(ref loc, uintp off) +{ + prtYieldUntilMovable(iFlcSyncEqual, &pLsrObjectField(loc, off, void**), NULL, + PrtInfiniteWaitCycles64, PrtGcTagOffset, (void*)off); +} + +static PrtBool SYNCHCC iFlcSyncNotEqual(volatile void *location, void *data) +{ + return (*(void **)location != data) ? PrtTrue : PrtFalse; +} + +static void iFlcSynchWaitNonNull(ref loc, uintp off) +{ + prtYieldUntilMovable(iFlcSyncNotEqual, &pLsrObjectField(loc, off, void**), NULL, + PrtInfiniteWaitCycles64, PrtGcTagOffset, (void*)off); +} + +static void iFlcSynchWaitEqualVoidS(ref loc, uintp off, void *data) +{ + prtYieldUntilMovable(iFlcSyncEqual, &pLsrObjectField(loc, off, void**), data, + PrtInfiniteWaitCycles64, PrtGcTagOffset, (void*)off); +} + +/* Note that this function should not be __pcdecl as it takes and returns refs */ +static inline ref pLsrSynchLockedCmpxchgRef(ref* loc, ref cmpValue, ref newValue) +{ + return (ref)pgc_write_ref_cas(loc,newValue,cmpValue); +} + +/* Taken from mcrt/src/mcrtthreads/mcrtia32.h */ +/* Attempts to atomically set *loc to newValue only if *loc==cmpValue. Returns the value read + * from *loc: cmpValue if successful, otherwise the contents of loc. +*/ +static inline uint32 SYNCHCC pLsrSynchLockedCmpxchgUInt32(volatile uint32 * loc, uint32 cmpValue, uint32 newValue) +{ +#ifdef PLSR_GNU_ASM + uint32 value; + __asm__ __volatile__( + "lock; cmpxchgl %3,(%1)" + : "=a"(value) + : "r"(loc), "a"(cmpValue), "d"(newValue) + : "memory"); + return value; +#else +#ifdef INTEL64 + register uint32 value; + __asm { + mov rcx, loc + mov edx, newValue + mov eax, cmpValue + lock cmpxchg dword ptr[rcx], edx + mov value, eax + } + return value; +#else // !INTEL64 + register uint32 value; + __asm { + mov ecx, loc + mov edx, newValue + mov eax, cmpValue + lock cmpxchg dword ptr[ecx], edx + mov value, eax + } + return value; +#endif // INTEL64 +#endif +} + +static inline uint64 SYNCHCC pLsrSynchLockedCmpxchgUInt64(volatile uint64 * loc, uint64 cmpValue, uint64 newValue) +{ +#ifdef PLSR_GNU_ASM + uint64 result; + __asm__ __volatile__( + "lock; cmpxchg %3,(%1)" + : "=a"(result) + : "r"(loc), "a"(cmpValue), "r"(newValue) + : "memory"); + return result; +#else +#ifdef INTEL64 + uint64 value; + __asm { + mov rcx, loc; + mov rdx, newValue; + mov rax, cmpValue; + lock cmpxchg qword ptr[rcx], rdx; + mov value, rax; + } + return value; +#else // !INTEL64 + uint32 hiCmp = (uint32) (UINT64_C(0xffffffff) & (cmpValue >> 32)); + uint32 lowCmp = (uint32) (UINT64_C(0xffffffff) & cmpValue); + + uint32 hiNew = (uint32) (UINT64_C(0xffffffff) & (newValue >> 32)); + uint32 lowNew = (uint32) (UINT64_C(0xffffffff) & newValue); + + uint32 lowRes; + uint32 hiRes; + + __asm { + mov edx, hiCmp + mov eax, lowCmp + mov ecx, hiNew + mov ebx, lowNew + mov esi, loc + lock cmpxchg8b qword ptr[esi] + mov lowRes, eax + mov hiRes, edx + } + { + uint64 res = (((uint64)hiRes)<<32)|((uint64)lowRes); + return res; + } +#endif // INTEL64 +#endif +} + +#if (P_WORD_SIZE==4) +#define pLsrSynchLockedCmpxchgUIntp pLsrSynchLockedCmpxchgUInt32 +#else +#if (P_WORD_SIZE==8) +#define pLsrSynchLockedCmpxchgUIntp pLsrSynchLockedCmpxchgUInt64 +#else +#error "Unsupported word size" +#endif /* P_WORD_SIZE=64*/ +#endif /* P_WORD_SIZE=32*/ + +static inline int SYNCHCC pLsrSynchCmpAndSetUInt32(volatile uint32 * loc, uint32 cmpValue, uint32 newValue) +{ + return (pLsrSynchLockedCmpxchgUInt32(loc, cmpValue, newValue) == cmpValue); +} + +static inline int SYNCHCC pLsrSynchCmpAndSetUInt64(volatile uint64 * loc, uint64 cmpValue, uint64 newValue) +{ + return (pLsrSynchLockedCmpxchgUInt64(loc, cmpValue, newValue) == cmpValue); +} + +static inline int SYNCHCC pLsrSynchCmpAndSetUIntp(volatile uintp * loc, uintp cmpValue, uintp newValue) +{ + return (pLsrSynchLockedCmpxchgUIntp(loc, cmpValue, newValue) == cmpValue); +} + +/* N.B. Timeouts are in absolute cycle time, not deltas from now. + * XXX Change this to PrtInfiniteTimeoutCycles64 when it becomes available + */ +#define pLsrSynchInfiniteTimeoutCycles64 UINT64_C(0xFFFFFFFFFFFFFFFF) + +PrtBool SYNCHCC pLsrSynchYieldPredicateNotEqualUInt32(volatile void *location, void *data) { + return ((*(volatile uint32*)location) != (uint32) data) ? PrtTrue : PrtFalse; +} + +PrtBool SYNCHCC pLsrSynchYieldPredicateNotEqualUIntp(volatile void *location, void *data) { + return ((*(volatile uintp*)location) != (uintp) data) ? PrtTrue : PrtFalse; +} + +PrtBool SYNCHCC pLsrSynchYieldPredicateEqualUInt32(volatile void *location, void *data) { + return ((*(volatile uint32*)location) == (uint32) data) ? PrtTrue : PrtFalse; +} + +PrtBool SYNCHCC pLsrSynchYieldPredicateEqualUIntp(volatile void *location, void *data) { + return ((*(volatile uintp*)location) == (uintp) data) ? PrtTrue : PrtFalse; +} + +void pLsrSynchYieldUntilNotEqualUInt32(volatile uint32*loc, uint32 value) +{ + prtYieldUntil(pLsrSynchYieldPredicateNotEqualUInt32, (void*) loc, (void*) value, pLsrSynchInfiniteTimeoutCycles64); +} + +void pLsrSynchYieldUntilNotEqualUIntp(volatile uintp*loc, uintp value) +{ + prtYieldUntil(pLsrSynchYieldPredicateNotEqualUIntp, (void*) loc, (void*) value, pLsrSynchInfiniteTimeoutCycles64); +} + +void pLsrSynchYieldUntilEqualUInt32(volatile uint32*loc, uint32 value) +{ + prtYieldUntil(pLsrSynchYieldPredicateEqualUInt32, (void*) loc, (void*) value, pLsrSynchInfiniteTimeoutCycles64); +} + +void pLsrSynchYieldUntilEqualUIntp(volatile uintp*loc, uintp value) +{ + prtYieldUntil(pLsrSynchYieldPredicateEqualUIntp, (void*) loc, (void*) value, pLsrSynchInfiniteTimeoutCycles64); +} + +static inline uintp pLsrSynchAtomicTakeUIntp(volatile uintp*loc, uintp emptyValue) { + while (1) { + uintp contents =*loc; + if (contents != emptyValue) { + if (pLsrSynchCmpAndSetUIntp(loc, contents, emptyValue)) { + return contents; + } + } else { + pLsrSynchYieldUntilNotEqualUIntp(loc, emptyValue); + } + } +} + +static inline void SYNCHCC pLsrSynchAtomicPutUIntp(volatile uintp*loc, uintp value){ + *loc = value; +} + +/* This should probably have some spinning and exponential backoff */ +static inline uint32 pLsrSynchAtomicTakeUInt32(volatile uint32*loc, uint32 emptyValue) { + while (1) { + uint32 contents =*loc; + if (contents != emptyValue) { + if (pLsrSynchCmpAndSetUInt32(loc, contents, emptyValue)) { + return contents; + } + } else { + pLsrSynchYieldUntilNotEqualUInt32(loc, emptyValue); + } + } +} + +static inline void SYNCHCC pLsrSynchAtomicPutUInt32(volatile uint32*loc, uint32 value){ + *loc = value; +} + +#endif + +#endif /* !_PLSR_SYNCHRONIZATION_H_ */ diff --git a/runtime/include/hrc/plsr-tagged-int32.h b/runtime/include/hrc/plsr-tagged-int32.h new file mode 100755 index 0000000..c8eeb74 --- /dev/null +++ b/runtime/include/hrc/plsr-tagged-int32.h @@ -0,0 +1,1209 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +#ifndef _PLSR_TAGGED_INT32_ +#define _PLSR_TAGGED_INT32_ + +/* Defining PLSR_TAGGED_INT32_TEST will cause several non-static + * test functions to be defined which are convenient for inspecting + * the code generated by different compilers. + */ + +/* The package operations are defined in terms of three types. + * TaggedInt: the user defined tagged ptr/int type + * APType: the user defined ap (ptr) type + * TaggedInt32: the 32 bit tagged integer type. + * required: sizeof(TaggedInt) == sizeof(APType) >= sizeof(int32) + * + * Implementation specific assumption 1: we assume that for values within the range + * of TaggedInt32, casting to and from TTaggedInt preserves value. This is guaranteed if + * TaggedInt is a signed integer type with at least as much precision as TaggedInt32, but + * implementation defined if TaggedInt is a ptr type. + * + * Implementation specific assumption 2: we assume that casting between + * TaggedInt and APType is value preserving. C99 says that pointer + * casts are implementation defined. + * + * Implementation specific assumption 3: we assume (and check) that UINT32_MAX = 2^32-1. + * It's possible that this can be relaxed somewhat, but the proofs would need to be + * checked. + * + * Implementation specific assumption 4: the non-portable version assumes some + * implementation defined behavior, most notably two's complement representation, + * and that >> on signed integers is implemented as arithmetic right shift. + */ + +/* Commentary on integer representations and operations. + * + * Notation. + * + * Implementation defined behavior means that a given implementation + * is free to choose what to do, but must do something predictable and documented. + * + * Undefined behavior means that the meaning of the program is completely unspecified. + * The compiler is free to reject such a program, or to optimize as if the behavior + * cannot happen, or to cause a runtime error. It is *not* required to give a + * predictable/repeatable semantics to the behavior. + * + * Relying on implementation defined behavior could cause portability problems between + * compilers/platforms, but should give predictable results on a given compiler/platform. + * + * Relying on undefined behavior is very dangerous, since a given compiler/platform + * may intentionally or accidentally exploit this in such a way as to make the + * semantics completely unpredictable. For example, signed integer overflow is undefined. + * This implies (for example) that the test (x + 1) < x can be treated as false for + * signed positive integer x, since either x+1 does not overflow (in which case the + * result is false) or else x+1 does overflow (in which case the result is undefined + * and the program can behave arbitrarily). There are anecdotal reports of some + * compilers exploiting properties such as this. + * + * Representation. + * + * The c99 spec leaves signed and unsigned integer representations fairly + * implementation defined. Most notably, it allows for arbitrary padding bits, + * with essentially arbitrary values. For a given precision, it imposes + * minimum ranges of representable numbers, but not maximum ranges. It allows + * for several different encodings of signed integers. In practice, all of + * the platforms we care about will use two's complement arithmetic, with + * (semantically) no padding. So while it might be desirable in general to + * avoid relying on properties of the encoding, the result in most cases + * is well-defined for a given compiler/platform. The header + * defines precise two's complement integer types with no padding which + * would be well-suited to this use. However, this is not available in the + * VC99 libraries which we are using. + * + * Note that the unsigned char type representation is completely well-defined. + * + * Operations. + * + * Generally speaking, most operations on unsigned integers are fairly + * completely defined. Operations on signed integers however are + * frequently implementation defined or undefined in particular cases. + * + * Some specific relevant cases. + * + * Conversion from signed to unsigned is well-defined. + * Conversion from unsigned to signed is implementation defined when + * the value is not representable in the new type. + * Left shift is well-defined on unsigned. + * Left shift is well-defined on positive signed for which no overflow + * occurs. Otherwise it is UNDEFINED. + * Right shift is well-defined on unsigned. + * Right shift is well-defined on positive signed, and implementation + * defined on negative signed. + * For all arithmetic operations, overflow/underflow on unsigned is + * well-defined. + * For all arithmetic operations, overflow/underflow on signed is + * UNDEFINED. + * Logical (and/or/xor) operations are well-defined on both signed + * and unsigned integers. This is less useful than it seems however, + * since the representations are implementation defined. + * + * Key points: + * UNDEFINED: left shift on signed integers + * UNDEFINED: overflow/underflow on signed integers + */ + +/* There are two implementations provided here, chosen between by the + * P_TAGGED_INT32_PORTABLE flag. All of the differences between the + * implementations are restricted to the pLsrTaggedInt32InternalXXX + * operations. Both implementations avoid relying on undefined + * properties. The portable version also avoids implementation + * defined properties, at some expense in the quality of the + * generated code. + * + * The generated code can be inspected by defining PLSR_TAGGED_INT32_TEST + * and inspecting the assembly code for the test functions defined at the + * bottom. + * + * Implementation 1 (portable). + * + * We represent signed integers x in the range -2^30 <= x < 2^30 as + * 2*x + 1. This allows them to be distinguished from pointers under + * the implementation specific assumption that pointers are always even. + * Tagging and untagging are done via well-defined arithmetic operations. + * + * Implementation 2 (non-portable). + * + * We represent signed integers x in the range -2^30 <= x < 2^30 as + * 2*x & 0x1. This allows them to be distinguished from pointers + * under various implementation specific assumptions about the + * representation of integers (all of which should be true on all + * platforms we care about). + * N.B. Even under implementation specific assumptions, x >> 1 != x/2. + */ + + +/************************************************************************/ +/********** This section defines the core internal operations ***********/ +/************************************************************************/ + +/* We begin by defining the basic internal operations on tagged 32 bit integers. */ +typedef sint32 PlsrTaggedInt32; + +/* Some constants */ +#define pLsrTaggedInt32InternalMax (SINT32_MAX / 2) +#define pLsrTaggedInt32InternalMin (SINT32_MIN / 2) +#define pLsrTaggedInt32InternalTwoToThe31 0x80000000 +#define pLsrTaggedInt32InternalTwoToThe30 0x40000000 + + +#if (defined(P_TAGGED_INT32_ASSUME_SMALL) || defined(P_TAGGED_INT32_ASSERT_SMALL)) + +/************************ Tagging **********************************/ +#define pLsrTaggedInt32InternalTag(a) ((PlsrTaggedInt32) (a)) + +/************************ UnTagging ********************************/ +#define pLsrTaggedInt32InternalUnTag(a) ((PlsrTaggedInt32) a) + +/********************** Tag checking *******************************/ +#define pLsrTaggedInt32InternalIsTagged(a) 1 + +/***************** Two operand tag checking ************************/ +#define pLsrTaggedInt32InternalAreTagged(a, b) 1 + +/********************* Range checking *****************************/ +#ifdef P_TAGGED_INT32_ASSERT_SMALL +#define pLsrTaggedInt32InternalSIntInRange(a) \ + ((((a) <= pLsrTaggedInt32InternalMax) && \ + ((a) >= pLsrTaggedInt32InternalMin)) || \ + (pLsrRuntimeError_("SInt out of range"), 0)) +#define pLsrTaggedInt32InternalUIntInRange(a) \ + (((a) <= pLsrTaggedInt32InternalMax) || \ + (pLsrRuntimeError_("UInt out of range"), 0)) +#else +#define pLsrTaggedInt32InternalSIntInRange(a) 1 +#define pLsrTaggedInt32InternalUIntInRange(a) 1 +#endif /* P_TAGGED_INT32_ASSERT_SMALL */ + +#define pLsrTaggedInt32InternalSInt32InRange pLsrTaggedInt32InternalSIntInRange +#define pLsrTaggedInt32InternalUInt32InRange pLsrTaggedInt32InternalUIntInRange +#define pLsrTaggedInt32InternalSInt64InRange pLsrTaggedInt32InternalSIntInRange +#define pLsrTaggedInt32InternalUInt64InRange pLsrTaggedInt32InternalUIntInRange + +/******************** Check assumptions **************************/ +#define pLsrTaggedInt32InternalCheck(APType, TaggedInt) \ + do { \ + assert(sizeof(APType) >= sizeof(PlsrTaggedInt32)); \ + assert(sizeof(TaggedInt) >= (sizeof(APType))); \ + } while (0) + +#else /* !P_TAGGED_INT32_ASSERT_SMALL && !P_TAGGED_INT32_ASSUME_SMALL*/ +#ifdef P_TAGGED_INT32_PORTABLE + + +/************************ Tagging **********************************/ +/* The tagged version of an integer x in the appropriate range is 2*x + 1. + * Note that this is guaranteed not to overflow for 32 bit integers according to the + * c99 spec. + */ +#define pLsrTaggedInt32InternalTag(a) ((((PlsrTaggedInt32) (a)) * 2) + 1) + + +/************************ UnTagging ********************************/ +/* The untagged version of a tagged integer x is (x-1)/2. + */ +#define pLsrTaggedInt32InternalUnTag(a) ((((PlsrTaggedInt32) (a)) -1 )/2) + + +/********************** Tag checking *******************************/ +/* We test if x is tagged by checking (((uint32) x) mod 2): that is, + * x is tagged iff it is odd. Note that conversion to uint32 preserves parity + * so long as UINT32_MAX + 1 is even. The Intel C compiler generates good + * code for this in some contexts but not others. + */ +#define pLsrTaggedInt32InternalIsTagged(a) (((uint32) ((PlsrTaggedInt32) (a))) % 2) + + +/***************** Two operand tag checking ************************/ +/* We test if x and y are both tagged by computing + * (((unsigned char) (((uint32) x) mod 2)) & ((unsigned char) (((uint32) y) mod 2))) + * Using bitwise and avoids a double jump and produces better code. The conversion + * to unsigned char eliminates any padding bits. + */ +#define pLsrTaggedInt32InternalAreTagged(a, b) \ + (pLsrTaggedInt32InternalIsTagged(a)) && \ + (pLsrTaggedInt32InternalIsTagged(b)) + + +/********************* Range checking *****************************/ + +/* We can test any integer for being in range by comparing to the appropriate + * max and min values. However, this does not generate very good code. This is + * fully portable. + */ +#define pLsrTaggedInt32InternalSIntInRange(a) \ + (((a) <= pLsrTaggedInt32InternalMax) && ((a) >= pLsrTaggedInt32InternalMin)) +#define pLsrTaggedInt32InternalUIntInRange(a) ((a) <= pLsrTaggedInt32InternalMax) +#define pLsrTaggedInt32InternalUInt32InRange pLsrTaggedInt32InternalUIntInRange +#define pLsrTaggedInt32InternalUInt64InRange pLsrTaggedInt32InternalUIntInRange +#define pLsrTaggedInt32InternalSInt32InRange pLsrTaggedInt32InternalSIntInRange +#define pLsrTaggedInt32InternalSInt64InRange pLsrTaggedInt32InternalSIntInRange + +/******************** Check assumptions **************************/ +#define pLsrTaggedInt32InternalCheck(APType, TaggedInt) \ + do { \ + assert(sizeof(APType) >= sizeof(PlsrTaggedInt32)); \ + assert(sizeof(TaggedInt) >= (sizeof(APType))); \ + } while (0) + + +#else /* !P_TAGGED_INT32_PORTABLE && !P_TAGGED_INT32_ASSUME_SMALL && !P_TAGGED_INT32_ASSERT_SMALL */ + + +#define pLsrTaggedInt32InternalSetLowBit(a) ((a) | 0x1) +#define pLsrTaggedInt32InternalGetLowBit(a) ((a) & 0x1) +#define pLsrTaggedInt32InternalASR(a) ((a) >> 1) + + +/************************ Tagging **********************************/ +/* The tagged version of an integer x in the appropriate range is 2*x | 0x1. + * Under reasonable assumptions about representation, this sets the low bit + * only. + */ +#define pLsrTaggedInt32InternalTag(a) \ + (pLsrTaggedInt32InternalSetLowBit(((PlsrTaggedInt32) (a)) * 2)) + + +/************************ UnTagging ********************************/ +/* The untagged version of a tagged integer x is x >> 1. Under reasonable + * assumptions about representation and the implementation of >>, this + * does an arithmetic shift right. + */ +#define pLsrTaggedInt32InternalUnTag(a) \ + (pLsrTaggedInt32InternalASR((PlsrTaggedInt32) (a))) + + +/********************** Tag checking *******************************/ +/* An integer is tagged if the low bit is set. */ +#define pLsrTaggedInt32InternalIsTagged(a) \ + (pLsrTaggedInt32InternalGetLowBit((PlsrTaggedInt32) (a))) + + +/***************** Two operand tag checking ************************/ +/* We test if x and y are both tagged by checking that both low bits are set */ +#define pLsrTaggedInt32InternalAreTagged(a, b) \ + (pLsrTaggedInt32InternalGetLowBit((((uint32) ((PlsrTaggedInt32) (a))) & \ + ((uint32) ((PlsrTaggedInt32) (b)))))) + + +/********************* Range checking *****************************/ + +/* We can test any integer for being in range by comparing to the appropriate + * max and min values. However, this does not generate very good code. This is + * fully portable. + */ +#define pLsrTaggedInt32InternalSIntInRange(a) \ + (((a) <= pLsrTaggedInt32InternalMax) && ((a) >= pLsrTaggedInt32InternalMin)) +#define pLsrTaggedInt32InternalUIntInRange(a) ((a) <= pLsrTaggedInt32InternalMax) + +/* Hard to do better than this for uints */ +#define pLsrTaggedInt32InternalUInt32InRange pLsrTaggedInt32InternalUIntInRange +#define pLsrTaggedInt32InternalUInt64InRange pLsrTaggedInt32InternalUIntInRange + +/* For signed integer types with known precision, we can define + * a test which generates better code as follows, under the implementation + * specific assumptions that 1) UINT32_MAX = 2^32-1, and + * 2) SINT32_MAX = 2^32-1 and 3) SINT32_MAX = -2^32 (actually, any power of + * two). + * + * We test if x is in the representable range (that is -2^30 <= x < 2^30) + * by casting it to uint32, adding 2^30 and testing against 2^31. See the + * proof below for a formal justification of this in terms of the c99 spec, + * but informally what this is doing is as follows: + * The conversion to an unsigned int preserves the bit layout in two's + * complement. For negative numbers, x is representable if the leading + * two bits are 11, and for positive numbers, x is representable if + * the leading two bits are 00. Otherwise, the number is not representable. + * So we wish to tell whether or not the leading bits differ. We do this + * by adding 2^30 to the number. If the leading bits are 11, it overflows + * and the leading bits are 00. If the leading bits are 00, the leading + * bits become 01. In both cases, the number is now less than 2^31. If + * the bits differ, there is no overflow, and the leading bit will be 1. + * Therefore the number will be greater than or equal to 2^31. + * + * This explanation assumes two's complement representation with a particular + * bit order: however the proof below shows that this method is in + * fact fully general under very limited assumptions about the range of the + * respective types. These assumptions could be eliminated if we had + * access to the stdint.h types. + */ +#define pLsrTaggedInt32InternalSInt32InRange(a) \ + ((((uint32) (a)) + pLsrTaggedInt32InternalTwoToThe30) < pLsrTaggedInt32InternalTwoToThe31) +#define pLsrTaggedInt32InternalSInt64InRange(a) \ + ((((uint64) (a)) + pLsrTaggedInt32InternalTwoToThe30) < pLsrTaggedInt32InternalTwoToThe31) + +/* Proof of correctness for range checking. + * + * According the c99 spec, converting from a signed to an unsigned type of the + * same size: + * 1) preserves the value if the value is representable (positive) + * 2) is the result of repeatedly adding one more than the maximum value of + * the new type to the old value + * So, assuming a is a negative sint32, then + * ((uint32) a) = (UINT32_MAX + 1) + a + * For this proof, I assume that (UINT32_MAX + 1) = 2^32. Strictly speaking, + * I'm not sure this is guaranteed - I believe the spec permits additional + * range. In practice, this isn't a concern. I believe this proof could be + * generalized to relax this assumption somewhat. The proof goes through + * fine with any power of two for UINT32_MAX. + * + * For a 32 bit signed int k, -2^31 <= k < 2^31 + * Let S' = ((uint32) k) + 2^30 + * and let S = S' mod 2^32 + * Case 1: k is positive. Then: + * - ((uint32) k) = k, so + * - S = (k + 2^30) mod 2^32 + * = k + 2^30 (since k + 2^30 < 2^32) + * + * Suppose S < 2^31. Then + * S = k + 2^30 < 2^31 + * k + 2^30 < 2^30 + 2^30 + * k < 2^30 + * Suppose S >= 2^31. Then + * S = k + 2^30 >= 2^31 + * k + 2^30 >= 2^30 + 2^30 + * k >= 2^30 + * + * Case 2: k is negative (-1 >= k >= -2^31). Then: + * - By the c99 spec (assuming UINT32_MAX = 2^32-1) + * ((uint32) k) = 2^32 + k + * - So, S' = 2^32 + k + 2^30 + * + * Suppose k >= -2^30. Note that + * S = S' mod 2^32 + * = (2^32 + k + 2^30) mod 2^32 + * = (k + 2^30) mod 2^32 + * But note that 0 <= k + 2^30 < 2^30, so + * = (k + 2^30) + * And so + * S < 2^30 + * Finally, by contrapositive: + * S >= 2^30 => k < -2^30 + * So clearly + * S >= 2^31 => k < -2^30 + * + * Suppose k < -2^30. Note that + * S = S' mod 2^32 + * = (2^32 + k + 2^30) mod 2^32 + * But note that + * -2^31 + 2^30 <= k + 2^30 < -2^30 + 2^30, i.e. + * - 2^30 <= k + 2^30 < 0 + * So + * 2^32 - 2^30 <= S' < 2^32 + 0 + * 2^31 + 2^30 <= S' < 2^32 + * So + * S = S' mod 2^32 = S' >= 2^31 + 2^30 + * Finally, by contrapositive, + * S < 2^31 + 2^30 => k >= -2^30 + * So clearly + * S < 2^31 => k >= -2^30 + * + * So we have: + * S < 2^31 => k < 2^30 (k positive) + * S < 2^31 => k >= -2^30 (k negative) + * And + * S >= 2^31 => k >= 2^30 (k positive) + * S >= 2^31 => k < -2^30 (k negative) + * + * So S < 2^31 => -2^30 <= k < 2^30 + * + */ + +/******************** Check assumptions **************************/ +#define pLsrTaggedInt32InternalCheck(APType, TaggedInt) \ + do { \ + assert(sizeof(APType) >= sizeof(PlsrTaggedInt32)); \ + assert(sizeof(TaggedInt) >= (sizeof(APType))); \ + assert(UINT32_MAX == 0xFFFFFFFF); \ + assert(SINT32_MIN == 0x80000000); \ + assert(SINT32_MAX == 0x7FFFFFFF); \ + } while (0) + +#endif /* P_TAGGED_INT32_PORTABLE */ +#endif /* P_TAGGED_INT32_ASSUME_SMALL || P_TAGGED_INT32_ASSERT_SMALL */ + + +/************************************************************************/ +/******************* This is the main implementation ********************/ +/************************************************************************/ + +#define pLsrTaggedInt32Check pLsrTaggedInt32InternalCheck + +/*********************** Conversions ************************************/ + +#define pLsrTaggedInt32Max (pLsrTaggedInt32InternalMax) +#define pLsrTaggedInt32Min (pLsrTaggedInt32InternalMin) +#define pLsrTaggedInt32Zero (pLsrTaggedInt32InternalTag(0)) +#define pLsrTaggedInt32One (pLsrTaggedInt32InternalTag(1)) +#define pLsrTaggedInt32MinusOne (pLsrTaggedInt32InternalTag(-1)) + + +#define pLsrTaggedInt32TaggedIntIsTagged(a) (pLsrTaggedInt32InternalIsTagged(a)) +#define pLsrTaggedInt32TaggedIntUnTag(a) \ + (assert(pLsrTaggedInt32InternalIsTagged(a)), (pLsrTaggedInt32InternalUnTag(a))) +#define pLsrTaggedInt32TaggedIntFromSmallInt32(TaggedInt, a) \ + ((TaggedInt) (pLsrTaggedInt32InternalTag(a))) + +/* The NumConv macros are statement macros. */ + +/* Convert a value of type sintx to the tagged int type */ +#define pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(TaggedInt, SIntX, apFromSIntX, dest, a) \ + do { \ + if (sizeof(SIntX) <= sizeof(sint32)) { \ + if (pLsrTaggedInt32InternalSInt32InRange(a)) \ + {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \ + else \ + {(dest) = ((TaggedInt) (apFromSIntX(a)));} \ + } else if (sizeof(SIntX) == sizeof(sint64)) { \ + if (pLsrTaggedInt32InternalSInt64InRange(a)) \ + {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \ + else \ + {(dest) = ((TaggedInt) (apFromSIntX(a)));} \ + } else { \ + if (pLsrTaggedInt32InternalSIntInRange(a)) \ + {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \ + else \ + {(dest) = ((TaggedInt) (apFromSIntX(a)));} \ + } \ + } while (0) + +/* Convert a value of the tagged int type to type sintx */ +#define pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(SIntX, TaggedInt, APType, sIntXFromAP, dest, a) \ + do { \ + if (pLsrTaggedInt32InternalIsTagged(a)) \ + {(dest) = ((SIntX) (pLsrTaggedInt32InternalUnTag(a)));} \ + else \ + {(dest) = ((sIntXFromAP((APType) (a))));} \ + } while (0) + +/* Convert a value of type uintx to the tagged int type */ +#define pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(TaggedInt, UIntX, apFromUIntX, dest, a) \ + do { \ + if (sizeof(UIntX) <= sizeof(uint32)) { \ + if (pLsrTaggedInt32InternalUInt32InRange(a)) \ + {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \ + else \ + {(dest) = ((TaggedInt) (apFromUIntX(a)));} \ + } else if (sizeof(UIntX) == sizeof(uint64)) { \ + if (pLsrTaggedInt32InternalUInt64InRange(a)) \ + {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \ + else \ + {(dest) = ((TaggedInt) (apFromUIntX(a)));} \ + } else { \ + if (pLsrTaggedInt32InternalUIntInRange(a)) \ + {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \ + else \ + {(dest) = ((TaggedInt) (apFromUIntX(a)));} \ + } \ + } while (0) + +/* Convert a value of the tagged int type to type uintx */ +#define pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(UIntX, TaggedInt, APType, uIntXFromAP, dest, a) \ + do { \ + if (pLsrTaggedInt32InternalIsTagged(a)) \ + {(dest) = ((UIntX) (pLsrTaggedInt32InternalUnTag(a)));} \ + else \ + {(dest) = ((uIntXFromAP((APType) (a))));} \ + } while (0) + +/* Convert a value of type floatx to the tagged int type */ +/* This might be a bit of a hack. Find a better way to do this? -leaf */ +#define pLsrTaggedInt32NumConvFastTaggedIntFromFloat32(TaggedInt, apFromFloat32, dest, a) \ + do { \ + if ((((float32)(sint32)a) == ((float32) a)) && \ + (pLsrTaggedInt32InternalSInt32InRange((sint32)a))) { \ + (dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag((sint32)a))); \ + } else { \ + (dest) = ((TaggedInt) (apFromFloat32(a))); \ + } \ + } while (0) + +#define pLsrTaggedInt32NumConvFastTaggedIntFromFloat64(TaggedInt, apFromFloat64, dest, a) \ + do { \ + if ((((float64)(sint32)a) == ((float64) a)) && \ + (pLsrTaggedInt32InternalSInt32InRange((sint32)a))) { \ + (dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag((sint32)a))); \ + } else { \ + (dest) = ((TaggedInt) (apFromFloat64(a))); \ + } \ + } while (0) + +/* Convert a value of the tagged int type to type floatX */ +#define pLsrTaggedInt32NumConvFastFloatXFromTaggedInt(FloatX, TaggedInt, APType, floatXFromAP, dest, a) \ + do { \ + if (pLsrTaggedInt32InternalIsTagged(a)) \ + {(dest) = ((FloatX) (pLsrTaggedInt32InternalUnTag(a)));} \ + else \ + {(dest) = ((floatXFromAP((APType) (a))));} \ + } while (0) + +#define pLsrTaggedInt32MkCStringFrom(TName, TaggedInt, APType, apFromSInt32, cStringFromAP) \ + static char* pLsrCStringFrom##TName (TaggedInt a) { \ + { \ + if(pLsrTaggedInt32InternalIsTagged(a)) { \ + return cStringFromAP(apFromSInt32(pLsrTaggedInt32InternalUnTag(a))); \ + } else { \ + return cStringFromAP((APType) a); \ + } \ + } \ + } \ + +#define pLsrTaggedInt32MkFromCString(TName, TaggedInt, APType, apFromSInt32, sint32FromAP, apLessOrEqual, apFromCString) \ + static TaggedInt pLsr##TName##FromCString (char* s) { \ + { \ + APType ap = apFromCString(s); \ + APType max = apFromSInt32(pLsrTaggedInt32InternalMax); \ + APType min = apFromSInt32(pLsrTaggedInt32InternalMin); \ + if (apLessOrEqual (min, ap) && apLessOrEqual(ap, max)) { \ + return (TaggedInt) pLsrTaggedInt32InternalTag(sint32FromAP(ap)); \ + } else { \ + return (TaggedInt) ap; \ + } \ + } \ + } \ + +/**************************** Bitwise *********************************/ + +/* Can't make a tagged value non-taggable */ +#define pLsrTaggedInt32TaggedIntBNot(TaggedInt, APType, apBNot, apFromSInt32, dest, a) \ + do { \ + if (pLsrTaggedInt32InternalIsTagged((PlsrTaggedInt32) (a))) { \ + sint32 pLsrTaggedInt32TaggedIntBNot_tmp = \ + ~((sint32) (pLsrTaggedInt32InternalUnTag(a))); \ + (dest) = ((TaggedInt) pLsrTaggedInt32InternalTag(pLsrTaggedInt32TaggedIntBNot_tmp)); \ + } else { \ + (dest) = ((TaggedInt) (apBNot((APType) a))); \ + } \ + } while(0) + +/* Most of the binary bitwise operations can't overflow into the high bits + * and hence can use these versions. */ +#define pLsrTaggedInt32InternalBitwiseSlowPathReturnNO(TaggedInt, APType, apFromSInt32, apBin, a, b) \ + do { \ + APType pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA; \ + APType pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB; \ + if (pLsrTaggedInt32InternalIsTagged(a)) { \ + pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA = \ + apFromSInt32(pLsrTaggedInt32InternalUnTag(a)); \ + } else { \ + pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA = (APType)a; \ + } \ + if (pLsrTaggedInt32InternalIsTagged(b)) { \ + pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB = \ + apFromSInt32(pLsrTaggedInt32InternalUnTag(b)); \ + } else { \ + pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB = (APType)b; \ + } \ + return (TaggedInt) apBin(pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA, \ + pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB); \ + } while(0) + +#define pLsrTaggedInt32MkTaggedIntBitwiseBinSlowNO(TName, OpName, TaggedInt, APType, apFromSInt32, apBin, binOp) \ + static TaggedInt pLsrTaggedInt32##TName##OpName##Slow (TaggedInt a, TaggedInt b) { \ + if(pLsrTaggedInt32InternalAreTagged(a, b)) { \ + sint32 tmp = \ + binOp(((sint32) (pLsrTaggedInt32InternalUnTag(a))), \ + ((sint32) (pLsrTaggedInt32InternalUnTag(b)))); \ + return (TaggedInt) pLsrTaggedInt32InternalTag(tmp); \ + } else { \ + pLsrTaggedInt32InternalBitwiseSlowPathReturnNO(TaggedInt, APType, apFromSInt32, apBin, a, b); \ + } \ + } + + +#define pLsrTaggedInt32MkTaggedIntBitwiseBinNO(TaggedInt, APType, binOp, binOpSlow, dest, a, b) \ + do { \ + if(pLsrTaggedInt32InternalAreTagged(a, b)) { \ + sint32 pLsrTaggedInt32MkTaggedIntBitwiseBinNO_tmp = \ + binOp(((sint32) (pLsrTaggedInt32InternalUnTag(a))), \ + ((sint32) (pLsrTaggedInt32InternalUnTag(b)))); \ + (dest) = (TaggedInt) (pLsrTaggedInt32InternalTag(pLsrTaggedInt32MkTaggedIntBitwiseBinNO_tmp)); \ + } else { \ + (dest) = binOpSlow(a, b); \ + } \ + } while(0) + +/* Currently just use the slow version for things which might overflow into + * the high bits. + */ + +#define pLsrTaggedInt32InternalBitwiseSlowPathReturn(TaggedInt, APType, apFromSInt32, apBin, a, b) \ + do { \ + APType pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA; \ + APType pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB; \ + if (pLsrTaggedInt32InternalIsTagged(a)) { \ + pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA = \ + apFromSInt32(pLsrTaggedInt32InternalUnTag(a)); \ + } else { \ + pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA = (APType)a; \ + } \ + if (pLsrTaggedInt32InternalIsTagged(b)) { \ + pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB = \ + apFromSInt32(pLsrTaggedInt32InternalUnTag(b)); \ + } else { \ + pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB = (APType)b; \ + } \ + return (TaggedInt) apBin(pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA, \ + pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB); \ + } while(0) + +#define pLsrTaggedInt32MkTaggedIntBitwiseBinSlow(TName, OpName, TaggedInt, APType, apFromSInt32, apBin, binOp) \ + static TaggedInt pLsrTaggedInt32##TName##OpName##Slow (TaggedInt a, TaggedInt b) { \ + pLsrTaggedInt32InternalBitwiseSlowPathReturn(TaggedInt, APType, apFromSInt32, apBin, a, b); \ + } + +#define pLsrTaggedInt32MkTaggedIntBitwiseBin(TaggedInt, APType, binOp, binOpSlow, dest, a, b) \ + do { \ + (dest) = binOpSlow(a, b); \ + } while(0) + + +#define pLsrTaggedInt32TaggedIntBAndOp(a, b) (a & b) +#define pLsrTaggedInt32MkTaggedIntBAndSlow(TName, TaggedInt, APType, apFromSInt32, apBAnd) \ + pLsrTaggedInt32MkTaggedIntBitwiseBinSlowNO(TName, BAnd, TaggedInt, APType, apFromSInt32, \ + apBAnd, pLsrTaggedInt32TaggedIntBAndOp) +#define pLsrTaggedInt32TaggedIntBAnd(TaggedInt, APType, bAndSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntBitwiseBinNO(TaggedInt, APType, pLsrTaggedInt32TaggedIntBAndOp, \ + bAndSlow, dest, aa, bb) + +#define pLsrTaggedInt32TaggedIntBOrOp(a, b) (a | b) +#define pLsrTaggedInt32MkTaggedIntBOrSlow(TName, TaggedInt, APType, apFromSInt32, apBOr) \ + pLsrTaggedInt32MkTaggedIntBitwiseBinSlowNO(TName, BOr, TaggedInt, APType, apFromSInt32, apBOr, \ + pLsrTaggedInt32TaggedIntBOrOp) +#define pLsrTaggedInt32TaggedIntBOr(TaggedInt, APType, bOrSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntBitwiseBinNO(TaggedInt, APType, pLsrTaggedInt32TaggedIntBOrOp, bOrSlow, \ + dest, aa, bb) + +/* Can overflow */ +#define pLsrTaggedInt32TaggedIntBShiftLOp(a, b) (assert(0))) +#define pLsrTaggedInt32MkTaggedIntBShiftLSlow(TName, TaggedInt, APType, apFromSInt32, apBShiftL) \ + pLsrTaggedInt32MkTaggedIntBitwiseBinSlow(TName, BShiftL, TaggedInt, APType, apFromSInt32, apBShiftL, \ + pLsrTaggedInt32TaggedIntBShiftLOp) +#define pLsrTaggedInt32TaggedIntBShiftL(TaggedInt, APType, bShiftLSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntBitwiseBin(TaggedInt, APType, \ + pLsrTaggedInt32TaggedIntBShiftLOp, bShiftLSlow, dest, aa, bb) + +/* a >> b is only defined for 0 <= b <= 31 */ +#define pLsrTaggedInt32TaggedIntBShiftROp(a, b) ((b >= 0 && b < 32) ? (a >> b) : \ + ((b < 0 && b > -32) ? (a << -b) : \ + ((a < 0) ? -1 : 0))) +#define pLsrTaggedInt32MkTaggedIntBShiftRSlow(TName, TaggedInt, APType, apFromSInt32, apBShiftR) \ + pLsrTaggedInt32MkTaggedIntBitwiseBinSlow(TName, BShiftR, TaggedInt, APType, apFromSInt32, apBShiftR, \ + pLsrTaggedInt32TaggedIntBShiftROp) +#define pLsrTaggedInt32TaggedIntBShiftR(TaggedInt, APType, bShiftRSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntBitwiseBin(TaggedInt, APType, \ + pLsrTaggedInt32TaggedIntBShiftROp, bShiftRSlow, dest, aa, bb) + +#define pLsrTaggedInt32TaggedIntBXorOp(a, b) (a ^ b) +#define pLsrTaggedInt32MkTaggedIntBXorSlow(TName, TaggedInt, APType, apFromSInt32, apBXor) \ + pLsrTaggedInt32MkTaggedIntBitwiseBinSlowNO(TName, BXor, TaggedInt, APType, apFromSInt32, apBXor, \ + pLsrTaggedInt32TaggedIntBXorOp) +#define pLsrTaggedInt32TaggedIntBXor(TaggedInt, APType, bXorSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntBitwiseBinNO(TaggedInt, APType, \ + pLsrTaggedInt32TaggedIntBXorOp, bXorSlow, dest, aa, bb) + +/**************************** Arithmetic *********************************/ + +#define pLsrTaggedInt32TaggedIntNeg(TaggedInt, APType, apNeg, apFromSInt32, dest, a) \ + do { \ + if (pLsrTaggedInt32InternalIsTagged((PlsrTaggedInt32) (a))) { \ + sint32 pLsrTaggedInt32TaggedIntNeg_tmp = \ + -((sint32) (pLsrTaggedInt32InternalUnTag(a))); \ + if ((pLsrTaggedInt32InternalSInt32InRange(pLsrTaggedInt32TaggedIntNeg_tmp))) \ + {(dest) = ((TaggedInt) pLsrTaggedInt32InternalTag(pLsrTaggedInt32TaggedIntNeg_tmp));} \ + else \ + {(dest) = ((TaggedInt) apFromSInt32(pLsrTaggedInt32TaggedIntNeg_tmp));} \ + } else { \ + (dest) = ((TaggedInt) (apNeg((APType) a))); \ + } \ + } while(0) + +#define pLsrTaggedInt32InternalArithSlowPathReturn(TaggedInt, APType, apFromSInt32, apBin, a, b) \ + do { \ + APType pLsrTaggedInt32InternalArithSlowPathReturn_aA; \ + APType pLsrTaggedInt32InternalArithSlowPathReturn_bB; \ + if (pLsrTaggedInt32InternalIsTagged(a)) { \ + pLsrTaggedInt32InternalArithSlowPathReturn_aA = \ + apFromSInt32(pLsrTaggedInt32InternalUnTag(a)); \ + } else { \ + pLsrTaggedInt32InternalArithSlowPathReturn_aA = (APType)a; \ + } \ + if (pLsrTaggedInt32InternalIsTagged(b)) { \ + pLsrTaggedInt32InternalArithSlowPathReturn_bB = \ + apFromSInt32(pLsrTaggedInt32InternalUnTag(b)); \ + } else { \ + pLsrTaggedInt32InternalArithSlowPathReturn_bB = (APType)b; \ + } \ + return (TaggedInt) apBin(pLsrTaggedInt32InternalArithSlowPathReturn_aA, \ + pLsrTaggedInt32InternalArithSlowPathReturn_bB); \ + } while(0) + +/* Addition and subtraction */ +#define pLsrTaggedInt32MkTaggedIntAddSubSlow(TName, OpName, TaggedInt, APType, apFromSInt32, apBin, binOp) \ + static TaggedInt pLsrTaggedInt32##TName##OpName##Slow (TaggedInt a, TaggedInt b) { \ + if(pLsrTaggedInt32InternalAreTagged(a, b)) { \ + sint32 tmp = \ + (((sint32) (pLsrTaggedInt32InternalUnTag(a))) binOp \ + ((sint32) (pLsrTaggedInt32InternalUnTag(b)))); \ + if (pLsrTaggedInt32InternalSInt32InRange(tmp)) { \ + return (TaggedInt) pLsrTaggedInt32InternalTag(tmp); \ + } \ + else { \ + return (TaggedInt) apFromSInt32(tmp); \ + } \ + } else { \ + pLsrTaggedInt32InternalArithSlowPathReturn(TaggedInt, APType, apFromSInt32, apBin, a, b); \ + } \ + } + +#define pLsrTaggedInt32MkTaggedIntAddSub(TaggedInt, APType, binOp, binOpSlow, dest, a, b) \ + do { \ + if(pLsrTaggedInt32InternalAreTagged(a, b)) { \ + sint32 pLsrTaggedInt32MkTaggedIntAddSub_tmp = \ + (((sint32) (pLsrTaggedInt32InternalUnTag(a))) binOp \ + ((sint32) (pLsrTaggedInt32InternalUnTag(b)))); \ + if (pLsrTaggedInt32InternalSInt32InRange(pLsrTaggedInt32MkTaggedIntAddSub_tmp)) { \ + (dest) = (TaggedInt) (pLsrTaggedInt32InternalTag(pLsrTaggedInt32MkTaggedIntAddSub_tmp)); \ + continue; \ + } \ + } \ + (dest) = binOpSlow(a, b); \ + } while(0) + +#define pLsrTaggedInt32MkTaggedIntAddSlow(TName, TaggedInt, APType, apFromSInt32, apAdd) \ + pLsrTaggedInt32MkTaggedIntAddSubSlow(TName, Add, TaggedInt, APType, apFromSInt32, apAdd, +) +#define pLsrTaggedInt32TaggedIntAdd(TaggedInt, APType, addSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntAddSub(TaggedInt, APType, +, addSlow, dest, aa, bb) + +#define pLsrTaggedInt32MkTaggedIntSubSlow(TName, TaggedInt, APType, apFromSInt32, apSub) \ + pLsrTaggedInt32MkTaggedIntAddSubSlow(TName, Sub, TaggedInt, APType, apFromSInt32, apSub, -) +#define pLsrTaggedInt32TaggedIntSub(TaggedInt, APType, subSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntAddSub(TaggedInt, APType, -, subSlow, dest, aa, bb) + + +/* Multiplication */ +#define pLsrTaggedInt32MkTaggedIntMulSlow(TName, TaggedInt, APType, apFromSInt32, apFromSInt64, apMul) \ + static TaggedInt pLsrTaggedInt32##TName##MulSlow (TaggedInt a, TaggedInt b) { \ + if(pLsrTaggedInt32InternalAreTagged(a, b)) { \ + sint64 tmp = \ + (((sint64)(sint32) (pLsrTaggedInt32InternalUnTag(a))) * \ + ((sint64)(sint32) (pLsrTaggedInt32InternalUnTag(b)))); \ + if (pLsrTaggedInt32InternalSInt64InRange(tmp)) { \ + return (TaggedInt) pLsrTaggedInt32InternalTag(tmp); \ + } else { \ + return (TaggedInt) apFromSInt64(tmp); \ + } \ + } else { \ + pLsrTaggedInt32InternalArithSlowPathReturn(TaggedInt, APType, apFromSInt32, apMul, a, b); \ + } \ + } + +/* We use inline assembly here to work around an icl limitation. The compiler + * fails to turn add64((int64) a, b) into imul (a, b) when b is a small + * constant. Using assembly here probably eliminates some optimization + * opportunities. + * We could use a more customized asm sequence, but defining this via + * a saturating mutiply has the advantage of playing relatively nicely with + * the various versions of tagging/untagging/range checking. Note that this + * is not a "proper" saturating multiply, since a negative result saturates + * ot SINT32_MAX. We're really just using SINT32_MAX as an error result here. + */ +#ifdef PLSR_GNU_ASM +#ifdef __KNC__ +/* avoid cmovo for KNC */ +#define pLsrTaggedInt32InternalSInt32MulSaturating(D, A, B) \ + __asm__("imul %1, %0;jno 1f;mov %2, %0;1:" \ + : "=r"(D) \ + : "rm"(B), "rm"(SINT32_MAX), "0"(A)); +#else +/* D, A, and B should be variables here */ +/* This produces significantly better code */ +#define pLsrTaggedInt32InternalSInt32MulSaturating(D, A, B) \ + __asm__("imul %1, %0; cmovo %2, %0" \ + : "=r"(D) \ + : "rm"(B), "rm"(SINT32_MAX), "0"(A)); +#endif /* __KNC__ */ +#else /* PLSR_GNU_ASM */ +#ifdef PLSR_MS_ASM +#define pLsrTaggedInt32InternalSInt32MulSaturating(D, A, B) \ + __asm { __asm mov eax, A \ + __asm mov ecx, SINT32_MAX \ + __asm imul eax, B \ + __asm cmovo eax, ecx \ + __asm mov D, eax \ + }; +#else /* PLSR_MS_ASM */ +#error "Inline assembly unsupported" +#endif /* PLSR_MS_ASM */ +#endif /* PLSR_GNU_ASM */ + +#define pLsrTaggedInt32TaggedIntMul(TaggedInt, APType, mulSlow, dest, a, b) \ + do { \ + if(pLsrTaggedInt32InternalAreTagged(a, b)) { \ + sint32 pLsrTaggedInt32TaggedIntMul_a = pLsrTaggedInt32InternalUnTag(a); \ + sint32 pLsrTaggedInt32TaggedIntMul_b = pLsrTaggedInt32InternalUnTag(b); \ + sint32 pLsrTaggedInt32TaggedIntMul_tmp; \ + pLsrTaggedInt32InternalSInt32MulSaturating(pLsrTaggedInt32TaggedIntMul_tmp, \ + pLsrTaggedInt32TaggedIntMul_a, \ + pLsrTaggedInt32TaggedIntMul_b);\ + /* pLsrTaggedInt32TaggedIntMul_tmp = (((sint64) pLsrTaggedInt32TaggedIntMul_a) * */ \ + /* ((sint64) pLsrTaggedInt32TaggedIntMul_b));*/ \ + if (pLsrTaggedInt32InternalSInt32InRange(pLsrTaggedInt32TaggedIntMul_tmp)) { \ + (dest) = (TaggedInt) (pLsrTaggedInt32InternalTag(pLsrTaggedInt32TaggedIntMul_tmp)); \ + continue; \ + } \ + } \ + (dest) = mulSlow(a, b); \ + } while(0) + +/* Division */ + +typedef enum { + PlsrTaggedInt32InternalDivKindT, + PlsrTaggedInt32InternalDivKindE, + PlsrTaggedInt32InternalDivKindF +} PlsrTaggedInt32InternalDivKind; + +#define pLsrTaggedInt32InternalMkTaggedIntDivModSlow(Kind, kind, TName, TaggedInt, APType, apFromSInt32, apDivMod) \ + static void pLsrTaggedInt32##TName##DivMod##Kind##Slow (TaggedInt* q, TaggedInt* r, TaggedInt a, TaggedInt b) \ + { \ + if(pLsrTaggedInt32InternalAreTagged(a, b)) { \ + sint32 ai = pLsrTaggedInt32InternalUnTag(a); \ + sint32 bi = pLsrTaggedInt32InternalUnTag(b); \ + sint32 qi = ai/bi; \ + sint32 ri = ai%bi; \ + if (kind == PlsrTaggedInt32InternalDivKindF) { \ + if (((ri > 0) - (ri < 0)) == -((bi > 0) - (bi < 0))) { \ + qi--; \ + ri += bi; \ + } \ + } else if (kind == PlsrTaggedInt32InternalDivKindE) { \ + if (ri < 0) { \ + if (bi > 0) { \ + qi--; \ + ri += bi; \ + } else { \ + qi++; \ + ri -= bi; \ + } \ + } \ + } \ + /* |ai % bi| < |bi| */ \ + *r = (TaggedInt) (pLsrTaggedInt32InternalTag(ri)); \ + /* plsrTaggedInt32InternalMin/-1 > plsrTaggedInt32InternalMax */ \ + if (pLsrTaggedInt32InternalSInt32InRange(qi)) { \ + *q = (TaggedInt) (pLsrTaggedInt32InternalTag(qi)); \ + } else { \ + *q = (TaggedInt) (apFromSInt32(qi)); \ + } \ + } else { \ + APType aA; \ + APType bB; \ + if (pLsrTaggedInt32InternalIsTagged(a)) { \ + aA = \ + apFromSInt32(pLsrTaggedInt32InternalUnTag(a)); \ + } else { \ + aA = (APType)a; \ + } \ + if (pLsrTaggedInt32InternalIsTagged(b)) { \ + bB = \ + apFromSInt32(pLsrTaggedInt32InternalUnTag(b)); \ + } else { \ + bB = (APType)b; \ + } \ + apDivMod((APType*) q, (APType*) r, aA, bB); \ + } \ + } + + +#define pLsrTaggedInt32InternalTaggedIntDivMod(kind, TaggedInt, APType, apFromSInt32, divModSlow, q, r, a, b) \ + do { \ + if(pLsrTaggedInt32InternalAreTagged(a, b)) { \ + sint32 pLsrTaggedInt32InternalTaggedIntDivMod_ai = pLsrTaggedInt32InternalUnTag(a); \ + sint32 pLsrTaggedInt32InternalTaggedIntDivMod_bi = pLsrTaggedInt32InternalUnTag(b); \ + sint32 pLsrTaggedInt32InternalTaggedIntDivMod_qi = \ + pLsrTaggedInt32InternalTaggedIntDivMod_ai / pLsrTaggedInt32InternalTaggedIntDivMod_bi; \ + sint32 pLsrTaggedInt32InternalTaggedIntDivMod_ri = \ + pLsrTaggedInt32InternalTaggedIntDivMod_ai % pLsrTaggedInt32InternalTaggedIntDivMod_bi; \ + \ + if (kind == PlsrTaggedInt32InternalDivKindF) { \ + \ + if ( \ + ((pLsrTaggedInt32InternalTaggedIntDivMod_ri > 0) - \ + (pLsrTaggedInt32InternalTaggedIntDivMod_ri < 0)) \ + == \ + -((pLsrTaggedInt32InternalTaggedIntDivMod_bi > 0) - \ + (pLsrTaggedInt32InternalTaggedIntDivMod_bi < 0)) \ + ) \ + { \ + pLsrTaggedInt32InternalTaggedIntDivMod_qi--; \ + pLsrTaggedInt32InternalTaggedIntDivMod_ri += pLsrTaggedInt32InternalTaggedIntDivMod_bi; \ + } \ + } else if (kind == PlsrTaggedInt32InternalDivKindE) { \ + if (pLsrTaggedInt32InternalTaggedIntDivMod_ri < 0) { \ + if (pLsrTaggedInt32InternalTaggedIntDivMod_bi > 0) \ + { \ + pLsrTaggedInt32InternalTaggedIntDivMod_qi--; \ + pLsrTaggedInt32InternalTaggedIntDivMod_ri += pLsrTaggedInt32InternalTaggedIntDivMod_bi; \ + } \ + else { \ + pLsrTaggedInt32InternalTaggedIntDivMod_qi++; \ + pLsrTaggedInt32InternalTaggedIntDivMod_ri -= pLsrTaggedInt32InternalTaggedIntDivMod_bi; \ + } \ + } \ + } \ + \ + /* |ai % bi| < |bi| */ \ + r = (TaggedInt) (pLsrTaggedInt32InternalTag(pLsrTaggedInt32InternalTaggedIntDivMod_ri)); \ + \ + /* plsrTaggedInt32InternalMin/-1 > plsrTaggedInt32InternalMax */ \ + if (pLsrTaggedInt32InternalSInt32InRange(pLsrTaggedInt32InternalTaggedIntDivMod_qi)) { \ + q = (TaggedInt) (pLsrTaggedInt32InternalTag(pLsrTaggedInt32InternalTaggedIntDivMod_qi)); \ + } else { \ + q = (TaggedInt) (apFromSInt32(pLsrTaggedInt32InternalTaggedIntDivMod_qi)); \ + } \ + \ + } else { \ + TaggedInt PlsrTaggedInt32InternalTaggedIntDivMod_qq = NULL; \ + TaggedInt PlsrTaggedInt32InternalTaggedIntDivMod_rr = NULL; \ + divModSlow(&PlsrTaggedInt32InternalTaggedIntDivMod_qq, &PlsrTaggedInt32InternalTaggedIntDivMod_rr, a, b); \ + q = PlsrTaggedInt32InternalTaggedIntDivMod_qq; \ + r = PlsrTaggedInt32InternalTaggedIntDivMod_rr; \ + } \ + } while(0) + +#define pLsrTaggedInt32InternalTaggedIntDiv(kind, TaggedInt, APType, apFromSInt32, divModSlow, q, a, b) \ + do { \ + TaggedInt pLsrTaggedInt32InternalTaggedIntDiv_r = NULL; \ + pLsrTaggedInt32InternalTaggedIntDivMod(kind, TaggedInt, APType, apFromSInt32, divModSlow, q, \ + pLsrTaggedInt32InternalTaggedIntDiv_r, a, b); \ + } while (0) + +#define pLsrTaggedInt32InternalTaggedIntMod(kind, TaggedInt, APType, apFromSInt32, divModSlow, r, a, b) \ + do { \ + TaggedInt pLsrTaggedInt32InternalTaggedIntMod_q = NULL; \ + pLsrTaggedInt32InternalTaggedIntDivMod(kind, TaggedInt, APType, apFromSInt32, \ + divModSlow, pLsrTaggedInt32InternalTaggedIntMod_q, r, a, b); \ + } while (0) + +#define pLsrTaggedInt32MkTaggedIntDivModTSlow(TName, TaggedInt, APType, apFromSInt32, apDivModT) \ + pLsrTaggedInt32InternalMkTaggedIntDivModSlow(T, PlsrTaggedInt32InternalDivKindT, \ + TName, TaggedInt, APType, apFromSInt32, apDivModT) \ + +#define pLsrTaggedInt32TaggedIntDivModT(TaggedInt, APType, apFromSInt32, divModTSlow, q, r, a, b) \ + pLsrTaggedInt32InternalTaggedIntDivMod(PlsrTaggedInt32InternalDivKindT, \ + TaggedInt, APType, apFromSInt32, divModTSlow, q, r, a, b) \ + +#define pLsrTaggedInt32TaggedIntDivT(TaggedInt, APType, apFromSInt32, divModTSlow, q, a, b) \ + pLsrTaggedInt32InternalTaggedIntDiv(PlsrTaggedInt32InternalDivKindT, \ + TaggedInt, APType, apFromSInt32, divModTSlow, q, a, b) \ + +#define pLsrTaggedInt32TaggedIntModT(TaggedInt, APType, apFromSInt32, divModTSlow, r, a, b) \ + pLsrTaggedInt32InternalTaggedIntMod(PlsrTaggedInt32InternalDivKindT, \ + TaggedInt, APType, apFromSInt32, divModTSlow, r, a, b) \ + +#define pLsrTaggedInt32MkTaggedIntDivModESlow(TName, TaggedInt, APType, apFromSInt32, apDivModE) \ + pLsrTaggedInt32InternalMkTaggedIntDivModSlow(E, PlsrTaggedInt32InternalDivKindE, \ + TName, TaggedInt, APType, apFromSInt32, apDivModE) \ + +#define pLsrTaggedInt32TaggedIntDivModE(TaggedInt, APType, apFromSInt32, divModESlow, q, r, a, b) \ + pLsrTaggedInt32InternalTaggedIntDivMod(PlsrTaggedInt32InternalDivKindE, \ + TaggedInt, APType, apFromSInt32, divModESlow, q, r, a, b) \ + +#define pLsrTaggedInt32TaggedIntDivE(TaggedInt, APType, apFromSInt32, divModESlow, q, a, b) \ + pLsrTaggedInt32InternalTaggedIntDiv(PlsrTaggedInt32InternalDivKindE, \ + TaggedInt, APType, apFromSInt32, divModESlow, q, a, b) \ + +#define pLsrTaggedInt32TaggedIntModE(TaggedInt, APType, apFromSInt32, divModESlow, r, a, b) \ + pLsrTaggedInt32InternalTaggedIntMod(PlsrTaggedInt32InternalDivKindE, \ + TaggedInt, APType, apFromSInt32, divModESlow, r, a, b) \ + +#define pLsrTaggedInt32MkTaggedIntDivModFSlow(TName, TaggedInt, APType, apFromSInt32, apDivModF) \ + pLsrTaggedInt32InternalMkTaggedIntDivModSlow(F, PlsrTaggedInt32InternalDivKindF, \ + TName, TaggedInt, APType, apFromSInt32, apDivModF) \ + +#define pLsrTaggedInt32TaggedIntDivModF(TaggedInt, APType, apFromSInt32, divModFSlow, q, r, a, b) \ + pLsrTaggedInt32InternalTaggedIntDivMod(PlsrTaggedInt32InternalDivKindF, \ + TaggedInt, APType, apFromSInt32, divModFSlow, q, r, a, b) \ + +#define pLsrTaggedInt32TaggedIntDivF(TaggedInt, APType, apFromSInt32, divModFSlow, q, a, b) \ + pLsrTaggedInt32InternalTaggedIntDiv(PlsrTaggedInt32InternalDivKindF, \ + TaggedInt, APType, apFromSInt32, divModFSlow, q, a, b) \ + +#define pLsrTaggedInt32TaggedIntModF(TaggedInt, APType, apFromSInt32, divModFSlow, r, a, b) \ + pLsrTaggedInt32InternalTaggedIntMod(PlsrTaggedInt32InternalDivKindF, \ + TaggedInt, APType, apFromSInt32, divModFSlow, r, a, b) + +#define pLsrTaggedInt32MkTaggedIntGcd(TName, TaggedInt, APType, apFromSInt32, apLessOrEqual, sint32FromAP, apGcd) \ + static TaggedInt pLsrTaggedInt32##TName##Gcd(TaggedInt a, TaggedInt b) \ + { \ + if(pLsrTaggedInt32InternalAreTagged(a, b)) { \ + sint32 ai = pLsrTaggedInt32InternalUnTag(a); \ + sint32 bi = pLsrTaggedInt32InternalUnTag(b); \ + if (ai<0) ai = -ai; \ + if (bi<0) bi = -bi; \ + while (bi != 0) { \ + sint32 r = ai%bi; \ + ai = bi; \ + bi = r; \ + } \ + return (TaggedInt) (pLsrTaggedInt32InternalTag(ai)); \ + } else { \ + APType aA, bB; \ + if (pLsrTaggedInt32InternalIsTagged(a)) { \ + aA = apFromSInt32(pLsrTaggedInt32InternalUnTag(a)); \ + } else { \ + aA = (APType)a; \ + } \ + if (pLsrTaggedInt32InternalIsTagged(b)) { \ + bB = apFromSInt32(pLsrTaggedInt32InternalUnTag(b)); \ + } else { \ + bB = (APType)b; \ + } \ + APType r = apGcd(aA, bB); \ + APType min = apFromSInt32(pLsrTaggedInt32InternalMin); \ + APType max = apFromSInt32(pLsrTaggedInt32InternalMax); \ + if (apLessOrEqual (min, r) && apLessOrEqual(r, max)) { \ + return (TaggedInt) pLsrTaggedInt32InternalTag(sint32FromAP(r)); \ + } else { \ + return (TaggedInt) r; \ + } \ + } \ + } + +/* Comparisons */ + +#define pLsrTaggedInt32MkTaggedIntCmpSlow(TName, OpName, TaggedInt, APType, apFromSInt32, apCmp, cmpOp) \ + static PlsrBoolean pLsrTaggedInt32##TName##OpName##Slow (TaggedInt a, TaggedInt b) { \ + if(pLsrTaggedInt32InternalAreTagged(a, b)) { \ + return (PlsrBoolean) (((PlsrTaggedInt32) a) cmpOp \ + ((PlsrTaggedInt32) b)); \ + } else { \ + APType aA, bB; \ + if (pLsrTaggedInt32InternalIsTagged(a)) { \ + aA = apFromSInt32(pLsrTaggedInt32InternalUnTag(a)); \ + } else { \ + aA = (APType)a; \ + } \ + if (pLsrTaggedInt32InternalIsTagged(b)) { \ + bB = apFromSInt32(pLsrTaggedInt32InternalUnTag(b)); \ + } else { \ + bB = (APType)b; \ + } \ + return apCmp(aA, bB); \ + } \ + } + +#define pLsrTaggedInt32MkTaggedIntCmp(cmpSlow, cmpOp, dest, a, b) \ + do { \ + if(pLsrTaggedInt32InternalAreTagged((a), (b))) { \ + (dest) = ((PlsrBoolean) (((PlsrTaggedInt32) (a)) cmpOp \ + ((PlsrTaggedInt32) (b)))); \ + } else { \ + (dest) = (cmpSlow((a), (b))); \ + } \ + } while(0) + +#define pLsrTaggedInt32MkTaggedIntEqSlow(TName, TaggedInt, APType, apFromSInt32, apEq) \ + pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Eq, TaggedInt, APType, apFromSInt32, apEq, ==) +#define pLsrTaggedInt32TaggedIntEq(eqSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntCmp(eqSlow, ==, dest, aa, bb) + + +#define pLsrTaggedInt32MkTaggedIntNeSlow(TName, TaggedInt, APType, apFromSInt32, apNe) \ + pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Ne, TaggedInt, APType, apFromSInt32, apNe, !=) +#define pLsrTaggedInt32TaggedIntNe(neSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntCmp(neSlow, !=, dest, aa, bb) + +#define pLsrTaggedInt32MkTaggedIntLtSlow(TName, TaggedInt, APType, apFromSInt32, apLt) \ + pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Lt, TaggedInt, APType, apFromSInt32, apLt, <) +#define pLsrTaggedInt32TaggedIntLt(ltSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntCmp(ltSlow, <, dest, aa, bb) + +#define pLsrTaggedInt32MkTaggedIntGtSlow(TName, TaggedInt, APType, apFromSInt32, apGt) \ + pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Gt, TaggedInt, APType, apFromSInt32, apGt, >) +#define pLsrTaggedInt32TaggedIntGt(gtSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntCmp(gtSlow, >, dest, aa, bb) + +#define pLsrTaggedInt32MkTaggedIntLeSlow(TName, TaggedInt, APType, apFromSInt32, apLe) \ + pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Le, TaggedInt, APType, apFromSInt32, apLe, <=) +#define pLsrTaggedInt32TaggedIntLe(leSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntCmp(leSlow, <=, dest, aa, bb) + +#define pLsrTaggedInt32MkTaggedIntGeSlow(TName, TaggedInt, APType, apFromSInt32, apGe) \ + pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Ge, TaggedInt, APType, apFromSInt32, apGe, >=) +#define pLsrTaggedInt32TaggedIntGe(geSlow, dest, aa, bb) \ + pLsrTaggedInt32MkTaggedIntCmp(geSlow, >=, dest, aa, bb) + +/************************ Hashing **********************************/ + +#define pLsrTaggedInt32TaggedIntHash(TaggedInt, APType, apHash, a) \ + ((pLsrTaggedInt32InternalIsTagged((PlsrTaggedInt32) (a))) ? \ + pLsrSInt32Hash(pLsrTaggedInt32InternalUnTag(a)) : \ + apHash((APType)a)) + +/************************ Testing **********************************/ +#ifdef PLSR_TAGGED_INT32_TEST + +/************************ Tagging **********************************/ +void pLsrTaggedInt32TestTag(sint32 x) { + PlsrTaggedInt32 r = pLsrTaggedInt32InternalTag(x); + printf("Tagged version of %d is %d\n", x, r); +} + +/************************ UnTagging ********************************/ +void pLsrTaggedInt32TestUnTag(PlsrTaggedInt32 x) { + sint32 r = (sint32) pLsrTaggedInt32InternalUnTag(x); + printf("UnTagged version of %d is %d\n", x, r); +} + +/********************** Tag checking *******************************/ +void pLsrTaggedInt32TestIsTagged(PlsrTaggedInt32 x) { + if (pLsrTaggedInt32InternalIsTagged(x)) { + printf("%d is tagged", x); + } else { + printf("%d is not tagged", x); + } +} + +/***************** Two operand tag checking ************************/ +void pLsrTaggedInt32TestAreTagged(PlsrTaggedInt32 x, PlsrTaggedInt32 y) { + if (pLsrTaggedInt32InternalAreTagged(x, y)) { + printf("%d and %d are tagged", x, y); + } else { + printf("%d and/or %d is not tagged", x, y); + } +} + + +/********************* Range checking *****************************/ +void pLsrTaggedInt32TestSIntInRange(sint32 x, sint64 y) { + if (pLsrTaggedInt32InternalSInt32InRange(x)) { + printf("in range"); + } else { + printf("not in range"); + } + if (pLsrTaggedInt32InternalSInt64InRange(y)) { + printf("in range"); + } else { + printf("not in range"); + } +} + +void pLsrTaggedInt32TestUIntInRange(uint32 x, uint64 y) { + if (pLsrTaggedInt32InternalUInt32InRange(x)) { + printf("in range"); + } else { + printf("not in range"); + } + if (pLsrTaggedInt32InternalUInt64InRange(y)) { + printf("in range"); + } else { + printf("not in range"); + } +} +#endif /* PLSR_TAGGED_INT32_TEST */ + +#endif /* _PLSR_TAGGED_INT32_ */ + diff --git a/runtime/include/hrc/plsr-thunk.h b/runtime/include/hrc/plsr-thunk.h new file mode 100644 index 0000000..c1067c7 --- /dev/null +++ b/runtime/include/hrc/plsr-thunk.h @@ -0,0 +1,10 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Unified Futures Thunk Implementation */ + +#ifdef PLSR_LIGHTWEIGHT_THUNKS + #include "hrc/plsr-lightweight-thunk.h" +#else + #include "hrc/plsr-ptk-thunk.h" +#endif diff --git a/runtime/include/hrc/plsr-util.h b/runtime/include/hrc/plsr-util.h new file mode 100644 index 0000000..ad10e3e --- /dev/null +++ b/runtime/include/hrc/plsr-util.h @@ -0,0 +1,182 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Utilities */ + +#ifndef _PLSR_UTIL_H_ +#define _PLSR_UTIL_H_ + +#include +#include +#include "hrc/pil.h" + +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ +#endif /* P_USE_PILLAR */ + +#ifdef PLSR_LINUX +#include +#endif + +#ifdef P_USE_PILLAR +# undef to +# pragma pillar_managed(on) +#endif + +/* This is the type of Mil booleans. Mil booleans + * are integers in {0, 1}. Use toPlsrBoolean to + * convert C booleans (i.e. zero/non-zero values) + * to P booleans. + */ +typedef uintp PlsrBoolean; +#define toPlsrBoolean(e) ((PlsrBoolean)(!!(e))) + +/* Absolute value. According the c99 spec, converting from a signed to an + * unsigned type of the same size: + * 1) preserves the value if the value is representable (positive) + * 2) is the result of repeatedly adding one more than the maximum value of + * the new type to the old value + * So, assuming a is a negative sint32, then + * ((uint32) a) = (UINT32_MAX + 1) + a (using non-modular arithmetic). This is + * guaranteed to be representable, since |SINT32_MIN| < UINT32_MAX + * So using true arithmetic, we have: + * UINT32_MAX - ((uint32) a) + 1 = UINT32_MAX - (UINT32_MAX + 1 + a) + 1 + * = UINT32_MAX - UINT32_MAX -1 -a + 1 + * = -a + * But note that since SINT32_MIN <= a < 0, + * 0 < ((uint32) a) = (UINT32_MAX + 1) + a <= UINT32_MAX must be true, so + * UINT32_MAX - ((uint32) a) cannot underflow, and must be less than + * UINT32_MAX (since ((uint32) a) must be positive. + * Therefore UINT32_MAX - ((uint32) a) + 1 can be computed precisely in + * 32 bit modular arithmetic. + */ + +#define pLsrUInt32FromNegativeSInt32Abs(a) (UINT32_MAX - ((uint32) a) + 1) +#define pLsrUInt64FromNegativeSInt64Abs(a) (UINT64_MAX - ((uint64) a) + 1) +#define pLsrUIntpFromNegativeSIntpAbs(a) (UINTP_MAX - ((uintp) a) + 1) + +static uint32 pLsrAbs32(sint32 a) { + if (a < 0) { + return UINT32_MAX - ((uint32) a) + 1; + } else { + return a; + } +} + +static uint64 pLsrAbs64(sint64 a) { + if (a < 0) { + return UINT64_MAX - ((uint64) a) + 1; + } else { + return a; + } +} + +static uintp pLsrAbsP(sintp a) { + if (a < 0) { + return UINTP_MAX - ((uintp) a) + 1; + } else { + return a; + } +} + +#ifndef PRT_NORETURN +#ifdef __GNUC__ +#define PRT_NORETURN __attribute__((noreturn)) +#else +#define PRT_NORETURN __declspec(noreturn) +#endif +#endif + +static void pLsrExit(int status) +{ +#ifdef P_USE_PARALLEL_FUTURES + ptkFutureSystemShutdown(); +#endif +#ifdef P_USE_AGC + pgc_kill(); +#endif /* P_USE_AGC */ +#ifdef P_USE_PILLAR + prtExit(status); +#else /* !P_USE_AGC */ + exit(status); +#endif /* !P_USE_AGC */ +} + +#define pLsrHalt(status) \ + do { \ + pLsrExit(status); \ + return 0; \ + } while (0) + +#define pLsrHaltV(status) \ + do { \ + pLsrExit(status); \ + return; \ + } while (0) + +static PRT_NORETURN void pLsrHaltNegOne() +{ + fflush(stdout); + assert(0); + pLsrExit(-1); +} + +#define pLsrRuntimeError(msg) \ + (printf("Runtime error: %s at line %d in file %s\n", msg, __LINE__, __FILE__), \ + pLsrHaltNegOne(), \ + 0) + +static PRT_NORETURN void pLsrRuntimeError_(char * msg) { + pLsrRuntimeError(msg); +} + +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ + +static PRT_NORETURN void pLsrRuntimeErrorUnmanaged(char * msg) { + printf("Runtime error: %s at line %d in file %s\n", msg, __LINE__, __FILE__), \ + fflush(stdout); + assert(0); + exit(-1); +} + +# undef to +#pragma pillar_managed(on) +#endif + +void pLsrDisableErrorBox(); +uint64 pLsrEventsTimeStamp(); + +#if (defined(PLSR_ENABLE_EVENTS)) && (defined(_WIN32)) + +static uint64 pLsrEventsStartTime; +static FILE* pLsrEventsLogFile; + +void pLsrEventsInit() +{ + pLsrEventsStartTime = pLsrEventsTimeStamp(); + pLsrEventsLogFile = fopen("events.log", "wt"); +} + +#define pLsrEventsTransition(action, state) \ + fprintf(pLsrEventsLogFile, "%d,%p,%s,%s\n", \ + (uint32) (pLsrEventsTimeStamp() - pLsrEventsStartTime), \ + prtGetTaskHandle(), \ + action, state); \ + +void pLsrEventsShutdown() +{ + fclose(pLsrEventsLogFile); +} + +#else /* PLSR_ENABLE_EVENTS */ + +#define pLsrEventsInit() +#define pLsrEventsTransition(action, state) +#define pLsrEventsShutdown() + +#endif /* PLSR_ENABLE_EVENTS */ + +#endif /* !_PLSR_UTIL_H_ */ diff --git a/runtime/include/hrc/plsr-value.h b/runtime/include/hrc/plsr-value.h new file mode 100755 index 0000000..f379f4a --- /dev/null +++ b/runtime/include/hrc/plsr-value.h @@ -0,0 +1,941 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Various P Values */ + +#ifndef _PLSR_VALUE_H_ +#define _PLSR_VALUE_H_ + + +/********************************************************************** + * Option Sets and Types + */ + +/* Option sets based on this vtable must always be initialised + * immediately after allocation without an intervening GC. If this is + * not the case, the mutability property of the vtable must be + * changed. + */ +pLsrVTableStatic(pLsrPSetVTable_, VSetTag, "*option set*", pLsrPSetPadding); +#define pLsrPSetVTable (&pLsrPSetVTable_) + +/* Generated code defines: + * pLsrPSetSize + * pLsrPSetOffset + * pLsrPTypeSize + */ + +static PlsrPAny pLsrPSetNew(PlsrPAny v) +{ + PlsrPAny s; + noyield { + pLsrAlloc(PlsrPAny, s, pLsrPSetVTable, pLsrPSetSize); + pLsrWriteBarrierRefOptBase + (s, pLsrObjectField(s, pLsrPSetOffset, PlsrPAny*), v); + } + return s; +} + +#define pLsrPSetGet(s) (pLsrObjectField(s, pLsrPSetOffset, PlsrPAny*)) +#define pLsrPSetIsEmpty(s) ((PlsrBoolean)(!pLsrPSetGet(s))) + +/* Types based on this vtable must always be initialised + * immediately after allocation without an intervening GC. If this is + * not the case, the mutability property of the vtable must be + * changed. + */ +pLsrVTableStatic(pLsrPTypeVTable_, VTypeTag, "*type*", pLsrPTypePadding); +#define pLsrPTypeVTable (&pLsrPTypeVTable_) + +/********************************************************************** + * Rationals + */ + +/* Boxed rationals based on this vtable must always be initialised + * immediately after allocation without an intervening GC. If this is + * not the case, the mutability property of the vtable must be + * changed. + */ +pLsrVTableStatic(pLsrPRatVTable_, VRatTag, "*rat*", pLsrPRatPadding); +#define pLsrPRatVTable (&pLsrPRatVTable_) + +/* Generated code defines: + * pLsrPRatSize + * pLsrPRatOffset + */ + +#ifdef P_PRAT_IS_SINTP + +#define pLsrPRatContainsRef 0 + +#define pLsrPRatUnimp(s, dest) \ + do { \ + pLsrRuntimeError("PRat unimplemented"); \ + (dest) = 0; \ + } while (0) + +#define pLsrSIntpFromPRat(pLsrSIntpFromPRatDest, pLsrSIntpFromPRatArg1) \ + do { \ + (pLsrSIntpFromPRatDest) = \ + ((sintp)pLsrObjectField(pLsrSIntpFromPRatArg1, pLsrPRatOffset, sintp*)); \ + } while (0) + +#define pLsrPRatFromRational(pLsrPRatFromRationalDest, pLsrPRatFromRationalArg1) \ + pLsrPRatUnimp("pLsrPRatFromRational", pLsrPRatFromRationalDest) + +#define pLsrRationalFromPRat(pLsrRationalFromPRatDest, pLsrRationalFromPRatArg1)\ + do { \ + sintp pLsrRationalFromPRatI; \ + pLsrSIntpFromPRat(pLsrRationalFromPRatI, pLsrRationalFromPRatArg1); \ + pLsrRationalFromSIntp(pLsrRationalFromPRatDest, pLsrRationalFromPRatI); \ + } while (0) + +/* Conversions */ + +#define pLsrPRatFromUInt32(pLsrPRatFromUInt32Dest, pLsrPRatFromUInt32Arg1) \ + do { \ + noyield { \ + pLsrAlloc(PlsrPAny, (pLsrPRatFromUInt32Dest), pLsrPRatVTable, pLsrPRatSize); \ + pLsrObjectField(pLsrPRatFromUInt32Dest, pLsrPRatOffset, uintp*) = ((uintp) pLsrPRatFromUInt32Arg1); \ + } \ + } while (0) + +#define pLsrPRatFromInteger(pLsrPRatFromIntegerDest, pLsrPRatFromIntegerArg1) \ + pLsrPRatUnimp("pLsrPRatFromInteger", pLsrPRatFromIntegerDest) + +#define pLsrUInt32FromPRat(pLsrUInt32FromPRatDest, pLsrUInt32FromPRatArg1) \ + do { \ + sintp pLsrUInt32FromPRatI; \ + pLsrSIntpFromPRat(pLsrUInt32FromPRatI, pLsrUInt32FromPRatArg1); \ + if (pLsrUInt32FromPRatI < 0 || pLsrUInt32FromPRatI > UINT32_MAX) { \ + pLsrRuntimeError_("pLsrUInt32FromPRat: not in range"); \ + } \ + pLsrUInt32FromPRatDest = (uint32) pLsrUInt32FromPRatI; \ + } while (0) + +#define pLsrUIntpFromPRat(pLsrUIntpFromPRatDest, pLsrUIntpFromPRatArg1) \ + pLsrPRatUnimp("pLsrUIntpFromPRat", pLsrUIntpFromPRatDest) + + +#else /* !P_PRAT_IS_SINTP */ + +#define pLsrPRatContainsRef 1 + +#define pLsrPRatFromRational(pLsrPRatFromRationalDest, pLsrPRatFromRationalArg1) \ + do { \ + noyield { \ + pLsrAlloc(PlsrPAny, (pLsrPRatFromRationalDest), pLsrPRatVTable, pLsrPRatSize); \ + pLsrWriteBarrierRefOptBase \ + (pLsrPRatFromRationalDest, \ + pLsrObjectField(pLsrPRatFromRationalDest, pLsrPRatOffset, PlsrRational*), \ + pLsrPRatFromRationalArg1); \ + } \ + } while (0) + +#define pLsrRationalFromPRat(pLsrRationalFromPRatDest, pLsrRationalFromPRatArg1)\ + do { \ + (pLsrRationalFromPRatDest) = \ + ((PlsrRational)pLsrObjectField(pLsrRationalFromPRatArg1, pLsrPRatOffset, PlsrRational*)); \ + } while (0) + +/* Conversions */ + +#define pLsrPRatFromUInt32(pLsrPRatFromUInt32Dest, pLsrPRatFromUInt32Arg1) \ + do { \ + PlsrRational pLsrPRatFromUInt32R; \ + pLsrRationalFromUInt32(pLsrPRatFromUInt32R, pLsrPRatFromUInt32Arg1); \ + pLsrPRatFromRational(pLsrPRatFromUInt32Dest, pLsrPRatFromUInt32R); \ + } while (0) + +#define pLsrPRatFromInteger(pLsrPRatFromIntegerDest, pLsrPRatFromIntegerArg1) \ + do { \ + PlsrRational pLsrPRatFromIntegerR; \ + pLsrRationalFromInteger(pLsrPRatFromIntegerR, pLsrPRatFromIntegerArg1); \ + pLsrPRatFromRational(pLsrPRatFromIntegerDest, pLsrPRatFromIntegerR); \ + } while (0) + +#define pLsrUInt32FromPRat(pLsrUInt32FromPRatDest, pLsrUInt32FromPRatArg1)\ + do { \ + PlsrRational pLsrUInt32FromPRatR; \ + pLsrRationalFromPRat(pLsrUInt32FromPRatR, pLsrUInt32FromPRatArg1); \ + pLsrUInt32FromRational(pLsrUInt32FromPRatDest, pLsrUInt32FromPRatR); \ + } while (0) + +#define pLsrUIntpFromPRat(pLsrUIntpFromPRatDest, pLsrUIntpFromPRatArg1) \ + do { \ + PlsrRational pLsrUIntpFromPRatR; \ + pLsrRationalFromPRat(pLsrUIntpFromPRatR, pLsrUIntpFromPRatArg1); \ + pLsrUIntpFromRational(pLsrUIntpFromPRatDest, pLsrUIntpFromPRatR); \ + } while (0) + +#endif /* P_PRAT_IS_SINTP */ +/********************************************************************** + * Names + */ + +pLsrVTableStatic(pLsrPNameVTable_, VNameTag, "*name*", 0); +#define pLsrPNameVTable (&pLsrPNameVTable_) + +/* PlsrObjectU must be a prefix of this structure */ +/* This structure must always be initialised immediately after allocation + * without an intervening GC. If this is not the case, the mutability + * property of the vtable must be changed. + */ +typedef struct { + PlsrVTable vtable; + uintp tag; + uintp hash; + uintp strLen; + char str[]; +} PlsrPNameU; + +#define pLsrPNameStatic(v, n, h, sl, ...) \ + static PlsrPNameU v = { \ + .vtable = pLsrPNameVTable, \ + .tag = (n), \ + .hash = (h), \ + .strLen = (sl), \ + .str = { __VA_ARGS__ } \ + } + +#define pLsrPNameGetTag(n) (((PlsrPNameU*)(n))->tag) +#define pLsrPNameGetHash(n) (((PlsrPNameU*)(n))->hash) +#define pLsrPNameGetStringLen(n) (((PlsrPNameU*)(n))->strLen) +#define pLsrPNameGetString(n) (((PlsrPNameU*)(n))->str) + +/********************************************************************** + * Floating Point + */ + +/* Generated code defines: + * pLsrPFloatOffset + * pLsrPFloatSize + * pLsrPDoubleOffset + * pLsrPDoubleSize + */ + +/* Boxed floats based on this vtable must always be initialised + * immediately after allocation without an intervening GC. If this is + * not the case, the mutability property of the vtable must be + * changed. + */ +pLsrVTableStatic(pLsrPFloatVTable_, VFloatTag, "*float*", pLsrPFloatPadding); +#define pLsrPFloatVTable (&pLsrPFloatVTable_) + +/* Boxed doubles based on this vtable must always be initialised + * immediately after allocation without an intervening GC. If this is + * not the case, the mutability property of the vtable must be + * changed. + */ +pLsrVTableStatic(pLsrPDoubleVTable_, VDoubleTag, "*double*", pLsrPDoublePadding); +#define pLsrPDoubleVTable (&pLsrPDoubleVTable_) + +#define pLsrPFloatGet(v) (pLsrObjectField((v), pLsrPFloatOffset, float*)) +#define pLsrPDoubleGet(v) (pLsrObjectField((v), pLsrPDoubleOffset, double*)) + + +/********************************************************************** + * Indexes + */ + +typedef struct { + PlsrPAny name; + uintp offset; +} PlsrIdxE; + +/* PlsrObjectU must be a prefix of this structure */ +typedef struct { + PlsrVTable vtable; + uintp len; + PlsrIdxE elts[]; +} PlsrIdxU; + +#define pLsrIdxPadding (sizeof(PlsrIdxU) - (sizeof(PlsrVTable) + sizeof(uintp))) +pLsrVTableStatic(pLsrIdxVTable_, VNoneTag, "*index*", pLsrIdxPadding); +#define pLsrIdxVTable (&pLsrIdxVTable_) + +typedef PlsrIdxU* PlsrIdxB; + +#define pLsrIdxEltStatic {0, 0} + +#define pLsrIdxEmpty(v) \ + static PlsrIdxU v = \ + { .vtable = pLsrIdxVTable, .len = 0, .elts = { pLsrIdxEltStatic } } + +#define pLsrIdxStatic(v, dlen, ... ) \ + static PlsrIdxU v = \ + { .vtable = pLsrIdxVTable, .len = (dlen), .elts = { __VA_ARGS__ } } + +static PlsrIdxB pLsrIdxNew(uintp len) +{ + uintp dlen = 1; + uintp i; + PlsrIdxB dct; + uintp size; + + /* Choose a power of two at least 1.5 times as large as len*/ + while(2*dlen < 3*len) dlen *= 2; + size = sizeof(PlsrIdxU) + dlen*sizeof(PlsrIdxE); + pLsrAlloc(PlsrIdxB, dct, pLsrIdxVTable, size); + dct->len = dlen; + for(i=0;ielts[i].name=0; + return dct; +} + +static void pLsrIdxSet(PlsrIdxB idx, PlsrPAny n, uintp e) +{ + uintp mask = idx->len-1; + PlsrIdxE* elts = idx->elts; + uintp j; + uintp hash; + if (!n) pLsrRuntimeError("Name out of range"); + hash = pLsrPNameGetHash(n); + for(j = hash & mask; elts[j].name; j = (j+1) & mask); + pLsrWriteBarrierRefBase(idx, elts[j].name, n); + elts[j].offset = e; +} + +static uintp pLsrIdxGet(PlsrIdxB idx, PlsrPAny n) +{ + uintp mask = idx->len-1; + PlsrIdxE* elts = idx->elts; + uintp hash = pLsrPNameGetHash(n); + uintp j; + for(j = hash & mask; elts[j].name != n; j = (j+1) & mask); + return elts[j].offset; +} + +/********************************************************************** + * Tuples + */ + +/* Generated code defines: + * pLsrPArrayOLenOffset + * pLsrPArrayOEltOffset + * pLsrPArrayOBaseSize + * pLsrPArrayILenOffset + * pLsrPArrayIEltOffset + * pLsrPArrayIIdxOffset + * pLsrPArrayIBaseSize + */ + +/* Base tuples */ + +#define pLsrTupleStatic(v, t, ...) static t v = { __VA_ARGS__ } + +#define pLsrTupleStaticV(v, t, vtb, sz, ...) \ + static t v = { .vtable = vtb, .f_0 = sz, .extras = { __VA_ARGS__ } } + +#define pLsrTupleNewFixed(dest, vt, sz, algn) \ + pLsrAllocAligned(PlsrObjectB, (dest), (vt), (sz), algn) +#define pLsrTupleNewVariable(dest, vt, fsz, esz, c, algn) \ + pLsrAllocAligned(PlsrObjectB, (dest), (vt), (fsz)+(esz)*(c), algn) + +#define pLsrTupleNewPinnedFixed(dest, vt, sz, algn) \ + pLsrAllocAlignedPinned(PlsrObjectB, (dest), (vt), (sz), algn) +#define pLsrTupleNewPinnedVariable(dest, vt, fsz, esz, c, algn) \ + pLsrAllocAlignedPinned(PlsrObjectB, (dest), (vt), (fsz)+(esz)*(c), algn) + +/* Ordinal arrays */ + +/* This vtable should only be used with arrays of refs + */ +pLsrVTableStatic(pLsrPArrayOVTable_, VArrayTag, "*ordinal array*", pLsrPArrayOPadding); +#define pLsrPArrayOVTable (&pLsrPArrayOVTable_) + +/* This function should only be used for arrays of references */ +static PlsrPAny pLsrPArrayONew(uintp c) +{ + PlsrPAny res; + pLsrTupleNewVariable(res, pLsrPArrayOVTable, pLsrPArrayOBaseSize, + sizeof(PlsrObjectB), c, sizeof(PlsrObjectB)); + pLsrObjectField(res, pLsrPArrayOLenOffset, uintp*) = c; + return res; +} + +#define pLsrPArrayOGetLen(arr) \ + (pLsrObjectField((arr), pLsrPArrayOLenOffset, uintp*)) + +/* This macro is for arrays of references */ +#define pLsrPArrayOElt(arr, i) \ + (pLsrObjectField((arr), pLsrPArrayOEltOffset+sizeof(PlsrObjectB)*(i), \ + PlsrObjectB*)) + +static void pLsrPArrayOSet(PlsrPAny arr, PlsrPAny i, PlsrObjectB e) +{ + uintp ui; + pLsrUIntpFromPRat(ui, i); + pLsrWriteBarrierRefBase(arr, pLsrPArrayOElt(arr, ui), e); +} + +static PlsrObjectB pLsrPArrayOGet(PlsrPAny arr, PlsrPAny i) +{ + uintp ui; + pLsrUIntpFromPRat(ui, i); + return pLsrPArrayOElt(arr, ui); +} + +#define pLsrPArrayOEltEval(dest, a, i) pLsrObjectEval((dest), pLsrPArrayOElt((a), (i))) + +/* Indexed arrays */ + +/* This vtable should not be used with accurate GC, as it does not + * determine its GC info unambiguously. However, the runtime uses + * it for arrays with all reference elements. + */ +pLsrVTableStatic(pLsrPArrayIVTable_, VArrayIdxTag, "*indexed array*", pLsrPArrayIPadding); +#define pLsrPArrayIVTable (&pLsrPArrayIVTable_) + +/* This function should only be used for arrays of references */ +static PlsrPAny pLsrPArrayINew(uintp c, PlsrIdxB idx) +{ + PlsrPAny res; + noyield { + pLsrTupleNewVariable(res, + pLsrPArrayIVTable, + pLsrPArrayIBaseSize, + sizeof(PlsrObjectB), + c, + sizeof(PlsrObjectB)); + pLsrObjectField(res, pLsrPArrayILenOffset, uintp*) = c; + pLsrWriteBarrierRefOptBase(res, + pLsrObjectField(res, pLsrPArrayIIdxOffset, + PlsrIdxB*), + idx); + } + return res; +} + +/* This macro is for arrays of references */ +#define pLsrPArrayIElt(arr, i) \ + (pLsrObjectField((arr), pLsrPArrayIEltOffset+sizeof(PlsrObjectB)*(i), \ + PlsrObjectB*)) + +#define pLsrPArrayIGetLen(arr) \ + (pLsrObjectField((arr), pLsrPArrayILenOffset, uintp*)) + +#define pLsrPArrayIGetIdx(arr) \ + (pLsrObjectField((arr), pLsrPArrayIIdxOffset, PlsrIdxB*)) + +static PlsrObjectB pLsrPArrayIGet(PlsrPAny arr, PlsrPAny n) +{ + uintp i = pLsrIdxGet(pLsrPArrayIGetIdx(arr), n); + return pLsrPArrayIElt(arr, i); +} + +/* Utilities */ + +static PlsrBoolean pLsrPArrayHasIdx(PlsrPAny arr) +{ + return pLsrVTableGetTag(pLsrObjectGetVTable(arr)) == VArrayIdxTag; +} + + +/********************************************************************** + * P Functions + */ + +/* When using accurate GC, this vtable should be used only with closures + * with no free variables. + */ +pLsrVTableStatic(pLsrClosureVTable_, VFunctionTag, "*function*", pLsrPFunctionPadding); +#define pLsrClosureVTable (&pLsrClosureVTable_) + +/* Generated constants: + * pLsrPFunctionCodeOffset + * pLsrPFunctionSize (* for no free variables *) + */ + +typedef PlsrObjectB (*PlsrPFunctionCodeRef)(PlsrObjectB); + +#define pLsrPFunctionApplyRef(clos, arg) \ + (pLsrObjectField(clos, pLsrPFunctionCodeOffset, PlsrPFunctionCodeRef*) \ + (arg)) + +/********************************************************************** + * Sums + */ + +/* All this code is for sums over references only */ + +/* Objects created from this vtable must always be initialised immediately + * after allocation + * without an intervening GC. If this is not the case, the mutability + * property of the vtable must be changed. + */ +pLsrVTableStatic(pLsrPSumVTable_, VSumTag, "*sum*", pLsrPSumPadding); +#define pLsrPSumVTable (&pLsrPSumVTable_) + +/* Generated constants: + * pLsrPSumTagOffset + * pLsrPSumValOffset + * pLsrPSumSize + */ + +static PlsrPAny pLsrPSumNew(PlsrPAny tag, PlsrObjectB value) +{ + PlsrPAny v; + noyield { + pLsrAlloc(PlsrPAny, v, pLsrPSumVTable, pLsrPSumSize); + pLsrWriteBarrierRefOptBase + (v, pLsrObjectField(v, pLsrPSumTagOffset, PlsrPAny*), tag); + pLsrWriteBarrierRefOptBase + (v, pLsrObjectField(v, pLsrPSumValOffset, PlsrObjectB*), value); + } + return v; +} + +static PlsrPAny pLsrPSumNewT(PlsrPAny tag, PlsrPAny value) +{ + PlsrThunkBRef valthnk = pLsrThunkNewValRef((PlsrRef) value); + return pLsrPSumNew(tag, (PlsrObjectB)valthnk); +} + +#define pLsrPSumGetTag(s) (pLsrObjectField(s, pLsrPSumTagOffset, PlsrPAny*)) +#define pLsrPSumGetVal(s) (pLsrObjectField(s, pLsrPSumValOffset, PlsrObjectB*)) +#define pLsrPSumGetValEval(dest, s) pLsrObjectEval((dest), pLsrPSumGetVal(s)) + + +/********************************************************************** + * Strings + */ + +static PlsrPAny pLsrCoreCharOrd = 0; + +static void pLsrRegisterCoreCharOrd(PlsrPAny n) +{ + assert(!pLsrCoreCharOrd); // Should only be registered once + assert(n); + pLsrCoreCharOrd = n; +} + +static PlsrBoolean pLsrIsNiceChar(int c) +{ + return c>=32 && c<=126; +} + +static PlsrBoolean pLsrValueToChar(char* c, PlsrPAny v) +{ + assert(pLsrCoreCharOrd); + if (pLsrVTableGetTag(pLsrObjectGetVTable(v)) == VSumTag) { + if(pLsrPSumGetTag(v) == pLsrCoreCharOrd) { + PlsrPAny cv; + pLsrPSumGetValEval(cv, v); + if (cv) { + char i; + pLsrUInt32FromPRat(i, cv); + if (pLsrIsNiceChar(i)) { + *c = i; + return 1; + } + } + } + } + return 0; +} + +static char* pLsrValueToCStringMaybe(PlsrPAny v) +{ + if (!pLsrCoreCharOrd || + pLsrVTableGetTag(pLsrObjectGetVTable(v)) != VArrayTag || + pLsrPArrayHasIdx(v) || + pLsrPArrayOGetLen(v) == 0) + return 0; + { + uintp c = pLsrPArrayOGetLen(v); + char* s = (char*)pLsrAllocC((c+1) * sizeof(char)); + uintp i; + for(i=0; i0) printf(", "); + pLsrValuePrint(pLsrPArrayOElt(v, i)); + } + printf("}"); + } + break; + } + case VArrayIdxTag: { + uintp alen = pLsrPArrayIGetLen(v); + /* Don't stack allocate until Pillar compiler alloca bug is fixed. */ + PlsrPAny *names = (PlsrPAny*) pLsrAllocC(alen*sizeof(PlsrPAny)); + { + uintp i; + PlsrIdxB idx = pLsrPArrayIGetIdx(v); + uintp dlen = idx->len; + + for(i=0; i < dlen; i++) { + if (idx->elts[i].name) { + names[idx->elts[i].offset] = idx->elts[i].name; + } + } + } + { + uintp i; + bool first = 1; + + printf("{"); + for(i=0; i < alen; i++) { + if (!first) printf(", "); + else first=0; + pLsrNamePrint(names[i]); + printf(" => "); + pLsrValuePrint(pLsrPArrayIElt(v, i)); + } + } + printf("}"); + pLsrFreeC(names); + break; + } + case VSetTag: + printf("SET{"); + if (!pLsrPSetIsEmpty(v)) + pLsrValuePrint(pLsrPSetGet(v)); + printf("}"); + break; + case VTypeTag: + printf("TypePH"); + break; + case VSumTag: + printf("sum("); + pLsrNamePrint(pLsrPSumGetTag(v)); + printf(") "); + pLsrValuePrint(pLsrPSumGetVal(v)); + break; + case VFunctionTag: + printf("A Closure"); + break; + case VThunkTag: + pLsrThunkPrintRef((PlsrThunkBRef)v); + break; + default: + printf("Something else: possibly missed a case in ValuePrint"); + break; + } +}; + +/********************************************************************** + * Register Runtime VTables + */ + +#define pLsrPNameSize (sizeof(PlsrPNameU)+1) // plus one for zero byte +#define pLsrPNameLenOff ((unsigned)(&((PlsrPNameU*)0)->strLen)) +#define pLsrIdxSize (sizeof(PlsrIdxU)) +#define pLsrIdxLenOff ((unsigned)(&((PlsrIdxU*)0)->len)) +#define pLsrIdxEltSize (sizeof(PlsrIdxE)) +#define pLsrThunkCutSize () + +/* Note: for PgcIsRef arrays which are all zeroes, we rely on + * c99 static initialization semantics to initialize the fields to zero. + * By convention, we write this as {0, } . + */ +static void pLsrValueRegisterVTables() +{ + static PgcIsRef pLsrPRatRefs[pLsrPRatSize/P_WORD_SIZE] = { 0, pLsrPRatContainsRef }; + static PgcIsRef pLsrPNameRefs[pLsrPNameSize/P_WORD_SIZE] = { 0, }; + static PgcIsRef pLsrPFloatRefs[pLsrPFloatSize/P_WORD_SIZE] = { 0, }; + static PgcIsRef pLsrPDoubleRefs[pLsrPDoubleSize/P_WORD_SIZE] = { 0, }; + static PgcIsRef pLsrPSumRefs[] = { 0, 1, 1 }; + static PgcIsRef pLsrPArrayORefs[pLsrPArrayOBaseSize/P_WORD_SIZE] = { 0, }; + static PgcIsRef pLsrPArrayIRefs[] = { 0, 0, 1 }; + static PgcIsRef pLsrPFunctionRefs[pLsrPFunctionSize/P_WORD_SIZE] = { 0, }; + static PgcIsRef pLsrPSetRefs[] = { 0, 1 }; + static PgcIsRef pLsrPTypeRefs[pLsrPTypeSize/P_WORD_SIZE] = { 0, }; + static PgcIsRef pLsrIdxRefs[pLsrIdxSize/P_WORD_SIZE] = { 0, }; + static PgcIsRef pLsrIdxWRefs[pLsrIdxSize/P_WORD_SIZE] = {0, }; + static PgcIsRef pLsrIdxEltRefs[] = { 1, 0 }; +#ifdef PLSR_LIGHTWEIGHT_THUNKS + assert(sizeof(PlsrThunkURef)/P_WORD_SIZE == 2); + static PgcIsRef pLsrThunkValRefsRef[] = {0, 1}; + static PgcIsRef pLsrThunkCutRefs[sizeof(PlsrThunkURef)/P_WORD_SIZE] = {0, 1}; +#else /* !PLSR_LIGHTWEIGHT_THUNKS */ +#ifdef PLSR_THUNK_SYNCHRONIZE +#ifdef PLSR_THUNK_INTERCEPT_CUTS + assert(sizeof(PlsrThunkURef)/P_WORD_SIZE == 6); + static PgcIsRef pLsrThunkValRefsRef[] = {0, 0, 0, 0, 0, 1}; +#else /* PLSR_THUNK_INTERCEPT_CUTS*/ + assert(sizeof(PlsrThunkURef)/P_WORD_SIZE == 5); + static PgcIsRef pLsrThunkValRefsRef[] = {0, 0, 0, 0, 1}; +#endif /* PLSR_THUNK_INTERCEPT_CUTS */ +#else /* !PLSR_THUNK_SYNCHRONIZE */ +#ifdef PLSR_THUNK_INTERCEPT_CUTS + assert(sizeof(PlsrThunkURef)/P_WORD_SIZE == 5); + static PgcIsRef pLsrThunkValRefsRef[] = {0, 0, 0, 0, 1}; +#else /* PLSR_THUNK_INTERCEPT_CUTS*/ + assert(sizeof(PlsrThunkURef)/P_WORD_SIZE == 4); + static PgcIsRef pLsrThunkValRefsRef[] = {0, 0, 0, 1}; +#endif /* PLSR_THUNK_INTERCEPT_CUTS */ +#endif /* !PLSR_THUNK_SYNCHRONIZE */ +#endif /* !PLSR_LIGHTWEIGHT_THUNKS */ + static PgcIsRef pLsrThunkValRefs32[sizeof(PlsrThunkU32)/P_WORD_SIZE] = {0, }; + static PgcIsRef pLsrThunkValRefs64[sizeof(PlsrThunkU64)/P_WORD_SIZE] = {0, }; + static PgcIsRef pLsrThunkValRefsFloat[sizeof(PlsrThunkUFloat)/P_WORD_SIZE] = {0, }; + static PgcIsRef pLsrThunkValRefsDouble[sizeof(PlsrThunkUDouble)/P_WORD_SIZE] = {0, }; + +#ifdef PLSR_LIGHTWEIGHT_THUNKS + static PgcIsRef pLsrThunkEvalRefsRef[sizeof(PlsrThunkURef)/P_WORD_SIZE] = {0, }; + static PgcIsRef pLsrThunkEvalRefs32[sizeof(PlsrThunkU32)/P_WORD_SIZE] = {0, }; + static PgcIsRef pLsrThunkEvalRefs64[sizeof(PlsrThunkU64)/P_WORD_SIZE] = {0, }; + static PgcIsRef pLsrThunkEvalRefsFloat[sizeof(PlsrThunkUFloat)/P_WORD_SIZE] = {0, }; + static PgcIsRef pLsrThunkEvalRefsDouble[sizeof(PlsrThunkUDouble)/P_WORD_SIZE] = {0, }; +#endif /* PLSR_LIGHTWEIGHT_THUNKS */ + + assert(pLsrPRatSize/P_WORD_SIZE == 2); + assert(pLsrPSumSize/P_WORD_SIZE == 3); + assert(pLsrPArrayIBaseSize/P_WORD_SIZE == 3); + assert(pLsrPSetSize/P_WORD_SIZE == 2); + assert(pLsrIdxEltSize/P_WORD_SIZE == 2); + + pLsrVTableRegister(pLsrPRatVTable, pLsrDefaultAlignment, pLsrPRatSize, pLsrPRatRefs, 0, 0, 0, + PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegister(pLsrPNameVTable, pLsrDefaultAlignment, pLsrPNameSize, pLsrPNameRefs, 1, + pLsrPNameLenOff, 0, PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegister(pLsrPFloatVTable, pLsrDefaultAlignment, pLsrPFloatSize, pLsrPFloatRefs, + 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegister(pLsrPDoubleVTable, pLsrDefaultAlignment, pLsrPDoubleSize, pLsrPDoubleRefs, + 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegister(pLsrPSumVTable, pLsrDefaultAlignment, pLsrPSumSize, pLsrPSumRefs, 0, 0, 0, + PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegister(pLsrPArrayOVTable, pLsrDefaultAlignment, pLsrPArrayOBaseSize, pLsrPArrayORefs, + P_WORD_SIZE, pLsrPArrayOLenOffset, 1, + PGC_CREATED_MUTABLE, 0); + pLsrVTableRegister(pLsrPArrayIVTable, pLsrDefaultAlignment, pLsrPArrayIBaseSize, pLsrPArrayIRefs, + P_WORD_SIZE, pLsrPArrayILenOffset, 1, + PGC_CREATED_MUTABLE, 0); + pLsrVTableRegister(pLsrClosureVTable, pLsrDefaultAlignment, pLsrPFunctionSize, pLsrPFunctionRefs, + 0, 0, 0, PGC_CREATED_MUTABLE, 0); + pLsrVTableRegister(pLsrPSetVTable, pLsrDefaultAlignment, pLsrPSetSize, pLsrPSetRefs, 0, 0, 0, + PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegister(pLsrPTypeVTable, pLsrDefaultAlignment, pLsrPTypeSize, pLsrPTypeRefs, 0, 0, + 0, PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegisterV(pLsrIdxVTable, pLsrDefaultAlignment, pLsrIdxSize, pLsrIdxRefs, + pLsrIdxEltSize, pLsrIdxLenOff, pLsrIdxEltRefs, + PGC_CREATED_MUTABLE, 0, pLsrIdxWRefs, NULL); +#ifdef PLSR_GC_INDIRECTIONS + pLsrIndirectionVTableRegister(pLsrThunkValVTableRef, sizeof(PlsrThunkURef), pLsrThunkResultOffsetRef); +#else + pLsrVTableRegister(pLsrThunkValVTableRef, pLsrDefaultAlignment, sizeof(PlsrThunkURef), + pLsrThunkValRefsRef, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0); +#endif + pLsrVTableRegister(pLsrThunkValVTable32, pLsrDefaultAlignment, sizeof(PlsrThunkU32), + pLsrThunkValRefs32, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegister(pLsrThunkValVTable64, pLsrDefaultAlignment, sizeof(PlsrThunkU64), + pLsrThunkValRefs64, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegister(pLsrThunkValVTableFloat, pLsrDefaultAlignment, sizeof(PlsrThunkUFloat), + pLsrThunkValRefsFloat, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0); + pLsrVTableRegister(pLsrThunkValVTableDouble, pLsrDefaultAlignment, sizeof(PlsrThunkUDouble), + pLsrThunkValRefsDouble, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0); + +#ifdef PLSR_LIGHTWEIGHT_THUNKS + pLsrVTableRegister(pLsrThunkCutVTable, pLsrDefaultAlignment, sizeof(PlsrThunkURef), + pLsrThunkCutRefs, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0); + + pLsrVTableRegister(pLsrThunkEvalVTableRef, pLsrDefaultAlignment, sizeof(PlsrThunkURef), + pLsrThunkEvalRefsRef, 0, 0, 0, PGC_CREATED_MUTABLE, 0); + pLsrVTableRegister(pLsrThunkEvalVTable32, pLsrDefaultAlignment, sizeof(PlsrThunkU32), + pLsrThunkEvalRefs32, 0, 0, 0, PGC_CREATED_MUTABLE, 0); + pLsrVTableRegister(pLsrThunkEvalVTable64, pLsrDefaultAlignment, sizeof(PlsrThunkU64), + pLsrThunkEvalRefs64, 0, 0, 0, PGC_CREATED_MUTABLE, 0); + pLsrVTableRegister(pLsrThunkEvalVTableFloat, pLsrDefaultAlignment, sizeof(PlsrThunkUFloat), + pLsrThunkEvalRefsFloat, 0, 0, 0, PGC_CREATED_MUTABLE, 0); + pLsrVTableRegister(pLsrThunkEvalVTableDouble, pLsrDefaultAlignment, sizeof(PlsrThunkUDouble), + pLsrThunkEvalRefsDouble, 0, 0, 0, PGC_CREATED_MUTABLE, 0); +#endif +} + +/********************************************************************** + * Register Runtime Globals + */ + +#define pLsrValueGlobalRefsCount 1 + +static PlsrRef* pLsrValueGlobalRefs[] = + { + (PlsrRef *) &pLsrCoreCharOrd, + (PlsrRef *) NULL /* This must be last */ + }; + +static void pLsrValueRegisterGlobalRefs() { + assert(pLsrValueGlobalRefs[pLsrValueGlobalRefsCount] == NULL); + pLsrGcRegisterGlobalRefs ((void **)pLsrValueGlobalRefs, pLsrValueGlobalRefsCount); +}; + +static void pLsrValueRegisterGlobals() { + pLsrValueRegisterGlobalRefs(); +}; + +/********************************************************************** + * Runtime Assertions + */ + +static void pLsrValueCheck() +{ + if (pLsrObjectFieldsBase > pLsrPSetOffset) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPSetSize < pLsrPSetOffset+sizeof(PlsrPAny)) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrObjectFieldsBase > pLsrPTypeSize) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrObjectFieldsBase > pLsrPRatOffset) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPRatSize < pLsrPRatOffset+sizeof(PlsrRational)) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrObjectFieldsBase > pLsrPFloatOffset) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPFloatSize < pLsrPFloatOffset+sizeof(float)) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrObjectFieldsBase > pLsrPDoubleOffset) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPDoubleSize < pLsrPDoubleOffset+sizeof(double)) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrObjectFieldsBase > pLsrPArrayOLenOffset) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPArrayOEltOffset < pLsrPArrayOLenOffset+sizeof(uintp)) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPArrayOBaseSize < pLsrPArrayOEltOffset) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrObjectFieldsBase > pLsrPArrayILenOffset) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPArrayIIdxOffset < pLsrPArrayILenOffset+sizeof(uintp)) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPArrayIEltOffset < pLsrPArrayIIdxOffset+sizeof(void*)) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPArrayIBaseSize < pLsrPArrayIEltOffset) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrObjectFieldsBase > pLsrPFunctionCodeOffset) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrObjectFieldsBase > pLsrPSumTagOffset) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPSumSize < pLsrPSumTagOffset+sizeof(PlsrPAny)) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPSumSize < pLsrPSumValOffset+sizeof(PlsrObjectB)) + pLsrRuntimeError("Bad object model!\n"); +#ifdef P_USE_TAGGED_RATIONALS + if (pLsrPSmallRationalMax != pLsrSmallRationalMax) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPSmallRationalMin != pLsrSmallRationalMin) + pLsrRuntimeError("Bad object model!\n"); +#endif +#ifdef P_USE_TAGGED_INTEGERS + if (pLsrPSmallIntegerMax != pLsrSmallIntegerMax) + pLsrRuntimeError("Bad object model!\n"); + if (pLsrPSmallIntegerMin != pLsrSmallIntegerMin) + pLsrRuntimeError("Bad object model!\n"); +#endif +} + +#endif /* !_PLSR_VALUE_H_ */ diff --git a/runtime/include/hrc/plsr-wpo.h b/runtime/include/hrc/plsr-wpo.h new file mode 100755 index 0000000..31df034 --- /dev/null +++ b/runtime/include/hrc/plsr-wpo.h @@ -0,0 +1,168 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* Weak Pointer Objects */ + +#ifndef _PLSR_WPO_H_ +#define _PLSR_WPO_H_ + +/*********************************************************************************** + ************* Weak Pointer Object Interface *************************************** + **********************************************************************************/ + +typedef PlsrRef PlsrWpo; + +/* Create a new weak pointer object from key, value and optional finalizer +* finalizer may be NULL. Finalizer is assumed to be a PlsrThunkBRef +*/ +static PlsrWpo pLsrWpoNew(PlsrRef key, PlsrRef value, PlsrRef finalizer); + +/* Create a new weak pointer object from key, value and optional finalizer +* finalizer may be NULL. Finalizer is run using the provided code pointer. +*/ +static PlsrWpo pLsrWpoNewWithRun(PlsrRef key, PlsrRef value, PlsrRef finalizer, PlsrFinalizerCode run); + +/* Read the value (if present) of the weak pointer object. Returns NULL +* if not present. +*/ +static PlsrRef pLsrWpoRead(PlsrWpo wpo); + +/* Tombstone the weak pointer object and immediately run its finalizer. +* Returns when the finalizer has completed. +*/ +static void pLsrWpoFinalize(PlsrWpo wpo); + +/* Registration code */ +static void pLsrWpoRegisterVTables(); +static void pLsrWpoRegisterGlobals(); +static void pLsrWpoCheckAssertions(); + +/*********************************************************************************** + ************* Weak Pointer Object Implementation ********************************** + **********************************************************************************/ + + +typedef struct PlsrWpoS { + PlsrVTable vtable; + PlsrRef key; + volatile PlsrRef value; + PlsrRef finalizer; + PlsrFinalizerCode run; +} PlsrWpoU; + +#define pLsrWpoPadding \ + (sizeof(PlsrWpoU) - sizeof(PlsrVTable) - 3*sizeof(PlsrRef) - sizeof(PlsrFinalizerCode)) +pLsrVTableStatic(pLsrWpoVTable_, VNoneTag, "*weak pointer object*", pLsrWpoPadding); +#define pLsrWpoVTable (&pLsrWpoVTable_) + + + +#ifdef P_USE_PILLAR + + +static PlsrWpo pLsrWpoNewWithRun(PlsrRef key, PlsrRef value, PlsrRef finalizer, PlsrFinalizerCode run) +{ + PlsrRef res; + pLsrAlloc_(PlsrRef, res, pLsrWpoVTable, sizeof(PlsrWpoU)); + ((PlsrWpoU*)res)->key=key; + ((PlsrWpoU*)res)->value=value; + ((PlsrWpoU*)res)->finalizer=finalizer; + ((PlsrWpoU*)res)->run=run; +#ifdef PLSR_WPO_TRACE + printf("Creating WPO object %p with finalizer object %p and value %p\n", res, finalizer, value); + fflush(stdout); +#endif + return res; +} + +void pLsrWpoRunFinalizer(PlsrRef finalizer) +{ + pLsrThunkEvalRef(finalizer); +} + +static PlsrWpo pLsrWpoNew(PlsrRef key, PlsrRef value, PlsrRef finalizer) +{ + return pLsrWpoNewWithRun(key, value, finalizer, pLsrWpoRunFinalizer); +} + +static PlsrRef pLsrWpoRead(PlsrWpo wpo) +{ +#ifdef PLSR_WPO_TRACE + printf("Reading WPO object %p with value %p\n", wpo, ((PlsrWpoU*)wpo)->value); + fflush(stdout); +#endif + + return ((PlsrWpoU*)wpo)->value; +} + +static void pLsrWpoFinalize(PlsrWpo wpo) +{ + ((PlsrWpoU*)wpo)->value = NULL; + if (((PlsrWpoU*)wpo)->finalizer) { + ((PlsrWpoU*)wpo)->run(((PlsrWpoU*)wpo)->finalizer); + } +} + + +#pragma pillar_managed(off) + +static void pLsrWpoTombstone(void * wpo) +{ +#ifdef PLSR_WPO_TRACE + printf("Tombstoning WPO object %p with value %p\n", wpo, ((PlsrWpoU*)wpo)->value); + fflush(stdout); +#endif + assert(((PlsrWpoU*)wpo)->value); + ((PlsrWpoU*)wpo)->value = NULL; + if (((PlsrWpoU*)wpo)->finalizer) { + pLsrFinalizerAdd(((PlsrWpoU*)wpo)->run, ((PlsrWpoU*)wpo)->finalizer); + } +} + +#pragma pillar_managed(on) + +#define pLsrWpoSize (sizeof(PlsrWpoU)) + +static void pLsrWpoRegisterVTables() +{ + /* XXX Temporarily disable weak pointer semantics pending GC fixes. -leaf */ + static PgcIsRef pLsrWpoRefs[] = { 0, 1, 1, 1, 0 }; + static PgcIsRef pLsrWpoWRefs[] = { 0, 0, 0, 0, 0 }; + pLsrVTableRegisterV(pLsrWpoVTable, pLsrDefaultAlignment, pLsrWpoSize, pLsrWpoRefs, + 0, 0, 0, + PGC_ALWAYS_MUTABLE, 0, pLsrWpoWRefs, NULL); + pgc_set_wpo_vtable((struct VTable *) pLsrWpoVTable); + pgc_set_wpo_finalizer(pLsrWpoTombstone); +} + +static void pLsrWpoRegisterGlobals() +{ +} + +static void pLsrWpoCheckAssertions() +{ +} + +#else /* ! P_USE_PILLAR */ + +static PlsrWpo pLsrWpoNew(PlsrRef key, PlsrRef value, PlsrRef finalizer) +{ + pLsrRuntimeError("Weak pointer objects not supported"); + return 0; +} + +static void pLsrWpoRegisterVTables() +{ +} + +static void pLsrWpoRegisterGlobals() +{ +} + +static void pLsrWpoCheckAssertions() +{ +} + +#endif /* P_USE_PILLAR */ + +#endif /* !_PLSR_WPO_H_ */ diff --git a/runtime/include/hrc/plsr.h b/runtime/include/hrc/plsr.h new file mode 100644 index 0000000..edbcc69 --- /dev/null +++ b/runtime/include/hrc/plsr.h @@ -0,0 +1,183 @@ +/* The Haskell Research Compiler */ +/* COPYRIGHT_NOTICE_1 */ + +/* The P language-specific runtime */ + +#ifndef _PLSR_H_ +#define _PLSR_H_ + +/********************************************************************** + * Control + */ + +/* User Control macros: + * P_WORD_SIZE - number of bytes in a pointer + * P_USE_PILLAR - compiled for pillar + * P_USE_CGC - use BDW conservative garbage collector + * P_USE_AGC - use a PSL accurate garabage collector + * - value specifies kind (see PlsrAgcKind ) + * P_AGC_LOCK_PARAM - the lock parameter to pass to pgc_init + * PLSR_ENABLE_EVENTS - Turn on the event logging code. + * P_USE_FAST_ALLOC - use an inlined fast path for allocation + * PLSR_INSTRUMENT_ALLOCATION - collect and report allocation stats + * PLSR_INSTRUMENT_GCS - collect and report per gc allocation stats + * PLSR_INSTRUMENT_VTB_ALC - collect and report per vtable allocation stats + * P_USE_GC_WRITE_BARRIERS - use write barriers for the GC + * P_ALL_BARRIERS - do all barriers for barrier validation + * P_USE_PARALLEL_FUTURES - use the parallel futures system + * P_DO_VTABLE_CHANGE - do vtable changing for immutability etc. + * P_PRAT_USE_SINTP - use sintp to represent P Rationals (unsafe) + * P_USE_TAGGED_INTEGERS - use low bit tagging on small ap integers + * P_USE_TAGGED_RATIONALS - use low bit tagging on small ap rationals + * P_TAGGED_INT32_PORTABLE - see commentary in plsr-tagged-int32.h + * P_TAGGED_INT32_ASSERT_SMALL - use 32 bit integers with runtime checks + * P_TAGGED_INT32_ASSUME_SMALL - use 32 bit untegers with no checks + * PLSR_DISABLE_TAILCALL - disable tailcalls + * PLSR_GMP_USE_GCMALLOC - use gc malloc for gmp wrappers + * PLSR_GMP_USE_PCDECL - use pcdecl to avoid pinning gmp wrappers + * PLSR_GMP_USE_MALLOC - use malloc to avoid pinning gmp wrappers + * PLSR_GMP_FORCE_GC - force malloc heap limits for gmp wrappers + * PLSR_GMP_USE_PINNING - use pinning in gmp wrappers + * PLSR_GMP_USE_GALLOCATE - use guaranteed allocation in gmp wrappers + * PLSR_GMP_USE_DEFAULT - use a backend default gmp implementation + * PLSR_TAGGED_INT32_TEST - generate test functions + * PLSR_TAGGED_INTEGER_RECOVER - Check AP int output for taggable integers + * PLSR_THUNK_INTERCEPT_CUTS - Intercept cuts out of thunks + * PLSR_THUNK_SYNCHRONIZE - use synchronized thunks + * PLSR_NO_GMP_INTEGERS - don't use the GMP + * PLSR_STACK_SIZE_WORKER - default stack size (mb) for ipc worker threads + * PLSR_STACK_SIZE_MAIN - default stack size (mb) for ipc main threads + * PLSR_SINGLE_THREADED - no runtime concurrency allowed + * Derived Control macros: + * P_USE_MCRT - compiled for C and use mcrt + * P_USE_PTHREADS - use pthreads for threading + * PLSR_GNU_ASM - use gnu style asm + * PLSR_MS_ASM - use ms style asm + */ + +/*#define PLSR_ENABLE_EVENTS*/ + +/* Derive control macros */ + +#if defined(__GNUC__) || __INTEL_COMPILER >= 1000 + #define PLSR_GNU_ASM +#else + #define PLSR_MS_ASM +#endif + +#if defined(PLSR_SINGLE_THREADED) && defined(P_USE_PARALLEL_FUTURES) +#error "Can't use parallel futures with the single threaded runtime" +#endif + +#if defined(P_USE_PARALLEL_FUTURES) && !defined(PLSR_THUNK_SYNCHRONIZE) +#error "Must use synchronized thunks with parallel futures" +#endif + +#ifdef USE_PTHREADS +#define P_USE_PTHREADS +#else +#if !defined(P_USE_PILLAR) && defined(P_USE_PARALLEL_FUTURES) +#define P_USE_MCRT +#endif /* !P_USE_PILLAR && P_USE_PARALLEL_FUTURES */ +#endif /* USE_PTHREADS */ + +/* Memory allocation */ +#ifdef P_USE_CGC +# define GC_NOT_DLL +# ifdef DEBUG +# define GC_DEBUG +# endif /* DEBUG */ +#endif /* P_USE_CGC */ + +/********************************************************************** + * C header files + */ + +#ifdef P_USE_PILLAR +# pragma pillar_managed(off) +# define to __to__ +#endif /* P_USE_PILLAR */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef PLSR_LINUX +#include +#endif +#include + +/* Memory allocation and collection utilities */ + +#define PgcIsRef bool + +enum PlsrAgcKind { + PlsrAKMf = 1, + PlsrAKCgc = 2, + PlsrAKTgc = 3 +}; + +#ifdef P_USE_CGC +# ifdef P_USE_PILLAR +# error "CGC and Pillar combination not supported" +# endif /* P_USE_PILLAR */ +# include "toolkit/gc.h" +# define P_VTABLE_RESERVE P_WORD_SIZE +#else /* !P_USE_CGC */ +#ifdef P_USE_AGC +# ifdef P_USE_PILLAR +# undef PgcIsRef +# include "pgc/pgc.h" +# define P_VTABLE_RESERVE PGC_VTABLE_RESERVE +# else /* !P_USR_PILLAR */ +# error "AGC and non-Pillar combination not support" +# endif /* !P_USE_PILLAR */ +#else /* !P_USE_AGC */ +# include +# define P_VTABLE_RESERVE P_WORD_SIZE +#endif /* !P_USE_AGC */ +#endif /* !P_USE_CGC */ + +#if !defined(P_USE_PTHREADS) && (defined(P_USE_MCRT) || defined(P_USE_PARALLEL_FUTURES)) +#include +#endif /* defined(P_USE_MCRT) || defined(P_USE_PARALLEL_FUTURES) */ + +#ifdef P_USE_PILLAR +# undef to +# pragma pillar_managed(on) +# include "prt/prt.h" +# include "prt/prtcodegenerator.h" +#endif /* P_USE_PILLAR */ + +/* Toolkit headers - these work in both C and Pillar */ + +#if defined(P_USE_PARALLEL_FUTURES) || defined(PLSR_THUNK_SYNCHRONIZE) +#define CONCURRENCY +#else /* !P_USE_PARALLEL_FUTURES */ +#undef CONCURRENCY +#endif /* !P_USE_PARALLEL_FUTURES */ +#include "toolkit/ptkfuture.h" +typedef ref PtkRef; + +/********************************************************************** + * Runtime proper + */ + +#include "hrc/plsr-util.h" +#include "hrc/plsr-objects.h" +#include "hrc/plsr-synchronization.h" +#include "hrc/plsr-gc.h" +#include "hrc/plsr-finalizer.h" +#include "hrc/plsr-numeric.h" +#include "hrc/plsr-thunk.h" +#include "hrc/plsr-value.h" +#include "hrc/plsr-wpo.h" +#include "hrc/plsr-prims.h" +#include "hrc/plsr-params.h" +#include "hrc/plsr-main.h" + +#endif /* !_PLSR_H_ */