diff --git a/.gitignore b/.gitignore
index c5c04ee..95a8876 100644
--- a/.gitignore
+++ b/.gitignore
@@ -68,3 +68,4 @@ compiler/version.sml
 # /runtime
 /runtime/Makefile
 /runtime/Makefile.in
+/runtime/ghc/*.a
diff --git a/Makefile.in b/Makefile.in
index dcebc5f..f6c2c11 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1,3 +1,4 @@
-include ihc-makefile.inc
+include hrc-makefile.inc
 
-IHC_PREFIX=@prefix@
+HRC_PREFIX=@prefix@
+PLATFORM_CPPFLAGS=@PLATFORM_CPPFLAGS@
diff --git a/compiler/as-to-mil/ghc-prim.sml b/compiler/as-to-mil/ghc-prim.sml
index 0f45739..4477428 100644
--- a/compiler/as-to-mil/ghc-prim.sml
+++ b/compiler/as-to-mil/ghc-prim.sml
@@ -83,7 +83,7 @@ struct
   type argsAndTyps = (Mil.simple * Mil.typ) list
   type block = HsToMilUtils.MS.t
 
-  val pkgName = "ihc/plsr-prims-ghc"
+  val pkgName = "hrc/plsr-prims-ghc"
   (* mil types *)
   val boolTyp      = M.TBoolean
   val int64ArbTyp  = IntArb.T (IntArb.S64, IntArb.Signed)
diff --git a/compiler/back-end/back-end.sml b/compiler/back-end/back-end.sml
index 0700f6f..d37e749 100644
--- a/compiler/back-end/back-end.sml
+++ b/compiler/back-end/back-end.sml
@@ -28,11 +28,10 @@ struct
                            val name = "BackEnd"
                            val indent = 0
                          end)
-
+  (*
   val runtimeDirectory =
    fn config => Path.snoc (Config.home config, "runtime")
 
-  (*
   val iflcLibDirectory =
    fn config => Config.iflcLibDirectory config
 
@@ -591,6 +590,15 @@ struct
       in [file]
       end
 
+  fun runtimeLibraries (config) =
+      let
+        val hrcGhcRtLib =
+          case Config.targetWordSize config
+           of Config.Ws32 => "libhrc_ghc_runtime32.a"
+            | Config.Ws64 => "libhrc_ghc_runtime64.a"
+      in [hrcGhcRtLib]
+      end
+
   fun unmanagedLibraries (config) =
       let
         val threads =
@@ -606,11 +614,12 @@ struct
         val (prtBegin, prtEnd) =
           ([ifDebug (config, "pillar2c_crt_begind.obj", "pillar2c_crt_begin.obj")],
            [ifDebug (config, "pillar2c_crt_endd.obj", "pillar2c_crt_end.obj")])
-        val gcLibs = gcLibraries (config)
-        val futureLibs = futureLibraries (config)
-        val unmanagedLibs = unmanagedLibraries (config)
+        val gcLibs = gcLibraries config
+        val futureLibs = futureLibraries config
+        val runtimeLibs = runtimeLibraries config
+        val unmanagedLibs = unmanagedLibraries config
         val pre = prtBegin
-        val post = List.concat [futureLibs, prtEnd, gcLibs, unmanagedLibs]
+        val post = List.concat [futureLibs, prtEnd, gcLibs, runtimeLibs, unmanagedLibs]
       in (pre, post)
       end
 
diff --git a/compiler/back-end/mil-to-pil.sml b/compiler/back-end/mil-to-pil.sml
index 0af0822..f7b8f5f 100644
--- a/compiler/back-end/mil-to-pil.sml
+++ b/compiler/back-end/mil-to-pil.sml
@@ -3357,25 +3357,11 @@ struct
                Pil.D.blank,
                omDefs,
                Pil.D.blank,
-               Pil.D.includeLocalFile "ihc/pil",
-               Pil.D.includeLocalFile "ihc/plsr",
+               Pil.D.includeLocalFile "hrc/pil",
+               Pil.D.includeLocalFile "hrc/plsr",
                Pil.D.blank,
                incs,
-(*
-  There are a number of reasons we don't want to print externs in the output file:
-
-   1. GHC's extern declaration doesn't exactly match C function's type, for example,
-      CInt is internally represented as Int# in GHC Core, which translates to int64
-      on x86_64, and if we have already included a C header, it will become a type
-      mis-match, and GCC will error (ICC will warn).
-
-   2. We have already included all header files, which will be inlined by pilicl,
-      and extern definitions following the header files are just redundant.
-
-   Unless we can fix 1, or the assumption in 2 is no longer valid, we can safety
-   exclude extern declarations in the output.
-*)
-               (* exts, *)
+               exts,
                Pil.D.blank,
                Pil.D.comment "Types",
                typs,
diff --git a/configure.ac b/configure.ac
index eb7882c..37016d8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,11 +2,27 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.60])
-AC_INIT([ihc], [0.1], [BUG-REPORT-ADDRESS])
+AC_INIT([hrc], [0.1], [BUG-REPORT-ADDRESS])
 
 AC_CHECK_HEADERS([assert.h float.h immintrin.h limits.h locale.h malloc.h math.h nmmintrin.h signal.h stdarg.h stdio.h stdlib.h string.h time.h sys/time.h sys/timeb.h termios.h])
 
 PKG_CHECK_MODULES([IFLC_LIB], [iflc-lib])
 
+PLATFORM_CPPFLAGS=
+AC_CANONICAL_HOST
+case "$host" in
+    *-mingw*|*-*-cygwin*)
+        ;;
+    *-*-linux*)
+        PLATFORM_CPPFLAGS="-DPLSR_LINUX"
+        ;;
+    *)
+        AC_MSG_ERROR([Unsupported host: $host])
+        ;;
+esac
+
+AC_SUBST([PLATFORM_CPPFLAGS])
+
+AM_SUBST_NOTMAKE
 AC_CONFIG_FILES([Makefile])
 AC_OUTPUT
diff --git a/hrc-makefile.inc b/hrc-makefile.inc
new file mode 100644
index 0000000..e094a2e
--- /dev/null
+++ b/hrc-makefile.inc
@@ -0,0 +1,510 @@
+# The Haskell Research Compiler
+# COPYRIGHT_NOTICE_1
+
+all: all2
+
+# TODO: Tidy this up
+HRC_EXE = $(IHC_EXE)
+
+INSTALL_BINDIR     = $(HRC_PREFIX)/bin
+INSTALL_INCLUDEDIR = $(HRC_PREFIX)/include/hrc
+INSTALL_LIBDIR     = $(HRC_PREFIX)/lib
+
+BINDIR = bin
+RUNDIR = runtime
+RUNDIR_INCLUDEDIR = $(RUNDIR)/include
+
+install: $(HRC_GHC_RUNTIME_LIBS) $(HRC_EXE)
+	install -d $(INSTALL_BINDIR) $(INSTALL_INCLUDEDIR) $(INSTALL_INCLUDEDIR)/ghc $(INSTALL_LIBDIR)
+	install $(HRC_GHC_RUNTIMEDIR)/*.a $(INSTALL_LIBDIR)
+	install $(HRC_EXE) $(INSTALL_BINDIR)
+	install $(RUNDIR_INCLUDEDIR)/hrc/*.h $(INSTALL_INCLUDEDIR)
+	install $(RUNDIR_INCLUDEDIR)/hrc/ghc/*.h $(INSTALL_INCLUDEDIR)/ghc
+
+ifeq (Windows_NT, $(OS))
+  EXE := .exe
+else
+  EXE :=
+endif
+
+### HRC-GHC runtime
+CC = gcc
+AR = ar
+
+ARFLAGS  = rvs
+CPPFLAGS = -I"$(RUNDIR_INCLUDEDIR)" $(PLATFORM_CPPFLAGS) `pkg-config --cflags iflc-lib` -Wall
+CFLAGS   = $(CPPFLAGS)
+
+HRC_GHC_RUNTIMEDIR = $(RUNDIR)/ghc
+
+HRC_GHC_RUNTIME_LIBS = \
+  $(HRC_GHC_RUNTIMEDIR)/libhrc_ghc_runtime32.a \
+	$(HRC_GHC_RUNTIMEDIR)/libhrc_ghc_runtime64.a
+
+HRC_GHC_RUNTIME_OBJECTS32 = \
+  $(HRC_GHC_RUNTIMEDIR)/float32.o \
+	$(HRC_GHC_RUNTIMEDIR)/Globals32.o \
+	$(HRC_GHC_RUNTIMEDIR)/plsr-util32.o \
+	$(HRC_GHC_RUNTIMEDIR)/thread32.o \
+	$(HRC_GHC_RUNTIMEDIR)/TTY32.o
+
+HRC_GHC_RUNTIME_OBJECTS64 = \
+  $(HRC_GHC_RUNTIMEDIR)/float64.o \
+	$(HRC_GHC_RUNTIMEDIR)/Globals64.o \
+	$(HRC_GHC_RUNTIMEDIR)/plsr-util64.o \
+	$(HRC_GHC_RUNTIMEDIR)/thread64.o \
+	$(HRC_GHC_RUNTIMEDIR)/TTY64.o
+
+$(HRC_GHC_RUNTIMEDIR)/%32.o: $(HRC_GHC_RUNTIMEDIR)/%.c
+	$(CC) $(CFLAGS) -DP_WORD_SIZE=4 $< -c -o $@
+
+$(HRC_GHC_RUNTIMEDIR)/%64.o: $(HRC_GHC_RUNTIMEDIR)/%.c
+	$(CC) $(CFLAGS) -DP_WORD_SIZE=8 $< -c -o $@
+
+$(HRC_GHC_RUNTIMEDIR)/libhrc_ghc_runtime32.a: $(HRC_GHC_RUNTIME_OBJECTS32)
+	$(AR) $(ARFLAGS) $@ $<
+
+$(HRC_GHC_RUNTIMEDIR)/libhrc_ghc_runtime64.a: $(HRC_GHC_RUNTIME_OBJECTS64)
+	$(AR) $(ARFLAGS) $@ $^
+
+hrc-ghc-runtime: $(HRC_GHC_RUNTIME_LIBS)
+
+### Old P Front End
+
+OFE = frontend-old
+OFE_EXE = $(BINDIR)/$(OFE)$(EXE)
+OFE_SRCS = frontend-old/p.cpp
+
+ofe: $(OFE_EXE)
+
+$(OFE_EXE): build/vs2010/$(OFE)/$(OFE)/$(OFE).vcxproj $(OFE_SRCS)
+	cd build/vs2010/$(OFE)/$(OFE); devenv $(OFE).vcxproj /rebuild Debug; cp ../Debug/frontend-old.exe ../../../../bin
+#$(OFE_EXE): build/vc60/$(OFE)/$(OFE).dsp $(OFE_SRCS)
+#	cd build/vc60/$(OFE); msdev $(OFE).dsp /MAKE "p - Win32 Debug" /REBUILD
+
+### New P Front End
+
+NFE = frontend-new
+NFE_EXE = $(BINDIR)/$(NFE)$(EXE)
+NFE_SRCS = $(NFE)/p.hs
+
+nfe: $(NFE_EXE)
+
+$(NFE_EXE): $(NFE_SRCS)
+	ghc -fno-gen-manifest $(NFE_SRCS) -o $(NFE_EXE)
+
+RR_EXE = $(BINDIR)/p-runtime$(EXE)
+RR_SRCS = $(NFE)/p-runtime.hs
+
+$(RR_EXE): $(RR_SRCS)
+	ghc $(RR_SRCS) -o $(RR_EXE)
+
+### Main part of the compiler
+
+IFLC_VERSION = v1.0
+
+# keep these in alpabetic order
+COMPILER_SRCS = \
+  compiler/back-end/back-end.mlb \
+  compiler/back-end/back-end.sml \
+  compiler/back-end/mil-to-pil.sml \
+  compiler/back-end/outputter.sml \
+  compiler/back-end/pil.sml \
+  compiler/back-end/runtime.sml \
+  compiler/common/chat.sml \
+  compiler/common/common.mlb \
+  compiler/common/compare.sml \
+  compiler/common/config.sml \
+  compiler/common/dataflow.sml \
+  compiler/common/dominance.sml \
+  compiler/common/effect.sml \
+  compiler/common/fail.sml \
+  compiler/common/globals.sml \
+  compiler/common/graph.sml \
+  compiler/common/identifier.sml \
+  compiler/common/int-arb.sml \
+  compiler/common/intr.sml \
+  compiler/common/lub.sml \
+  compiler/common/pass.sml \
+  compiler/common/path.sml \
+  compiler/common/rat.sml \
+  compiler/common/rename.sml \
+  compiler/common/topo-sort.sml \
+  compiler/common/tuple.sml \
+  compiler/common/try.sml \
+  compiler/common/type-rep.sml \
+  compiler/common/utils.sml \
+  compiler/common/z-coding.sml \
+  compiler/driver.sml \
+  compiler/mil/analyse.sml \
+  compiler/mil/bound-vars.sml \
+  compiler/mil/call-graph.sml \
+  compiler/mil/cfg.sml \
+  compiler/mil/check.sml \
+  compiler/mil/code-copy.sml \
+  compiler/mil/compile.mlb \
+  compiler/mil/compile.sml \
+  compiler/mil/dataflow-analysis.sml \
+  compiler/mil/dependence-analysis.sml \
+  compiler/mil/extended-layout.sml \
+  compiler/mil/fmil.sml \
+  compiler/mil/free-vars.sml \
+  compiler/mil/imil/block.sml \
+  compiler/mil/imil/both-mil.sml \
+  compiler/mil/imil/common.sml \
+  compiler/mil/imil/def.sml \
+  compiler/mil/imil/enumerate.sml \
+  compiler/mil/imil/func.sml \
+  compiler/mil/imil/global.sml \
+  compiler/mil/imil/instr.sml \
+  compiler/mil/imil/item.sml \
+  compiler/mil/imil/layout.sml \
+  compiler/mil/imil/t.sml \
+  compiler/mil/imil/types.sml \
+  compiler/mil/imil/use.sml \
+  compiler/mil/imil/var.sml \
+  compiler/mil/imil/workset.sml \
+  compiler/mil/imil/imil.mlb \
+  compiler/mil/imil/imil.sml \
+  compiler/mil/layout.sml \
+  compiler/mil/loop.sml \
+  compiler/mil/lower/lower.mlb \
+  compiler/mil/lower/mil-to-core-mil.sml \
+  compiler/mil/lower/vector.sml \
+  compiler/mil/mil.mlb \
+  compiler/mil/mil.sml \
+  compiler/mil/name-small-values.sml \
+  compiler/mil/number-instructions.sml \
+  compiler/mil/optimise/annotated-cg-printer.sml \
+  compiler/mil/optimise/branch-remove.sml \
+  compiler/mil/optimise/cfg-simplify.sml \
+  compiler/mil/optimise/contify.sml \
+  compiler/mil/optimise/cse.sml \
+  compiler/mil/optimise/double-diamond.sml \
+  compiler/mil/optimise/fun-known.sml \
+  compiler/mil/optimise/fx-analysis.sml \
+  compiler/mil/optimise/inline-aggressive.sml \
+  compiler/mil/optimise/inline-leaves.sml \
+  compiler/mil/optimise/inline-profile.sml \
+  compiler/mil/optimise/inline-rewrite.sml  \
+  compiler/mil/optimise/inline-small.sml \
+  compiler/mil/optimise/iv-cse.sml \
+  compiler/mil/optimise/licm.sml \
+  compiler/mil/optimise/loop-invert.sml \
+  compiler/mil/optimise/optimise.mlb \
+  compiler/mil/optimise/rep/analyze.sml \
+  compiler/mil/optimise/rep/base.sml \
+  compiler/mil/optimise/rep/dead-code.sml \
+  compiler/mil/optimise/rep/driver.sml \
+  compiler/mil/optimise/rep/flatten.sml \
+  compiler/mil/optimise/rep/flowgraph.sml \
+  compiler/mil/optimise/rep/node.sml \
+  compiler/mil/optimise/rep/object.sml \
+  compiler/mil/optimise/rep/optimize.sml \
+  compiler/mil/optimise/rep/prep.sml \
+  compiler/mil/optimise/rep/reconstruct.sml \
+  compiler/mil/optimise/rep/seq.sml \
+  compiler/mil/optimise/rep/show.sml \
+  compiler/mil/optimise/rep/summary.sml \
+  compiler/mil/optimise/rep/rep.mlb \
+  compiler/mil/optimise/rep/rep.sml \
+  compiler/mil/optimise/simple-escape.sml \
+  compiler/mil/optimise/simplify.sml \
+  compiler/mil/optimise/thunks.sml \
+  compiler/mil/optimise/vectorize.sml \
+  compiler/mil/p-object-model.sml \
+  compiler/mil/parse.sml \
+  compiler/mil/prims.sml \
+  compiler/mil/prims-utils.sml \
+  compiler/mil/profile.sml \
+  compiler/mil/rename.sml \
+  compiler/mil/rewrite.sml \
+  compiler/mil/shape-analysis.sml \
+  compiler/mil/stats.sml \
+  compiler/mil/stream.sml \
+  compiler/mil/stream2.sml \
+  compiler/mil/transform.sml \
+  compiler/mil/type.sml \
+  compiler/mil/utils.sml \
+  compiler/mil/utils2.sml
+
+COMPILER_STD_OPTS = @MLton -- \
+	-verbose 1 -runtime 'use-mmap true' \
+	-codegen native \
+	-native-split 900000
+# The following don't work on 64-bit
+#       -ieee-fp true -max-heap 800m
+
+ifeq (y, $(fast))
+  COMPILER_OPTS :=$(COMPILER_STD_OPTS)
+else
+  COMPILER_OPTS :=$(COMPILER_STD_OPTS)  -const 'Exn.keepHistory true'
+endif
+
+# keep these in alpabetic order
+HASKELL_SRCS = \
+  compiler/anorm-strict/anorm-strict-analyze.sml \
+  compiler/anorm-strict/anorm-strict-clone.sml \
+  compiler/anorm-strict/anorm-strict-closure-convert.sml \
+  compiler/anorm-strict/anorm-strict-free-vars.sml \
+  compiler/anorm-strict/anorm-strict-layout.sml \
+  compiler/anorm-strict/anorm-strict.mlb \
+  compiler/anorm-strict/anorm-strict-optimize.sml \
+  compiler/anorm-strict/anorm-strict-rewrite.sml \
+  compiler/anorm-strict/stats.sml 	\
+  compiler/anorm-strict/anorm-strict.sml \
+  compiler/anorm-strict/anorm-strict-utils.sml \
+  compiler/as-to-mil/as-to-mil.mlb \
+  compiler/as-to-mil/ghc-prim.sml \
+  compiler/as-to-mil/to-mil.sml \
+  compiler/as-to-mil/utils.sml \
+  compiler/anorm-lazy/anorm-lazy.sml \
+  compiler/anorm-lazy/anorm-lazy-analyze.sml \
+  compiler/anorm-lazy/abs-core.sml \
+  compiler/anorm-lazy/abs-eval.sml \
+  compiler/anorm-lazy/stats.sml \
+  compiler/anorm-lazy/strictness.sml \
+  compiler/anorm-lazy/to-abs-core.sml \
+  compiler/ch-to-as/ch-to-as.mlb \
+  compiler/ch-to-as/to-lazy.sml \
+  compiler/ch-to-as/to-strict.sml \
+  compiler/core-hs/core-hs.mlb \
+  compiler/core-hs/core-hs.sml \
+  compiler/core-hs/ghc-prim-op.sml \
+  compiler/core-hs/ghc-prim-type.sml \
+  compiler/core-hs/layout.sml \
+  compiler/core-hs/normalize.sml \
+  compiler/core-hs/parse.sml \
+  compiler/core-hs/link-option.sml \
+  compiler/core-hs/core-hs.lex.sml \
+  compiler/core-hs/core-hs.grm.sig \
+  compiler/core-hs/core-hs.grm.sml \
+  compiler/haskell.sml
+
+# keep these in alpabetic order
+P_SRCS = \
+  compiler/core-op/core-op.mlb \
+  compiler/core-op/core-p.sml \
+  compiler/core-op/fix-up.sml \
+  compiler/core-op/layout.sml \
+  compiler/core-op/parse.sml \
+  compiler/core-op/prims.sml \
+  compiler/core-op/stats.sml \
+  compiler/core-op/utils.sml \
+  compiler/core-p/common.sml \
+  compiler/core-p/common-utils.sml \
+  compiler/core-p/core-p.mlb \
+  compiler/core-p/linear.sml \
+  compiler/core-p/linear-analyse.sml \
+  compiler/core-p/linear-dce.sml \
+  compiler/core-p/linear-free-variables.sml \
+  compiler/core-p/linear-layout.sml \
+  compiler/core-p/linear-optimise.sml \
+  compiler/core-p/linear-simplify.sml \
+  compiler/core-p/linear-sink1.sml \
+  compiler/core-p/linear-sink2.sml \
+  compiler/core-p/linear-stats.sml \
+  compiler/core-p/linear-type.sml \
+  compiler/core-p/linear-utils.sml \
+  compiler/core-p/natives.sml \
+  compiler/core-p/parse.sml \
+  compiler/core-p/prims.sml \
+  compiler/core-p/raw-layout.sml \
+  compiler/core-p/raw-stats.sml \
+  compiler/core-p/raw-to-linear.sml \
+  compiler/core-p/raw-utils.sml \
+  compiler/core-p/raw.sml \
+  compiler/core-p/type.sml \
+  compiler/cop-to-old-hil/acp.sml \
+  compiler/cop-to-old-hil/analysis.sml \
+  compiler/cop-to-old-hil/compute-work.sml \
+  compiler/cop-to-old-hil/cop-to-hil.mlb \
+  compiler/cop-to-old-hil/cp-to-acp.sml \
+  compiler/cop-to-old-hil/cp-to-hil.sml \
+  compiler/cop-to-old-hil/direct.sml \
+  compiler/cop-to-old-hil/effect-analysis.sml \
+  compiler/cop-to-old-hil/evaled.sml \
+  compiler/cop-to-old-hil/hil-builder.sml \
+  compiler/cop-to-old-hil/hybrid.sml \
+  compiler/cop-to-old-hil/thunks.sml \
+  compiler/cop-to-old-hil/transform.sml \
+  compiler/cop-to-old-hil/two-version-analysis.sml \
+  compiler/cop-to-old-hil/two-version-compute-work.sml \
+  compiler/cop-to-old-hil/two-version-transform.sml \
+  compiler/cop-to-old-hil/two-version-utils.sml \
+  compiler/cop-to-old-hil/two-version-version-analysis.sml \
+  compiler/cop-to-old-hil/two-version.sml \
+  compiler/cop-to-old-hil/type-analyse.sml \
+  compiler/cop-to-old-hil/utils.sml \
+  compiler/cp-to-hil/analyse.sml \
+  compiler/cp-to-hil/check.sml \
+  compiler/cp-to-hil/cp-to-hil.mlb \
+  compiler/cp-to-hil/cp-to-hil.sml \
+  compiler/cp-to-hil/interface.sml \
+  compiler/cp-to-hil/transform.sml \
+  compiler/cp-to-hil/simple-analyse.sml \
+  compiler/hil/analyse.sml \
+  compiler/hil/check.sml \
+  compiler/hil/dce.sml \
+  compiler/hil/free-vars.sml \
+  compiler/hil/hil.mlb \
+  compiler/hil/hil.sml \
+  compiler/hil/layout.sml \
+  compiler/hil/optimise.sml \
+  compiler/hil/simplifier.sml \
+  compiler/hil/utils.sml \
+  compiler/hil-to-mil/analyze.sml \
+  compiler/hil-to-mil/hil-to-mil.mlb \
+  compiler/hil-to-mil/hil-to-mil.sml \
+  compiler/hil-to-mil/interface.sml \
+  compiler/hil-to-mil/rewrite.sml \
+  compiler/hil-to-mil/simple-analyze.sml \
+  compiler/old-hil/check.sml \
+  compiler/old-hil/duplicate.sml \
+  compiler/old-hil/free-vars.sml \
+  compiler/old-hil/hil.mlb \
+  compiler/old-hil/hil.sml \
+  compiler/old-hil/inline.sml \
+  compiler/old-hil/layout.sml \
+  compiler/old-hil/optimise.sml \
+  compiler/old-hil/stats.sml \
+  compiler/old-hil/utils.sml \
+  compiler/old-hil-to-mil/analyze.sml \
+  compiler/old-hil-to-mil/hil-in-mil.sml \
+  compiler/old-hil-to-mil/hil-to-mil.mlb \
+  compiler/old-hil-to-mil/hil-to-mil.sml \
+  compiler/old-hil-to-mil/interface.sml \
+  compiler/old-hil-to-mil/rewrite.sml \
+  compiler/old-hil-to-mil/simple-analyze.sml \
+  compiler/p.sml
+
+# make-version
+make-version: $(BINDIR)/make-version.sh
+	$(BINDIR)/make-version.sh "$(IFLC_VERSION)" compiler/version.sml $(HRC_PREFIX)
+
+# iflc
+
+IFLC = iflc
+IFLC_EXE = $(BINDIR)/$(IFLC)$(EXE)
+
+IFLC_SRCS = $(COMPILER_SRCS) $(HASKELL_SRCS) $(P_SRCS) compiler/iflc.mlb compiler/iflc.sml
+
+iflc: $(IFLC_EXE)
+
+$(IFLC_EXE): make-version $(IFLC_SRCS)
+	cd compiler; \
+	rm -f iflc.*.c iflc.*.s; \
+	mlton $(COMPILER_OPTS) -stop g iflc.mlb && \
+	mlton $(COMPILER_OPTS) -output ../$(IFLC_EXE) iflc.*.c iflc.*.s && \
+	rm -f iflc.*.c iflc.*.s;
+
+### ihc
+
+IHC = ihc
+IHC_EXE = $(BINDIR)/$(IHC)$(EXE)
+
+IHC_SRCS = $(COMPILER_SRCS) $(HASKELL_SRCS) compiler/ihc.mlb compiler/ihc.sml
+
+ihc: $(IHC_EXE)
+
+$(IHC_EXE): make-version $(IHC_SRCS) compiler/ihc.mlb
+	cd compiler; \
+	rm -f ihc.*.c ihc.*.s; \
+	mlton $(COMPILER_OPTS) -stop g $(IHC).mlb && \
+	mlton $(COMPILER_OPTS) -output ../$(IHC_EXE) ihc.*.c ihc.*.s && \
+	rm -f ihc.*.c ihc.*.s
+
+### New P Runtime Support
+
+NP_RUNTIME = $(RUNDIR)/hil.mil
+
+npr: $(NP_RUNTIME)
+
+$(RUNDIR)/hil.mil: compiler/hil-to-mil/hil.mil
+	cp $< $@
+
+### P LSR
+
+# Keep these in alphabetic order
+PLSR_SRCS = \
+  $(RUNDIR_INCLUDEDIR)/hrc/pil.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-ap-integer.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-ap-rational.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-gc.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-gmp-integer.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-gmp-integer-gallocate.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-iflc-integer.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-integer.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-lightweight-thunk.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-main.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-numeric.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-objects.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-params.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims-prims.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims-runtime.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims-vector-avx.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims-vector-sse.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims-vector.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-prims.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-ptk-thunk.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-rational.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-tagged-int32.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-thunk.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-util.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-value.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr-vector.h \
+  $(RUNDIR_INCLUDEDIR)/hrc/plsr.h
+
+### Main stuff
+
+iflc-all: nfe iflc npr
+
+all2: hrc-ghc-runtime ihc
+
+partclean:
+	rm -f runtime/ghc/*.a
+	rm -f `find bin build compiler runtime benchmarks \
+	            -name \\*~ -o -name \\*.obj -o -name \\*.o -o -name gc.log -o -name .pilink -o -name .pilicl`
+	rm -f $(NFE)/*.{hi,o,exe} $(NFE)/*~
+	rm -rf build/vc60/$(OFE)/Debug
+	rm -f  build/vc60/$(OFE)/$(OFE).{ncb,opt,plg}
+	rm -rf build/vc60/pdebug/Debug
+	rm -rf build/vc60/pdebug/Release
+	rm -rf build/vc60/pdebug/pdebug.{ncb,opt,plg}
+	rm -f `find tests -name \\*~ -o -name \\*.exe -o \
+                          -name \\*.obj -o -name \\*.o -o -name \\*.cmp -o -name \\*.exp -o \
+                          -name \\*.lib -o -name \\*.pdb -o -name \\*.suo -o -name \\*.ilk -o -name gc.log`
+	rm -f `find tests -name \\*.c -a -not -name matrix-multiply-c.c`
+	-make -C benchmarks/barnes-hut clean
+	-make -C benchmarks/cloth-nobc clean
+	-make -C benchmarks/matrix-multiply clean
+	-make -C benchmarks/quicksort clean
+	-make -C benchmarks/raytracer clean
+	-make -C benchmarks/smvp clean
+	-make -C benchmarks/smvp-direct clean
+	-make -C benchmarks/sudoku clean
+	-make -C benchmarks/nofib clean
+	rm -f *~
+	rm -f mlmon.out gc.log root_map.txt *.ppm compiler/version.sml
+
+clean: partclean
+	rm -f $(OFE_EXE) $(NFE_EXE) $(IFLC_EXE) $(IHC_EXE) $(NP_RUNTIME) regressions.log smvp.res
+
+### Testing
+
+test: iflc-all
+	$(BINDIR)/regress.sh -short -nomake
+
+regress: iflc-all
+	$(BINDIR)/regress.sh -nomake
+
+torture: iflc-all
+	$(BINDIR)/regress.sh -torture -nomake
+
+### ML-Yacc and ML-Lex
+compiler/core-hs/core-hs.lex.sml: compiler/core-hs/core-hs.lex compiler/core-hs/core-hs.grm.sml
+	mllex $<
+compiler/core-hs/core-hs.grm.sml: compiler/core-hs/core-hs.grm
+	mlyacc $<
+compiler/core-hs/core-hs.grm.sig: compiler/core-hs/core-hs.grm
+	mlyacc $<
diff --git a/runtime/ghc/Globals.c b/runtime/ghc/Globals.c
index fb90782..f51c379 100755
--- a/runtime/ghc/Globals.c
+++ b/runtime/ghc/Globals.c
@@ -1,10 +1,9 @@
 /* The Haskell Research Compiler */
 /* COPYRIGHT_NOTICE_1 */
 
-typedef enum {
-    ISK_SystemEventThreadEventManager, ISK_SystemEventThreadIOManager, ISK_GHCConcWindowsPendingDelays,
-    ISK_GHCConcWindowsIOManagerThread, ISK_GHCConcWindowsProdding, ISK_GHCConcSignalSignalHandler, ISK_Num
-} IhrStoreKey;
+#include <prt/prt.h>
+#include "hrc/plsr-util.h"
+#include "hrc/ghc/Globals.h"
 
 static void* ihrStore[ISK_Num] = {NULL, };
 static struct prtMutex* ihrStoreLock;
@@ -74,7 +73,7 @@ void blockUserSignals() {}
 
 void unblockUserSignals() {}
 
-void stopTimer() {} 
+void stopTimer() {}
 
 void startTimer() {}
 
diff --git a/runtime/ghc/TTY.c b/runtime/ghc/TTY.c
index 7ea41cd..cacb69e 100644
--- a/runtime/ghc/TTY.c
+++ b/runtime/ghc/TTY.c
@@ -22,6 +22,8 @@
 #include <signal.h>
 #endif
 
+#include "hrc/ghc/TTY.h"
+
 // Here we save the terminal settings on the standard file
 // descriptors, if we need to change them (eg. to support NoBuffering
 // input).
@@ -52,7 +54,7 @@ resetTerminalSettings (void)
     // if we changed them.  See System.Posix.Internals.tcSetAttr for
     // more details, including the reason we termporarily disable
     // SIGTTOU here.
-    { 
+    {
 	int fd;
 	sigset_t sigset, old_sigset;
 	sigemptyset(&sigset);
diff --git a/runtime/ghc/float.c b/runtime/ghc/float.c
index b0667f2..c655be5 100644
--- a/runtime/ghc/float.c
+++ b/runtime/ghc/float.c
@@ -9,55 +9,28 @@
 
 #include <math.h>
 #include <float.h>
-
-#define IEEE_FLOATING_POINT 1
-
-/*
- * Encoding and decoding Doubles.  Code based on the HBC code
- * (lib/fltcode.c).
- */
-
-#if IEEE_FLOATING_POINT
-#define MY_DMINEXP  ((DBL_MIN_EXP) - (DBL_MANT_DIG) - 1)
-/* DMINEXP is defined in values.h on Linux (for example) */
-#define DHIGHBIT 0x00100000
-#define DMSBIT   0x80000000
-
-#define MY_FMINEXP  ((FLT_MIN_EXP) - (FLT_MANT_DIG) - 1)
-#define FHIGHBIT 0x00800000
-#define FMSBIT   0x80000000
-#endif
-
-#if defined(WORDS_BIGENDIAN) || defined(FLOAT_WORDS_BIGENDIAN)
-#define L 1
-#define H 0
-#else
-#define L 0
-#define H 1
-#endif
-
-#define __abs(a)		(( (a) >= 0 ) ? (a) : (-(a)))
+#include "hrc/ghc/float.h"
 
 StgDouble
 __2Int_encodeDouble (I_ j_high, I_ j_low, I_ e)
 {
   StgDouble r;
-  
+
   /* assuming 32 bit ints */
   ASSERT(sizeof(int          ) == 4            );
 
   r = (StgDouble)((unsigned int)j_high);
   r *= 4294967296.0; /* exp2f(32); */
   r += (StgDouble)((unsigned int)j_low);
-  
+
   /* Now raise to the exponent */
   if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
     r = ldexp(r, e);
-  
+
   /* sign is encoded in the size */
   if (j_high < 0)
     r = -r;
-  
+
   return r;
 }
 
@@ -66,13 +39,13 @@ StgDouble
 __word_encodeDouble (W_ j, I_ e)
 {
   StgDouble r;
-  
+
   r = (StgDouble)j;
-  
+
   /* Now raise to the exponent */
   if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
     r = ldexp(r, e);
-  
+
   return r;
 }
 
@@ -81,17 +54,17 @@ StgDouble
 __int_encodeDouble (I_ j, I_ e)
 {
   StgDouble r;
-  
+
   r = (StgDouble)__abs(j);
-  
+
   /* Now raise to the exponent */
   if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
     r = ldexp(r, e);
-  
+
   /* sign is encoded in the size */
   if (j < 0)
     r = -r;
-  
+
   return r;
 }
 
@@ -100,17 +73,17 @@ StgFloat
 __int_encodeFloat (I_ j, I_ e)
 {
   StgFloat r;
-  
+
   r = (StgFloat)__abs(j);
-  
+
   /* Now raise to the exponent */
   if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
     r = ldexp(r, e);
-  
+
   /* sign is encoded in the size */
   if (j < 0)
     r = -r;
-  
+
   return r;
 }
 
@@ -119,13 +92,13 @@ StgFloat
 __word_encodeFloat (W_ j, I_ e)
 {
   StgFloat r;
-  
+
   r = (StgFloat)j;
-  
+
   /* Now raise to the exponent */
   if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
     r = ldexp(r, e);
-  
+
   return r;
 }
 
@@ -217,4 +190,3 @@ __decodeFloat_Int (I_ *man, I_ *exp, StgFloat flt)
 	    *man = - *man;
     }
 }
-
diff --git a/runtime/ghc/thread.c b/runtime/ghc/thread.c
index c405cfb..eee8e2b 100644
--- a/runtime/ghc/thread.c
+++ b/runtime/ghc/thread.c
@@ -1,6 +1,8 @@
 /* The Haskell Research Compiler */
 /* COPYRIGHT_NOTICE_1 */
 
+#include "hrc/ghc/thread.h"
+
 /* place holders for now */
 I_ rts_getThreadId (W_ tid)
 {
diff --git a/runtime/include/hrc/ghc/Globals.h b/runtime/include/hrc/ghc/Globals.h
new file mode 100644
index 0000000..5c7c112
--- /dev/null
+++ b/runtime/include/hrc/ghc/Globals.h
@@ -0,0 +1,33 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _GHC_GLOBALS_H_
+#define _GHC_GLOBALS_H_
+
+#include "hrc/ghc/float.h"
+
+typedef enum {
+    ISK_SystemEventThreadEventManager, ISK_SystemEventThreadIOManager, ISK_GHCConcWindowsPendingDelays,
+    ISK_GHCConcWindowsIOManagerThread, ISK_GHCConcWindowsProdding, ISK_GHCConcSignalSignalHandler, ISK_Num
+} IhrStoreKey;
+
+void ihrSetNCapabilities(uint32 n);
+void ihrGlobalInit();
+void* getOrSetKey(IhrStoreKey k, void* p);
+void* getOrSetSystemEventThreadEventManagerStore(void* p);
+void* getOrSetSystemEventThreadIOManagerThreadStore(void* p);
+void* getOrSetGHCConcWindowsPendingDelaysStore(void* p);
+void* getOrSetGHCConcWindowsIOManagerThreadStore(void* p);
+void* getOrSetGHCConcWindowsProddingStore(void* p);
+void* getOrSetGHCConcSignalSignalHandlerStore(void* p);
+void sysErrorBelch(char* s);
+void blockUserSignals();
+void unblockUserSignals();
+void stopTimer();
+void startTimer();
+void stackOverflow();
+// int lockFile(int fd, uint64 dev, uint64 ino, int for_writing);
+// int unlockFile(int fd);
+uint64 getMonotonicNSec();
+
+#endif
diff --git a/runtime/include/hrc/ghc/TTY.h b/runtime/include/hrc/ghc/TTY.h
new file mode 100644
index 0000000..c9f6221
--- /dev/null
+++ b/runtime/include/hrc/ghc/TTY.h
@@ -0,0 +1,20 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _GHC_TTY_H_
+#define _GHC_TTY_H_
+
+#include "hrc/ghc/float.h"
+
+#ifdef HAVE_TERMIOS_H
+#include <termios.h>
+#endif
+#ifdef HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+
+void* __hscore_get_saved_termios(I_ fd0);
+void __hscore_set_saved_termios(I_ fd0, void* ts);
+void resetTerminalSettings (void);
+
+#endif
diff --git a/runtime/include/hrc/ghc/float.h b/runtime/include/hrc/ghc/float.h
new file mode 100644
index 0000000..8bd2d43
--- /dev/null
+++ b/runtime/include/hrc/ghc/float.h
@@ -0,0 +1,64 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) Lennart Augustsson
+ * (c) The GHC Team, 1998-2000
+ *
+ * Miscellaneous support for floating-point primitives
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef _GHC_FLOAT_H_
+#define _GHC_FLOAT_H_
+
+#include <assert.h>
+#include <math.h>
+#include <float.h>
+#include "hrc/pil.h"
+
+#define IEEE_FLOATING_POINT 1
+
+/*
+ * Encoding and decoding Doubles.  Code based on the HBC code
+ * (lib/fltcode.c).
+ */
+
+#if IEEE_FLOATING_POINT
+#define MY_DMINEXP  ((DBL_MIN_EXP) - (DBL_MANT_DIG) - 1)
+/* DMINEXP is defined in values.h on Linux (for example) */
+#define DHIGHBIT 0x00100000
+#define DMSBIT   0x80000000
+
+#define MY_FMINEXP  ((FLT_MIN_EXP) - (FLT_MANT_DIG) - 1)
+#define FHIGHBIT 0x00800000
+#define FMSBIT   0x80000000
+#endif
+
+#if defined(WORDS_BIGENDIAN) || defined(FLOAT_WORDS_BIGENDIAN)
+#define L 1
+#define H 0
+#else
+#define L 0
+#define H 1
+#endif
+
+#define __abs(a)		(( (a) >= 0 ) ? (a) : (-(a)))
+
+typedef void* StgStablePtr;
+#define ASSERT assert
+//typedef uint32 nat;
+typedef float StgFloat;
+typedef double StgDouble;
+typedef sintp I_;
+typedef uintp W_;
+#define SIZEOF_FLOAT 4
+#define SIZEOF_DOUBLE 8
+
+StgDouble __2Int_encodeDouble (I_ j_high, I_ j_low, I_ e);
+StgDouble __word_encodeDouble (W_ j, I_ e);
+StgDouble __int_encodeDouble (I_ j, I_ e);
+StgFloat __int_encodeFloat (I_ j, I_ e);
+StgFloat __word_encodeFloat (W_ j, I_ e);
+void __decodeDouble_2Int (I_ *man_sign, W_ *man_high, W_ *man_low, I_ *exp, StgDouble dbl);
+void __decodeFloat_Int (I_ *man, I_ *exp, StgFloat flt);
+
+#endif
diff --git a/runtime/include/hrc/ghc/thread.h b/runtime/include/hrc/ghc/thread.h
new file mode 100644
index 0000000..8e7cbd7
--- /dev/null
+++ b/runtime/include/hrc/ghc/thread.h
@@ -0,0 +1,19 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _GHC_THREAD_H_
+#define _GHC_THREAD_H_
+
+#include "hrc/ghc/float.h"
+
+I_ rts_getThreadId (W_ tid);
+I_ cmp_thread (W_ tidA, W_ tidB);
+I_ rtsSupportsBoundThreads();
+
+#if defined (__MINGW32__) || defined(WIN32)
+HANDLE getIOManagerEvent();
+W_ readIOManagerEvent();
+void sendIOManagerEvent(W_ e);
+#endif
+
+#endif
diff --git a/runtime/include/hrc/pil.h b/runtime/include/hrc/pil.h
new file mode 100644
index 0000000..98b1c2b
--- /dev/null
+++ b/runtime/include/hrc/pil.h
@@ -0,0 +1,251 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* This file defines the Pil abstractions to make C and Pillar look as close
+ * as possible.
+ */
+
+#ifndef _PIL_H_
+#define _PIL_H_
+
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+#endif /* P_USE_PILLAR */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#ifdef P_USE_PILLAR
+#  undef to
+#  pragma pillar_managed(on)
+#  include "prt/prt.h"
+#endif /* P_USE_PILLAR */
+
+/**********************************************************************/
+/* Define definite sized integral types                               */
+
+#ifdef WIN32  /* MS VC++ compiler */
+
+typedef __int64 sint64;
+typedef __int32 sint32;
+typedef __int8 sint8;
+typedef __int16 sint16;
+typedef unsigned __int64 uint64;
+typedef unsigned __int32 uint32;
+typedef unsigned __int8 uint8;
+typedef unsigned __int16 uint16;
+typedef float float32;
+typedef double float64;
+
+#define SINT32_MAX INT_MAX
+#define SINT32_MIN INT_MIN
+#define UINT32_MAX ULONG_MAX
+
+#define SINT64_MAX _I64_MAX
+#define SINT64_MIN _I64_MIN
+#define UINT64_MAX _UI64_MAX
+
+#define INFINITY32 ((float)(1.0/0.0))
+#define NAN32 ((float)0.0/0.0)
+#define INFINITY64 ((double)(1.0/0.0))
+#define NAN64 ((double)0.0/0.0)
+
+#define SINT64_C(c) c ## L
+#define UINT64_C(c) c ## UL
+
+#else /* !WIN32 */
+#ifdef __INTEL_COMPILER  /* Intel C Compiler */
+
+typedef long long sint64;
+typedef unsigned long long uint64;
+typedef int sint32;
+typedef unsigned uint32;
+typedef short sint16;
+typedef unsigned short uint16;
+typedef char sint8;
+typedef unsigned char uint8;
+typedef float float32;
+typedef double float64;
+
+#define SINT32_MAX INT_MAX
+#define SINT32_MIN INT_MIN
+#define UINT32_MAX UINT_MAX
+
+#define SINT64_MAX _I64_MAX
+#define SINT64_MIN _I64_MIN
+#define UINT64_MAX _UI64_MAX
+
+#define INFINITY32 ((float)(1.0/0.0))
+#define NAN32 ((float)0.0/0.0)
+#define INFINITY64 ((double)(1.0/0.0))
+#define NAN64 ((double)0.0/0.0)
+
+#else /* !WIN32 && !__INTEL_COMPILER */
+#ifdef __GNUC__ /* gcc */
+
+#ifndef P_USE_PILLAR
+#ifndef __cdecl
+#define __cdecl __attribute__((cdecl))
+#endif /* __cdecl */
+#endif /* P_USE_PILLAR */
+
+#include <stdint.h>
+
+typedef long long int sint64;
+typedef unsigned long long int uint64;
+typedef int sint32;
+typedef unsigned uint32;
+typedef short sint16;
+typedef unsigned short uint16;
+typedef char sint8;
+typedef unsigned char uint8;
+typedef float float32;
+typedef double float64;
+
+#define SINT32_MAX INT32_MAX
+#define SINT32_MIN INT32_MIN
+
+#define SINT64_MAX INT64_MAX
+#define SINT64_MIN INT64_MIN
+
+#define INFINITY32 ((float)(1.0/0.0))
+#define NAN32 ((float)0.0/0.0)
+#define INFINITY64 ((double)(1.0/0.0))
+#define NAN64 ((double)0.0/0.0)
+
+#ifdef INTEL64
+#define SINT64_C(c)    c ## L
+#define UINT64_C(c)   c ## UL
+#else
+#define SINT64_C INT64_C
+#endif
+
+#else
+
+#error Unknown compiler
+
+#endif /* !WIN32 && !__INTEL_COMPILER && !__GNUC__ */
+#endif /* !WIN32 && !__INTEL_COMPILER */
+#endif /* !WIN32 */
+
+#if P_WORD_SIZE == 4
+
+typedef sint32 sintp;
+typedef uint32 uintp;
+#define SINTP_MAX SINT32_MAX
+#define SINTP_MIN SINT32_MIN
+#define UINTP_MAX UINT32_MAX
+
+#elif P_WORD_SIZE == 8
+
+typedef sint64 sintp;
+typedef uint64 uintp;
+#define SINTP_MAX SINT64_MAX
+#define SINTP_MIN SINT64_MIN
+#define UINTP_MAX UINT64_MAX
+
+#else /* P_WORD_SIZE notin {4, 8} */
+
+#error P_WORD_SIZE not defined or not a supported size
+
+#endif /* P_WORD_SIZE notin {4, 8} */
+
+#ifdef  __GNUC__
+#define pil_aligned(a) __attribute__((aligned(a)))
+#else
+#define pil_aligned(a) __declspec(align(a))
+#endif
+
+typedef uintp bool;
+
+static void pilCheck()
+{
+    if (sizeof(sint32)!=4) { fprintf(stderr, "pil: bad sint32!\n"); exit(-1); }
+    if (sizeof(uint32)!=4) { fprintf(stderr, "pil: bad uint32!\n"); exit(-1); }
+    if (sizeof(sint64)!=8) { fprintf(stderr, "pil: bad sint64!\n"); exit(-1); }
+    if (sizeof(uint64)!=8) { fprintf(stderr, "pil: bad uint64!\n"); exit(-1); }
+    if (sizeof(sintp)!=P_WORD_SIZE)
+        { fprintf(stderr, "pil: bad sintp!\n"); exit(-1); }
+    if (sizeof(uintp)!=P_WORD_SIZE)
+        { fprintf(stderr, "pil: bad uint!\n"); exit(-1); }
+    if (sizeof(void*)!=P_WORD_SIZE)
+        { fprintf(stderr, "pil: bad word size!\n"); exit(-1); }
+    if (sizeof(float32)!=4) { fprintf(stderr, "pil: bad float32!\n"); exit(-1); }
+    if (sizeof(float64)!=8) { fprintf(stderr, "pil: bad float64!\n"); exit(-1); }
+}
+
+/**********************************************************************/
+/* Control stuff                                                      */
+
+#ifdef P_USE_PILLAR
+
+#ifdef __pillar2c__
+# define PilContinuation0 continuation_type
+# define PilContinuation(...) continuation_type< __VA_ARGS__ >
+#else
+# define PilContinuation0 g4
+# define PilContinuation(...) g4
+#endif
+#define pilContinuationLocal(cv)
+#define pilContinuationMake(v, cl, cv) (v) = (cv)
+#define pilContinuation0(cl, cv) continuation cv():
+#define pilContinuation(cl, cv, ...) continuation cv(__VA_ARGS__):
+#define pilCutTo0(c) cut to c
+#ifdef __pillar2c__
+  #define pilCutToA(c, ...) cut to c with (__VA_ARGS__)
+  #define pilCutToC(...) also cuts to (__VA_ARGS__)
+#else /* !__pillar2c__ */
+  #define pilCutToA(c, ...) cut to c(__VA_ARGS__)
+  #define pilCutToC(...) also cuts to __VA_ARGS__
+#endif /* !__pillar2c__ */
+
+#if !PLSR_DISABLE_TAILCALL
+  #define TAILCALL(e) return tailcall e
+  #define TAILCALLV(e) tailcall e; return
+#else
+  #define TAILCALL(e) return e
+  #define TAILCALLV(e) e;return
+#endif
+
+#ifdef USE_PTHREADS
+#define YIELDCHECK() (*(unsigned volatile*)(((char *)prtGetTaskHandle()) + (3*P_WORD_SIZE)))
+#else  // USE_PTHREADS
+#ifdef NO_PRSCALL
+#define YIELDCHECK() (*((*((short volatile**)(((char *)prtGetTaskHandle()) +(2*P_WORD_SIZE)))) + 0x76))
+#else  // NO_PRSCALL
+#define YIELDCHECK() (*(short volatile*)(**(char volatile* volatile* volatile*)((char *)prtGetTaskHandle()+(2*P_WORD_SIZE))+0x76))
+#endif // NO_PRSCALL
+#endif // USE_PTHREADS
+
+#define pilYieldDec uintp spin=0
+#define pilYield()                                              \
+    do {                                                        \
+        if (++spin&15==0 && YIELDCHECK()) prtYield();           \
+    } while (0)
+
+#else /* !P_USE_PILLAR */
+
+#include <setjmp.h>
+
+#define PilContinuation0 jmp_buf*
+#define PilContinuation(...) jmp_buf*
+#define pilContinuationLocal(cv) jmp_buf cv
+#define pilContinuationMake(v, cl, cv)                          \
+    do { (v) = &(cv); if (setjmp((cv))) goto cl; } while (0)
+#define pilContinuation0(cl, cv)
+#define pilContinuation(cl, cv, ...)
+#define pilCutTo0(c) longjmp(*c, 1)
+#define pilCutToA(c, ...) longjmp(*c, 1)
+#define pilCutToC(...)
+
+#define TAILCALL(e) return (e)
+#define TAILCALLV(e) do { e; return; } while (0)
+#define noyield
+#define pilYieldDec
+#define pilYield()
+
+#endif /* !P_USE_PILLAR */
+
+#endif /* !_PIL_H */
diff --git a/runtime/include/hrc/plsr-ap-integer.h b/runtime/include/hrc/plsr-ap-integer.h
new file mode 100644
index 0000000..7bcd5b9
--- /dev/null
+++ b/runtime/include/hrc/plsr-ap-integer.h
@@ -0,0 +1,20 @@
+ /* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifdef PLSR_GMP_USE_DEFAULT
+#ifdef __pillar2c__
+#define PLSR_GMP_USE_GALLOCATE
+#else /* ! __pillar2c__ */
+#define PLSR_GMP_USE_PINNING
+#endif /* ! __pillar2c__ */
+#endif /* !PLSR_GMP_USE_DEFAULT */
+
+#ifdef PLSR_NO_GMP_INTEGERS
+#include "hrc/plsr-iflc-integer.h"
+#else /* !PLSR_NO_GMP_INTEGERS */
+#ifdef PLSR_GMP_USE_GALLOCATE
+#include "hrc/plsr-gmp-integer-gallocate.h"
+#else /* !PLSR_GMP_USE_GALLOCATE */
+#include "hrc/plsr-gmp-integer.h"
+#endif /* !PLSR_GMP_USE_GALLOCATE */
+#endif /* !PLSR_NO_GMP_INTEGERS */
diff --git a/runtime/include/hrc/plsr-ap-rational.h b/runtime/include/hrc/plsr-ap-rational.h
new file mode 100755
index 0000000..bee6740
--- /dev/null
+++ b/runtime/include/hrc/plsr-ap-rational.h
@@ -0,0 +1,500 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Arbitrary precision numbers */
+
+#ifndef _PLSR_AP_RATIONAL_H_
+#define _PLSR_AP_RATIONAL_H_
+
+/**********************************************************************
+ * Arbitrary precision rationals
+ */
+
+/* Represents the rational x such that c0+c1*x=0
+ * Invariants: gcd(c0,c1)=1 and c1>0
+ */
+
+typedef struct PlsrAPRatS_ {
+    PlsrVTable vtable;
+    PlsrInteger c0;
+    PlsrInteger c1;
+} PlsrAPRatS;
+
+#ifdef P_USE_PILLAR
+typedef PlsrRef PlsrAPRat;
+#else /* !P_USE_PILLAR */
+typedef PlsrAPRatS* PlsrAPRat;
+#endif /* !P_USE_PILLAR */
+
+#define pLsrAPRatRep(l) ((PlsrAPRatS*)(l))
+#define pLsrAPRatC0(l) pLsrAPRatRep(l)->c0
+#define pLsrAPRatC1(l) pLsrAPRatRep(l)->c1
+
+
+#define pLsrAPRatPadding                                        \
+    (sizeof(PlsrAPRatS) - sizeof(PlsrVTable) - sizeof(PlsrInteger) - sizeof(PlsrInteger))
+pLsrVTableStatic(pLsrAPRatVTable_, VNoneTag, "*rational*", pLsrAPRatPadding);
+#define pLsrAPRatVTable &pLsrAPRatVTable_
+
+/* Note: the parameter names are significant.  Using c0 and c1 breaks,
+ * since apparently the gnu and Intel C compilers confuse the parameter
+ * with the field name.  Uggh.
+ */
+#define pLsrAPRatStaticUnboxedDef(uvar)                         \
+    static PlsrAPRatS uvar = { .vtable = pLsrAPRatVTable, }
+
+#define pLsrAPRatStaticInitFromIntegers(var, c00, c10)                  \
+    do {                                                                \
+        if (pLsrIntegerIsRef) {                                         \
+            pLsrWriteBarrierRefBase(var, pLsrAPRatC0(var), c00);        \
+            pLsrWriteBarrierRefBase(var, pLsrAPRatC1(var), c10);        \
+        } else {                                                        \
+            pLsrAPRatC0(var) = c00;                                     \
+            pLsrAPRatC1(var) = c10;                                     \
+        }                                                               \
+    } while (0)
+
+#define pLsrAPRatStaticInit(var, c00, c10) \
+    pLsrAPRatStaticInitFromIntegers(var, pLsrIntegerFromCString(c00), pLsrIntegerFromCString(c10))
+
+#define pLsrAPRatStaticRef(uvar)                \
+    ((PlsrAPRat) &uvar)
+
+/* Constants */
+
+static PlsrAPRatS pLsrAPRatZero_ = { .vtable = pLsrAPRatVTable,
+                                     .c0 = NULL, 
+                                     .c1 = NULL };
+#define pLsrAPRatZero ((PlsrAPRat) &pLsrAPRatZero_)
+
+static PlsrAPRatS pLsrAPRatOne_ = { .vtable = pLsrAPRatVTable,
+                                    .c0 = NULL, 
+                                    .c1 = NULL };
+#define pLsrAPRatOne ((PlsrAPRat) &pLsrAPRatOne_)
+
+static PlsrAPRatS pLsrAPRatUInt32Max_ = { .vtable = pLsrAPRatVTable,
+                                          .c0 = NULL,
+                                          .c1 = NULL };
+#define pLsrAPRatUInt32Max ((PlsrAPRat) &pLsrAPRatUInt32Max_)
+
+/* Constructors */
+
+/* Assumes that c0 and c1 are already in canonical form */
+static PlsrAPRat pLsrAPRatMk(PlsrInteger c0, PlsrInteger c1) {
+    uintp size = sizeof(PlsrAPRatS);
+    PlsrAPRat a;
+    noyield {
+        pLsrAlloc(PlsrAPRat, a, pLsrAPRatVTable, size);
+        if (pLsrIntegerIsRef) {
+            pLsrWriteBarrierRefOptBase(a, pLsrAPRatC0(a), c0);
+            pLsrWriteBarrierRefOptBase(a, pLsrAPRatC1(a), c1);
+        } else {
+            pLsrAPRatC0(a) = c0;
+            pLsrAPRatC1(a) = c1;
+        }
+    }
+    return a;
+}
+
+/* Create a rational number r from two integers c0, c1 such
+ * that r = -c0/c1, where gcd(c0, c1) may not be 1 */
+static PlsrAPRat pLsrAPRatFromIntegersNorm(PlsrInteger num, PlsrInteger den)
+{
+    PlsrInteger g = pLsrIntegerGcd(num, den);
+    PlsrBoolean b;
+    pLsrIntegerLT(b, den, pLsrIntegerZero);
+    if (b) {
+        pLsrIntegerNegate(num, num);
+        pLsrIntegerNegate(den, den);
+    }
+    pLsrIntegerDivT(num, num, g);
+    pLsrIntegerDivT(den, den, g);
+    return pLsrAPRatMk(num, den);
+}
+
+static PlsrAPRat pLsrAPRatFromSIntps(sintp a, sintp b) {
+    PlsrInteger i1;
+    pLsrIntegerFromSIntp(i1, -a);
+    PlsrInteger i2;
+    pLsrIntegerFromSIntp(i2, b);
+    return pLsrAPRatFromIntegersNorm(i1, i2);
+}
+
+
+/* Basic queries and destructors */
+
+static PlsrBoolean pLsrAPRatIsIntegral(PlsrAPRat r)
+{
+    PlsrBoolean b;
+    pLsrIntegerEQ(b, pLsrAPRatC1(r), pLsrIntegerOne);
+    return b;
+}
+
+static PlsrInteger pLsrAPRatNumerator(PlsrAPRat r) {
+    PlsrInteger i = pLsrAPRatC0(r);
+    pLsrIntegerNegate(i, i);
+    return i;
+}
+
+static PlsrInteger pLsrAPRatDenominator(PlsrAPRat r) {
+    return pLsrAPRatC1(r);
+}
+
+/* Conversions */
+
+static PlsrAPRat pLsrAPRatFromIntegers(PlsrInteger a, PlsrInteger b) {
+    PlsrInteger n;
+    pLsrIntegerNegate(n, a);
+    return pLsrAPRatFromIntegersNorm(n, b);
+}
+
+static void pLsrIntegersFromAPRat(PlsrInteger* numO, PlsrInteger* denO, PlsrAPRat r) {
+    pLsrIntegerNegate(*numO, pLsrAPRatC0(r));
+    *denO = pLsrAPRatC1(r);
+    return;
+}
+
+static PlsrAPRat pLsrAPRatFromInteger(PlsrInteger i)
+{
+    return pLsrAPRatFromIntegers(i, pLsrIntegerOne);
+}
+
+static PlsrInteger pLsrIntegerFromAPRat(PlsrAPRat r)
+{
+    assert(pLsrAPRatIsIntegral(r));
+    PlsrInteger n;
+    pLsrIntegerNegate(n, pLsrAPRatC0(r));
+    return n;
+}
+
+static PlsrAPRat pLsrAPRatFromSInt32(sint32 i)
+{
+    PlsrInteger ai;
+    pLsrIntegerFromSInt32(ai, i);
+    return pLsrAPRatFromInteger(ai);
+}
+
+static sint32 pLsrSInt32FromAPRat(PlsrAPRat r)
+{
+    sint32 i;
+    pLsrSInt32FromInteger(i, pLsrIntegerFromAPRat(r));
+    return i;
+}
+
+static PlsrAPRat pLsrAPRatFromSInt64(sint64 i)
+{
+    PlsrInteger ai;
+    pLsrIntegerFromSInt64(ai, i);
+    return pLsrAPRatFromInteger(ai);
+}
+
+static sint32 pLsrSInt64FromAPRat(PlsrAPRat r)
+{
+    sint64 i;
+    pLsrSInt64FromInteger(i, pLsrIntegerFromAPRat(r));
+    return i;
+}
+
+static PlsrAPRat pLsrAPRatFromUInt32(uint32 i)
+{
+    PlsrInteger ai;
+    pLsrIntegerFromUInt32(ai, i);
+    return pLsrAPRatFromInteger(ai);
+}
+
+static uint32 pLsrUInt32FromAPRat(PlsrAPRat r)
+{
+    uint32 i;
+    pLsrUInt32FromInteger(i, pLsrIntegerFromAPRat(r));
+    return i;
+}
+
+static PlsrAPRat pLsrAPRatFromSIntp(sintp i)
+{
+    PlsrInteger ai;
+    pLsrIntegerFromSIntp(ai, i);
+    return pLsrAPRatFromInteger(ai);
+}
+
+static sintp pLsrSIntpFromAPRat(PlsrAPRat r)
+{
+    sintp i;
+    pLsrSIntpFromInteger(i, pLsrIntegerFromAPRat(r));
+    return i;
+}
+
+static PlsrAPRat pLsrAPRatFromUIntp(uintp i)
+{
+    PlsrInteger ai;
+    pLsrIntegerFromUIntp(ai, i);
+    return pLsrAPRatFromInteger(ai);
+}
+
+static uintp pLsrUIntpFromAPRat(PlsrAPRat r)
+{
+    uintp i;
+    pLsrUIntpFromInteger(i, pLsrIntegerFromAPRat(r));
+    return i;
+}
+
+static PlsrAPRat pLsrAPRatFromFloat32(float32 f) {
+    pLsrRuntimeError("pLsrAPRatFromFloat32 not implemented");
+    return 0;
+}
+
+static PlsrAPRat pLsrAPRatFromFloat64(float64 f) {
+    pLsrRuntimeError("pLsrAPRatFromFloat64 not implemented");
+    return 0;
+}
+
+static float32 pLsrFloat32FromAPRat(PlsrAPRat a) {
+    pLsrRuntimeError("pLsrFloat32FromAPRat not implemented");
+    return 0;
+}
+
+static float64 pLsrFloat64FromAPRat(PlsrAPRat a) {
+    pLsrRuntimeError("pLsrFloat64FromAPRat not implemented");
+    return 0;
+}
+
+static PlsrBoolean pLsrAPRatLt(PlsrAPRat a, PlsrAPRat b);
+static PlsrBoolean pLsrAPRatGt(PlsrAPRat a, PlsrAPRat b);
+
+static uintp pLsrAPRatToUInt32Checked(PlsrAPRat r)
+{
+    if (!pLsrAPRatIsIntegral(r) 
+        || pLsrAPRatLt(r, pLsrAPRatZero) 
+        || pLsrAPRatGt(r, pLsrAPRatUInt32Max))
+        return UINTP_MAX;
+    return pLsrUInt32FromAPRat(r);
+};
+
+static char* pLsrCStringFromAPRat(PlsrAPRat r) {
+    PlsrInteger numI = pLsrAPRatNumerator(r);
+    PlsrInteger denI = pLsrAPRatDenominator(r);
+    char *num = pLsrCStringFromInteger(numI);
+    char *w;
+    PlsrBoolean b;
+    pLsrIntegerEQ(b, denI, pLsrIntegerOne);
+    if (b) {
+        w = (char*)pLsrAllocC(sizeof(char) * (strlen(num) + 1));
+        sprintf(w, "%s", num);
+    } else {
+        char *den = pLsrCStringFromInteger(denI);
+        w = (char*)pLsrAllocC(sizeof(char) * (strlen(num) + strlen(den) + 1 + 1));
+        sprintf(w, "%s/%s", num, den);
+        pLsrFreeC(den);
+    }
+    pLsrFreeC(num);
+
+    return w;
+}
+
+static PlsrAPRat pLsrAPRatFromCString(char* s) {
+    char *num;
+    char *den;
+    PlsrInteger numI;
+    PlsrInteger denI;
+    int i;
+    for(i=0;(s[i] != '/') && (s[i] != '\0');i++);
+    num = (char*) pLsrAllocC(sizeof(char) * (i + 1));
+    strncpy(num, s, i);
+    numI = pLsrIntegerFromCString(num);
+    pLsrFreeC(num);
+    if (s[i] == '\0') {
+        denI = pLsrIntegerOne;
+    } else {
+        den = s + i + 1;
+        denI = pLsrIntegerFromCString(den);
+    }
+    return pLsrAPRatFromIntegers(numI, denI);
+ }
+
+/* Arithmetic */
+
+static PlsrAPRat pLsrAPRatNeg(PlsrAPRat r)
+{
+    PlsrInteger n;
+    pLsrIntegerNegate(n, pLsrAPRatC0(r));
+    return pLsrAPRatMk(n, pLsrAPRatC1(r));
+    /* Normalisation should not be necessary */
+};
+
+static PlsrAPRat pLsrAPRatPlus(PlsrAPRat r1,
+                               PlsrAPRat r2)
+{
+    PlsrInteger r1c0 = pLsrAPRatC0(r1);
+    PlsrInteger r1c1 = pLsrAPRatC1(r1);
+    PlsrInteger r2c0 = pLsrAPRatC0(r2);
+    PlsrInteger r2c1 = pLsrAPRatC1(r2);
+    PlsrInteger p1;
+    PlsrInteger p2;
+    PlsrInteger p3;
+    PlsrInteger c0;
+    PlsrInteger c1;
+    pLsrIntegerTimes(p1, r1c0, r2c1);
+    pLsrIntegerTimes(p2, r1c1, r2c0);
+    pLsrIntegerPlus(c0, p1, p2);
+    pLsrIntegerTimes(c1, r1c1, r2c1);
+    return pLsrAPRatFromIntegersNorm(c0, c1);
+}
+
+static PlsrAPRat pLsrAPRatMinus(PlsrAPRat r1,
+                                PlsrAPRat r2)
+{
+    PlsrInteger r1c0 = pLsrAPRatC0(r1);
+    PlsrInteger r1c1 = pLsrAPRatC1(r1);
+    PlsrInteger r2c0 = pLsrAPRatC0(r2);
+    PlsrInteger r2c1 = pLsrAPRatC1(r2);
+    PlsrInteger p1;
+    PlsrInteger p2;
+    PlsrInteger p3;
+    PlsrInteger c0;
+    PlsrInteger c1;
+    pLsrIntegerTimes(p1, r1c0, r2c1);
+    pLsrIntegerTimes(p2, r1c1, r2c0);
+    pLsrIntegerMinus(c0, p1, p2);
+    pLsrIntegerTimes(c1, r1c1, r2c1);
+    return pLsrAPRatFromIntegersNorm(c0, c1);
+}
+
+static PlsrAPRat pLsrAPRatMult(PlsrAPRat r1,
+                               PlsrAPRat r2)
+{
+    PlsrInteger r1c0 = pLsrAPRatC0(r1);
+    PlsrInteger r1c1 = pLsrAPRatC1(r1);
+    PlsrInteger r2c0 = pLsrAPRatC0(r2);
+    PlsrInteger r2c1 = pLsrAPRatC1(r2);
+    PlsrInteger c0;
+    PlsrInteger c1;
+    pLsrIntegerNegate(r1c0, r1c0);
+    pLsrIntegerTimes(c0, r1c0, r2c0);
+    pLsrIntegerTimes(c1, r1c1, r2c1);
+    return pLsrAPRatFromIntegersNorm(c0, c1);
+}
+
+/* Comparisons */
+
+static PlsrBoolean pLsrAPRatEq(PlsrAPRat r1, PlsrAPRat r2)
+{
+    PlsrInteger r1c0 = pLsrAPRatC0(r1);
+    PlsrInteger r1c1 = pLsrAPRatC1(r1);
+    PlsrInteger r2c0 = pLsrAPRatC0(r2);
+    PlsrInteger r2c1 = pLsrAPRatC1(r2);
+    PlsrBoolean b1, b2;
+    pLsrIntegerEQ(b1, r1c0, r2c0);
+    if (!b1) return 0;
+    pLsrIntegerEQ(b2, r1c1, r2c1);
+    return b2;
+}
+
+static PlsrBoolean pLsrAPRatNe(PlsrAPRat r1, PlsrAPRat r2)
+{
+    return !pLsrAPRatEq(r1, r2);
+}
+
+static PlsrBoolean pLsrAPRatLt(PlsrAPRat r1, PlsrAPRat r2)
+{
+    PlsrInteger r1c0 = pLsrAPRatC0(r1);
+    PlsrInteger r1c1 = pLsrAPRatC1(r1);
+    PlsrInteger r2c0 = pLsrAPRatC0(r2);
+    PlsrInteger r2c1 = pLsrAPRatC1(r2);
+    PlsrInteger p1;
+    PlsrInteger p2;
+    PlsrBoolean b;
+    pLsrIntegerTimes(p1, r1c1, r2c0);
+    pLsrIntegerTimes(p2, r1c0, r2c1);
+    pLsrIntegerLT(b, p1, p2);
+    return b;
+}
+
+static PlsrBoolean pLsrAPRatGt(PlsrAPRat r1, PlsrAPRat r2)
+{
+    PlsrInteger r1c0 = pLsrAPRatC0(r1);
+    PlsrInteger r1c1 = pLsrAPRatC1(r1);
+    PlsrInteger r2c0 = pLsrAPRatC0(r2);
+    PlsrInteger r2c1 = pLsrAPRatC1(r2);
+    PlsrInteger p1;
+    PlsrInteger p2;
+    PlsrBoolean b;
+    pLsrIntegerTimes(p1, r1c1, r2c0);
+    pLsrIntegerTimes(p2, r1c0, r2c1);
+    pLsrIntegerGT(b, p1, p2);
+    return b;
+}
+
+static PlsrBoolean pLsrAPRatLe(PlsrAPRat r1, PlsrAPRat r2)
+{
+    PlsrInteger r1c0 = pLsrAPRatC0(r1);
+    PlsrInteger r1c1 = pLsrAPRatC1(r1);
+    PlsrInteger r2c0 = pLsrAPRatC0(r2);
+    PlsrInteger r2c1 = pLsrAPRatC1(r2);
+    PlsrInteger p1;
+    PlsrInteger p2;
+    PlsrBoolean b;
+    pLsrIntegerTimes(p1, r1c1, r2c0);
+    pLsrIntegerTimes(p2, r1c0, r2c1);
+    pLsrIntegerLE(b, p1, p2);
+    return b;
+}
+
+static PlsrBoolean pLsrAPRatGe(PlsrAPRat r1, PlsrAPRat r2)
+{
+    PlsrInteger r1c0 = pLsrAPRatC0(r1);
+    PlsrInteger r1c1 = pLsrAPRatC1(r1);
+    PlsrInteger r2c0 = pLsrAPRatC0(r2);
+    PlsrInteger r2c1 = pLsrAPRatC1(r2);
+    PlsrInteger p1;
+    PlsrInteger p2;
+    PlsrBoolean b;
+    pLsrIntegerTimes(p1, r1c1, r2c0);
+    pLsrIntegerTimes(p2, r1c0, r2c1);
+    pLsrIntegerGE(b, p1, p2);
+    return b;
+}
+
+static sint32 pLsrAPRatCheckRangeSInt32(PlsrAPRat a, sint32 upper, sint32 lower) {
+    if (pLsrAPRatIsIntegral(a)) {
+        PlsrInteger im = pLsrAPRatNumerator(a);
+        PlsrInteger i;
+        pLsrIntegerNegate(i, im);
+        return pLsrIntegerCheckRangeSInt32(i, upper, lower);
+    } 
+    return SINT32_MIN;
+}
+
+#define pLsrAPRatSize (sizeof(PlsrAPRatS))
+#define pLsrAPRatAlignment 4
+
+static void pLsrAPRatRegisterVTables()
+{
+    static PgcIsRef pLsrAPRatRefs[pLsrAPRatSize/P_WORD_SIZE] = { 0, pLsrIntegerIsRef, pLsrIntegerIsRef };
+ 
+    assert(pLsrAPRatSize/P_WORD_SIZE == 3);
+
+    pLsrVTableRegister(pLsrAPRatVTable, pLsrAPRatAlignment, pLsrAPRatSize, pLsrAPRatRefs, 0, 0, 0,
+                       PGC_ALWAYS_IMMUTABLE, 0);
+}
+
+#define pLsrAPRatGlobalsCount 3
+
+static PlsrObjectB pLsrAPRatGlobals[] = 
+    {
+        (PlsrObjectB) pLsrAPRatZero,
+        (PlsrObjectB) pLsrAPRatOne,
+        (PlsrObjectB) pLsrAPRatUInt32Max,
+        (PlsrObjectB) NULL /* This must be last */
+    };
+
+static void pLsrAPRatRegisterGlobals() {
+    assert(pLsrAPRatGlobals[pLsrAPRatGlobalsCount] == NULL);
+    pLsrGcRegisterGlobals(pLsrAPRatGlobals, pLsrAPRatGlobalsCount);
+}
+
+static void pLsrAPRatInitialize() {
+    pLsrAPRatStaticInitFromIntegers(pLsrAPRatZero, pLsrIntegerZero, pLsrIntegerOne);
+    pLsrAPRatStaticInitFromIntegers(pLsrAPRatOne, pLsrIntegerMinusOne, pLsrIntegerOne);
+    pLsrAPRatStaticInitFromIntegers(pLsrAPRatUInt32Max, pLsrIntegerMinusUInt32Max, pLsrIntegerOne);
+}
+
+#endif /* !_PLSR_AP_RATIONAL_H_ */
diff --git a/runtime/include/hrc/plsr-finalizer.h b/runtime/include/hrc/plsr-finalizer.h
new file mode 100644
index 0000000..0cc9c43
--- /dev/null
+++ b/runtime/include/hrc/plsr-finalizer.h
@@ -0,0 +1,350 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Finalizers */
+
+#ifndef _PLSR_FINALIZER_H_
+#define _PLSR_FINALIZER_H_
+
+typedef void (*PlsrFinalizerCode)(PlsrRef);
+
+#ifdef P_USE_PILLAR
+
+#define pLsrFinalizerInitialSize 500
+typedef struct PlsrFinalizerS {PlsrFinalizerCode f;volatile PlsrRef object;} PlsrFinalizerU, *PlsrFinalizer;
+
+typedef struct PlsrFinalizerChunkS {
+    volatile uintp count;
+    uintp length;
+    volatile struct PlsrFinalizerChunkS* next;
+    volatile PlsrFinalizerU entries[];
+} PlsrFinalizerChunkU, *PlsrFinalizerChunk;
+
+typedef struct PlsrFinalizerGlobalsS {
+    uintp busy;
+    uintp shutdown;
+    uintp done;
+    uintp signaled;
+    struct prtMutex *lock;
+    struct prtCondition *notify;
+    volatile PlsrFinalizerChunk inbox; 
+    volatile PlsrFinalizerChunk todo; 
+} PlsrFinalizerGlobalsU, *PlsrFinalizerGlobals;
+    
+static PlsrFinalizerGlobalsU pLsrFinalizerGlobals;
+
+/* Finalizer addition */
+#pragma pillar_managed(off)
+
+/* Notify the finalizer thread of activity
+* (shutdown or new work).  
+*/
+static void pLsrFinalizerNotify() 
+{
+    if (!pLsrFinalizerGlobals.signaled) {
+        fflush(stdout);
+        prtMutexLock(pLsrFinalizerGlobals.lock);
+        fflush(stdout);
+        pLsrFinalizerGlobals.signaled = 1;
+        prtConditionSignal(pLsrFinalizerGlobals.notify);
+        fflush(stdout);
+        prtMutexUnlock(pLsrFinalizerGlobals.lock);
+        fflush(stdout);
+    }
+}
+
+/* Wait for notification of new work */
+static void pLsrFinalizerWaitForNotification() 
+{
+    prtMutexLock(pLsrFinalizerGlobals.lock);
+    if (!pLsrFinalizerGlobals.signaled) {
+        prtConditionWait(pLsrFinalizerGlobals.notify, pLsrFinalizerGlobals.lock);
+    }
+    pLsrFinalizerGlobals.signaled = 0;
+    prtMutexUnlock(pLsrFinalizerGlobals.lock);
+}
+
+static PlsrFinalizerChunk pLsrFinalizerTakeInbox() 
+{
+    return (PlsrFinalizerChunk) pLsrSynchAtomicTakeUIntp((volatile uintp*) &pLsrFinalizerGlobals.inbox, 1);
+}
+
+static void pLsrFinalizerPutInbox(PlsrFinalizerChunk inbox) 
+{
+    pLsrSynchAtomicPutUIntp((volatile uintp*) &pLsrFinalizerGlobals.inbox, (uintp) inbox);
+    pLsrFinalizerNotify();
+}
+
+static PlsrFinalizerChunk pLsrFinalizerNewChunkUnmanaged(uintp length) 
+{
+    PlsrFinalizerChunk chunk = 
+        (PlsrFinalizerChunk) pLsrAllocCUnmanaged(sizeof(PlsrFinalizerChunkU) + length*sizeof(PlsrFinalizerU));
+    chunk->count=0;
+    chunk->length=length;
+    chunk->next=NULL;
+    return chunk;
+}
+
+static void pLsrFinalizerAdd(PlsrFinalizerCode f, volatile PlsrRef object) 
+{
+#if (defined(PLSR_FINALIZER_TRACE) && (PLSR_FINALIZER_TRACE>0))
+    printf("Adding finalizer object %p\n", object);
+    fflush(stdout);
+#endif
+#ifdef PLSR_SINGLE_THREADED
+    printf("Can't use finalizers with the single threaded runtime\n");
+    fflush(stdout);
+    assert(0);
+    exit(-1);
+#else
+    PlsrFinalizerChunk inbox = pLsrFinalizerTakeInbox();
+    if (inbox->count >= inbox->length) {
+        uintp length = inbox->length;
+        PlsrFinalizerChunk next = inbox;
+        inbox = pLsrFinalizerNewChunkUnmanaged(length * 2);
+        inbox->next = next;
+    } 
+    uintp count=inbox->count;
+    inbox->entries[count].f = f;
+    inbox->entries[count].object = object;
+    inbox->count = count + 1;
+    pLsrFinalizerPutInbox(inbox);
+#endif
+}
+
+#pragma pillar_managed(on)
+
+/* Root set enumeration */
+#pragma pillar_managed(off)
+/* These run on a pillar task, can yield */
+
+static void pLsrFinalizerEnumerateChunk(PrtRseCallback rse, void* env, PlsrFinalizerChunk chunk)
+{
+    while (chunk != NULL) {
+        for(int i=0; i<chunk->count;i++) {
+#if (defined(PLSR_FINALIZER_TRACE) && (PLSR_FINALIZER_TRACE>0))
+    printf("Enumerating finalizer object %p\n", chunk->entries[i].object);
+    fflush(stdout);
+#endif
+
+            rse(env, (void**) &(chunk->entries[i].object), PrtGcTagDefault, 0);
+        }
+        chunk=(PlsrFinalizerChunk) chunk->next;
+    }
+}
+
+static void pLsrFinalizerReportRoots(PrtRseCallback rse, void* env)
+{
+#ifdef PLSR_FINALIZER_TRACE
+    printf("Finalizer enumeration starting\n");
+    fflush(stdout);
+#endif
+    pLsrFinalizerEnumerateChunk(rse, env, pLsrFinalizerGlobals.inbox);
+    pLsrFinalizerEnumerateChunk(rse, env, pLsrFinalizerGlobals.todo);  /* Finalizer thread is paused */
+#ifdef PLSR_FINALIZER_TRACE
+    printf("Finalizer enumeration finished\n");
+    fflush(stdout);
+#endif
+}
+#pragma pillar_managed(on)
+
+static PlsrFinalizerChunk pLsrFinalizerNewChunk(uintp length) {
+    PlsrFinalizerChunk chunk = 
+        (PlsrFinalizerChunk) pLsrAllocC(sizeof(PlsrFinalizerChunkU) + length*sizeof(PlsrFinalizerU));
+    chunk->count=0;
+    chunk->length=length;
+    chunk->next=NULL;
+    return chunk;
+}
+
+static void pLsrFinalizerDeleteChunk(PlsrFinalizerChunk chunk) {
+    pLsrFreeC(chunk);
+}
+
+
+/* We walk through the todo set, removing an element one
+ * at a time.  All but the last (removed) element, remain
+ * in the set to be enumerated by the root set enumerator.
+ * The last (removed) element, is on the stack of this managed thread,
+ * and hence will be enumerated by pillar
+ */
+static void pLsrFinalizerFinalizeTodo() 
+{
+#ifdef PLSR_FINALIZER_TRACE
+    int processed = 0;
+    printf("Finalizer finalize loop starting\n");
+    fflush(stdout);
+#endif
+    pLsrFinalizerGlobals.busy = 1;
+    /* Atomically */
+    noyield {
+        pLsrFinalizerGlobals.todo = pLsrFinalizerGlobals.inbox;
+        int length = 
+            (pLsrFinalizerInitialSize < pLsrFinalizerGlobals.inbox->length/2)
+            ? pLsrFinalizerGlobals.inbox->length/2
+            : pLsrFinalizerInitialSize;
+        pLsrFinalizerGlobals.inbox = pLsrFinalizerNewChunk(length);
+    }
+
+    /* We keep a copy here.  GC may move the contents
+    * of the fields, but nothing else modifies todo itself */
+    PlsrFinalizerChunk todo = pLsrFinalizerGlobals.todo;
+    while (todo != NULL) {
+        while (todo->count > 0) {
+            PlsrFinalizerCode f;
+            PlsrRef object;
+            noyield {
+                int i = todo->count-1;
+                f = todo->entries[i].f;
+                object = todo->entries[i].object;
+                todo->count = todo->count - 1;
+            }
+
+#if (defined(PLSR_FINALIZER_TRACE) && (PLSR_FINALIZER_TRACE>0))
+            printf("Finalizer finalizing object %p\n", object);
+            fflush(stdout);
+#endif
+#ifdef PLSR_FINALIZER_TRACE
+            processed++;
+#endif
+            f(object);
+        }
+        PlsrFinalizerChunk next = (PlsrFinalizerChunk) todo->next;
+        pLsrFinalizerGlobals.todo = next;
+        pLsrFinalizerDeleteChunk(todo);
+        todo=next;
+    }
+    pLsrFinalizerGlobals.busy = 0;
+#ifdef PLSR_FINALIZER_TRACE
+    printf("Finalizer finalize loop finished finalizing %d objects\n", processed);
+    fflush(stdout);
+#endif
+
+}
+
+static void pLsrFinalizerRun()
+{
+    pilYieldDec;
+
+#ifdef PLSR_FINALIZER_TRACE
+    printf("Starting finalizer thread\n");
+    fflush(stdout);
+#endif
+
+    while (!pLsrFinalizerGlobals.shutdown) {
+        assert(pLsrFinalizerGlobals.todo == NULL);
+        if (pLsrFinalizerGlobals.inbox->count > 0) {
+            pLsrFinalizerFinalizeTodo();
+        } else {
+            pLsrFinalizerWaitForNotification();
+        }
+        pilYield();
+    }
+#ifdef PLSR_FINALIZER_TRACE
+    printf("Starting finalizer thread shutdown sequence\n");
+    fflush(stdout);
+#endif
+
+#ifdef PLSR_FINALIZER_SHUTDOWN_LOOP
+    pgc_force_gc();
+    while (pLsrFinalizerGlobals.inbox->count > 0) {
+#ifdef PLSR_FINALIZER_TRACE
+    printf("Iterating finalizer thread shutdown sequence\n");
+    fflush(stdout);
+#endif
+        pLsrFinalizerFinalizeTodo();
+        pgc_force_gc();
+        pilYield();
+    } 
+#endif
+
+#ifdef PLSR_FINALIZER_TRACE
+    printf("Completed finalizer thread shutdown sequence\n");
+    fflush(stdout);
+#endif
+
+    pLsrFinalizerGlobals.done = 1;
+}
+
+static void pLsrFinalizerStart()
+{
+    pLsrFinalizerGlobals.shutdown = 0;
+    pLsrFinalizerGlobals.done = 0;
+    pLsrFinalizerGlobals.busy = 0;
+    pLsrFinalizerGlobals.lock = prtMutexCreate(NULL);
+    pLsrFinalizerGlobals.notify = prtConditionInit(NULL);
+    pLsrFinalizerGlobals.signaled = 0;
+    pLsrFinalizerGlobals.inbox = pLsrFinalizerNewChunk(pLsrFinalizerInitialSize);
+    pLsrFinalizerGlobals.todo = NULL;
+
+#ifndef PLSR_SINGLE_THREADED
+    pcall pLsrFinalizerRun();
+#endif
+}
+
+/* Shutdown the system.  If cleanup, then complete finalization
+ * before exiting, otherwise just exit.
+ */
+static void pLsrFinalizerShutdown(PlsrBoolean cleanup)
+{
+#ifndef PLSR_SINGLE_THREADED
+    if (cleanup) {
+        pLsrFinalizerGlobals.shutdown = 1;
+        pLsrFinalizerNotify();
+        pLsrSynchYieldUntilEqualUIntp(&pLsrFinalizerGlobals.done, 1);
+    }
+#endif
+}
+
+static void pLsrFinalizerYieldToFinalizer() 
+{
+#ifndef PLSR_SINGLE_THREADED
+    if (pLsrFinalizerGlobals.signaled) {
+        pLsrSynchYieldUntilEqualUIntp(&pLsrFinalizerGlobals.signaled, 0);
+    }
+    if (pLsrFinalizerGlobals.busy) {
+        pLsrSynchYieldUntilEqualUIntp(&pLsrFinalizerGlobals.busy, 0);
+    }
+#else
+    if (pLsrFinalizerGlobals.inbox->count > 0) {
+        pLsrFinalizerFinalizeTodo();
+    }
+#endif
+}
+
+static void pLsrFinalizerRegisterVTables()
+{
+}
+
+static void pLsrFinalizerRegisterGlobals()
+{
+}
+
+static void pLsrFinalizerCheckAssertions()
+{
+}
+#else /* !P_USE_PILLAR */
+
+static void pLsrFinalizerStart()
+{
+}
+
+static void pLsrFinalizerShutdown(PlsrBoolean cleanup)
+{
+}
+
+static void pLsrFinalizerRegisterVTables()
+{
+}
+
+static void pLsrFinalizerRegisterGlobals()
+{
+}
+
+static void pLsrFinalizerCheckAssertions()
+{
+}
+
+#endif /* P_USE_PILLAR */
+
+#endif /* !_PLSR_FINALIZER_H_ */
diff --git a/runtime/include/hrc/plsr-gc.h b/runtime/include/hrc/plsr-gc.h
new file mode 100755
index 0000000..5a056ba
--- /dev/null
+++ b/runtime/include/hrc/plsr-gc.h
@@ -0,0 +1,730 @@
+ /* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Memory Allocation and Garbage Collection */
+
+#ifndef _PLSR_GC_H_
+#define _PLSR_GC_H_
+
+#ifdef PLSR_LINUX
+#define min(x,y) ((x)<(y)?(x):(y))
+#endif
+
+/**********************************************************************/
+/*** Allocation instrumentation ***/
+
+static uint64 pLsrNumObjectsAllocated = 0;
+static uint64 pLsrNumBytesAllocated = 0;
+static uint64 pLsrNumPaddingBytesAllocated = 0;
+
+static uint64 pLsrNumObjectsAllocatedSinceLast = 0;
+static uint64 pLsrNumBytesAllocatedSinceLast = 0;
+
+static uint64 pLsrNumObjectsAllocatedUnmanaged = 0;
+static uint64 pLsrNumBytesAllocatedUnmanaged = 0;
+
+static uint64 pLsrNumObjectsFreedUnmanaged = 0;
+
+static uint64 pLsrNumObjectsAllocatedUnmanagedSinceLast = 0;
+static uint64 pLsrNumBytesAllocatedUnmanagedSinceLast = 0;
+
+static uint64 pLsrNumObjectsFreedUnmanagedSinceLast = 0;
+
+#ifdef PLSR_INSTRUMENT_ALLOCATION
+#define pLsrAllocInstrument1(vtable, size)               \
+    do {                                                 \
+        pLsrNumObjectsAllocated++;                       \
+        pLsrNumBytesAllocated += (size);                 \
+        pLsrNumPaddingBytesAllocated += (vtable)->padding;      \
+    } while (0)
+#define pLsrAllocCInstrument1(size)                                     \
+    do {                                                                \
+        pLsrNumObjectsAllocatedUnmanaged++;                             \
+        pLsrNumBytesAllocatedUnmanaged += (size);                       \
+    } while (0);
+#define pLsrFreeCInstrument1()                                          \
+    do {                                                                \
+        pLsrNumObjectsFreedUnmanaged++;                                 \
+        pLsrNumObjectsFreedUnmanagedSinceLast++;                        \
+    } while (0);
+#else /* !PLSR_INSTRUMENT_ALLOCATION */
+#define pLsrAllocInstrument1(vtable, size)
+#define pLsrAllocCInstrument1(size)
+#define pLsrFreeCInstrument1()
+#endif /* !PLSR_INSTRUMENT_ALLOCATION */
+
+#ifdef PLSR_INSTRUMENT_VTB_ALC
+#define pLsrAllocInstrument2(vtable, size) \
+    do {                                   \
+        (vtable)->numObjects++;            \
+        (vtable)->numBytes += (size);      \
+    } while (0)
+#else /* !PLSR_INSTRUMENT_VTB_ALC */
+#define pLsrAllocInstrument2(vtable, size)
+#endif /* !PLSR_INSTRUMENT_VTB_ALC */
+
+#ifdef PLSR_INSTRUMENT_GCS
+#define pLsrAllocInstrument3(size)                       \
+    do {                                                 \
+        pLsrNumObjectsAllocatedSinceLast++;              \
+        pLsrNumBytesAllocatedSinceLast += (size);        \
+    } while (0)
+#define pLsrAllocCInstrument2(size)                               \
+    do {                                                          \
+        pLsrNumObjectsAllocatedUnmanagedSinceLast++;              \
+        pLsrNumBytesAllocatedUnmanagedSinceLast += (size);        \
+    } while (0);
+#else /* !PLSR_INSTRUMENT_GCS */
+#define pLsrAllocInstrument3(size)
+#define pLsrAllocCInstrument2(size)
+#endif /* !PLSR_INSTRUMENT_GCS */
+
+#ifdef PLSR_FORCE_REGC
+uint32 pLsrAllocREGC_signal = 0;
+#define pLsrAllocREGC() do {if (pLsrAllocREGC_signal) {pgc_force_gc(); pLsrAllocREGC_signal = 0;}} while (0)
+#define pLsrAllocREGCSet() (pLsrAllocREGC_signal = 1)
+#else
+#define pLsrAllocREGC()
+#define pLsrAllocREGCSet()
+#endif 
+
+
+/**********************************************************************/
+/*** Heap Allocation ***/
+
+#define pLsrDefaultAlignment 4
+
+#define pLsrAllocStart() pLsrAllocREGC()
+
+#define pLsrAllocFinish(r, vt, size)                    \
+    do {                                                \
+        pLsrAllocInstrument1(vt, size);                  \
+        pLsrAllocInstrument2(vt, size);                 \
+        pLsrAllocInstrument3(size);                     \
+        ((PlsrObjectU*)(r))->vtable = vt;               \
+    } while(0)
+
+#define pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocator)      \
+    do {                                                                \
+        pLsrAllocStart();                                               \
+        (dest) = (t) pLsrAllocator(size, vtable);                       \
+        pLsrAllocFinish(dest, vtable, size);                            \
+    } while(0)
+
+
+#ifdef P_USE_AGC
+/* Wrap the unmanaged call in a managed function, and inline
+ * the managed function call. This keeps the code size smaller.
+ * alignment is based on the vtable properties */
+PlsrObjectB pLsrAllocSlowCapacity(uintp size, PlsrVTable vtable) {
+#ifdef __pillar2c__
+    PlsrObjectB res;
+    if (!(res = pgc_allocate_or_null(size, vtable))) {
+        return pgc_allocate(size, vtable);
+    }
+    return res;
+#else
+    return pgc_allocate(size, vtable);
+#endif
+}
+PlsrObjectB pLsrAllocSlow(uintp size, PlsrVTable vtable) {
+    return pgc_allocate(size, vtable);
+}
+
+PlsrObjectB pLsrAllocSlowOrNull(uintp size, PlsrVTable vtable) {
+    PlsrObjectB res;
+    if (!(res = pgc_allocate_or_null(size, vtable))) {
+        pLsrRuntimeError_ ("Allocate or null returned null");
+    }
+    return res;
+}
+
+#define BITS_PER_BYTE 8
+#define NEXT_TO_HIGH_BIT_SET_MASK (1<<((sizeof(uintp) * BITS_PER_BYTE)-2))
+
+/* Wrap the unmanaged call in a managed function, and inline
+ * the managed function call. This keeps the code size smaller.
+ * alignment is based on the vtable properties */
+PlsrObjectB pLsrAllocSlowPinned(uintp size, PlsrVTable vtable) {
+    if ((P_USE_AGC == PlsrAKMf))
+        return pgc_allocate(size | NEXT_TO_HIGH_BIT_SET_MASK, vtable);
+    else
+        return pgc_allocate(size, vtable);
+}
+
+/* Accurate GC, pinned */
+#define pLsrAllocPinned(t, dest, vtable, size) \
+    pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocSlowPinned)
+
+#define pLsrAllocAlignedPinned(t, dest, vtable, size, alignment) \
+    pLsrAllocPinned(t, dest, vtable, size)
+
+#define pLsrAllocPinnedFinalizable pLsrAllocPinned
+
+#define pLsrAllocFinalizable(t, dest, vtable, size)                 \
+    pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocSlow)
+
+#ifdef P_USE_FAST_ALLOC
+
+/* Accurate GC, fast path */
+typedef struct {
+    char *tls_current_free;
+    char *tls_current_ceiling;
+    void *chunk;
+    void *curr_alloc_block;
+} GC_Nursery_Info;
+
+#ifdef __pillar2c__
+
+extern unsigned g_tls_offset_bytes;
+#define orp_local_to_gc_local(handle) (void *)((char*)(handle) + g_tls_offset_bytes)
+#ifdef TLS0
+#define pLsrGetAllocNursery() (GC_Nursery_Info *)orp_local_to_gc_local(*((void**)prtGetTaskHandle()))
+#define pLsrGetAllocNurseryTh(task) (GC_Nursery_Info *)orp_local_to_gc_local(*((void**)task))
+#else // TLS0 
+#define pLsrGetAllocNursery() (GC_Nursery_Info *)orp_local_to_gc_local(*((void**)((char*)prtGetTaskHandle() + P_WORD_SIZE)))
+#define pLsrGetAllocNurseryTh(task) (GC_Nursery_Info *)orp_local_to_gc_local(*((void**)((char*)task + P_WORD_SIZE)))
+#endif
+
+#else // __pillar2c__
+#define pLsrGetAllocNursery() (GC_Nursery_Info *) prtGetTls()
+#endif
+
+/* alignment is power of two */
+#define pLsrAllocAligned(t, dest, vtable, size, alignment)              \
+    do {                                                                \
+        uintp pLsrAllocSize = ((size) + 3) & 0xFFffFFfc;                \
+        uintp pLsrAllocAlignment = (alignment <=4) ? 0 : (alignment - 1); \
+        GC_Nursery_Info *pLsrAllocNursery = pLsrGetAllocNursery();      \
+        pLsrAllocStart();                                               \
+        noyield {                                                       \
+            char * pLsrAllocFrontier = pLsrAllocNursery->tls_current_free; \
+            char * pLsrAllocLimit = pLsrAllocNursery->tls_current_ceiling; \
+            pLsrAllocFrontier = (char *) ((((uintp) pLsrAllocFrontier) + pLsrAllocAlignment) & (~pLsrAllocAlignment)); \
+            if (pLsrAllocFrontier + pLsrAllocSize <= pLsrAllocLimit) {  \
+                (dest) = (t)pLsrAllocFrontier;                          \
+                pLsrAllocNursery->tls_current_free = pLsrAllocFrontier + pLsrAllocSize; \
+            } else {                                                    \
+                (dest) = (t)pLsrAllocSlowCapacity(pLsrAllocSize, vtable);       \
+            }                                                           \
+        }                                                               \
+        pLsrAllocFinish(dest, vtable, pLsrAllocSize);                   \
+    } while(0)
+
+#define pLsrAlloc(t, dest, vtable, size)  pLsrAllocAligned(t, dest, vtable, size, 1) 
+
+/* Accurate GC, skip local nursery */
+#define pLsrAlloc_(t, dest, vtable, size)                           \
+    pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocSlow) 
+
+/* alignment is power of two */
+#define pLsrAllocNoGC(t, dest, vtable, size)                            \
+    do {                                                                \
+        uintp pLsrAllocSize = ((size) + 3) & 0xFFffFFfc;                \
+        GC_Nursery_Info *pLsrAllocNursery = pLsrGetAllocNursery();      \
+        pLsrAllocStart();                                               \
+        char * pLsrAllocFrontier = pLsrAllocNursery->tls_current_free;  \
+        char * pLsrAllocLimit = pLsrAllocNursery->tls_current_ceiling;  \
+        if (pLsrAllocFrontier + pLsrAllocSize <= pLsrAllocLimit) {      \
+            (dest) = (t)pLsrAllocFrontier;                              \
+            pLsrAllocNursery->tls_current_free = pLsrAllocFrontier + pLsrAllocSize; \
+        } else { /* could fall back to allocate or null here */         \
+            pLsrRuntimeErrorUnmanaged("Insufficient space for guaranteed allocation"); \
+            (dest) = (t)(0);                                            \
+        }                                                               \
+        pLsrAllocFinish(dest, vtable, pLsrAllocSize);                   \
+    } while(0)
+      
+#else /* !P_USE_FAST_ALLOC */
+
+/* Accurate GC, slow path */
+#define pLsrAllocAligned(t, dest, vtable, size, alignment)      \
+    pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocSlow) 
+
+#define pLsrAlloc(t, dest, vtable, size) pLsrAllocAligned(t, dest, vtable, size, 1)
+
+#define pLsrAlloc_(t, dest, vtable, size) pLsrAlloc(t, dest, vtable, size)
+
+#define pLsrAllocNoGC(t, dest, vtable, size)    \
+    pLsrAllocInstrumentWrap(t, dest, vtable, size, pLsrAllocSlowOrNull)
+
+#endif /* P_USE_FAST_ALLOC */
+#else /* !P_USE_AGC */
+
+/* Conservative GC*/
+#ifdef P_USE_CGC
+#define pLsrAlloc(t, dest, vtable, size)                 \
+    do {                                                \
+        pLsrAllocStart();                               \
+        (dest) = (t) GC_MALLOC(size);                   \
+        if (!(dest)) pLsrRuntimeError_("Out of memory"); \
+        pLsrAllocFinish(dest, vtable, size);            \
+    } while(0)
+
+#define pLsrAllocPinned pLsrAlloc
+#define pLsrAllocAligned(t, dest, vtable, size, algn) pLsrAlloc(t, dest, vtable, size)
+#define pLsrAlloc_ pLsrAlloc
+#define pLsrAllocPinnedFinalizable pLsrAlloc
+#define pLsrAllocFinalizable pLsrAlloc
+
+#else /* !P_USE_CGC */
+
+/* NO GC */
+#define pLsrAlloc(t, dest, vtable, size)                 \
+    do {                                                \
+        pLsrAllocStart();                               \
+        dest = (t) malloc(size);                        \
+        if (!(dest)) pLsrRuntimeError_("Out of memory"); \
+        pLsrAllocFinish(dest, vtable, size);            \
+    } while(0)
+
+#define pLsrAllocPinned pLsrAlloc
+#define pLsrAllocAligned(t, dest, vtable, size, algn) pLsrAlloc(t, dest, vtable, size)
+#define pLsrAlloc_ pLsrAlloc
+#define pLsrAllocPinnedFinalizable pLsrAlloc
+#define pLsrAllocFinalizable pLsrAlloc
+
+#endif /*P_USE_CGC */
+#endif /*P_USE_AGC */
+
+/**********************************************************************/
+/*** C Allocation ***/
+
+
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+#endif /* P_USE_PILLAR */
+
+static void* pLsrAllocCUnmanaged(uintp size)
+{
+    void* res = malloc(size);
+    pLsrAllocCInstrument1(size);
+    pLsrAllocCInstrument2(size);
+    if (res) return res;
+    printf("Out of memory (C unmanaged alloc)");
+    exit(-1);
+    return 0;
+}
+
+#ifdef P_USE_PILLAR
+#  undef to
+#  pragma pillar_managed(on)
+#endif
+
+
+static void* pLsrAllocC(uintp size)
+{
+    void* res = malloc(size);
+    pLsrAllocCInstrument1(size);
+    pLsrAllocCInstrument2(size);
+    if (res) return res;
+    pLsrRuntimeError_("Out of memory (C alloc)");
+    return 0;
+}
+
+static void* pLsrReAllocC(void * obj, uintp osize, uintp nsize)
+{
+    void* res = realloc(obj, nsize);
+    pLsrFreeCInstrument1();
+    pLsrAllocCInstrument1(nsize);
+    pLsrAllocCInstrument2(nsize);
+    if (res) return res;
+    pLsrRuntimeError_("Out of memory (C realloc)");
+    return 0;
+}
+
+static void pLsrFreeC(void* obj)
+{
+    pLsrFreeCInstrument1();
+    free(obj);
+}
+
+/**********************************************************************/
+/*** Options and Initialisation ***/
+
+#ifdef P_USE_AGC
+#pragma pillar_managed(off)
+static void pLsrRuntimeReportRoots(PrtRseCallback, void*);
+static void pLsrPPilerReportRoots(PrtRseCallback, void*);
+static void pLsrGcReportRoots(PrtRseCallback rse, void* env) 
+{
+#ifdef PLSR_INSTRUMENT_GCS
+    printf("Total allocation: %I64u bytes (%I64u objects) managed, %I64u bytes (%I64u objects) unmanaged, %I64u objects freed\n", 
+           pLsrNumBytesAllocated, pLsrNumObjectsAllocated, 
+           pLsrNumBytesAllocatedUnmanaged, pLsrNumObjectsAllocatedUnmanaged,
+           pLsrNumObjectsFreedUnmanaged);
+    printf("Since last: %I64u bytes (%I64u objects) managed, %I64u bytes (%I64u objects) unmanaged, %I64u objects freed\n", 
+           pLsrNumBytesAllocatedSinceLast, pLsrNumObjectsAllocatedSinceLast, 
+           pLsrNumBytesAllocatedUnmanagedSinceLast, pLsrNumObjectsAllocatedUnmanagedSinceLast,
+           pLsrNumObjectsFreedUnmanagedSinceLast);
+    pLsrNumBytesAllocatedSinceLast = 0;
+    pLsrNumObjectsAllocatedSinceLast = 0;
+    pLsrNumBytesAllocatedUnmanagedSinceLast = 0; 
+    pLsrNumObjectsAllocatedUnmanagedSinceLast = 0;
+    pLsrNumObjectsFreedUnmanagedSinceLast = 0;
+#endif
+    pLsrRuntimeReportRoots(rse, env);
+    pLsrPPilerReportRoots(rse, env);
+    pLsrAllocREGCSet();
+}
+#pragma pillar_managed(on)
+#endif /* P_USE_AGC */
+
+static void pLsrGcOption(const char* name, const char* arg)
+{
+#ifdef P_USE_AGC
+    pgc_next_command_line_argument(name, arg);
+#else /* !P_USE_AGC */
+    pLsrRuntimeError("GC option not implemented");
+#endif /* !P_USE_AGC */
+}
+
+static void pLsrGcInit(sintp initHeap, uintp maxHeap)
+{
+    char buf[100];
+#ifdef P_USE_CGC
+    if (initHeap<0) initHeap = 50;
+    initHeap *= 1024 * 1024;
+    /* GC Bug.  The GC tends to run out of memory when initHeap is set to maxHeap */
+    initHeap -= initHeap / 10;
+    maxHeap *= 1024 * 1024;
+    GC_init();
+    GC_expand_hp(initHeap);
+    GC_set_max_heap_size (maxHeap);
+#else /* !P_USE_CGC */
+#ifdef P_USE_AGC
+    if (P_USE_AGC == PlsrAKCgc || P_USE_AGC == PlsrAKTgc) {
+        /* If only maxHeap set, prefer maxHeap */
+        if (initHeap<=0 && maxHeap>0) {initHeap = maxHeap;}
+        else if (initHeap>0 && maxHeap==0) {maxHeap = initHeap;}
+        else if (initHeap >= 0 && maxHeap > 0 && initHeap != maxHeap)
+            { pLsrRuntimeError("GCInit: initHeap <> maxHeap not supported on this GC");}
+    }
+    if (initHeap>0) {
+        sprintf(buf, "-Xms%dm", initHeap);
+        pgc_next_command_line_argument("-Xms", buf);
+    }
+    if (maxHeap>0) {
+        sprintf(buf, "-Xmx%dm", maxHeap);
+        pgc_next_command_line_argument("-Xmx", buf);
+    }
+    // lockParm = false for MF, true for V4
+    pgc_init(pLsrGcReportRoots, P_AGC_LOCK_PARAM);
+#endif /* P_USR_AGC */
+#endif /* !P_USE_CGC */
+}
+
+/**********************************************************************/
+/*** Vtable registration and Enumeration stuff ***/
+
+static PlsrVTable pLsrAllVTables = NULL;
+
+#ifdef P_USE_AGC
+
+#define pLsrGcRegisterGlobals pgc_register_global_objects
+#define pLsrGcRegisterGlobalRefs pgc_register_global_refs
+
+/* vtable, object size (bytes), offset of the indirection slot (bytes) */
+static void pLsrIndirectionVTableRegister(PlsrVTable vt, uintp size, uintp offset)
+{
+#ifdef PLSR_INSTRUMENT_VTB_ALC
+    vt->next = pLsrAllVTables;
+    pLsrAllVTables = vt;
+#endif
+    pgc_new_indirection_object((struct VTable*)vt, size, offset);
+ };
+
+static void pLsrVTableRegisterV(PlsrVTable vt, uintp alignment,
+                                uintp fs, PgcIsRef frefs[],
+                                uintp vs, uintp vlo, 
+                                PgcIsRef vrefs[],
+                                enum PGC_MUTABILITY mutability,
+                                uintp pinned,
+                                PgcIsRef wrefs[],
+                                void (* __cdecl finalizer)(Managed_Object_Handle))
+{
+    /* alignment is a power of two, so just count shifts, offset by two */
+    uintp powerOfTwoBaseFour = 0;
+    uintp adjusted = alignment >> 2;  
+    while ((adjusted >>=1) > 0) { powerOfTwoBaseFour++; }
+    /* alignment requirement is 2^(2+powerOfTwoBaseFour).*/
+    struct AlignmentInfo ai = {.alignArray = 0, .powerOfTwoBaseFour = powerOfTwoBaseFour};
+
+#ifdef PLSR_INSTRUMENT_VTB_ALC
+    vt->next = pLsrAllVTables;
+    pLsrAllVTables = vt;
+#endif
+    pgc_new_object_format((struct VTable*)vt, 
+                          fs, 
+                          frefs, 
+                          vs,
+                          vlo, 
+                          vrefs, 
+                          ai,
+                          mutability,
+                          pinned,
+                          wrefs,
+                          finalizer
+                          );
+}
+
+/*
+ * vt is the vtable
+ * alignment is the required object alignment in bytes (must be power of two)
+ * fs is the fixed size in bytes
+ * frefs indicates for each word offset starting at zero (i.e. starting with the vtable) is it a ref
+ * vs is the element size for the variable portion (in bytes)
+ *   If vs=0, then there is no variable portion, and vlo and vref are ignored.
+ * vlo is the offset from base of object of the length field (if applicable).
+ * vref indicates the traceability of the first word of the elements in the variable portion
+ * mutability indicates the mutability of objects allocated with this vtable
+ * finalizer is an unmanged code pointer to be run on finalization (NULL if none)
+ */
+static void pLsrVTableRegisterFinalizable(PlsrVTable vt, uintp alignment, 
+                                          uintp fs, PgcIsRef frefs[],
+                                          uintp vs, uintp vlo, PgcIsRef vref,
+                                          enum PGC_MUTABILITY mutability,
+                                          uintp pinned,
+                                          void (* __cdecl finalizer)(Managed_Object_Handle)
+                                          )
+{
+    PgcIsRef vrefs[2];
+    vrefs[0] = vref;
+    vrefs[1] = 0;
+    PgcIsRef wrefs[fs/P_WORD_SIZE];
+    for(int i = 0;i < fs/P_WORD_SIZE;i++) {
+        wrefs[i]=0;
+    }
+    pLsrVTableRegisterV(vt, alignment, fs, frefs, vs, vlo, vrefs, mutability, pinned, wrefs, finalizer);
+}
+
+#define pLsrVTableRegister(vt, alignment, fcs, frefs, vs, vlo, vref, mutability, pinned) \
+    pLsrVTableRegisterFinalizable(vt, alignment, fcs, frefs, vs, vlo, vref, mutability, pinned, NULL)
+
+#else /* !P_USE_AGC */
+
+enum PGC_MUTABILITY {
+	PGC_ALWAYS_MUTABLE = 0,
+	PGC_CREATED_MUTABLE = 1,
+	PGC_ALWAYS_IMMUTABLE = 2
+};
+
+#define pLsrGcRegisterGlobals(gs, num)
+#define pLsrGcRegisterGlobalRefs(gs, num)
+#define pLsrIndirectionVTableRegister(vt, size, offset) 
+#define pLsrVTableRegister(vt, alignment, fs, frefs, vs, vlo, vref, m, pinned) 
+#define pLsrVTableRegisterFinalizable(vt, alignment, fs, frefs, vs, vlo, vref, m, pinned, finalizer)
+#define pLsrVTableRegisterV(vt, alignment, fs, frefs, vs, vlo, vrefs, m, pinned, wrefs, finalizer)
+
+#endif /* !P_USE_AGC */
+
+/**********************************************************************/
+/*** Barriers ***/
+
+#ifdef P_USE_GC_WRITE_BARRIERS
+#ifdef P_USE_AGC
+#define pLsrWriteBarrierRef(t, s) \
+    (pgc_write_ref_slot((struct Object**)&(t), (struct Object*)(s)))
+#define pLsrWriteBarrierRefBase(b, t, s)                  \
+    (pgc_write_ref_slot_with_base((struct Object*)(b),    \
+                                  (struct Object**)&(t),  \
+                                  (struct Object*)(s)))
+#ifdef P_ALL_BARRIERS
+#define pLsrWriteBarrierRefOpt(t, s) \
+    (pgc_write_ref_slot((struct Object**)&(t), (struct Object*)(s)))
+#define pLsrWriteBarrierRefOptBase(b, t, s)      \
+    (pgc_write_ref_slot_with_base((struct Object*)(b),  \
+                                  (struct Object**)&(t),        \
+                                  (struct Object*)(s)))
+#else /* !P_ALL_BARRIERS */
+#define pLsrWriteBarrierRefOpt(t, s)                    \
+    ((*((struct Object**)&(t)))=((struct Object*)(s)))
+#define pLsrWriteBarrierRefOptBase(b, t, s)             \
+    ((*((struct Object**)&(t)))=((struct Object*)(s)))
+#endif /* !P_ALL_BARRIERS */
+#else /* !P_USE_AGC */
+#error "write barriers only work with agc"
+#endif /* !P_USE_AGC */
+#else /* !P_USE_GC_WRITE_BARRIERS */
+#define pLsrWriteBarrierRef(t, s)                       \
+    ((*((struct Object**)&(t)))=((struct Object*)(s)))
+#define pLsrWriteBarrierRefBase(b, t, s)                \
+    ((*((struct Object**)&(t)))=((struct Object*)(s)))
+#define pLsrWriteBarrierRefOpt(t, s)                    \
+    ((*((struct Object**)&(t)))=((struct Object*)(s)))
+#define pLsrWriteBarrierRefOptBase(b, t, s)             \
+    ((*((struct Object**)&(t)))=((struct Object*)(s)))
+#endif /* !P_USE_GC_WRITE_BARRIERS */
+
+/**********************************************************************/
+/*** Changing VTables ***/
+
+#ifdef P_USE_AGC
+
+#define pLsrObjectChangeVTableMandatory(obj, vtb) \
+    pgc_modify_object_vtable(((struct Object*)(obj)), ((struct VTable*)(vtb)))
+
+/* XXX Extract this out to pgc.h  -leaf */
+#define pLsrObjectCmpAndSetVTableMandatory(obj, vtbOld, vtbNew)        \
+    pLsrSynchCmpAndSetUIntp(((volatile uintp*)(obj)), (uintp) vtbOld, (uintp) vtbNew)
+
+#else /* !P_USE_AGC */
+
+#define pLsrObjectChangeVTableMandatory(obj, vtb) (obj)->vtable = (vtb)
+
+#define pLsrObjectCmpAndSetVTableMandatory(obj, vtbOld, vtbNew)        \
+    pLsrSynchCmpAndSetUIntp(((struct Object*)(obj)), (uintp) vtbOld, (uintp) vtbNew)
+
+#endif /* !P_USE_AGC */
+
+#ifdef P_DO_VTABLE_CHANGE
+
+#define pLsrObjectChangeVTable(obj, vtb) pLsrObjectChangeVTableMandatory(obj, vtb)
+
+#else /* !P_DO_VTABLE_CHANGE */
+
+#define pLsrObjectChangeVTable(obj, vtb)
+
+#endif /* !P_DO_VTABLE_CHANGE */
+
+
+/**********************************************************************/
+/*** Malloc in the GC heap ***/
+
+#ifdef P_USE_AGC
+
+typedef struct PlsrGCHeapMallocObjS_ {
+    PlsrVTable vtable;
+    struct PlsrGCHeapMallocObjS_* prev;
+    struct PlsrGCHeapMallocObjS_* next;
+    uintp length;
+    char bytes[];
+} PlsrGCHeapMallocObjS;
+
+#define pLsrGCHeapMallocObjPadding \
+    (sizeof(PlsrGCHeapMallocObjS) - sizeof(PlsrVTable) - 2*sizeof(struct PlsrGCHeapMallocObjS_*) - sizeof(uintp))
+pLsrVTableStatic(pLsrGCHeapMallocObjVTable_, VNoneTag, "*heap malloc*", pLsrGCHeapMallocObjPadding);
+#define pLsrGCHeapMallocObjVTable (&pLsrGCHeapMallocObjVTable_)
+
+#define pLsrGCHeapMallocObjSize (sizeof(PlsrGCHeapMallocObjS))
+
+static PlsrGCHeapMallocObjS pLsrGCHeapMallocObjectList_ = 
+    {.vtable = pLsrGCHeapMallocObjVTable,
+     .prev = NULL,
+     .next = NULL,
+     .length = 0};
+static PlsrGCHeapMallocObjS* pLsrGCHeapMallocObjList = &pLsrGCHeapMallocObjectList_;
+
+static void pLsrGCHeapMallocObjListInsert(PlsrGCHeapMallocObjS* list, PlsrGCHeapMallocObjS* node) 
+{
+    node->prev = list->prev;
+    node->prev->next = node;
+    node->next = list;
+    list->prev = node;
+}
+
+static void pLsrGCHeapMallocObjListRemove(PlsrGCHeapMallocObjS* node) 
+{
+    node->next->prev = node->prev;
+    node->prev->next = node->next;
+    node->prev = NULL;
+    node->next = NULL;
+}
+
+#define pLsrGCHeapPtrToObject(ptr) ((PlsrGCHeapMallocObjS *) (((char*) ptr) - pLsrGCHeapMallocObjSize))
+
+static void* pLsrGCHeapMalloc(uintp size)
+{
+    PlsrGCHeapMallocObjS *node;
+    assert(pLsrGCHeapMallocObjSize == (uintp) &((PlsrGCHeapMallocObjS *)0)->bytes);
+    pLsrAllocPinned(PlsrGCHeapMallocObjS*, node, pLsrGCHeapMallocObjVTable, pLsrGCHeapMallocObjSize + size);
+    PlsrGCHeapMallocObjS *list = (PlsrGCHeapMallocObjS *) pLsrSynchAtomicTakeUIntp((volatile uintp*) &pLsrGCHeapMallocObjList, 1);
+    pLsrGCHeapMallocObjListInsert(list, node);
+    pLsrSynchAtomicPutUIntp((volatile uintp*) &pLsrGCHeapMallocObjList, (uintp) list);
+    node->length=size;
+    return &node->bytes;
+}
+
+static void pLsrGCHeapFree(void *ptr)
+{
+    
+    PlsrGCHeapMallocObjS *node = pLsrGCHeapPtrToObject(ptr);
+    PlsrGCHeapMallocObjS *list = (PlsrGCHeapMallocObjS *) pLsrSynchAtomicTakeUIntp((volatile uintp*) &pLsrGCHeapMallocObjList, 1);
+    pLsrGCHeapMallocObjListRemove(node);
+    pLsrSynchAtomicPutUIntp((volatile uintp*) &pLsrGCHeapMallocObjList, (uintp) list);
+    return;
+}
+
+static void* pLsrGCHeapReAlloc(void * ptr, uintp osize, uintp nsize)
+{
+    PlsrGCHeapMallocObjS *old = pLsrGCHeapPtrToObject(ptr);
+    assert(old->length >= osize);
+    if (old->length >= nsize) {
+        return ptr;
+    } else {
+        void *newPtr = pLsrGCHeapMalloc(nsize);
+        uintp csize = min(nsize, osize);
+        memcpy(newPtr, ptr, csize);
+        pLsrGCHeapFree(ptr);
+        return newPtr;
+    }
+}
+
+static void pLsrGCHeapMallocRegisterVTables() 
+{
+    static PgcIsRef refs[pLsrGCHeapMallocObjSize/P_WORD_SIZE] = { 0, 1, 1, 0};
+    uintp pinned = 1;
+    assert(pLsrGCHeapMallocObjSize/P_WORD_SIZE == 4);
+    assert((uintp)&(((PlsrGCHeapMallocObjS*)(0))->length) == 3*P_WORD_SIZE);
+    pLsrVTableRegister(pLsrGCHeapMallocObjVTable, pLsrDefaultAlignment, pLsrGCHeapMallocObjSize, 
+                       refs, 1, 3*P_WORD_SIZE, 0, PGC_ALWAYS_MUTABLE, pinned);
+}
+
+#define pLsrGCHeapMallocGlobalsCount 1
+
+static PlsrObjectB pLsrGCHeapMallocGlobals[] = 
+    {
+        (PlsrObjectB) (&pLsrGCHeapMallocObjectList_),
+        (PlsrObjectB) NULL /* This must be last */
+    };
+
+static void pLsrGCHeapMallocRegisterGlobals() 
+{
+    assert(pLsrGCHeapMallocGlobals[pLsrGCHeapMallocGlobalsCount] == NULL);
+    pLsrGcRegisterGlobals (pLsrGCHeapMallocGlobals, pLsrGCHeapMallocGlobalsCount);
+};
+
+
+static void pLsrGCHeapMallocInitialize() 
+{
+    pLsrGCHeapMallocObjList->prev=pLsrGCHeapMallocObjList;
+    pLsrGCHeapMallocObjList->next=pLsrGCHeapMallocObjList;
+}
+
+#else /* !P_USE_AGC */
+
+#define pLsrGCHeapMalloc pLsrAllocC
+#define pLsrGCHeapReAlloc pLsrReAllocC
+#define pLsrGCHeapFree pLsrFreeC
+
+static void pLsrGCHeapMallocRegisterVTables() {}
+static void pLsrGCHeapMallocRegisterGlobals() {}
+static void pLsrGCHeapMallocInitialize() {}
+
+#endif /* P_USE_AGC */
+
+static void pLsrGCRegisterVTables() 
+{
+    pLsrGCHeapMallocRegisterVTables();
+}
+static void pLsrGCRegisterGlobals() 
+{
+    pLsrGCHeapMallocRegisterGlobals();
+}
+static void pLsrGCInitialize() 
+{
+    pLsrGCHeapMallocInitialize();
+}
+
+#endif /* !_PLSR_GC_H_ */
diff --git a/runtime/include/hrc/plsr-gmp-integer-gallocate.h b/runtime/include/hrc/plsr-gmp-integer-gallocate.h
new file mode 100644
index 0000000..3ea116a
--- /dev/null
+++ b/runtime/include/hrc/plsr-gmp-integer-gallocate.h
@@ -0,0 +1,1188 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_AP_INTEGER_H_
+#define _PLSR_AP_INTEGER_H_
+
+#define PLSR_GMP_REPLACE_MALLOC
+#define PLSR_GMP_USE_PCDECL
+
+#ifndef __pillar2c__
+#error "GMP integer guaranteed allocation only supported on ipc"
+
+#endif /* __pillar2c__ */
+
+#  pragma pillar_push_cc(__pcdecl)
+#include <gmp.h>
+#  pragma pillar_pop_cc
+
+/* Types */
+
+typedef struct PlsrAPIntS_ {
+    PlsrVTable vtable;
+    int alloc;
+    int size;
+    mp_limb_t d[];
+} PlsrAPIntS;
+
+#define pLsrAPIntPadding (sizeof(PlsrAPIntS) - sizeof(PlsrVTable) - sizeof(int) - sizeof(int))
+pLsrVTableStatic(pLsrAPIntVTable_, VNoneTag, "*ap integer*", pLsrAPIntPadding);
+#define pLsrAPIntVTable (&pLsrAPIntVTable_)
+
+#ifdef P_USE_PILLAR
+typedef PlsrRef PlsrAPInt;
+#else /* !P_USE_PILLAR */
+typedef PlsrAPIntS* PlsrAPInt;
+#endif /* !P_USE_PILLAR */
+
+#define pLsrAPIntAlignment 4
+#define pLsrAPIntSize (sizeof(PlsrAPIntS))
+
+#define pLsrMPZ_mp_alloc(i) ((i)[0]._mp_alloc)
+#define pLsrMPZ_mp_size(i) ((i)[0]._mp_size)
+#define pLsrMPZ_mp_d(i) ((i)[0]._mp_d)
+
+#define pLsrAPInt_mp_alloc(i) (((PlsrAPIntS*) (i))->alloc)
+#define pLsrAPInt_mp_size(i) (((PlsrAPIntS*) (i))->size)
+#define pLsrAPInt_mp_d(i) (&(((PlsrAPIntS*) (i))->d[0]))
+
+#define pLsrAPInt_mp_d_Offset ((uintp)(pLsrAPIntSize))
+#define pLsrAPIntMPZMemPtrToAPInt(mem) (((char*) mem) - (pLsrAPInt_mp_d_Offset))
+
+#define pLsrAPIntToMPZ(dest, src)                                       \
+    do {                                                                \
+        pLsrMPZ_mp_alloc(dest) = pLsrAPInt_mp_alloc(src);               \
+        pLsrMPZ_mp_size(dest) = pLsrAPInt_mp_size(src);                 \
+        pLsrMPZ_mp_d(dest) = pLsrAPInt_mp_d(src);                       \
+    } while (0)
+
+#define pLsrAPIntFromMPZ(dest, src)                                     \
+    do {                                                                \
+        (dest) = pLsrAPIntMPZMemPtrToAPInt(pLsrMPZ_mp_d(src));          \
+        assert(pLsrAPInt_mp_alloc(dest) == pLsrMPZ_mp_alloc(src));      \
+        pLsrAPInt_mp_size(dest) = pLsrMPZ_mp_size(src);                 \
+    } while (0)
+
+#define pLsrAPIntInitMPZFromAPInt(src)                                  \
+    {{pLsrAPInt_mp_alloc(src), pLsrAPInt_mp_size(src), pLsrAPInt_mp_d(src)}};
+
+
+static uintp pLsrGmpAllocationReserve = 300;
+
+// Ask if the GC can definitely allocate size number of bytes without needing a collection.
+EXTERN(PgcBool) __pcdecl pgc_can_allocate_without_collection_th(unsigned size, PrtTaskHandle task);
+// Do whatever is necessary to make sure that size bytes can be allocated after this call without a collection.
+// This call itself may or may not cause a collection.
+EXTERN(void) PRT_CDECL   pgc_require_allocate_without_collection_th(unsigned size, PrtTaskHandle task);
+/* Prepare to allocate up to count gmp integers */
+#define pLsrGmpPrepareForAllocation(count)                              \
+    do {                                                                \
+        if (!pgc_can_allocate_without_collection_th(count*pLsrGmpAllocationReserve,prtGetTaskHandle())) { \
+            pgc_require_allocate_without_collection_th(count*pLsrGmpAllocationReserve,prtGetTaskHandle()); \
+        }                                                               \
+    } while(0)
+
+/* For documentation purposes only*/
+#define pLsrAPIntBeginNoGC
+
+#define pLsrAPIntKeepLive1(v)                   \
+    do {                                        \
+        volatile PlsrAPInt pLsrAPIntKeepLive1A; \
+        pLsrAPIntKeepLive1A = v;                \
+    } while(0);
+
+#define pLsrAPIntKeepLive2(v0, v1)              \
+    do {                                        \
+        volatile PlsrAPInt pLsrAPIntKeepLive2A; \
+        pLsrAPIntKeepLive2A = v0;               \
+        pLsrAPIntKeepLive2A = v1;               \
+    } while(0);
+
+#ifdef PLSR_AP_INT_TRACE
+static void pLsrAPIntShow1(char * s, PlsrAPInt i1)
+{
+    mpz_t t1 = pLsrAPIntInitMPZFromAPInt(i1);
+    char* res = pLsrAllocC(mpz_sizeinbase (t1, 10) + 2);
+    mpz_get_str(res, 10, t1);
+    printf("APInt: %s, %s\n", s, res);
+    pLsrFreeC(res);
+}
+
+static void pLsrAPIntShow3(char * s, PlsrAPInt i1, PlsrAPInt i2, PlsrAPInt i3)
+{
+    mpz_t t1 = pLsrAPIntInitMPZFromAPInt(i1);
+    mpz_t t2 = pLsrAPIntInitMPZFromAPInt(i2);
+    mpz_t t3 = pLsrAPIntInitMPZFromAPInt(i3);
+    char* res1 = pLsrAllocC(mpz_sizeinbase (t1, 10) + 2);
+    mpz_get_str(res1, 10, t1);
+    char* res2 = pLsrAllocC(mpz_sizeinbase (t2, 10) + 2);
+    mpz_get_str(res2, 10, t2);
+    char* res3 = pLsrAllocC(mpz_sizeinbase (t3, 10) + 2);
+    mpz_get_str(res3, 10, t3);
+    printf("APInt: %s, (%s, %s, %s)\n", s, res1, res2, res3);
+    pLsrFreeC(res1);
+    pLsrFreeC(res2);
+    pLsrFreeC(res3);
+}
+#define pLsrAPIntTrace0(s)
+#define pLsrAPIntTrace1(s, i1)  pLsrAPIntShow1(s, i1)
+#define pLsrAPIntTrace2(s, i1, i2)
+#define pLsrAPIntTrace3(s, i1, i2, i3) pLsrAPIntShow3(s, i1, i2, i3)
+#define pLsrAPIntTrace4(s, i1, i2, i3, i4)
+#define pLsrAPIntTraceFmt1(s, i1)
+#else
+#define pLsrAPIntTrace0(s)
+#define pLsrAPIntTrace1(s, i1)
+#define pLsrAPIntTrace2(s, i1, i2)
+#define pLsrAPIntTrace3(s, i1, i2, i3)
+#define pLsrAPIntTrace4(s, i1, i2, i3, i4)
+#define pLsrAPIntTraceFmt1(s, i1)
+#endif
+
+static PlsrAPInt pLsrAPIntNewFromCString(char *s)
+{
+    pLsrGmpPrepareForAllocation(1);
+    pLsrAPIntBeginNoGC {
+        mpz_t t1;
+        if (mpz_init_set_str(t1, s, 0) != 0) {
+            pLsrRuntimeError("Failed to initialize ap integer");
+        }
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, t1);
+        return res;
+    }
+}
+
+/* operation must be an init_set_* */
+#define pLsrAPIntNewFromXBody(operation, src)           \
+    do {                                                \
+        mpz_t t1;                                       \
+        pLsrGmpPrepareForAllocation(1);                 \
+        pLsrAPIntBeginNoGC {                            \
+            operation(t1, src);                         \
+            PlsrAPInt res; pLsrAPIntFromMPZ(res, t1);    \
+            return res;                                 \
+        }                                               \
+    } while (0)
+
+static PlsrAPInt pLsrAPIntNewFromSignedLong(long int si)
+{
+    pLsrAPIntNewFromXBody(mpz_init_set_si, si);
+}
+
+static PlsrAPInt pLsrAPIntNewFromUnsignedLong(unsigned long int ui)
+{
+    pLsrAPIntNewFromXBody(mpz_init_set_ui, ui);
+}
+
+static PlsrAPInt pLsrAPIntNewFromFloat32(float32 f)
+{
+    pLsrAPIntNewFromXBody(mpz_init_set_d, (float64) f);
+}
+
+static PlsrAPInt pLsrAPIntNewFromFloat64(float64 f)
+{
+    pLsrAPIntNewFromXBody(mpz_init_set_d, f);
+}
+
+
+static PlsrAPInt pLsrAPIntNewFromAPInt(PlsrAPInt a)
+{
+    pLsrGmpPrepareForAllocation(1);
+    pLsrAPIntBeginNoGC {
+        mpz_t t1;
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_init_set(t1, t2);
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, t1);
+        return res;
+    }
+}
+
+/* Constants */
+
+/* Globals */
+
+#define PLSR_AP_INT_BACKPATCH_GLOBALS 1
+
+/* This must match the definition of PlsrAPIntS */
+#define PlsrAPIntSGlobal(bytes)                 \
+    struct {                                    \
+        PlsrVTable vtable;                      \
+        int alloc;                              \
+        int size;                               \
+        mp_limb_t d[bytes];                     \
+    }
+
+#define pLsrAPIntStaticGlobalNew(dv, bytes)                             \
+    static PlsrAPIntSGlobal(bytes) dv = {.vtable = pLsrAPIntVTable,        \
+                                         .alloc = (bytes + (sizeof(mp_limb_t) - 1))/sizeof(mp_limb_t), \
+                                         .size = 0,                     \
+    }
+
+void pLsrAPIntStaticGlobalInitFromAPInt(PlsrAPInt dest, PlsrAPInt src)
+{
+    int size = pLsrAbs32(pLsrAPInt_mp_size(src));
+    assert(pLsrAPInt_mp_alloc(dest) >= size);
+    pLsrAPInt_mp_size(dest) = pLsrAPInt_mp_size(src);
+    memcpy(pLsrAPInt_mp_d(dest), pLsrAPInt_mp_d(src), size * sizeof(mp_limb_t));
+}
+
+/* These are global, and will never be freed, */
+#define pLsrAPIntStaticGlobalInitFromCString(dest, s) (pLsrAPIntStaticGlobalInitFromAPInt(dest, pLsrAPIntNewFromCString(s)))
+
+#define pLsrAPIntStaticGlobalInitFromSInt32(dest, si) (pLsrAPIntStaticGlobalInitFromAPInt(dest, pLsrAPIntNewFromSignedLong(si)))
+
+#define pLsrAPIntStaticGlobalInitFromUInt32(dest, ui) (pLsrAPIntStaticGlobalInitFromAPInt(dest, pLsrAPIntNewFromUnsignedLong(ui)))
+
+pLsrAPIntStaticGlobalNew(pLsrAPIntZero_, 0);
+#define pLsrAPIntZero ((PlsrAPInt) &pLsrAPIntZero_)
+pLsrAPIntStaticGlobalNew(pLsrAPIntOne_, 1);
+#define pLsrAPIntOne ((PlsrAPInt) &pLsrAPIntOne_)
+pLsrAPIntStaticGlobalNew(pLsrAPIntMinusOne_, 1);
+#define pLsrAPIntMinusOne ((PlsrAPInt) &pLsrAPIntMinusOne_)
+pLsrAPIntStaticGlobalNew(pLsrAPIntMinusUInt32Max_, sizeof(uint32));
+#define pLsrAPIntMinusUInt32Max ((PlsrAPInt) &pLsrAPIntMinusUInt32Max_)
+
+/* Conversions */
+
+/* Unsigned Integer Conversions */
+
+static PlsrAPInt pLsrAPIntFromUInt32(uint32 i)
+{
+    PlsrAPInt res = pLsrAPIntNewFromUnsignedLong(i);
+    pLsrAPIntTrace1("New from uint32", res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntFromUInt64(uint64 i)
+{
+    PlsrAPInt res;
+    if (sizeof(uint64) <= sizeof(unsigned long int))
+        res = pLsrAPIntNewFromUnsignedLong(i);
+    else if (i <= UINT32_MAX) {
+        res = pLsrAPIntFromUInt32(i);
+    } else {
+        uint64 ui = i;
+        uint32 upper = (uint32) (ui >> 32);
+        uint32 lower = (uint32) ui;
+        PlsrAPInt z = pLsrAPIntFromUInt32(upper);
+        pLsrGmpPrepareForAllocation(2);
+        pLsrAPIntBeginNoGC {
+            mpz_t t1 = pLsrAPIntInitMPZFromAPInt(z);
+            mpz_mul_2exp(t1, t1, 32);
+            mpz_add_ui(t1, t1, lower);
+            pLsrAPIntFromMPZ(res, t1);
+        }
+    }
+    pLsrAPIntTrace1("New from uint64", res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntFromUIntp(uintp i)
+{
+    PlsrAPInt res;
+    if (sizeof(uintp) <= sizeof(uint32))
+        res = pLsrAPIntFromUInt32(i);
+    else if (sizeof(uintp) <= sizeof(uint64))
+        res = pLsrAPIntFromUInt64(i);
+    else {
+        pLsrRuntimeError("UIntp too large");
+        res = 0;
+    }
+    pLsrAPIntTrace1("New from uintp", res);
+    return res;
+}
+
+static uint32 pLsrUInt32FromAPInt(PlsrAPInt a)
+{
+    mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+    uint32 res = (uint32) mpz_get_ui(t1);
+    pLsrAPIntTrace1("To uint32", a);
+    return res;
+}
+
+static uint64 pLsrUInt64FromAPInt(PlsrAPInt a)
+{
+    uint64 res = 0;
+    pLsrGmpPrepareForAllocation(4);
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        if (sizeof(uint64) <= sizeof(unsigned long int) || mpz_fits_ulong_p(t1)) {
+            res = (uint64) mpz_get_ui(t1);
+        } else {
+            mpz_t tmp1;
+            mpz_init_set(tmp1, t1);
+            mpz_fdiv_q_2exp(tmp1, tmp1, 32);
+            res = (uint64) mpz_get_ui(tmp1);
+            res = res << 32;
+            mpz_set(tmp1, t1);
+            mpz_fdiv_r_2exp(tmp1, tmp1, 32);
+            res += (uint64) mpz_get_ui(tmp1);
+        }
+    }
+    pLsrAPIntTrace1("To uint64", a);
+    return res;
+}
+
+static uintp pLsrUIntpFromAPInt(PlsrAPInt a)
+{
+    uintp res;
+    if (sizeof(uintp) <= sizeof(uint32))
+        res = pLsrUInt32FromAPInt(a);
+    else if (sizeof(uintp) <= sizeof(uint64))
+        res = pLsrUInt64FromAPInt(a);
+    else {
+        pLsrRuntimeError("UIntp too large");
+        res = 0;
+    }
+    pLsrAPIntTrace1("To uintp", a);
+    return res;
+}
+
+/* Signed Integer Conversions */
+
+static PlsrAPInt pLsrAPIntFromSInt32(sint32 i)
+{
+    PlsrAPInt res = pLsrAPIntNewFromSignedLong(i);
+    pLsrAPIntTrace1("From sint32", res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntFromSInt64(sint64 i)
+{
+    PlsrAPInt res;
+    if (sizeof(sint64) <= sizeof(long int))
+        res = pLsrAPIntNewFromSignedLong(i);
+    else if (i <= ((sint64) SINT32_MAX) && i >= ((sint64) SINT32_MIN)) {
+        res = pLsrAPIntFromSInt32((sint32) i);
+    } else {
+        uint64 ui = pLsrAbs64(i);
+        uint32 upper = (uint32) (ui >> 32);
+        uint32 lower = (uint32) ui;
+        PlsrAPInt z = pLsrAPIntFromUInt32(upper);
+        pLsrGmpPrepareForAllocation(2);
+        pLsrAPIntBeginNoGC {
+            mpz_t t1 = pLsrAPIntInitMPZFromAPInt(z);
+            mpz_mul_2exp(t1, t1, 32);
+            mpz_add_ui(t1, t1, lower);
+            if (i < 0) {
+                mpz_neg(t1, t1);
+            }
+            pLsrAPIntFromMPZ(res, t1);
+        }
+    }
+    pLsrAPIntTrace1("From sint64", res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntFromSIntp(sintp i)
+{
+    PlsrAPInt res;
+    if (sizeof(sintp) <= sizeof(sint32))
+        res = pLsrAPIntFromSInt32(i);
+    else if (sizeof(sintp) <= sizeof(sint64))
+        res = pLsrAPIntFromSInt64(i);
+    else {
+        pLsrRuntimeError("SIntp too large");
+        res = 0;
+    }
+    pLsrAPIntTrace1("From sint64", res);
+    return res;
+}
+
+static sint32 pLsrSInt32FromAPInt(PlsrAPInt a)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        sint32 res = (sint32) mpz_get_si(t1);
+        pLsrAPIntTraceFmt1("To sint32 %d", res);
+        return res;
+    }
+}
+
+static sint64 pLsrSInt64FromAPInt(PlsrAPInt a)
+{
+    sint64 res = 0;
+    pLsrAPIntTrace1("To sint64", a);
+    pLsrGmpPrepareForAllocation(3);
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(t1)) {
+            res = (sint64) mpz_get_si(t1);
+        } else {
+            int negate = 0;
+            mpz_t tmp1;
+            mpz_init_set(tmp1, t1);
+            if (mpz_sgn(tmp1) == -1) {
+                negate = 1;
+                mpz_neg(tmp1, tmp1);
+            }
+            mpz_fdiv_q_2exp(tmp1, tmp1, 32);
+            res = (sint64) mpz_get_ui(tmp1);
+            res = res << 32;
+            if (negate) {
+                mpz_neg(tmp1, t1);
+            } else {
+                mpz_set(tmp1, t1);
+            }
+            mpz_fdiv_r_2exp(tmp1, tmp1, 32);
+            res += (sint64) mpz_get_ui(tmp1);
+            if (negate) {
+                res = -res;
+            }
+        }
+    }
+    pLsrAPIntTraceFmt1("To sint64 %lld\n", res);
+    return res;
+}
+
+static sintp pLsrSIntpFromAPInt(PlsrAPInt a)
+{
+    pLsrAPIntTrace1("To sintp", a);
+    if (sizeof(sintp) <= sizeof(sint32))
+        return pLsrSInt32FromAPInt(a);
+    else if (sizeof(sintp) <= sizeof(sint64))
+        return pLsrSInt64FromAPInt(a);
+    else {
+        pLsrRuntimeError("SIntp too large");
+        return 0;
+    }
+}
+
+/* String Conversions */
+static char* pLsrCStringFromAPInt(PlsrAPInt a)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        char* res = pLsrAllocC(mpz_sizeinbase (t1, 10) + 2);
+        mpz_get_str(res, 10, t1);
+        pLsrAPIntTrace1("To cstring", a);
+        return res;
+    }
+}
+
+static PlsrAPInt pLsrAPIntFromCString(char* s)
+{
+    PlsrAPInt res = pLsrAPIntNewFromCString(s);
+    pLsrAPIntTrace1("From cstring", res);
+    return res;
+}
+
+#define pLsrAPIntBinary(a, b, operator)                                 \
+    do {                                                                \
+        pLsrGmpPrepareForAllocation(1);                                 \
+        pLsrAPIntBeginNoGC {                                            \
+            mpz_t t1;                                                   \
+            mpz_init(t1);                                               \
+            mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a);                    \
+            mpz_t t3 = pLsrAPIntInitMPZFromAPInt(b);                    \
+            operator(t1, t2, t3);                                       \
+            PlsrAPInt res; pLsrAPIntFromMPZ(res, t1);                   \
+            pLsrAPIntTrace3(#operator, a, b, res);                      \
+            return res;                                                 \
+        }                                                               \
+    } while(0)
+
+/* Casts */
+
+static sint8 pLsrAPIntCastToSInt8(PlsrAPInt a)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        sint8 res = (sint8)(sint32) mpz_get_si(t1);
+        pLsrAPIntTraceFmt1("Cast to sint8 %d", res);
+        return res;
+    }
+}
+
+static sint16 pLsrAPIntCastToSInt16(PlsrAPInt a)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        sint16 res = (sint16)(sint32) mpz_get_si(t1);
+        pLsrAPIntTraceFmt1("Cast to sint16 %d", res);
+        return res;
+    }
+}
+
+static sint64 pLsrAPIntCastToSInt64(PlsrAPInt a)
+{
+    sint64 res = 0;
+    pLsrAPIntTrace1("Cast to sint64", a);
+    pLsrGmpPrepareForAllocation(3);
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(t1)) {
+            res = (sint64) mpz_get_si(t1);
+        } else {
+            int negate = 0;
+            mpz_t tmp1;
+            mpz_init_set(tmp1, t1);
+            if (mpz_sgn(tmp1) == -1) {
+                negate = 1;
+                mpz_neg(tmp1, tmp1);
+            }
+            mpz_fdiv_q_2exp(tmp1, tmp1, 32);
+            res = (sint64) mpz_get_ui(tmp1);
+            res = res << 32;
+            if (negate) {
+                mpz_neg(tmp1, t1);
+            } else {
+                mpz_set(tmp1, t1);
+            }
+            mpz_fdiv_r_2exp(tmp1, tmp1, 32);
+            res += (sint64) mpz_get_ui(tmp1);
+            if (negate) {
+                res = -res;
+            }
+        }
+    }
+    pLsrAPIntTraceFmt1("To sint64 %lld\n", res);
+    return res;
+}
+
+static uint8 pLsrAPIntCastToUInt8(PlsrAPInt a)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        uint8 res = (uint8)(sint32) mpz_get_si(t1);
+        pLsrAPIntTraceFmt1("Cast to uint8 %d", res);
+        return res;
+    }
+}
+
+static uint16 pLsrAPIntCastToUInt16(PlsrAPInt a)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        uint16 res = (uint16)(sint32) mpz_get_si(t1);
+        pLsrAPIntTraceFmt1("Cast to uint16 %d", res);
+        return res;
+    }
+}
+
+static uint32 pLsrAPIntCastToUInt32(PlsrAPInt a)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        uint32 res = (uint32) mpz_get_ui(t1);
+        if (mpz_sgn(t1) == -1) {
+            res = -res;
+        }
+        pLsrAPIntTraceFmt1("Cast to uint32 %d", res);
+        return res;
+    }
+}
+
+static sint32 pLsrAPIntCastToSInt32(PlsrAPInt a)
+{
+    return (sint32)pLsrAPIntCastToUInt32(a);
+}
+
+static uint64 pLsrAPIntCastToUInt64(PlsrAPInt a)
+{
+    sint64 res = 0;
+    pLsrAPIntTrace1("Cast to uint64", a);
+    pLsrGmpPrepareForAllocation(3);
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(t1)) {
+            res = (sint64) mpz_get_si(t1);
+        } else {
+            int negate = 0;
+            mpz_t tmp1;
+            mpz_init_set(tmp1, t1);
+            if (mpz_sgn(tmp1) == -1) {
+                negate = 1;
+                mpz_neg(tmp1, tmp1);
+            }
+            mpz_fdiv_q_2exp(tmp1, tmp1, 32);
+            res = (sint64) mpz_get_ui(tmp1);
+            res = res << 32;
+            if (negate) {
+                mpz_neg(tmp1, t1);
+            } else {
+                mpz_set(tmp1, t1);
+            }
+            mpz_fdiv_r_2exp(tmp1, tmp1, 32);
+            res += (sint64) mpz_get_ui(tmp1);
+            if (negate) {
+                res = -res;
+            }
+        }
+    }
+    pLsrAPIntTraceFmt1("To uint64 %lld\n", res);
+    return (uint64)res;
+}
+
+static float32 pLsrAPIntCastToFloat32(PlsrAPInt a)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        float32 res = (float32) mpz_get_d(t1);
+        pLsrAPIntTraceFmt1("Cast to float32 %f\n", res);
+        return res;
+    }
+}
+
+static float64 pLsrAPIntCastToFloat64(PlsrAPInt a)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        float64 res = (float64) mpz_get_d(t1);
+        pLsrAPIntTraceFmt1("Cast to float64 %lf\n", res);
+        return res;
+    }
+}
+
+/* Bitwise */
+
+static PlsrAPInt pLsrAPIntBNot(PlsrAPInt a)
+{
+    pLsrGmpPrepareForAllocation(2);
+    pLsrAPIntBeginNoGC {
+        mpz_t t1;
+        mpz_init(t1);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_com(t1, t2);
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, t1);
+        pLsrAPIntTrace2("BNot", a, res);
+        return res;
+    }
+}
+
+static PlsrAPInt pLsrAPIntBAnd(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_and);
+}
+
+static PlsrAPInt pLsrAPIntBOr(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_ior);
+}
+
+static PlsrAPInt pLsrAPIntShiftLUIntL(PlsrAPInt a, unsigned long b)
+{
+    pLsrGmpPrepareForAllocation(1);
+    pLsrAPIntBeginNoGC {
+        mpz_t t1;
+        mpz_init(t1);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_mul_2exp(t1, t2, b);
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, t1);
+        return res;
+    }
+}
+
+static PlsrAPInt pLsrAPIntShiftRUIntL(PlsrAPInt a, unsigned long b)
+{
+    pLsrGmpPrepareForAllocation(2);
+    pLsrAPIntBeginNoGC {
+        mpz_t t1;
+        mpz_init(t1);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_fdiv_q_2exp(t1, t2, b);
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, t1);
+        return res;
+    }
+}
+
+static PlsrAPInt pLsrAPIntBShiftL(PlsrAPInt a, PlsrAPInt b)
+{
+    long int count;
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(b);
+        if (mpz_fits_slong_p(t1)) {
+            count = mpz_get_si(t1);
+        } else {
+            pLsrRuntimeError("GMP AP Int only supports shifts up to 2^32 bits");
+            count = 0;
+        }
+    }
+    PlsrAPInt res = 0;
+    if (count < 0) {
+        res = pLsrAPIntShiftRUIntL(a, -count);
+    } else {
+        res = pLsrAPIntShiftLUIntL(a, count);
+    }
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntBShiftR(PlsrAPInt a, PlsrAPInt b)
+{
+    long int count;
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(b);
+        if (mpz_fits_slong_p(t1)) {
+            count = mpz_get_si(t1);
+        } else {
+            pLsrRuntimeError("GMP AP Int only supports shifts up to 2^32 bits");
+            count = 0;
+        }
+    }
+    PlsrAPInt res = 0;
+    if (count < 0) {
+        res = pLsrAPIntShiftLUIntL(a, -count);
+    } else {
+        res = pLsrAPIntShiftRUIntL(a, count);
+    }
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntBXor(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_xor);
+}
+
+
+
+/* Arithmetic */
+
+static PlsrAPInt pLsrAPIntNegate(PlsrAPInt a)
+{
+    pLsrGmpPrepareForAllocation(2);
+    pLsrAPIntBeginNoGC {
+        mpz_t t1;
+        mpz_init(t1);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_neg(t1, t2);
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, t1);
+        pLsrAPIntTrace2("Negate", a, res);
+        return res;
+    }
+}
+
+static PlsrAPInt pLsrAPIntAdd(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_add);
+}
+
+static PlsrAPInt pLsrAPIntSub(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_sub);
+}
+
+static PlsrAPInt pLsrAPIntMul(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_mul);
+}
+
+
+static void pLsrAPIntDivModE(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrGmpPrepareForAllocation(4);
+    pLsrAPIntBeginNoGC {
+        mpz_t tq, tr;
+        mpz_init(tq);
+        mpz_init(tr);
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b);
+        mpz_tdiv_qr(tq, tr, t1, t2);
+        if (mpz_sgn(tr) < 0) {
+            if (mpz_sgn(t2) > 0) {
+                mpz_sub_ui(tq, tq,1);
+                mpz_add(tr, tr, t2);
+            } else {
+                mpz_add_ui(tq, tq, 1);
+                mpz_sub(tr, tr, t2);
+            }
+        }
+        pLsrAPIntFromMPZ(*quotO, tq);
+        pLsrAPIntFromMPZ(*remO, tr);
+        pLsrAPIntTrace4("DivModE", a, b, q, r);
+        return;
+    }
+}
+
+static void pLsrAPIntDivModF(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrGmpPrepareForAllocation(4);
+    pLsrAPIntBeginNoGC {
+        mpz_t tq, tr;
+        mpz_init(tq);
+        mpz_init(tr);
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b);
+        mpz_fdiv_qr(tq, tr, t1, t2);
+        pLsrAPIntFromMPZ(*quotO, tq);
+        pLsrAPIntFromMPZ(*remO, tr);
+        pLsrAPIntTrace4("DivModF", a, b, q, r);
+        return;
+    }
+}
+
+static void pLsrAPIntDivModT(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrGmpPrepareForAllocation(4);
+    pLsrAPIntBeginNoGC {
+        mpz_t tq, tr;
+        mpz_init(tq);
+        mpz_init(tr);
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b);
+        mpz_tdiv_qr(tq, tr, t1, t2);
+        pLsrAPIntFromMPZ(*quotO, tq);
+        pLsrAPIntFromMPZ(*remO, tr);
+        pLsrAPIntTrace4("DivModT", a, b, q, r);
+        return;
+    }
+}
+
+static PlsrAPInt pLsrAPIntDivE(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrGmpPrepareForAllocation(4);
+    pLsrAPIntBeginNoGC {
+        mpz_t tq, tr;
+        mpz_init(tq);
+        mpz_init(tr);
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b);
+        mpz_tdiv_qr(tq, tr, t1, t2);
+        if (mpz_sgn(tr) < 0) {
+            if (mpz_sgn(t2) > 0) {
+                mpz_sub_ui(tq, tq, 1);
+            } else {
+                mpz_add_ui(tq, tq, 1);
+            }
+        }
+        pLsrAPIntTrace4("DivE", a, b, q, r);
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, tq);
+        return res;
+    }
+}
+
+static PlsrAPInt pLsrAPIntDivF(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrGmpPrepareForAllocation(3);
+    pLsrAPIntBeginNoGC {
+        mpz_t tq;
+        mpz_init(tq);
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b);
+        mpz_fdiv_q(tq, t1, t2);
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, tq);
+        pLsrAPIntTrace3("DivF", a, b, res);
+        return res;
+    };
+}
+
+static PlsrAPInt pLsrAPIntDivT(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrGmpPrepareForAllocation(3);
+    pLsrAPIntBeginNoGC {
+        mpz_t tq;
+        mpz_init(tq);
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b);
+        mpz_tdiv_q(tq, t1, t2);
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, tq);
+        pLsrAPIntTrace3("DivT", a, b, res);
+        return res;
+    }
+}
+
+static PlsrAPInt pLsrAPIntModE(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrGmpPrepareForAllocation(3);
+    pLsrAPIntBeginNoGC {
+        mpz_t tr;
+        mpz_init(tr);
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b);
+        mpz_tdiv_r(tr, t1, t2);
+        if (mpz_sgn(tr) < 0) {
+            if (mpz_sgn(t2) > 0) {
+                mpz_add(tr, tr, t2);
+            } else {
+                mpz_sub(tr, tr, t2);
+            }
+        }
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, tr);
+        pLsrAPIntTrace3("ModE", a, b, res);
+        return res;
+    }
+}
+
+static PlsrAPInt pLsrAPIntModF(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrGmpPrepareForAllocation(2);
+    pLsrAPIntBeginNoGC {
+        mpz_t tr;
+        mpz_init(tr);
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b);
+        mpz_fdiv_r(tr, t1, t2);
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, tr);
+        pLsrAPIntTrace3("ModF", a, b, res);
+        return res;
+    }
+}
+
+static PlsrAPInt pLsrAPIntModT(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrGmpPrepareForAllocation(2);
+    pLsrAPIntBeginNoGC {
+        mpz_t tr;
+        mpz_init(tr);
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b);
+        mpz_tdiv_r(tr, t1, t2);
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, tr);
+        return res;
+    }
+}
+
+static PlsrAPInt pLsrAPIntGcd(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrGmpPrepareForAllocation(2);
+    pLsrAPIntBeginNoGC {
+        mpz_t tres;
+        mpz_init(tres);
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b);
+        mpz_gcd(tres, t1, t2);
+        PlsrAPInt res; pLsrAPIntFromMPZ(res, tres);
+        pLsrAPIntTrace3("Gcd", a, b, res);
+        return res;
+    }
+}
+
+
+/* Comparisons */
+/* calls below assume that this does not incur a gc */
+static sintp pLsrAPIntCompare(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        mpz_t t2 = pLsrAPIntInitMPZFromAPInt(b);
+        sintp res = mpz_cmp(t1, t2);
+        pLsrAPIntTrace2("Compare", a, b);
+        return res;
+    }
+}
+
+
+static PlsrBoolean pLsrAPIntLess(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) < 0;
+}
+
+static PlsrBoolean pLsrAPIntGreater(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) > 0;
+}
+
+static PlsrBoolean pLsrAPIntEqual(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) == 0;
+}
+
+static PlsrBoolean pLsrAPIntNotEqual(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) != 0;
+}
+
+static PlsrBoolean pLsrAPIntLessOrEqual(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) <= 0;
+}
+
+static PlsrBoolean pLsrAPIntGreaterOrEqual(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) >= 0;
+}
+
+
+/* Miscellaneous */
+
+static uint32 pLsrAPIntFitsInSInt32(PlsrAPInt a)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        uint32 res = mpz_fits_slong_p(t1);
+        if (sizeof(sint32) < sizeof(long int)) {
+            /* If it fits in a long int and the long int is small enough,
+             * it fits in an sint32 */
+            if (res) {
+                long int ai = mpz_get_si(t1);
+                res = ai >=SINT32_MIN && ai <= SINT32_MAX;
+            }
+        } else if (sizeof(sint32) > sizeof(long int)) {
+            if (!res) { /* gc ok */
+                res = (pLsrAPIntGreaterOrEqual(a, pLsrAPIntFromSInt32(SINT32_MIN)) &&
+                       pLsrAPIntLessOrEqual(a, pLsrAPIntFromSInt32(SINT32_MAX)));
+            }
+        }
+        /* If they're equal, we're good. */
+        return res;
+    }
+}
+
+/* Returns SINT32_MIN if (a >= upper) or (a <= lower)
+ * Otherwise returns a.
+ * To be useful, (lower > SINT32_MIN) should be true.
+ */
+static sint32 pLsrAPIntCheckRangeSInt32(PlsrAPInt a, sint32 upper, sint32 lower)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        uint32 fitsInLong = mpz_fits_slong_p(t1);
+        sint32 res = SINT32_MIN;
+        if (sizeof(sint32) < sizeof(long int)) {
+            /* If it fits in a long int and the long int is small enough,
+             * it fits in an sint32 */
+            if (fitsInLong) {
+                long int ai = mpz_get_si(t1);
+                if (ai >= ((long int) lower) && ai <= ((long int) upper)) {
+                    res = (sint32) ai;
+                }
+            }
+        } else if (sizeof(sint32) > sizeof(long int)) {
+            if (fitsInLong) {
+                sint32 ai = (sint32) mpz_get_si(t1);
+                if (ai >= lower && ai <= upper) {
+                    res = ai;
+                }
+            } else if (pLsrAPIntGreaterOrEqual(a, pLsrAPIntFromSInt32(lower)) &&
+                       pLsrAPIntLessOrEqual(a, pLsrAPIntFromSInt32(upper))) {
+                res = (sint32) mpz_get_si(t1);
+            }
+        } else if (fitsInLong) {
+            sint32 ai = (sint32) mpz_get_si(t1);
+            if (ai >= lower && ai <= upper) {
+                res = ai;
+            }
+        }
+        return res;
+    }
+}
+
+#define pLsrAPIntFromFloat32 pLsrAPIntNewFromFloat32
+#define pLsrAPIntFromFloat64 pLsrAPIntNewFromFloat64
+
+static float32 pLsrFloat32FromAPInt(PlsrAPInt a) {
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        float32 res = (float32) mpz_get_d(t1);
+        pLsrAPIntTrace1("To float32", a);
+        return res;
+    }
+}
+
+static float64 pLsrFloat64FromAPInt(PlsrAPInt a) {
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        float64 res = (float64) mpz_get_d(t1);
+        pLsrAPIntTrace1("To float64", a);
+        return res;
+    }
+}
+
+#define hashPair(h1, h2) ((h1)+((h2)<<5)+(h2)+720)
+
+/* no gc */
+static uintp pLsrAPIntMPZHash(mpz_t z)
+{
+    uintp h;
+    switch mpz_sgn(z) {
+    case -1: h = 987; break;
+    case 1:  h = 0;   break;
+    case 0:  h = 0;   break;
+    }
+    for(size_t l = mpz_size(z); l > 0; l--) {
+        h = hashPair(h, mpz_getlimbn(z, l-1));
+    }
+    return h;
+}
+
+static uintp pLsrAPIntHash(PlsrAPInt a)
+{
+    pLsrAPIntBeginNoGC {
+        mpz_t t1 = pLsrAPIntInitMPZFromAPInt(a);
+        uintp h = pLsrAPIntMPZHash(t1);
+        return h;
+    }
+}
+
+/* This function must match the previous one */
+static uintp pLsrSInt32Hash(sint32 i)
+{
+    pLsrAPIntBeginNoGC {
+        uintp h;
+        mpz_t tmp;
+        mpz_init_set_si(tmp, i);
+        h = pLsrAPIntMPZHash(tmp);
+        return h;
+    }
+}
+
+/* Initialization and Registration */
+
+static void pLsrAPIntRegisterVTables()
+{
+    static PgcIsRef pLsrAPIntRefs[pLsrAPIntSize/P_WORD_SIZE] = { 0, };
+
+    pLsrVTableRegister(pLsrAPIntVTable, pLsrAPIntAlignment, pLsrAPIntSize, pLsrAPIntRefs, sizeof(mp_limb_t), P_WORD_SIZE, 0,
+                       PGC_ALWAYS_MUTABLE, 0);
+
+}
+
+#define pLsrAPIntGlobalsCount 4
+
+static PlsrObjectB pLsrAPIntGlobals[] =
+    {
+        (PlsrObjectB) &pLsrAPIntZero_,
+        (PlsrObjectB) &pLsrAPIntOne_,
+        (PlsrObjectB) &pLsrAPIntMinusOne_,
+        (PlsrObjectB) &pLsrAPIntMinusUInt32Max_,
+        (PlsrObjectB) NULL /* This must be last */
+    };
+
+static void pLsrAPIntRegisterGlobals() {
+    assert(pLsrAPIntGlobals[pLsrAPIntGlobalsCount] == NULL);
+    pLsrGcRegisterGlobals (pLsrAPIntGlobals, pLsrAPIntGlobalsCount);
+};
+
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+
+static void* pLsrGMPMalloc(size_t size)
+{
+    PlsrAPIntS* mem;
+    pLsrAllocNoGC(PlsrAPIntS*, mem, pLsrAPIntVTable, sizeof(PlsrAPIntS) + size);
+    mem->alloc = size/sizeof(mp_limb_t);
+    assert(size % sizeof(mp_limb_t) == 0);
+    if (size > pLsrGmpAllocationReserve/2) pLsrGmpAllocationReserve = size*2;
+    return pLsrAPInt_mp_d(mem);
+}
+
+static void pLsrGMPFree(void* object, size_t size)
+{
+    PlsrAPInt a;
+    assert((a = pLsrAPIntMPZMemPtrToAPInt(object), pLsrAPInt_mp_alloc(a)*sizeof(mp_limb_t) ==  size));
+}
+
+static void* pLsrGMPReAlloc(void* object, size_t osize, size_t nsize)
+{
+    PlsrAPIntS* mem;
+    void* s;
+    pLsrAllocNoGC(PlsrAPIntS*, mem, pLsrAPIntVTable, sizeof(PlsrAPIntS) + nsize);
+    s = pLsrAPInt_mp_d(mem);
+    memcpy(s, object, osize);
+    mem->alloc = nsize/sizeof(mp_limb_t);
+    assert(nsize % sizeof(mp_limb_t) == 0);
+    if (nsize > pLsrGmpAllocationReserve/2) pLsrGmpAllocationReserve = nsize*2;
+    return s;
+}
+
+#  undef to
+#pragma pillar_managed(on)
+#endif
+
+
+static void pLsrAPIntInitialize(uintp memLimit) {
+    mp_set_memory_functions(pLsrGMPMalloc, pLsrGMPReAlloc, pLsrGMPFree);
+    pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntZero, 0);
+    pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntOne, 1);
+    pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntMinusOne, -1);
+    PlsrAPInt tmp = pLsrAPIntFromUInt32(UINT32_MAX);
+    tmp = pLsrAPIntNegate(tmp);
+    pLsrAPIntStaticGlobalInitFromAPInt(pLsrAPIntMinusUInt32Max, tmp);
+};
+
+
+#endif /*_PLSR_AP_INTEGER_H_ */
diff --git a/runtime/include/hrc/plsr-gmp-integer.h b/runtime/include/hrc/plsr-gmp-integer.h
new file mode 100644
index 0000000..ae65c8c
--- /dev/null
+++ b/runtime/include/hrc/plsr-gmp-integer.h
@@ -0,0 +1,1178 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_AP_INTEGER_H_
+#define _PLSR_AP_INTEGER_H_
+
+#ifdef PLSR_GMP_USE_MALLOC
+#define pLsrAPIntIsPinned 0
+#else /* PLSR_GMP_USE_MALLOC */
+
+#ifdef PLSR_GMP_USE_PCDECL
+
+#define pLsrAPIntIsPinned 0
+
+#ifndef __pillar2c__
+#error "pcdecl only supported on pillar2c"
+#endif /* __pillar2c__ */
+
+#else /* PLSR_GMP_USE_PCDECL*/
+
+#ifdef PLSR_GMP_USE_PINNING
+#define pLsrAPIntIsPinned 1
+
+#else /* !PLSR_GMP_USE_PINNING */
+
+#ifdef PLSR_GMP_USE_GCMALLOC
+
+#define pLsrAPIntIsPinned 1
+#define PLSR_GMP_REPLACE_MALLOC
+
+#else /* !PLSR_GMP_USE_GCMALLOC */
+
+#ifdef PLSR_GMP_USE_FORCE_GC
+#define pLsrAPIntIsPinned 0
+#define PLSR_GMP_USE_PCDECL
+#define PLSR_GMP_FORCE_GC
+#define PLSR_GMP_REPLACE_MALLOC
+#else /* !PLSR_GMP_USE_FORCE_GC */
+#error "No GMP implementation selected"
+#endif /* !PLSR_GMP_USE_FORCE_GC */
+
+#endif /* !PLSR_GMP_USE_GCMALLOC */
+
+#endif /* PLSR_GMP_USE_PINNING */
+
+#endif /* !PLSR_GMP_USE_PCDECL*/
+
+#endif /* !PLSR_GMP_USE_MALLOC*/
+
+
+#ifdef PLSR_GMP_USE_PCDECL
+
+#  pragma pillar_push_cc(__pcdecl)
+#include <gmp.h>
+#  pragma pillar_pop_cc
+
+#else /* PLSR_GMP_USE_PCDECL */
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+#endif /* P_USE_PILLAR */
+
+#include <gmp.h>
+
+#ifdef P_USE_PILLAR
+#  undef to
+#  pragma pillar_managed(on)
+#endif
+#endif /* !PLSR_GMP_USE_PCDECL */
+/* Types */
+
+#ifdef PLSR_GMP_USE_MALLOC
+
+typedef struct PlsrAPIntS_ {
+    PlsrVTable vtable;
+    mpz_t *z;
+} PlsrAPIntS;
+
+#define pLsrAPIntPadding (sizeof(PlsrAPIntS) - sizeof(PlsrVTable) - sizeof(mpz_t*))
+
+#else /* PLSR_GMP_USE_MALLOC */
+
+typedef struct PlsrAPIntS_ {
+    PlsrVTable vtable;
+    mpz_t z;
+} PlsrAPIntS;
+
+#define pLsrAPIntPadding (sizeof(PlsrAPIntS) - sizeof(PlsrVTable) - sizeof(mpz_t))
+
+#endif /* !PLSR_GMP_USE_MALLOC */
+
+/* This must be pinned, and we must add finalizers for it */
+pLsrVTableStatic(pLsrAPIntVTable_, VNoneTag, "*ap integer*", pLsrAPIntPadding);
+#define pLsrAPIntVTable (&pLsrAPIntVTable_)
+
+
+#ifdef P_USE_PILLAR
+typedef PlsrRef PlsrAPInt;
+#else /* !P_USE_PILLAR */
+typedef PlsrAPIntS* PlsrAPInt;
+#endif /* !P_USE_PILLAR */
+
+#define pLsrAPIntAlignment 4
+#define pLsrAPIntSize (sizeof(PlsrAPIntS))
+
+#ifdef PLSR_GMP_USE_MALLOC
+#define pLsrAPIntGetZPtr(i) (((PlsrAPIntS*) i)->z)
+#define pLsrAPIntGetZ(i) (*(((PlsrAPIntS*) i)->z))
+#else
+#define pLsrAPIntGetZ(i) (((PlsrAPIntS*) i)->z)
+#endif
+
+#define pLsrAPIntKeepLive1(v)                   \
+    do {                                        \
+        volatile PlsrAPInt pLsrAPIntKeepLive1A; \
+        pLsrAPIntKeepLive1A = v;                \
+    } while(0);
+
+#define pLsrAPIntKeepLive2(v0, v1)              \
+    do {                                        \
+        volatile PlsrAPInt pLsrAPIntKeepLive2A; \
+        pLsrAPIntKeepLive2A = v0;               \
+        pLsrAPIntKeepLive2A = v1;               \
+    } while(0);
+
+#ifdef PLSR_AP_INT_TRACE
+static void pLsrAPIntShow1(char * s, PlsrAPInt i)
+{
+    char* res = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i), 10) + 2);
+    mpz_get_str(res, 10, pLsrAPIntGetZ(i));
+    printf("APInt: %s, %s\n", s, res);
+    pLsrFreeC(res);
+}
+static void pLsrAPIntShow2(char * s, PlsrAPInt i1, PlsrAPInt i2)
+{
+    char* res1 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i1), 10) + 2);
+    mpz_get_str(res1, 10, pLsrAPIntGetZ(i1));
+    char* res2 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i2), 10) + 2);
+    mpz_get_str(res2, 10, pLsrAPIntGetZ(i2));
+    printf("APInt: %s, (%s, %s)\n", s, res1, res2);
+    pLsrFreeC(res1);
+    pLsrFreeC(res2);
+}
+static void pLsrAPIntShow3(char * s, PlsrAPInt i1, PlsrAPInt i2, PlsrAPInt i3)
+{
+    char* res1 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i1), 10) + 2);
+    mpz_get_str(res1, 10, pLsrAPIntGetZ(i1));
+    char* res2 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i2), 10) + 2);
+    mpz_get_str(res2, 10, pLsrAPIntGetZ(i2));
+    char* res3 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i3), 10) + 2);
+    mpz_get_str(res3, 10, pLsrAPIntGetZ(i3));
+    printf("APInt: %s, (%s, %s, %s)\n", s, res1, res2, res3);
+    pLsrFreeC(res1);
+    pLsrFreeC(res2);
+    pLsrFreeC(res3);
+}
+static void pLsrAPIntShow4(char * s, PlsrAPInt i1, PlsrAPInt i2, PlsrAPInt i3, PlsrAPInt i4)
+{
+    char* res1 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i1), 10) + 2);
+    mpz_get_str(res1, 10, pLsrAPIntGetZ(i1));
+    char* res2 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i2), 10) + 2);
+    mpz_get_str(res2, 10, pLsrAPIntGetZ(i2));
+    char* res3 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i3), 10) + 2);
+    mpz_get_str(res3, 10, pLsrAPIntGetZ(i3));
+    char* res4 = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(i4), 10) + 2);
+    mpz_get_str(res4, 10, pLsrAPIntGetZ(i4));
+    printf("APInt: %s, (%s, %s, %s, %s)\n", s, res1, res2, res3, res4);
+    pLsrFreeC(res1);
+    pLsrFreeC(res2);
+    pLsrFreeC(res3);
+    pLsrFreeC(res4);
+}
+
+#define pLsrAPIntTrace0(s) printf("APInt: %s\n", s)
+#define pLsrAPIntTrace1(s, i1)  pLsrAPIntShow1(s, i1)
+#define pLsrAPIntTrace2(s, i1, i2) pLsrAPIntShow2(s, i1, i2)
+#define pLsrAPIntTrace3(s, i1, i2, i3) pLsrAPIntShow3(s, i1, i2, i3)
+#define pLsrAPIntTrace4(s, i1, i2, i3, i4) pLsrAPIntShow4(s, i1, i2, i3, i4)
+#define pLsrAPIntTraceFmt1(s, i1)  printf(s, i1)
+#else
+#define pLsrAPIntTrace0(s)
+#define pLsrAPIntTrace1(s, i1)
+#define pLsrAPIntTrace2(s, i1, i2)
+#define pLsrAPIntTrace3(s, i1, i2, i3)
+#define pLsrAPIntTrace4(s, i1, i2, i3, i4)
+#define pLsrAPIntTraceFmt1(s, i1)
+#endif
+
+/* Allocation */
+
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+static void pLsrAPIntFinalize(PlsrAPInt i)
+{
+    mpz_clear(pLsrAPIntGetZ(i));
+
+#ifdef PLSR_GMP_USE_MALLOC
+    pLsrFreeC(pLsrAPIntGetZPtr(i));
+#endif
+
+    pLsrAPIntKeepLive1(i);
+}
+
+#  undef to
+#pragma pillar_managed(on)
+#endif
+
+#ifdef PLSR_GMP_FORCE_GC
+/* At some point, this should be made thread safe */
+static int pLsrGmpAllocated=0;
+static int pLsrGmpMemLimit=100*1000*1000;
+#define pLsrAPIntGCCheck()                                      \
+    do {                                                        \
+        if (pLsrGmpAllocated > pLsrGmpMemLimit) {               \
+            pgc_force_gc();                                     \
+        }                                                       \
+    } while (0)
+#else /* PLSR_GMP_FORCE_GC */
+
+#define pLsrAPIntGCCheck()
+
+#endif /* PLSR_GMP_FORCE_GC */
+
+/* NB: This does not initialize the mpz_t structure */
+static PlsrAPInt pLsrAPIntNewUninit()
+{
+    PlsrAPInt res;
+    pLsrAPIntGCCheck();
+#ifdef PLSR_GMP_USE_PINNING
+    pLsrAllocPinnedFinalizable(PlsrAPInt, res, pLsrAPIntVTable, sizeof(PlsrAPIntS));
+#else
+    pLsrAllocFinalizable(PlsrAPInt, res, pLsrAPIntVTable, sizeof(PlsrAPIntS));
+#endif
+#ifdef PLSR_GMP_USE_MALLOC
+    pLsrAPIntGetZPtr(res) = (mpz_t*) pLsrAllocC(sizeof(mpz_t));
+#endif
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntNew()
+{
+    PlsrAPInt res;
+    pLsrAPIntGCCheck();
+#ifdef PLSR_GMP_USE_PINNING
+    pLsrAllocPinnedFinalizable(PlsrAPInt, res, pLsrAPIntVTable, sizeof(PlsrAPIntS));
+#else
+    pLsrAllocFinalizable(PlsrAPInt, res, pLsrAPIntVTable, sizeof(PlsrAPIntS));
+#endif
+#ifdef PLSR_GMP_USE_MALLOC
+    pLsrAPIntGetZPtr(res) = (mpz_t*) pLsrAllocC(sizeof(mpz_t));
+#endif
+    mpz_init(pLsrAPIntGetZ(res));
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntNewFromCString(char *s)
+{
+    PlsrAPInt res = pLsrAPIntNewUninit();
+    if (mpz_init_set_str(pLsrAPIntGetZ(res), s, 0) != 0) {
+        pLsrRuntimeError("Failed to initialize ap integer");
+    }
+    pLsrAPIntTrace1("New from string", res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntNewFromSignedLong(long int si)
+{
+    PlsrAPInt res = pLsrAPIntNewUninit();
+    mpz_init_set_si(pLsrAPIntGetZ(res), si);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntNewFromUnsignedLong(unsigned long int ui)
+{
+    PlsrAPInt res = pLsrAPIntNewUninit();
+    mpz_init_set_ui(pLsrAPIntGetZ(res), ui);
+    return res;
+}
+
+
+static PlsrAPInt pLsrAPIntNewFromFloat32(float32 f)
+{
+    PlsrAPInt res = pLsrAPIntNewUninit();
+    mpz_init_set_d(pLsrAPIntGetZ(res), (float64) f);
+    pLsrAPIntTrace1("New from float32", res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntNewFromFloat64(float64 f)
+{
+    PlsrAPInt res = pLsrAPIntNewUninit();
+    mpz_init_set_d(pLsrAPIntGetZ(res), f);
+    pLsrAPIntTrace1("New from float64", res);
+    return res;
+}
+
+
+static PlsrAPInt pLsrAPIntNewFromAPInt(PlsrAPInt a)
+{
+    PlsrAPInt res = pLsrAPIntNewUninit();
+    mpz_init_set(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a));
+    pLsrAPIntKeepLive1(a);
+    return res;
+}
+
+static void pLsrAPIntDestroy(PlsrAPInt i)
+{
+    mpz_clear(pLsrAPIntGetZ(i));
+#ifdef PLSR_GMP_USE_MALLOC
+    pLsrFreeC(pLsrAPIntGetZPtr(i));
+#endif
+    pLsrAPIntKeepLive1(i);
+}
+
+/* Constants */
+
+/* Globals */
+
+#define PLSR_AP_INT_BACKPATCH_GLOBALS 1
+
+#define pLsrAPIntStaticGlobalNew(dv, bytes)                     \
+    static PlsrAPIntS dv = {.vtable = pLsrAPIntVTable, }
+
+/* These are global, and will never be freed, */
+static void pLsrAPIntStaticGlobalInitFromCString(PlsrAPInt i, char* s)
+{
+#ifdef PLSR_GMP_USE_MALLOC
+    pLsrAPIntGetZPtr(i) = (mpz_t*) pLsrAllocC(sizeof(mpz_t));
+#endif
+    /* Use leading characters of string to determine base*/
+    if (mpz_init_set_str(pLsrAPIntGetZ(i), s, 0) != 0) {
+        pLsrRuntimeError("Failed to initialize global ap integer");
+    }
+}
+
+static void pLsrAPIntStaticGlobalInitFromSInt32(PlsrAPInt z, sint32 si)
+{
+#ifdef PLSR_GMP_USE_MALLOC
+    pLsrAPIntGetZPtr(z) = (mpz_t*) pLsrAllocC(sizeof(mpz_t));
+#endif
+    mpz_init_set_si(pLsrAPIntGetZ(z), si);
+}
+
+static void pLsrAPIntStaticGlobalInitFromUInt32(PlsrAPInt z, uint32 ui)
+{
+#ifdef PLSR_GMP_USE_MALLOC
+    pLsrAPIntGetZPtr(z) = (mpz_t*) pLsrAllocC(sizeof(mpz_t));
+#endif
+    mpz_init_set_ui(pLsrAPIntGetZ(z), ui);
+}
+
+pLsrAPIntStaticGlobalNew(pLsrAPIntZero_, 0);
+#define pLsrAPIntZero ((PlsrAPInt) &pLsrAPIntZero_)
+pLsrAPIntStaticGlobalNew(pLsrAPIntOne_, 0);
+#define pLsrAPIntOne ((PlsrAPInt) &pLsrAPIntOne_)
+pLsrAPIntStaticGlobalNew(pLsrAPIntMinusOne_, 0);
+#define pLsrAPIntMinusOne ((PlsrAPInt) &pLsrAPIntMinusOne_)
+pLsrAPIntStaticGlobalNew(pLsrAPIntMinusUInt32Max_, 0);
+#define pLsrAPIntMinusUInt32Max ((PlsrAPInt) &pLsrAPIntMinusUInt32Max_)
+
+/* Conversions */
+
+/* Unsigned Integer Conversions */
+
+static PlsrAPInt pLsrAPIntFromUInt32(uint32 i)
+{
+    PlsrAPInt res = pLsrAPIntNewFromUnsignedLong(i);
+    pLsrAPIntTrace1("New from uint32", res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntFromUInt64(uint64 i)
+{
+    PlsrAPInt res;
+    if (sizeof(uint64) <= sizeof(unsigned long int))
+        res = pLsrAPIntNewFromUnsignedLong(i);
+    else if (i <= UINT32_MAX) {
+        res = pLsrAPIntFromUInt32(i);
+    } else {
+        uint64 ui = i;
+        uint32 upper = (uint32) (ui >> 32);
+        uint32 lower = (uint32) ui;
+        PlsrAPInt z = pLsrAPIntFromUInt32(upper);
+        mpz_mul_2exp(pLsrAPIntGetZ(z), pLsrAPIntGetZ(z), 32);
+        mpz_add_ui(pLsrAPIntGetZ(z), pLsrAPIntGetZ(z), lower);
+        res = z;
+    }
+    pLsrAPIntTrace1("New from uint64", res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntFromUIntp(uintp i)
+{
+    PlsrAPInt res;
+    if (sizeof(uintp) <= sizeof(uint32))
+        res = pLsrAPIntFromUInt32(i);
+    else if (sizeof(uintp) <= sizeof(uint64))
+        res = pLsrAPIntFromUInt64(i);
+    else {
+        pLsrRuntimeError("UIntp too large");
+        res = 0;
+    }
+    pLsrAPIntTrace1("New from uintp", res);
+    return res;
+}
+
+static uint32 pLsrUInt32FromAPInt(PlsrAPInt a)
+{
+    uint32 res = (uint32) mpz_get_ui(pLsrAPIntGetZ(a));
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTrace1("To uint32", a);
+    return res;
+}
+
+static uint64 pLsrUInt64FromAPInt(PlsrAPInt a)
+{
+    uint64 res = 0;
+    if (sizeof(uint64) <= sizeof(unsigned long int) || mpz_fits_ulong_p(pLsrAPIntGetZ(a))) {
+        res = (uint64) mpz_get_ui(pLsrAPIntGetZ(a));
+    } else {
+        mpz_t tmp1;
+        mpz_t tmp2;
+        mpz_init_set(tmp1, pLsrAPIntGetZ(a));
+        mpz_init_set(tmp2, tmp1);
+        mpz_fdiv_q_2exp(tmp1, tmp1, 32);
+        mpz_fdiv_r_2exp(tmp2, tmp2, 32);
+        res = (uint64) mpz_get_ui(tmp1);
+        res = res << 32;
+        res += (uint64) mpz_get_ui(tmp2);
+        mpz_clear(tmp1);
+        mpz_clear(tmp2);
+    }
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTrace1("To uint64", a);
+    return res;
+}
+
+static uintp pLsrUIntpFromAPInt(PlsrAPInt a)
+{
+    uintp res;
+    if (sizeof(uintp) <= sizeof(uint32))
+        res = pLsrUInt32FromAPInt(a);
+    else if (sizeof(uintp) <= sizeof(uint64))
+        res = pLsrUInt64FromAPInt(a);
+    else {
+        pLsrRuntimeError("UIntp too large");
+        res = 0;
+    }
+    pLsrAPIntTrace1("To uintp", a);
+    return res;
+}
+
+/* Signed Integer Conversions */
+
+static PlsrAPInt pLsrAPIntFromSInt32(sint32 i)
+{
+    PlsrAPInt res = pLsrAPIntNewFromSignedLong(i);
+    pLsrAPIntTrace1("From sint32", res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntFromSInt64(sint64 i)
+{
+    PlsrAPInt res;
+    if (sizeof(sint64) <= sizeof(long int))
+        res = pLsrAPIntNewFromSignedLong(i);
+    else if (i <= ((sint64) SINT32_MAX) && i >= ((sint64) SINT32_MIN)) {
+        res = pLsrAPIntFromSInt32((sint32) i);
+    } else {
+        uint64 ui = pLsrAbs64(i);
+        uint32 upper = (uint32) (ui >> 32);
+        uint32 lower = (uint32) ui;
+        PlsrAPInt z = pLsrAPIntFromUInt32(upper);
+        mpz_mul_2exp(pLsrAPIntGetZ(z), pLsrAPIntGetZ(z), 32);
+        mpz_add_ui(pLsrAPIntGetZ(z), pLsrAPIntGetZ(z), lower);
+        if (i < 0) {
+            mpz_neg(pLsrAPIntGetZ(z), pLsrAPIntGetZ(z));
+        }
+        res = z;
+    }
+    pLsrAPIntTrace1("From sint64", res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntFromSIntp(sintp i)
+{
+    PlsrAPInt res;
+    if (sizeof(sintp) <= sizeof(sint32))
+        res = pLsrAPIntFromSInt32(i);
+    else if (sizeof(sintp) <= sizeof(sint64))
+        res = pLsrAPIntFromSInt64(i);
+    else {
+        pLsrRuntimeError("SIntp too large");
+        res = 0;
+    }
+    pLsrAPIntTrace1("From sint64", res);
+    return res;
+}
+
+static sint32 pLsrSInt32FromAPInt(PlsrAPInt a)
+{
+    sint32 res = (sint32) mpz_get_si(pLsrAPIntGetZ(a));
+    pLsrAPIntTraceFmt1("To sint32 %d", res);
+    return res;
+}
+
+static sint64 pLsrSInt64FromAPInt(PlsrAPInt a)
+{
+    sint64 res = 0;
+    pLsrAPIntTrace1("To sint64", a);
+    if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(pLsrAPIntGetZ(a))) {
+        res = (sint64) mpz_get_si(pLsrAPIntGetZ(a));
+    } else {
+        int negate = 0;
+        mpz_t tmp1;
+        mpz_t tmp2;
+        mpz_init_set(tmp1, pLsrAPIntGetZ(a));
+        if (mpz_sgn(tmp1) == -1) {
+            negate = 1;
+            mpz_neg(tmp1, tmp1);
+        }
+        mpz_init_set(tmp2, tmp1);
+        mpz_fdiv_q_2exp(tmp1, tmp1, 32);
+        mpz_fdiv_r_2exp(tmp2, tmp2, 32);
+        res = (sint64) mpz_get_ui(tmp1);
+        res = res << 32;
+        res += (sint64) mpz_get_ui(tmp2);
+        if (negate) {
+            res = -res;
+        }
+        mpz_clear(tmp1);
+        mpz_clear(tmp2);
+    }
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTraceFmt1("To sint64 %lld\n", res);
+    return res;
+}
+
+static sintp pLsrSIntpFromAPInt(PlsrAPInt a)
+{
+    pLsrAPIntTrace1("To sintp", a);
+    if (sizeof(sintp) <= sizeof(sint32))
+        return pLsrSInt32FromAPInt(a);
+    else if (sizeof(sintp) <= sizeof(sint64))
+        return pLsrSInt64FromAPInt(a);
+    else {
+        pLsrRuntimeError("SIntp too large");
+        return 0;
+    }
+}
+
+/* String Conversions */
+static char* pLsrCStringFromAPInt(PlsrAPInt a)
+{
+    char* res = pLsrAllocC(mpz_sizeinbase (pLsrAPIntGetZ(a), 10) + 2);
+    mpz_get_str(res, 10, pLsrAPIntGetZ(a));
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTrace1("To cstring", a);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntFromCString(char* s)
+{
+    PlsrAPInt res = pLsrAPIntNewFromCString(s);
+    pLsrAPIntTrace1("From cstring", res);
+    return res;
+}
+
+#define pLsrAPIntBinary(a, b, operator)                                 \
+    do {                                                                \
+        PlsrAPInt res = pLsrAPIntNew();                                 \
+        operator(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b)); \
+        pLsrAPIntKeepLive2(a, b);                                       \
+        pLsrAPIntTrace3(#operator, a, b, res);                         \
+        return res;                                                     \
+    } while(0)
+
+/* Casts */
+
+static sint8 pLsrAPIntCastToSInt8(PlsrAPInt a)
+{
+    sint8 res = (sint8)(sint32) mpz_get_si(pLsrAPIntGetZ(a));
+    pLsrAPIntTraceFmt1("Cast to sint8 %d", res);
+    return res;
+}
+
+static sint16 pLsrAPIntCastToSInt16(PlsrAPInt a)
+{
+    sint16 res = (sint16)(sint32) mpz_get_si(pLsrAPIntGetZ(a));
+    pLsrAPIntTraceFmt1("Cast to sint16 %d", res);
+    return res;
+}
+
+static sint64 pLsrAPIntCastToSInt64(PlsrAPInt a)
+{
+    sint64 res = 0;
+    pLsrAPIntTrace1("Cast to sint64", a);
+    if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(pLsrAPIntGetZ(a))) {
+        res = (sint64) mpz_get_si(pLsrAPIntGetZ(a));
+    } else {
+        int negate = 0;
+        mpz_t tmp1;
+        mpz_t tmp2;
+        mpz_init_set(tmp1, pLsrAPIntGetZ(a));
+        if (mpz_sgn(tmp1) == -1) {
+            negate = 1;
+            mpz_neg(tmp1, tmp1);
+        }
+        mpz_init_set(tmp2, tmp1);
+        mpz_fdiv_q_2exp(tmp1, tmp1, 32);
+        mpz_fdiv_r_2exp(tmp2, tmp2, 32);
+        res = (sint64) mpz_get_ui(tmp1);
+        res = res << 32;
+        res += (sint64) mpz_get_ui(tmp2);
+        if (negate) {
+            res = -res;
+        }
+        mpz_clear(tmp1);
+        mpz_clear(tmp2);
+    }
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTraceFmt1("To sint64 %lld\n", res);
+    return res;
+}
+
+static uint8 pLsrAPIntCastToUInt8(PlsrAPInt a)
+{
+    uint8 res = (uint8)(sint32) mpz_get_si(pLsrAPIntGetZ(a));
+    pLsrAPIntTraceFmt1("Cast to uint8 %d", res);
+    return res;
+}
+
+static uint16 pLsrAPIntCastToUInt16(PlsrAPInt a)
+{
+    uint16 res = (uint16)(sint32) mpz_get_si(pLsrAPIntGetZ(a));
+    pLsrAPIntTraceFmt1("Cast to uint16 %d", res);
+    return res;
+}
+
+static uint32 pLsrAPIntCastToUInt32(PlsrAPInt a)
+{
+    uint32 res = (uint32) mpz_get_ui(pLsrAPIntGetZ(a));
+    if (mpz_sgn(pLsrAPIntGetZ(a)) == -1) {
+        res = -res;
+    }
+    pLsrAPIntTraceFmt1("Cast to uint32 %d", res);
+    return res;
+}
+
+static sint32 pLsrAPIntCastToSInt32(PlsrAPInt a)
+{
+    return (sint32)pLsrAPIntCastToUInt32(a);
+}
+
+static uint64 pLsrAPIntCastToUInt64(PlsrAPInt a)
+{
+    sint64 res = 0;
+    pLsrAPIntTrace1("Cast to uint64", a);
+    if (sizeof(sint64) <= sizeof(long int) || mpz_fits_slong_p(pLsrAPIntGetZ(a))) {
+        res = (sint64) mpz_get_si(pLsrAPIntGetZ(a));
+    } else {
+        int negate = 0;
+        mpz_t tmp1;
+        mpz_t tmp2;
+        mpz_init_set(tmp1, pLsrAPIntGetZ(a));
+        if (mpz_sgn(tmp1) == -1) {
+            negate = 1;
+            mpz_neg(tmp1, tmp1);
+        }
+        mpz_init_set(tmp2, tmp1);
+        mpz_fdiv_q_2exp(tmp1, tmp1, 32);
+        mpz_fdiv_r_2exp(tmp2, tmp2, 32);
+        res = (sint64) mpz_get_ui(tmp1);
+        res = res << 32;
+        res += (sint64) mpz_get_ui(tmp2);
+        if (negate) {
+            res = -res;
+        }
+        mpz_clear(tmp1);
+        mpz_clear(tmp2);
+    }
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTraceFmt1("To uint64 %lld\n", res);
+    return (uint64)res;
+}
+
+static float32 pLsrAPIntCastToFloat32(PlsrAPInt a)
+{
+    float32 res = (float32) mpz_get_d(pLsrAPIntGetZ(a));
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTraceFmt1("Cast to float32 %f\n", res);
+    return res;
+}
+
+static float64 pLsrAPIntCastToFloat64(PlsrAPInt a)
+{
+    float64 res = (float64) mpz_get_d(pLsrAPIntGetZ(a));
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTraceFmt1("Cast to float64 %lf\n", res);
+    return res;
+}
+
+/* Bitwise */
+
+static PlsrAPInt pLsrAPIntBNot(PlsrAPInt a)
+{
+    PlsrAPInt res = pLsrAPIntNew();
+    mpz_com(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a));
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTrace2("BNot", a, res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntBAnd(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_and);
+}
+
+static PlsrAPInt pLsrAPIntBOr(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_ior);
+}
+
+static PlsrAPInt pLsrAPIntShiftLUIntL(PlsrAPInt a, unsigned long b)
+{
+    PlsrAPInt res = pLsrAPIntNew();
+    mpz_mul_2exp(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a), b);
+    pLsrAPIntKeepLive1(a);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntShiftRUIntL(PlsrAPInt a, unsigned long b)
+{
+    PlsrAPInt res = pLsrAPIntNew();
+    mpz_fdiv_q_2exp(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a), b);
+    pLsrAPIntKeepLive1(a);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntBShiftL(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt res = 0;
+    if (mpz_fits_slong_p(pLsrAPIntGetZ(b))) {
+        long int count = mpz_get_si(pLsrAPIntGetZ(b));
+        if (count < 0) {
+            res = pLsrAPIntShiftRUIntL(a, -count);
+        } else {
+            res = pLsrAPIntShiftLUIntL(a, count);
+        }
+    } else {
+        pLsrRuntimeError("GMP AP Int only supports shifts up to 2^32 bits");
+    }
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntBShiftR(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt res = 0;
+    if (mpz_fits_slong_p(pLsrAPIntGetZ(b))) {
+        long int count = mpz_get_si(pLsrAPIntGetZ(b));
+        if (count < 0) {
+            res = pLsrAPIntShiftLUIntL(a, -count);
+        } else {
+            res = pLsrAPIntShiftRUIntL(a, count);
+        }
+    } else {
+        pLsrRuntimeError("GMP AP Int only supports shifts up to 2^32 bits");
+    }
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntBXor(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_xor);
+}
+
+
+
+/* Arithmetic */
+
+static PlsrAPInt pLsrAPIntNegate(PlsrAPInt a)
+{
+    PlsrAPInt res = pLsrAPIntNew();
+    mpz_neg(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a));
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTrace2("Negate", a, res);
+    return res;
+}
+
+static PlsrAPInt pLsrAPIntAdd(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_add);
+}
+
+static PlsrAPInt pLsrAPIntSub(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_sub);
+}
+
+static PlsrAPInt pLsrAPIntMul(PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntBinary(a, b, mpz_mul);
+}
+
+
+static void pLsrAPIntDivModE(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt q = pLsrAPIntNew();
+    PlsrAPInt r = pLsrAPIntNew();
+    mpz_tdiv_qr(pLsrAPIntGetZ(q), pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b));
+    if (mpz_sgn(pLsrAPIntGetZ(r)) < 0) {
+        if (mpz_sgn(pLsrAPIntGetZ(b)) > 0) {
+            mpz_sub_ui(pLsrAPIntGetZ(q), pLsrAPIntGetZ(q),1);
+            mpz_add(pLsrAPIntGetZ(r), pLsrAPIntGetZ(r), pLsrAPIntGetZ(b));
+        } else {
+            mpz_add_ui(pLsrAPIntGetZ(q), pLsrAPIntGetZ(q), 1);
+            mpz_sub(pLsrAPIntGetZ(r), pLsrAPIntGetZ(r), pLsrAPIntGetZ(b));
+        }
+    }
+    *quotO = q;
+    *remO = r;
+    pLsrAPIntKeepLive2(a, b);
+    pLsrAPIntTrace4("DivModE", a, b, q, r);
+    return;
+}
+
+static void pLsrAPIntDivModF(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt q = pLsrAPIntNew();
+    PlsrAPInt r = pLsrAPIntNew();
+    mpz_fdiv_qr(pLsrAPIntGetZ(q), pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b));
+    *quotO = q;
+    *remO = r;
+    pLsrAPIntKeepLive2(a, b);
+    pLsrAPIntTrace4("DivModF", a, b, q, r);
+    return;
+}
+
+static void pLsrAPIntDivModT(PlsrAPInt* quotO, PlsrAPInt* remO, PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt q = pLsrAPIntNew();
+    PlsrAPInt r = pLsrAPIntNew();
+    mpz_tdiv_qr(pLsrAPIntGetZ(q), pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b));
+    *quotO = q;
+    *remO = r;
+    pLsrAPIntKeepLive2(a, b);
+    pLsrAPIntTrace4("DivModT", a, b, q, r);
+    return;
+}
+
+static PlsrAPInt pLsrAPIntDivE(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt q = pLsrAPIntNew();
+    mpz_t r;
+    mpz_init(r);
+    mpz_tdiv_qr(pLsrAPIntGetZ(q), r, pLsrAPIntGetZ(a), pLsrAPIntGetZ(b));
+    if (mpz_sgn(r) < 0) {
+        if (mpz_sgn(pLsrAPIntGetZ(b)) > 0) {
+            mpz_sub_ui(pLsrAPIntGetZ(q), pLsrAPIntGetZ(q), 1);
+        } else {
+            mpz_add_ui(pLsrAPIntGetZ(q), pLsrAPIntGetZ(q), 1);
+        }
+    }
+    pLsrAPIntTrace4("DivE", a, b, q, r);
+    mpz_clear(r);
+    pLsrAPIntKeepLive2(a, b);
+    pLsrAPIntTrace3("DivE", a, b, q);
+    return q;
+
+}
+
+static PlsrAPInt pLsrAPIntDivF(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt q = pLsrAPIntNew();
+    mpz_fdiv_q(pLsrAPIntGetZ(q), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b));
+    pLsrAPIntKeepLive2(a, b);
+    pLsrAPIntTrace3("DivF", a, b, q);
+    return q;
+}
+
+static PlsrAPInt pLsrAPIntDivT(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt q = pLsrAPIntNew();
+    mpz_tdiv_q(pLsrAPIntGetZ(q), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b));
+    pLsrAPIntKeepLive2(a, b);
+    pLsrAPIntTrace3("DivT", a, b, q);
+    return q;
+}
+
+static PlsrAPInt pLsrAPIntModE(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt r = pLsrAPIntNew();
+    mpz_tdiv_r(pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b));
+    if (mpz_sgn(pLsrAPIntGetZ(r)) < 0) {
+        if (mpz_sgn(pLsrAPIntGetZ(b)) > 0) {
+            mpz_add(pLsrAPIntGetZ(r), pLsrAPIntGetZ(r), pLsrAPIntGetZ(b));
+        } else {
+            mpz_sub(pLsrAPIntGetZ(r), pLsrAPIntGetZ(r), pLsrAPIntGetZ(b));
+        }
+    }
+    pLsrAPIntKeepLive2(a, b);
+    pLsrAPIntTrace3("ModE", a, b, r);
+    return r;
+}
+
+static PlsrAPInt pLsrAPIntModF(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt r = pLsrAPIntNew();
+    mpz_fdiv_r(pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b));
+    pLsrAPIntKeepLive2(a, b);
+    pLsrAPIntTrace3("ModF", a, b, r);
+    return r;
+}
+
+static PlsrAPInt pLsrAPIntModT(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt r = pLsrAPIntNew();
+    mpz_tdiv_r(pLsrAPIntGetZ(r), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b));
+    pLsrAPIntKeepLive2(a, b);
+    return r;
+}
+
+static PlsrAPInt pLsrAPIntGcd(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt res = pLsrAPIntNew();
+    mpz_gcd(pLsrAPIntGetZ(res), pLsrAPIntGetZ(a), pLsrAPIntGetZ(b));
+    pLsrAPIntKeepLive2(a, b);
+    pLsrAPIntTrace3("Gcd", a, b, res);
+    return res;
+}
+
+
+/* Comparisons */
+
+static sintp pLsrAPIntCompare(PlsrAPInt a, PlsrAPInt b)
+{
+    sintp res = mpz_cmp(pLsrAPIntGetZ(a), pLsrAPIntGetZ(b));
+    pLsrAPIntKeepLive2(a, b);
+    pLsrAPIntTrace2("Compare", a, b);
+    return res;
+}
+
+
+static PlsrBoolean pLsrAPIntLess(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) < 0;
+}
+
+static PlsrBoolean pLsrAPIntGreater(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) > 0;
+}
+
+static PlsrBoolean pLsrAPIntEqual(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) == 0;
+}
+
+static PlsrBoolean pLsrAPIntNotEqual(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) != 0;
+}
+
+static PlsrBoolean pLsrAPIntLessOrEqual(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) <= 0;
+}
+
+static PlsrBoolean pLsrAPIntGreaterOrEqual(PlsrAPInt a, PlsrAPInt b) {
+    return pLsrAPIntCompare(a, b) >= 0;
+}
+
+
+/* Miscellaneous */
+
+static uint32 pLsrAPIntFitsInSInt32(PlsrAPInt a)
+{
+    uint32 res = mpz_fits_slong_p(pLsrAPIntGetZ(a));
+
+    if (sizeof(sint32) < sizeof(long int)) {
+        /* If it fits in a long int and the long int is small enough,
+         * it fits in an sint32 */
+        if (res) {
+            long int ai = mpz_get_si(pLsrAPIntGetZ(a));
+            res = ai >=SINT32_MIN && ai <= SINT32_MAX;
+        }
+    } else if (sizeof(sint32) > sizeof(long int)) {
+        if (!res) {
+            res = (pLsrAPIntGreaterOrEqual(a, pLsrAPIntFromSInt32(SINT32_MIN)) &&
+                   pLsrAPIntLessOrEqual(a, pLsrAPIntFromSInt32(SINT32_MAX)));
+        }
+    }
+    /* If they're equal, we're good. */
+    pLsrAPIntKeepLive1(a);
+    return res;
+}
+
+/* Returns SINT32_MIN if (a >= upper) or (a <= lower)
+ * Otherwise returns a.
+ * To be useful, (lower > SINT32_MIN) should be true.
+ */
+static sint32 pLsrAPIntCheckRangeSInt32(PlsrAPInt a, sint32 upper, sint32 lower)
+{
+    uint32 fitsInLong = mpz_fits_slong_p(pLsrAPIntGetZ(a));
+    sint32 res = SINT32_MIN;
+    if (sizeof(sint32) < sizeof(long int)) {
+        /* If it fits in a long int and the long int is small enough,
+         * it fits in an sint32 */
+        if (fitsInLong) {
+            long int ai = mpz_get_si(pLsrAPIntGetZ(a));
+            if (ai >= ((long int) lower) && ai <= ((long int) upper)) {
+                res = (sint32) ai;
+            }
+        }
+    } else if (sizeof(sint32) > sizeof(long int)) {
+        if (fitsInLong) {
+            sint32 ai = (sint32) mpz_get_si(pLsrAPIntGetZ(a));
+            if (ai >= lower && ai <= upper) {
+                res = ai;
+            }
+        } else if (pLsrAPIntGreaterOrEqual(a, pLsrAPIntFromSInt32(lower)) &&
+                   pLsrAPIntLessOrEqual(a, pLsrAPIntFromSInt32(upper))) {
+            res = pLsrSInt32FromAPInt(a);
+        }
+    } else if (fitsInLong) {
+        sint32 ai = (sint32) mpz_get_si(pLsrAPIntGetZ(a));
+        if (ai >= lower && ai <= upper) {
+            res = ai;
+        }
+    }
+    pLsrAPIntKeepLive1(a);
+    return res;
+}
+
+
+#define pLsrAPIntFromFloat32 pLsrAPIntNewFromFloat32
+#define pLsrAPIntFromFloat64 pLsrAPIntNewFromFloat64
+
+static float32 pLsrFloat32FromAPInt(PlsrAPInt a) {
+    float32 res = (float32) mpz_get_d(pLsrAPIntGetZ(a));
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTrace1("To float32", a);
+    return res;
+}
+
+static float64 pLsrFloat64FromAPInt(PlsrAPInt a) {
+    float64 res = (float64) mpz_get_d(pLsrAPIntGetZ(a));
+    pLsrAPIntKeepLive1(a);
+    pLsrAPIntTrace1("To float64", a);
+    return res;
+}
+
+#define hashPair(h1, h2) ((h1)+((h2)<<5)+(h2)+720)
+
+static uintp pLsrAPIntMPZHash(mpz_t z)
+{
+    uintp h;
+    switch mpz_sgn(z) {
+    case -1: h = 987; break;
+    case 1:  h = 0;   break;
+    case 0:  h = 0;   break;
+    }
+    for(size_t l = mpz_size(z); l > 0; l--) {
+        h = hashPair(h, mpz_getlimbn(z, l-1));
+    }
+    return h;
+}
+static uintp pLsrAPIntHash(PlsrAPInt i)
+{
+    uintp h = pLsrAPIntMPZHash(pLsrAPIntGetZ(i));
+    pLsrAPIntKeepLive1(i);
+    return h;
+}
+
+/* This function must match the previous one */
+static uintp pLsrSInt32Hash(sint32 i)
+{
+    uintp h;
+    mpz_t tmp;
+    mpz_init_set_si(tmp, i);
+    h = pLsrAPIntMPZHash(tmp);
+    mpz_clear(tmp);
+    return h;
+}
+
+/* Initialization and Registration */
+
+static void pLsrAPIntRegisterVTables()
+{
+    static PgcIsRef pLsrAPIntRefs[pLsrAPIntSize/P_WORD_SIZE] = { 0, };
+
+    pLsrVTableRegisterFinalizable(pLsrAPIntVTable, pLsrAPIntAlignment, pLsrAPIntSize, pLsrAPIntRefs, 0, 0, 0,
+                                  PGC_ALWAYS_MUTABLE, pLsrAPIntIsPinned, pLsrAPIntFinalize);
+
+}
+
+#define pLsrAPIntGlobalsCount 4
+
+static PlsrObjectB pLsrAPIntGlobals[] =
+    {
+        (PlsrObjectB) &pLsrAPIntZero_,
+        (PlsrObjectB) &pLsrAPIntOne_,
+        (PlsrObjectB) &pLsrAPIntMinusOne_,
+        (PlsrObjectB) &pLsrAPIntMinusUInt32Max_,
+        (PlsrObjectB) NULL /* This must be last */
+    };
+
+static void pLsrAPIntRegisterGlobals() {
+    assert(pLsrAPIntGlobals[pLsrAPIntGlobalsCount] == NULL);
+    pLsrGcRegisterGlobals (pLsrAPIntGlobals, pLsrAPIntGlobalsCount);
+};
+
+#ifdef PLSR_GMP_USE_GCMALLOC
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+
+static void* pLsrGMPMalloc(size_t size)
+{
+    return pLsrGCHeapMalloc((uintp) size);
+}
+static void pLsrGMPFree(void* object, size_t size)
+{
+    pLsrGCHeapFree(object);
+}
+static void* pLsrGMPReAlloc(void* object, size_t osize, size_t nsize)
+{
+    return pLsrGCHeapReAlloc(object, (uintp) osize, (uintp) nsize);
+}
+
+#  undef to
+#pragma pillar_managed(on)
+#endif
+#else
+#ifdef PLSR_GMP_FORCE_GC
+
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+
+static void* pLsrGMPMalloc(size_t size)
+{
+    pLsrGmpAllocated += size;
+    return malloc((uintp) size);
+}
+static void pLsrGMPFree(void* object, size_t size)
+{
+    pLsrGmpAllocated -= size;
+    free(object);
+}
+static void* pLsrGMPReAlloc(void* object, size_t osize, size_t nsize)
+{
+    if (osize > nsize) {
+        pLsrGmpAllocated -= (osize-nsize);
+    } else {
+        pLsrGmpAllocated += (nsize-osize);
+    }
+    return realloc(object, (uintp) nsize);
+}
+
+#  undef to
+#pragma pillar_managed(on)
+#endif
+
+#endif /* PLSR_GMP_FORCE_GC */
+#endif /* PLSR_GMP_USE_GCMALLOC*/
+
+static void pLsrAPIntInitialize(uintp memLimit) {
+#ifdef PLSR_GMP_FORCE_GC
+    if (memLimit != 0) pLsrGmpMemLimit = memLimit;
+#endif
+#ifdef PLSR_GMP_REPLACE_MALLOC
+    mp_set_memory_functions(pLsrGMPMalloc, pLsrGMPReAlloc, pLsrGMPFree);
+#endif
+    pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntZero, 0);
+    pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntOne, 1);
+    pLsrAPIntStaticGlobalInitFromSInt32(pLsrAPIntMinusOne, -1);
+    pLsrAPIntStaticGlobalInitFromUInt32(pLsrAPIntMinusUInt32Max, UINT32_MAX);
+    mpz_neg(pLsrAPIntGetZ(pLsrAPIntMinusUInt32Max), pLsrAPIntGetZ(pLsrAPIntMinusUInt32Max));
+};
+
+#endif /*_PLSR_AP_INTEGER_H_ */
diff --git a/runtime/include/hrc/plsr-integer.h b/runtime/include/hrc/plsr-integer.h
new file mode 100644
index 0000000..13232ee
--- /dev/null
+++ b/runtime/include/hrc/plsr-integer.h
@@ -0,0 +1,651 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_INTEGER_H_
+#define _PLSR_INTEGER_H_
+
+/**********************************************************************
+ * Arbitrary precision integers
+ */
+
+#include "hrc/plsr-ap-integer.h"
+
+/* GC functions */
+
+#define pLsrIntegerIsRef 1
+#define pLsrIntegerRegisterVTables pLsrAPIntRegisterVTables
+#define pLsrIntegerRegisterGlobals pLsrAPIntRegisterGlobals
+
+#ifdef P_USE_TAGGED_INTEGERS
+
+#include "hrc/plsr-tagged-int32.h"
+
+/* Types */
+
+typedef PlsrAPInt PlsrInteger;
+typedef PlsrAPIntS PlsrIntegerU;
+
+/* Basic constructors */
+#define pLsrIntegerIsTagged(a) (pLsrTaggedInt32TaggedIntIsTagged(a))
+
+/* Constants */
+
+#define pLsrSmallIntegerMax pLsrTaggedInt32Max
+#define pLsrSmallIntegerMin pLsrTaggedInt32Min
+
+#define pLsrIntegerZero  ((PlsrInteger) pLsrTaggedInt32Zero)
+#define pLsrIntegerOne  ((PlsrInteger) pLsrTaggedInt32One)
+#define pLsrIntegerMinusOne  ((PlsrInteger) pLsrTaggedInt32MinusOne)
+#define pLsrIntegerMinusUInt32Max  pLsrAPIntMinusUInt32Max
+
+
+#ifdef PLSR_AP_INT_BACKPATCH_GLOBALS
+
+#define pLsrIntegerStaticUnboxed pLsrAPIntStaticGlobalNew
+#define pLsrIntegerStaticGlobalInit pLsrAPIntStaticGlobalInitFromCString
+#define pLsrIntegerStaticRef(uvar)  ((PlsrAPInt) & (uvar))
+
+#else
+
+#define pLsrIntegerSignNeg PlsrAPNeg
+#define pLsrIntegerSignPos PlsrAPPos
+#define pLsrIntegerSignZero PlsrAPZero
+#define pLsrIntegerDigitListStaticEmpty pLsrAPIntDigitListStaticEmpty
+#define pLsrIntegerDigitListStaticConsUnboxedDef pLsrAPIntDigitListStaticConsUnboxedDef
+#define pLsrIntegerDigitListStaticConsRef pLsrAPIntDigitListStaticConsRef
+#define pLsrIntegerStaticUnboxedDef pLsrAPIntStaticUnboxedDef
+#define pLsrIntegerStaticRef(uvar)  ((PlsrAPInt) & uvar)
+
+#endif
+
+
+/* Conversions */
+
+#define pLsrSmallIntegerFromSInt32(a) (pLsrTaggedInt32TaggedIntFromSmallInt32(PlsrInteger, a))
+
+#define pLsrIntegerFromSInt8 pLsrIntegerFromSInt32
+#define pLsrSInt8FromInteger pLsrSInt32FromInteger
+
+#define pLsrIntegerFromSInt16 pLsrIntegerFromSInt32
+#define pLsrSInt16FromInteger pLsrSInt32FromInteger
+
+
+#define pLsrIntegerFromSInt32(dest, a)                              \
+    pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(PlsrInteger, sint32, pLsrAPIntFromSInt32, dest, a)
+#define pLsrSInt32FromInteger(dest, a)                             \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint32, PlsrInteger, PlsrAPInt, pLsrSInt32FromAPInt, dest, a)
+
+#define pLsrIntegerFromSIntp(dest, a)                              \
+    pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(PlsrInteger, sintp, pLsrAPIntFromSIntp, dest, a)
+#define pLsrSIntpFromInteger(dest, a)                             \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sintp, PlsrInteger, PlsrAPInt, pLsrSIntpFromAPInt, dest, a)
+
+#define pLsrIntegerFromSInt64(dest, a)                              \
+    pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(PlsrInteger, sint64, pLsrAPIntFromSInt64, dest, a)
+#define pLsrSInt64FromInteger(dest, a)                             \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint64, PlsrInteger, PlsrAPInt, pLsrSInt64FromAPInt, dest, a)
+
+#define pLsrIntegerFromUInt8 pLsrIntegerFromUInt32
+#define pLsrUInt8FromInteger pLsrUInt32FromInteger
+
+#define pLsrIntegerFromUInt16 pLsrIntegerFromUInt32
+#define pLsrUInt16FromInteger pLsrUInt32FromInteger
+
+#define pLsrIntegerFromUInt32(dest, a)                              \
+    pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(PlsrInteger, uint32, pLsrAPIntFromUInt32, dest, a)
+#define pLsrUInt32FromInteger(dest, a)                             \
+    pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(uint32, PlsrInteger, PlsrAPInt, pLsrUInt32FromAPInt, dest, a)
+
+#define pLsrIntegerFromUIntp(dest, a)                              \
+    pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(PlsrInteger, uintp, pLsrAPIntFromUIntp, dest, a)
+#define pLsrUIntpFromInteger(dest, a)                             \
+    pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(uintp, PlsrInteger, PlsrAPInt, pLsrUIntpFromAPInt, dest, a)
+
+#define pLsrIntegerFromUInt64(dest, a)                              \
+    pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(PlsrInteger, uint64, pLsrAPIntFromUInt64, dest, a)
+#define pLsrUInt64FromInteger(dest, a)                             \
+    pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(uint64, PlsrInteger, PlsrAPInt, pLsrUInt64FromAPInt, dest, a)
+
+#define pLsrIntegerFromFloat32(dest, a)                                 \
+    pLsrTaggedInt32NumConvFastTaggedIntFromFloat32(PlsrInteger, pLsrAPIntFromFloat32, dest, a)
+#define pLsrFloat32FromInteger(dest, a)                                 \
+    pLsrTaggedInt32NumConvFastFloatXFromTaggedInt(float32, PlsrInteger, PlsrAPInt, pLsrFloat32FromAPInt, dest, a)
+
+#define pLsrIntegerFromFloat64(dest, a)                                 \
+    pLsrTaggedInt32NumConvFastTaggedIntFromFloat64(PlsrInteger, pLsrAPIntFromFloat64, dest, a)
+#define pLsrFloat64FromInteger(dest, a)                                 \
+    pLsrTaggedInt32NumConvFastFloatXFromTaggedInt(float64, PlsrInteger, PlsrAPInt, pLsrFloat64FromAPInt, dest, a)
+
+
+/* pLsrCStringFromInteger */
+pLsrTaggedInt32MkCStringFrom(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrCStringFromAPInt);
+
+/* pLsrIntegerFromCString */
+pLsrTaggedInt32MkFromCString(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, \
+                             pLsrSInt32FromAPInt, pLsrAPIntLessOrEqual, pLsrAPIntFromCString);
+
+/* Casts */
+
+/* NG: Note that we are assuming that casting from sint32 to sintX and uintX takes the bottom X bits
+ *     This is not portable.
+ */
+
+#define pLsrIntegerCastToSInt8(dest, a)                                 \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint8, PlsrInteger, PlsrAPInt, pLsrAPIntCastToSInt8, dest, a)
+
+#define pLsrIntegerCastToSInt16(dest, a)                                \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint16, PlsrInteger, PlsrAPInt, pLsrAPIntCastToSInt16, dest, a)
+
+#define pLsrIntegerCastToSInt32(dest, a)                                \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint32, PlsrInteger, PlsrAPInt, pLsrAPIntCastToSInt32, dest, a)
+
+#define pLsrIntegerCastToSInt64(dest, a)                                \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint64, PlsrInteger, PlsrAPInt, pLsrAPIntCastToSInt64, dest, a)
+
+#define pLsrIntegerCastToUInt8(dest, a)                                 \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(uint8, PlsrInteger, PlsrAPInt, pLsrAPIntCastToUInt8, dest, a)
+
+#define pLsrIntegerCastToUInt16(dest, a)                                \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(uint16, PlsrInteger, PlsrAPInt, pLsrAPIntCastToUInt16, dest, a)
+
+#define pLsrIntegerCastToUInt32(dest, a)                                \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(uint32, PlsrInteger, PlsrAPInt, pLsrAPIntCastToUInt32, dest, a)
+
+#define pLsrIntegerCastToUInt64(dest, a)                                \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(uint64, PlsrInteger, PlsrAPInt, pLsrAPIntCastToUInt64, dest, a)
+
+#define pLsrIntegerCastToFloat32(dest, a)                               \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(float32, PlsrInteger, PlsrAPInt, pLsrAPIntCastToFloat32, dest, a)
+
+#define pLsrIntegerCastToFloat64(dest, a)                               \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(float64, PlsrInteger, PlsrAPInt, pLsrAPIntCastToFloat64, dest, a)
+
+/* Arithmetic */
+
+#ifdef PLSR_TAGGED_INTEGER_RECOVER
+
+static PlsrInteger pLsrIntegerAPIntBNot(PlsrAPInt a)
+{
+    PlsrAPInt c = pLsrAPIntBNot(a);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntBAnd(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntBAnd(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntBOr(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntBOr(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntBShiftL(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntBShiftL(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntBShiftR(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntBShiftR(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntBXor(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntBXor(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntAdd(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntAdd(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntSub(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntSub(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+/* Multiplication can only make things worse */
+#define pLsrIntegerAPIntMul pLsrAPIntMul
+
+static void pLsrIntegerAPIntDivModE(PlsrInteger* q, PlsrInteger* r, PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntDivModE(q, r, a, b);
+    sint32 qi = pLsrAPIntCheckRangeSInt32 (*q, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    sint32 ri = pLsrAPIntCheckRangeSInt32 (*r, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (qi > SINT32_MIN) {
+        *q = pLsrSmallIntegerFromSInt32(qi);
+    }
+    if (ri > SINT32_MIN) {
+        *r = pLsrSmallIntegerFromSInt32(ri);
+    }
+}
+
+static void pLsrIntegerAPIntDivModF(PlsrInteger* q, PlsrInteger* r, PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntDivModF(q, r, a, b);
+    sint32 qi = pLsrAPIntCheckRangeSInt32 (*q, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    sint32 ri = pLsrAPIntCheckRangeSInt32 (*r, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (qi > SINT32_MIN) {
+        *q = pLsrSmallIntegerFromSInt32(qi);
+    }
+    if (ri > SINT32_MIN) {
+        *r = pLsrSmallIntegerFromSInt32(ri);
+    }
+}
+
+static void pLsrIntegerAPIntDivModT(PlsrInteger* q, PlsrInteger* r, PlsrAPInt a, PlsrAPInt b)
+{
+    pLsrAPIntDivModT(q, r, a, b);
+    sint32 qi = pLsrAPIntCheckRangeSInt32 (*q, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    sint32 ri = pLsrAPIntCheckRangeSInt32 (*r, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (qi > SINT32_MIN) {
+        *q = pLsrSmallIntegerFromSInt32(qi);
+    }
+    if (ri > SINT32_MIN) {
+        *r = pLsrSmallIntegerFromSInt32(ri);
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntDivE(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntDivE(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntDivF(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntDivF(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntDivT(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntDivT(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntModE(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntModE(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntModF(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntModF(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+static PlsrInteger pLsrIntegerAPIntModT(PlsrAPInt a, PlsrAPInt b)
+{
+    PlsrAPInt c = pLsrAPIntModT(a, b);
+    sint32 ci = pLsrAPIntCheckRangeSInt32 (c, pLsrTaggedInt32Max, pLsrTaggedInt32Min);
+    if (ci > SINT32_MIN) {
+        return pLsrSmallIntegerFromSInt32(ci);
+    } else {
+        return c;
+    }
+}
+
+#else
+#define pLsrIntegerAPIntBNot pLsrAPIntBNot
+#define pLsrIntegerAPIntBAnd pLsrAPIntBAnd
+#define pLsrIntegerAPIntBOr pLsrAPIntBOr
+#define pLsrIntegerAPIntBShiftL pLsrAPIntBShiftL
+#define pLsrIntegerAPIntBShiftR pLsrAPIntBShiftR
+#define pLsrIntegerAPIntBXor pLsrAPIntBXor
+#define pLsrIntegerAPIntAdd pLsrAPIntAdd
+#define pLsrIntegerAPIntSub pLsrAPIntSub
+#define pLsrIntegerAPIntMul pLsrAPIntMul
+#define pLsrIntegerAPIntDivModE pLsrAPIntDivModE
+#define pLsrIntegerAPIntDivModF pLsrAPIntDivModF
+#define pLsrIntegerAPIntDivModT pLsrAPIntDivModT
+#define pLsrIntegerAPIntDivE pLsrAPIntDivE
+#define pLsrIntegerAPIntDivF pLsrAPIntDivF
+#define pLsrIntegerAPIntDivT pLsrAPIntDivT
+#define pLsrIntegerAPIntModE pLsrAPIntModE
+#define pLsrIntegerAPIntModF pLsrAPIntModF
+#define pLsrIntegerAPIntModT pLsrAPIntModT
+#endif
+
+
+/**************************** Bitwise *********************************/
+
+/* pLsrTaggedInt32IntegerBNotSlow*/
+#define pLsrIntegerBNot(dest, a)                                        \
+    pLsrTaggedInt32TaggedIntBNot(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBNotSlow, dest, a)
+
+/* pLsrTaggedInt32IntegerBAndSlow*/
+pLsrTaggedInt32MkTaggedIntBAndSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntBAnd);
+#define pLsrIntegerBAnd(dest, a, b) \
+    pLsrTaggedInt32TaggedIntBAnd(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBAndSlow, dest, a, b)
+
+/* pLsrTaggedInt32IntegerBOrSlow*/
+pLsrTaggedInt32MkTaggedIntBOrSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntBOr);
+#define pLsrIntegerBOr(dest, a, b) \
+    pLsrTaggedInt32TaggedIntBOr(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBOrSlow, dest, a, b)
+
+/* pLsrTaggedInt32IntegerBShiftLSlow*/
+pLsrTaggedInt32MkTaggedIntBShiftLSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntBShiftL);
+#define pLsrIntegerBShiftL(dest, a, b) \
+    pLsrTaggedInt32TaggedIntBShiftL(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBShiftLSlow, dest, a, b)
+
+/* pLsrTaggedInt32IntegerBShiftRSlow*/
+pLsrTaggedInt32MkTaggedIntBShiftRSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntBShiftR);
+#define pLsrIntegerBShiftR(dest, a, b)                                  \
+    pLsrTaggedInt32TaggedIntBShiftR(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBShiftRSlow, dest, a, b)
+
+/* pLsrTaggedInt32IntegerBXorSlow*/
+pLsrTaggedInt32MkTaggedIntBXorSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntBXor);
+#define pLsrIntegerBXor(dest, a, b) \
+    pLsrTaggedInt32TaggedIntBXor(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerBXorSlow, dest, a, b)
+
+
+/**************************** Arithmetic *********************************/
+
+#define pLsrIntegerNegate(dest, r) \
+    pLsrTaggedInt32TaggedIntNeg(PlsrInteger, PlsrAPInt, pLsrAPIntNegate, pLsrAPIntFromSInt32, dest, r)
+
+/* pLsrTaggedInt32IntegerAddSlow*/
+pLsrTaggedInt32MkTaggedIntAddSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntAdd);
+#define pLsrIntegerPlus(dest, a, b) \
+    pLsrTaggedInt32TaggedIntAdd(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerAddSlow, dest, a, b)
+
+/* pLsrTaggedInt32IntegerSubSlow*/
+pLsrTaggedInt32MkTaggedIntSubSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntSub);
+#define pLsrIntegerMinus(dest, a, b)                               \
+    pLsrTaggedInt32TaggedIntSub(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerSubSlow, dest, a, b)
+
+/* pLsrTaggedInt32IntegerMulSlow*/
+pLsrTaggedInt32MkTaggedIntMulSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntFromSInt64,
+                                  pLsrIntegerAPIntMul);
+#define pLsrIntegerTimes(dest, a, b)                                \
+    pLsrTaggedInt32TaggedIntMul(PlsrInteger, PlsrAPInt, pLsrTaggedInt32IntegerMulSlow, dest, a, b)
+
+/* pLsrTaggedInt32IntegerDivModESlow */
+pLsrTaggedInt32MkTaggedIntDivModESlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntDivModE);
+#define pLsrIntegerDivModE(q, r, a, b)                                  \
+    pLsrTaggedInt32TaggedIntDivModE(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModESlow, q, r, a, b)
+#define pLsrIntegerDivE(dest, a, b)                                     \
+    pLsrTaggedInt32TaggedIntDivE(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModESlow, dest, a, b)
+#define pLsrIntegerModE(dest, a, b)                                     \
+    pLsrTaggedInt32TaggedIntModE(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModESlow, dest, a, b)
+
+/* pLsrTaggedInt32IntegerDivModFSlow */
+pLsrTaggedInt32MkTaggedIntDivModFSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntDivModF);
+#define pLsrIntegerDivModF(q, r, a, b)                                  \
+    pLsrTaggedInt32TaggedIntDivModF(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModFSlow, q, r, a, b)
+#define pLsrIntegerDivF(dest, a, b)                                     \
+    pLsrTaggedInt32TaggedIntDivF(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModFSlow, dest, a, b)
+#define pLsrIntegerModF(dest, a, b)                                     \
+    pLsrTaggedInt32TaggedIntModF(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModFSlow, dest, a, b)
+
+/* pLsrTaggedInt32IntegerDivModTSlow */
+pLsrTaggedInt32MkTaggedIntDivModTSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrIntegerAPIntDivModT);
+#define pLsrIntegerDivModT(q, r, a, b)                                  \
+    pLsrTaggedInt32TaggedIntDivModT(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModTSlow, q, r, a, b)
+#define pLsrIntegerDivT(dest, a, b)                                     \
+    pLsrTaggedInt32TaggedIntDivT(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModTSlow, dest, a, b)
+#define pLsrIntegerModT(dest, a, b)                                     \
+    pLsrTaggedInt32TaggedIntModT(PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrTaggedInt32IntegerDivModTSlow, dest, a, b)
+
+/* pLsrTaggedInt32IntegerGcd */
+pLsrTaggedInt32MkTaggedIntGcd(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, \
+                              pLsrAPIntLessOrEqual, pLsrSInt32FromAPInt, pLsrAPIntGcd);
+#define pLsrIntegerGcd pLsrTaggedInt32IntegerGcd
+
+/* Comparisons */
+pLsrTaggedInt32MkTaggedIntEqSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntEqual);
+pLsrTaggedInt32MkTaggedIntNeSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntNotEqual);
+pLsrTaggedInt32MkTaggedIntLtSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntLess);
+pLsrTaggedInt32MkTaggedIntGtSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntGreater);
+pLsrTaggedInt32MkTaggedIntLeSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntLessOrEqual);
+pLsrTaggedInt32MkTaggedIntGeSlow(Integer, PlsrInteger, PlsrAPInt, pLsrAPIntFromSInt32, pLsrAPIntGreaterOrEqual);
+
+#define pLsrIntegerEQ(dest, a, b) pLsrTaggedInt32TaggedIntEq(pLsrTaggedInt32IntegerEqSlow, dest, a, b)
+#define pLsrIntegerNE(dest, a, b) pLsrTaggedInt32TaggedIntNe(pLsrTaggedInt32IntegerNeSlow, dest, a, b)
+#define pLsrIntegerLT(dest, a, b) pLsrTaggedInt32TaggedIntLt(pLsrTaggedInt32IntegerLtSlow, dest, a, b)
+#define pLsrIntegerGT(dest, a, b) pLsrTaggedInt32TaggedIntGt(pLsrTaggedInt32IntegerGtSlow, dest, a, b)
+#define pLsrIntegerLE(dest, a, b) pLsrTaggedInt32TaggedIntLe(pLsrTaggedInt32IntegerLeSlow, dest, a, b)
+#define pLsrIntegerGE(dest, a, b) pLsrTaggedInt32TaggedIntGe(pLsrTaggedInt32IntegerGeSlow, dest, a, b)
+
+/* Miscellaneous */
+#define pLsrIntegerFitsInSInt32(a) ((pLsrIntegerIsTagged(a)) || (pLsrAPIntFitsInSInt32(a)))
+
+static sint32 pLsrIntegerCheckRangeSInt32(PlsrInteger a, sint32 upper, sint32 lower)
+{
+    if (pLsrIntegerIsTagged(a)) {
+        sint32 ai;
+        pLsrSInt32FromInteger(ai, a);
+        return ((ai <= upper) && (ai >= lower)) ? ai : SINT32_MIN;
+    } else {
+        return pLsrAPIntCheckRangeSInt32(a, upper, lower);
+    }
+}
+
+#define pLsrIntegerHash(i) pLsrTaggedInt32TaggedIntHash(PlsrInteger, PlsrAPInt, pLsrAPIntHash, i)
+
+#else /* !P_USE_TAGGED_INTEGERS */
+
+#define pLsrIntegerIsTagged(a) (0)
+
+/* Types */
+
+typedef PlsrAPInt PlsrInteger;
+typedef PlsrAPIntS PlsrIntegerU;
+
+/* Constants */
+
+#define pLsrSmallIntegerMax pLsrTaggedInt32Max
+#define pLsrSmallIntegerMin pLsrTaggedInt32Min
+
+#define pLsrIntegerZero  pLsrAPIntZero
+#define pLsrIntegerOne  pLsrAPIntOne
+#define pLsrIntegerMinusOne  pLsrAPIntMinusOne
+#define pLsrIntegerMinusUInt32Max  pLsrAPIntMinusUInt32Max
+
+#define pLsrIntegerSignNeg PlsrAPNeg
+#define pLsrIntegerSignPos PlsrAPPos
+#define pLsrIntegerSignZero PlsrAPZero
+
+#ifdef PLSR_AP_INT_BACKPATCH_GLOBALS
+
+#define pLsrIntegerStaticUnboxed pLsrAPIntStaticGlobalNew
+#define pLsrIntegerStaticGlobalInit pLsrAPIntStaticGlobalInitFromCString
+#define pLsrIntegerStaticRef(uvar)  ((PlsrAPInt) & (uvar))
+
+#else
+
+#define pLsrIntegerSignNeg PlsrAPNeg
+#define pLsrIntegerSignPos PlsrAPPos
+#define pLsrIntegerSignZero PlsrAPZero
+#define pLsrIntegerDigitListStaticEmpty pLsrAPIntDigitListStaticEmpty
+#define pLsrIntegerDigitListStaticConsUnboxedDef pLsrAPIntDigitListStaticConsUnboxedDef
+#define pLsrIntegerDigitListStaticConsRef pLsrAPIntDigitListStaticConsRef
+#define pLsrIntegerStaticUnboxedDef pLsrAPIntStaticUnboxedDef
+#define pLsrIntegerStaticRef(uvar)  ((PlsrAPInt) & uvar)
+
+#endif
+
+
+/* Conversions */
+
+#define pLsrSmallIntegerFromSInt32 pLsrAPIntFromSInt32
+
+#define pLsrIntegerFromSInt8 pLsrIntegerFromSInt32
+#define pLsrSInt8FromInteger pLsrSInt32FromInteger
+
+#define pLsrIntegerFromSInt16 pLsrIntegerFromSInt32
+#define pLsrSInt16FromInteger pLsrSInt32FromInteger
+
+#define pLsrIntegerFromSInt32(dest, i) ((dest) = (pLsrAPIntFromSInt32(i)))
+#define pLsrSInt32FromInteger(dest, i) ((dest) = (pLsrSInt32FromAPInt(i)))
+
+#define pLsrIntegerFromSInt64(dest, i) ((dest) = (pLsrAPIntFromSInt64(i)))
+#define pLsrSInt64FromInteger(dest, i) ((dest) = (pLsrSInt64FromAPInt(i)))
+
+#define pLsrIntegerFromSIntp(dest, i) ((dest) = (pLsrAPIntFromSIntp(i)))
+#define pLsrSIntpFromInteger(dest, i) ((dest) = (pLsrSIntpFromAPInt(i)))
+
+
+#define pLsrIntegerFromUInt8 pLsrIntegerFromUInt32
+#define pLsrUInt8FromInteger pLsrUInt32FromInteger
+
+#define pLsrIntegerFromUInt16 pLsrIntegerFromUInt32
+#define pLsrUInt16FromInteger pLsrUInt32FromInteger
+
+#define pLsrIntegerFromUInt32(dest, i) ((dest) = (pLsrAPIntFromUInt32(i)))
+#define pLsrUInt32FromInteger(dest, i) ((dest) = (pLsrUInt32FromAPInt(i)))
+
+#define pLsrIntegerFromUInt64(dest, i) ((dest) = (pLsrAPIntFromUInt64(i)))
+#define pLsrUInt64FromInteger(dest, i) ((dest) = (pLsrUInt64FromAPInt(i)))
+
+#define pLsrIntegerFromUIntp(dest, i) ((dest) = (pLsrAPIntFromUIntp(i)))
+#define pLsrUIntpFromInteger(dest, i) ((dest) = (pLsrUIntpFromAPInt(i)))
+
+#define pLsrCStringFromInteger pLsrCStringFromAPInt
+#define pLsrIntegerFromCString pLsrAPIntFromCString
+
+/* Bitwise */
+
+#define pLsrIntegerBNot(dest, i) ((dest) = (pLsrAPIntBNot(i)))
+#define pLsrIntegerBAnd(dest, a, b) ((dest) = (pLsrAPIntBAnd(a, b)))
+#define pLsrIntegerBOr(dest, a, b) ((dest) = (pLsrAPIntBOr(a, b)))
+#define pLsrIntegerBShiftL(dest, a, b) ((dest) = (pLsrAPIntBShiftL(a, b)))
+#define pLsrIntegerBShiftR(dest, a, b) ((dest) = (pLsrAPIntBShiftR(a, b)))
+#define pLsrIntegerBXor(dest, a, b) ((dest) = (pLsrAPIntBXor(a, b)))
+
+/* Arithmetic */
+
+#define pLsrIntegerNegate(dest, i) ((dest) = (pLsrAPIntNegate(i)))
+#define pLsrIntegerPlus(dest, a, b) ((dest) = (pLsrAPIntAdd(a, b)))
+#define pLsrIntegerMinus(dest, a, b) ((dest) = (pLsrAPIntSub(a, b)))
+#define pLsrIntegerTimes(dest, a, b) ((dest) = (pLsrAPIntMul(a, b)))
+
+#define pLsrIntegerDivModE(q, r, a, b)                              \
+    do {                                                                \
+        PlsrAPInt pLsrIntegerDivModE_q = NULL;                          \
+        PlsrAPInt pLsrIntegerDivModE_r = NULL;                          \
+        pLsrAPIntDivModE(&pLsrIntegerDivModE_q, &pLsrIntegerDivModE_r, a, b); \
+        q = pLsrIntegerDivModE_q;                                       \
+        r = pLsrIntegerDivModE_r;                                       \
+    } while (0)
+#define pLsrIntegerDivModF(q, r, a, b)                              \
+    do {                                                                \
+        PlsrAPInt pLsrIntegerDivModF_q = NULL;                          \
+        PlsrAPInt pLsrIntegerDivModF_r = NULL;                          \
+        pLsrAPIntDivModF(&pLsrIntegerDivModF_q, &pLsrIntegerDivModF_r, a, b); \
+        q = pLsrIntegerDivModF_q;                                       \
+        r = pLsrIntegerDivModF_r;                                       \
+    } while (0)
+#define pLsrIntegerDivModT(q, r, a, b)                               \
+    do {                                                                \
+        PlsrAPInt pLsrIntegerDivModT_q = NULL;                          \
+        PlsrAPInt pLsrIntegerDivModT_r = NULL;                          \
+        pLsrAPIntDivModT(&pLsrIntegerDivModT_q, &pLsrIntegerDivModT_r, a, b); \
+        q = pLsrIntegerDivModT_q;                                       \
+        r = pLsrIntegerDivModT_r;                                       \
+    } while (0)
+
+
+#define pLsrIntegerDivE(dest, a, b) ((dest) = (pLsrAPIntDivE(a, b)))
+#define pLsrIntegerModE(dest, a, b) ((dest) = (pLsrAPIntModE(a, b)))
+
+#define pLsrIntegerDivF(dest, a, b) ((dest) = (pLsrAPIntDivF(a, b)))
+#define pLsrIntegerModF(dest, a, b) ((dest) = (pLsrAPIntModF(a, b)))
+
+#define pLsrIntegerDivT(dest, a, b) ((dest) = (pLsrAPIntDivT(a, b)))
+#define pLsrIntegerModT(dest, a, b) ((dest) = (pLsrAPIntModT(a, b)))
+
+#define pLsrIntegerGcd pLsrAPIntGcd
+
+/* Comparisons */
+
+#define pLsrIntegerEQ(dest, a, b) ((dest) = (pLsrAPIntEqual(a, b)))
+#define pLsrIntegerNE(dest, a, b) ((dest) = (pLsrAPIntNotEqual(a, b)))
+#define pLsrIntegerLT(dest, a, b) ((dest) = (pLsrAPIntLess(a, b)))
+#define pLsrIntegerGT(dest, a, b) ((dest) = (pLsrAPIntGreater(a, b)))
+#define pLsrIntegerLE(dest, a, b) ((dest) = (pLsrAPIntLessOrEqual(a, b)))
+#define pLsrIntegerGE(dest, a, b) ((dest) = (pLsrAPIntGreaterOrEqual(a, b)))
+
+/* Miscellaneous */
+#define pLsrIntegerFitsInSInt32 pLsrAPIntFitsInSInt32
+#define pLsrIntegerCheckRangeSInt32 pLsrAPIntCheckRangeSInt32
+
+#define pLsrIntegerHash(i) (pLsrAPInthash(i))
+
+#endif /* P_USE_TAGGED_INTEGERS */
+#endif /*_PLSR_INTEGER_H_ */
diff --git a/runtime/include/hrc/plsr-lightweight-thunk.h b/runtime/include/hrc/plsr-lightweight-thunk.h
new file mode 100644
index 0000000..d106ac8
--- /dev/null
+++ b/runtime/include/hrc/plsr-lightweight-thunk.h
@@ -0,0 +1,568 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Unified Futures Thunk Implementation */
+
+#ifndef _PLSR_THUNK_H_
+#define _PLSR_THUNK_H_
+
+#ifndef PLSR_THUNK_NO_SUBSUMPTION
+#define PLSR_THUNK_SUBSUMPTION
+#endif
+
+#ifdef P_USE_PARALLEL_FUTURES
+#error "Parallel futures not supported with lightweight thunks"
+#endif
+
+#ifdef PLSR_THUNK_INTERCEPT_CUTS
+#error "Cut interception not supported on lightweight thunks"
+  #ifndef P_USE_PILLAR
+    #error "Cut interception only supported on Pillar"
+  #endif
+#endif
+
+/*** Types ***/
+
+#define PlsrTypesMk(name, retType)                                      \
+    typedef retType PlsrThunkReturnType##name;                          \
+    typedef PlsrThunkReturnType##name (*PlsrThunkCode##name)(PlsrThunkB##name); \
+    /* PlsrObjectU must be a prefix of this structure */                \
+    typedef struct PlsrThunkSVS##name {                                 \
+        PlsrVTable vtable;                                              \
+        retType result;                                                 \
+    } PlsrThunkSV##name;                                                \
+    typedef struct PlsrThunkSDS##name {                                 \
+        PlsrVTable vtable;                                              \
+        char fvs[];                                                     \
+    } PlsrThunkSD##name;                                                \
+    typedef struct PlsrThunkSES##name {                                  \
+        PlsrVTable vtable;                                              \
+        PlsrRef exn;                                                    \
+    } PlsrThunkSE##name;                                                \
+    typedef union PlsrThunkUU##name {                                   \
+        PlsrThunkSD##name delay;                                        \
+        PlsrThunkSV##name value;                                        \
+        PlsrThunkSE##name exn;                                          \
+    } PlsrThunkU##name
+
+/*
+ * PlsrThunkCodeRef, pLsrThunkSRef, pLsrThunkURef
+ * PlsrThunkCode32, pLsrThunkS32, pLsrThunkU32
+ * PlsrThunkCode64, pLsrThunkS64, pLsrThunkU64
+ * PlsrThunkCodeFloat, pLsrThunkSFloat, pLsrThunkUFloat
+ * PlsrThunkCodeDouble, pLsrThunkSDouble, pLsrThunkUDouble
+ */
+PlsrTypesMk(Ref, PlsrRef);
+PlsrTypesMk(32, uint32);
+PlsrTypesMk(64, uint64);
+PlsrTypesMk(Float, float32);
+PlsrTypesMk(Double, float64);
+
+#define pLsrThunkRef(t) ((PlsrThunkURef*)(t))
+#define pLsrThunk32(t)  ((PlsrThunkU32*)(t))
+#define pLsrThunk64(t)  ((PlsrThunkU64*)(t))
+#define pLsrThunkFloat(t)  ((PlsrThunkUFloat*)(t))
+#define pLsrThunkDouble(t)  ((PlsrThunkUDouble*)(t))
+
+#define pLsrThunkResultFieldRef(thunk)     (pLsrThunkRef(thunk)->value.result)
+#define pLsrThunkResultField32(thunk)      (pLsrThunk32(thunk)->value.result)
+#define pLsrThunkResultField64(thunk)      (pLsrThunk64(thunk)->value.result)
+#define pLsrThunkResultFieldFloat(thunk)   (pLsrThunkFloat(thunk)->value.result)
+#define pLsrThunkResultFieldDouble(thunk)  (pLsrThunkDouble(thunk)->value.result)
+
+#define pLsrThunkExnFieldRef(thunk)     (pLsrThunkRef(thunk)->exn.exn)
+#define pLsrThunkExnField32(thunk)      (pLsrThunk32(thunk)->exn.exn)
+#define pLsrThunkExnField64(thunk)      (pLsrThunk64(thunk)->exn.exn)
+#define pLsrThunkExnFieldFloat(thunk)   (pLsrThunkFloat(thunk)->exn.exn)
+#define pLsrThunkExnFieldDouble(thunk)  (pLsrThunkDouble(thunk)->exn.exn)
+
+#define pLsrThunkVTableFieldRef(thunk)     (pLsrThunkRef(thunk)->value.vtable)
+#define pLsrThunkVTableField32(thunk)      (pLsrThunk32(thunk)->value.vtable)
+#define pLsrThunkVTableField64(thunk)      (pLsrThunk64(thunk)->value.vtable)
+#define pLsrThunkVTableFieldFloat(thunk)   (pLsrThunkFloat(thunk)->value.vtable)
+#define pLsrThunkVTableFieldDouble(thunk)  (pLsrThunkDouble(thunk)->value.vtable)
+
+#define pLsrThunkGetCodeRef(thunk)     ((PlsrThunkCodeRef)pLsrVTableGetCustom(pLsrThunkRef(thunk)->delay.vtable))
+#define pLsrThunkGetCode32(thunk)      ((PlsrThunkCode32)pLsrVTableGetCustom(pLsrThunk32(thunk)->delay.vtable))
+#define pLsrThunkGetCode64(thunk)      ((PlsrThunkCode64)pLsrVTableGetCustom(pLsrThunk64(thunk)->delay.vtable))
+#define pLsrThunkGetCodeFloat(thunk)   ((PlsrThunkCodeFloat)pLsrVTableGetCustom(pLsrThunkFloat(thunk)->delay.vtable))
+#define pLsrThunkGetCodeDouble(thunk)  ((PlsrThunkCodeDouble)pLsrVTableGetCustom(pLsrThunkDouble(thunk)->delay.vtable))
+
+#ifdef PLSR_THUNK_SUBSUMPTION
+#define pLsrThunkCastToObjectRef(thunk)     ((PlsrRef) thunk)
+#else /* !PLSR_THUNK_SUBSUMPTION */
+#define pLsrThunkCastToObjectRef(thunk)     ((PlsrRef) 0)
+#endif /* !PLSR_THUNK_SUBSUMPTION */
+#define pLsrThunkCastToObject32(thunk)      (assert(0), (uint32) 0)
+#define pLsrThunkCastToObject64(thunk)      (assert(0), (uint64) 0)
+#define pLsrThunkCastToObjectFloat(thunk)   (assert(0), (float32) 0)
+#define pLsrThunkCastToObjectDouble(thunk)  (assert(0), (float64) 0)
+
+/* Global Thunks */
+
+#define pLsrThunkStaticValueMk(name, v, vt, retType, val)               \
+    static PlsrThunkU##name v =                                         \
+        { .value = {.vtable = (vt),                                     \
+                    .result = (retType) (val) }                         \
+        }
+
+#ifdef PLSR_THUNK_SUBSUMPTION
+/* For now leave globals boxed, since there are some mil-to-pil complications
+* with doing otherwise -leaf */
+  #define pLsrThunkSubsumptiveStaticValueMk pLsrThunkStaticValueMk
+#else /* ! PLSR_THUNK_SUBSUMPTION */
+  #define pLsrThunkSubsumptiveStaticValueMk pLsrThunkStaticValueMk
+#endif /* ! PLSR_THUNK_SUBSUMPTION */
+
+/* These are marked ALWAYS_IMMUTABLE, so must be initialized atomically wrt the gc */
+#define pLsrThunkStaticValueRef(v, val)    \
+    pLsrThunkSubsumptiveStaticValueMk(Ref,    v, pLsrThunkValVTableRef, PlsrRef, val)
+#define pLsrThunkStaticValue32(v, val)     pLsrThunkStaticValueMk(32,     v, pLsrThunkValVTable32, uint32, val)
+#define pLsrThunkStaticValue64(v, val)     pLsrThunkStaticValueMk(64,     v, pLsrThunkValVTable64, uint64, val)
+#define pLsrThunkStaticValueFloat(v, val)  pLsrThunkStaticValueMk(Float,  v, pLsrThunkValVTableFloat, float32, val)
+#define pLsrThunkStaticValueDouble(v, val) pLsrThunkStaticValueMk(Double, v, pLsrThunkValVTableDouble, float64, val)
+
+/* Creation */
+#define pLsrThunkUninitCodeMk(name)                                     \
+    static PlsrThunkReturnType##name pLsrThunkUninitCode##name(PlsrThunkB##name thunk) \
+    {                                                                   \
+        pLsrRuntimeError_("Tried to evaluate uninitialized thunk");      \
+        return (PlsrThunkReturnType##name) 0;                           \
+    }
+/* pLsrThunkUninitCodeRef, pLsrThunkUninitCode32, pLsrThunkUninitCode64
+ * pLsrThunkUninitCodeFloat, pLsrThunkUninitCodeDouble 
+ */
+pLsrThunkUninitCodeMk(Ref);
+pLsrThunkUninitCodeMk(32);
+pLsrThunkUninitCodeMk(64);
+pLsrThunkUninitCodeMk(Float);
+pLsrThunkUninitCodeMk(Double);
+
+/* This cannot engender a yield */
+#define pLsrThunkNewMk(name, dv, vt, sz, algn)                          \
+    do {                                                                \
+        pLsrAllocAligned(PlsrThunkB##name, (dv), (vt), (sz), (algn));   \
+        assert((vt)->tag==VThunkTag);                                   \
+    } while (0)
+
+#ifdef PLSR_ZERO_REFS                                                   
+#define pLsrThunkNewRef(dv, vt, sz, algn)                               \
+    do {                                                                \
+        pLsrThunkNewMk(Ref, dv, vt, sz, algn);                          \
+        pLsrThunkResultFieldRef(dv) = NULL;                             \
+        pLsrThunkExnFieldRef(dv) = NULL;                                \
+    } while (0)
+#define pLsrThunkNew32(dv, vt, sz, algn)                                \
+    do {                                                                \
+        pLsrThunkNewMk(32, dv, vt, sz, algn);                           \
+        pLsrThunkExnField32(dv) = NULL;                                 \
+    } while (0)
+#define pLsrThunkNew64(dv, vt, sz, algn)                                \
+    do {                                                                \
+        pLsrThunkNewMk(64, dv, vt, sz, algn);                           \
+        pLsrThunkExnField64(dv) = NULL;                                 \
+    } while (0)
+#define pLsrThunkNewFloat(dv, vt, sz, algn)                             \
+    do {                                                                \
+        pLsrThunkNewMk(Float, dv, vt, sz, algn);                        \
+        pLsrThunkExnFieldFloat(dv) = NULL;                              \
+    } while (0)
+#define pLsrThunkNewDouble(dv, vt, sz, algn)                            \
+    do {                                                                \
+        pLsrThunkNewMk(Double, dv, vt, sz, algn);                       \
+        pLsrThunkExnFieldDouble(dv) = NULL;                             \
+    } while (0)
+#else
+#define pLsrThunkNewRef(dv, vt, sz, algn)    pLsrThunkNewMk(Ref, dv, vt, sz, algn)
+#define pLsrThunkNew32(dv, vt, sz, algn)     pLsrThunkNewMk(32, dv, vt, sz, algn)
+#define pLsrThunkNew64(dv, vt, sz, algn)     pLsrThunkNewMk(64, dv, vt, sz, algn)
+#define pLsrThunkNewFloat(dv, vt, sz, algn)  pLsrThunkNewMk(Float, dv, vt, sz, algn)
+#define pLsrThunkNewDouble(dv, vt, sz, algn) pLsrThunkNewMk(Double, dv, vt, sz, algn)
+#endif
+
+/* Initialisation */
+
+#define pLsrThunkSetInitMk(name, thunk, vtable)    \
+    pLsrObjectChangeVTableMandatory(thunk, vtable)
+
+#define pLsrThunkSetInitRef(thunk, vtable)    pLsrThunkSetInitMk(Ref, thunk, vtable)
+#define pLsrThunkSetInit32(thunk, vtable)     pLsrThunkSetInitMk(32, thunk, vtable)
+#define pLsrThunkSetInit64(thunk, vtable)     pLsrThunkSetInitMk(64, thunk, vtable)
+#define pLsrThunkSetInitFloat(thunk, vtable)  pLsrThunkSetInitMk(Float, thunk, vtable)
+#define pLsrThunkSetInitDouble(thunk, vtable) pLsrThunkSetInitMk(Double, thunk, vtable)
+
+#define pLsrThunkSetValueRef(thunk, v)                                   \
+    (pLsrWriteBarrierRefBase((PlsrThunkBRef)(thunk), pLsrThunkResultFieldRef(thunk), (v)))
+/* This should not engender a yield */
+#define pLsrThunkSetValueNonWbMk(name, thunk, v)                         \
+    (pLsrThunkResultField##name(thunk) = (v))
+
+#define pLsrThunkSetValue32(thunk, v)     pLsrThunkSetValueNonWbMk(32, thunk, v)
+#define pLsrThunkSetValue64(thunk, v)     pLsrThunkSetValueNonWbMk(64, thunk, v)
+#define pLsrThunkSetValueFloat(thunk, v)  pLsrThunkSetValueNonWbMk(Float, thunk, v)
+#define pLsrThunkSetValueDouble(thunk, v) pLsrThunkSetValueNonWbMk(Double, thunk, v)
+
+#define pLsrThunkValueInitRef(thunk, v)                                 \
+    do {                                                                \
+        pLsrThunkSetValueRef(thunk, v);                                 \
+        pLsrObjectChangeVTableMandatory(thunk, pLsrThunkValVTableRef);  \
+    } while (0)
+
+#define pLsrThunkValueInit32(thunk, v)                                  \
+    do {                                                                \
+        pLsrThunkSetValue32(thunk, v);                                  \
+        pLsrObjectChangeVTableMandatory(thunk, pLsrThunkValVTable32);   \
+    } while (0)
+
+#define pLsrThunkValueInit64(thunk, v)                                  \
+    do {                                                                \
+        pLsrThunkSetValue64(thunk, v);                                  \
+        pLsrObjectChangeVTableMandatory(thunk, pLsrThunkValVTable64);   \
+    } while (0)
+
+#define pLsrThunkValueInitFloat(thunk, v)                                  \
+    do {                                                                \
+        pLsrThunkSetValueFloat(thunk, v);                                  \
+        pLsrObjectChangeVTableMandatory(thunk, pLsrThunkValVTableFloat); \
+    } while (0)
+
+#define pLsrThunkValueInitDouble(thunk, v)                                  \
+    do {                                                                \
+        pLsrThunkSetValueDouble(thunk, v);                                  \
+        pLsrObjectChangeVTableMandatory(thunk, pLsrThunkValVTableDouble); \
+    } while (0)
+
+#define pLsrThunkNewValueMk(name, dv, vt, sz, algn, v)                   \
+    do {                                                                \
+        noyield {                                                       \
+            pLsrThunkNew##name(dv, vt, sz, algn);                       \
+            pLsrThunkSetValueNonWbMk(name, dv, v);                      \
+        }                                                               \
+    } while(0)
+
+#ifdef PLSR_THUNK_SUBSUMPTION
+
+#define pLsrThunkSubsumptiveNewValueMk(name, dv, vt, sz, algn, v) ((dv) = ((PlsrThunkBRef) (v)))
+
+#else /* ! PLSR_THUNK_SUBSUMPTION */
+
+#define pLsrThunkSubsumptiveNewValueMk pLsrThunkNewValueMk
+
+#endif /* ! PLSR_THUNK_SUBSUMPTION */
+#define pLsrThunkNewValueRef(dv, vt, sz, algn, v)    pLsrThunkSubsumptiveNewValueMk(Ref, dv, vt, sz, algn, ((PlsrRef) v))
+#define pLsrThunkNewValue32(dv, vt, sz, algn, v)     pLsrThunkNewValueMk(32, dv, vt, sz, algn, v)
+#define pLsrThunkNewValue64(dv, vt, sz, algn, v)     pLsrThunkNewValueMk(64, dv, vt, sz, algn, v)
+#define pLsrThunkNewValueFloat(dv, vt, sz, algn, v)  pLsrThunkNewValueMk(Float, dv, vt, sz, algn, v)
+#define pLsrThunkNewValueDouble(dv, vt, sz, algn, v) pLsrThunkNewValueMk(Double, dv, vt, sz, algn, v)
+
+static PlsrThunkBRef pLsrThunkNewValRef(PlsrRef v)
+{
+#ifdef PLSR_THUNK_SUBSUMPTION
+    return (PlsrThunkBRef) v;
+#else
+    noyield {
+        PlsrThunkBRef res;
+        pLsrAlloc(PlsrThunkBRef, res, pLsrThunkValVTableRef, sizeof(PlsrThunkURef));
+        pLsrThunkSetValueRef(res, v);
+        return res;
+    }
+#endif
+}
+
+/* Projection */
+#ifdef PLSR_THUNK_SUBSUMPTION
+  #define pLsrThunkIsUnboxedRef(thunk)                                    \
+    (pLsrIntegerIsTagged(thunk) || (pLsrThunkVTableFieldRef(thunk)->tag != VThunkTag))
+#else
+  #define pLsrThunkIsUnboxedRef(thunk)     (assert(pLsrThunkVTableFieldRef(thunk)->tag == VThunkTag), 0)
+#endif
+#define pLsrThunkIsUnboxed32(thunk)      (assert(pLsrThunkVTableField32(thunk)->tag == VThunkTag), 0)
+#define pLsrThunkIsUnboxed64(thunk)      (assert(pLsrThunkVTableField64(thunk)->tag == VThunkTag), 0)
+#define pLsrThunkIsUnboxedFloat(thunk)   (assert(pLsrThunkVTableFieldFloat(thunk)->tag == VThunkTag), 0)
+#define pLsrThunkIsUnboxedDouble(thunk)  (assert(pLsrThunkVTableFieldDouble(thunk)->tag == VThunkTag), 0)
+
+#define pLsrThunkIsEvaledRef(thunk)     (pLsrThunkVTableFieldRef(thunk) == pLsrThunkValVTableRef)
+#define pLsrThunkIsEvaled32(thunk)      (pLsrThunkVTableField32(thunk) == pLsrThunkValVTable32)
+#define pLsrThunkIsEvaled64(thunk)      (pLsrThunkVTableField64(thunk) == pLsrThunkValVTable64)
+#define pLsrThunkIsEvaledFloat(thunk)   (pLsrThunkVTableFieldFloat(thunk) == pLsrThunkValVTableFloat)
+#define pLsrThunkIsEvaledDouble(thunk)  (pLsrThunkVTableFieldDouble(thunk) == pLsrThunkValVTableDouble)
+
+#define pLsrThunkGetValRef(thunk)     pLsrThunkResultFieldRef(thunk)
+#define pLsrThunkGetVal32(thunk)      pLsrThunkResultField32(thunk)
+#define pLsrThunkGetVal64(thunk)      pLsrThunkResultField64(thunk)
+#define pLsrThunkGetValFloat(thunk)   pLsrThunkResultFieldFloat(thunk)
+#define pLsrThunkGetValDouble(thunk)  pLsrThunkResultFieldDouble(thunk)
+
+
+/* Evaluation */
+
+#define pLsrThunkCallMk(name, thunk) pLsrThunkGetCode##name(thunk)(thunk)
+#define pLsrThunkTailCallMk(name, thunk) TAILCALL((pLsrThunkGetCode##name(thunk))(thunk))
+
+#define pLsrThunkCallDirectMk(name, code, thunk)  code(thunk)
+#define pLsrThunkTailCallDirectMk(name, code, thunk)  TAILCALL(code(thunk))
+
+#define pLsrThunkCallDirectNoCutsMk(name, code, thunk)  code(thunk)
+#define pLsrThunkTailCallDirectNoCutsMk(name, code, thunk)  TAILCALL(code(thunk))
+
+#define pLsrThunkEvalMk(name, thunk)                                    \
+    (pLsrThunkIsEvaled##name(thunk)) ? pLsrThunkGetVal##name(thunk) : pLsrThunkCall##name(thunk)
+#define pLsrThunkEvalDirectMk(name, code, thunk)                        \
+    (pLsrThunkIsEvaled##name(thunk)) ? pLsrThunkGetVal##name(thunk) : pLsrThunkCallDirect##name(code, thunk)
+#define pLsrThunkTailEvalMk(name, thunk)                                \
+    do {                                                                \
+        if (pLsrThunkIsEvaled##name(thunk)) {                           \
+            return pLsrThunkGetVal##name(thunk);                        \
+        } else {                                                        \
+            pLsrThunkTailCall##name(thunk);                             \
+        }                                                               \
+    } while (0)                                                         
+#define pLsrThunkTailEvalDirectMk(name, code, thunk)                    \
+    do {                                                                \
+        if (pLsrThunkIsEvaled##name(thunk)) {                           \
+            return pLsrThunkGetVal##name(thunk);                        \
+        } else {                                                        \
+            pLsrThunkTailCallDirect##name(code, thunk);                 \
+        }                                                               \
+    } while (0)                                                         
+
+#ifdef PLSR_THUNK_SUBSUMPTION
+#define pLsrThunkSubsumptiveEvalMk(name, thunk)                         \
+    (pLsrThunkIsUnboxed##name(thunk)) ? pLsrThunkCastToObject##name(thunk) : pLsrThunkEvalMk(name, thunk) 
+#define pLsrThunkSubsumptiveEvalDirectMk(name, code, thunk)             \
+    (pLsrThunkIsUnboxed##name(thunk)) ? pLsrThunkCastToObject##name(thunk) : pLsrThunkEvalDirectMk(name, code, thunk)
+#define pLsrThunkSubsumptiveTailEvalMk(name, thunk)                     \
+    do {                                                                \
+        if (pLsrThunkIsUnboxed##name(thunk)) {                          \
+            return pLsrThunkCastToObject##name(thunk);                  \
+        } else {                                                        \
+            pLsrThunkTailEvalMk(name, thunk);                            \
+        }                                                               \
+    } while (0)                                                         
+#define pLsrThunkSubsumptiveTailEvalDirectMk(name, code, thunk)         \
+    do {                                                                \
+        if (pLsrThunkIsUnboxed##name(thunk)) {                          \
+            return pLsrThunkCastToObject##name(thunk);                  \
+        } else {                                                        \
+            pLsrThunkTailEvalDirectMk(name, code, thunk);                \
+        }                                                               \
+    } while (0)                                                         
+#else /* ! PLSR_THUNK_SUBSUMPTION */
+
+#define pLsrThunkSubsumptiveEvalMk pLsrThunkEvalMk
+#define pLsrThunkSubsumptiveEvalDirectMk pLsrThunkEvalDirectMk
+#define pLsrThunkSubsumptiveTailEvalMk pLsrThunkTailEvalMk
+#define pLsrThunkTailSubsumptiveEvalDirectMk pLsrThunkTailEvalDirectMk
+
+#endif /* ! PLSR_THUNK_SUBSUMPTION */
+
+#define pLsrThunkCallRef(thunk)    pLsrThunkCallMk (Ref,thunk)
+#define pLsrThunkCall32(thunk)     pLsrThunkCallMk (32, thunk)
+#define pLsrThunkCall64(thunk)     pLsrThunkCallMk (64, thunk)
+#define pLsrThunkCallFloat(thunk)  pLsrThunkCallMk (Float, thunk)
+#define pLsrThunkCallDouble(thunk) pLsrThunkCallMk (Double, thunk)
+
+#define pLsrThunkCallDirectRef(code, thunk)    pLsrThunkCallDirectMk (Ref, code, thunk)
+#define pLsrThunkCallDirect32(code, thunk)     pLsrThunkCallDirectMk (32, code, thunk)
+#define pLsrThunkCallDirect64(code, thunk)     pLsrThunkCallDirectMk (64, code, thunk)
+#define pLsrThunkCallDirectFloat(code, thunk)  pLsrThunkCallDirectMk (Float, code, thunk)
+#define pLsrThunkCallDirectDouble(code, thunk) pLsrThunkCallDirectMk (Double, code, thunk)
+
+#define pLsrThunkTailCallRef(thunk)    pLsrThunkTailCallMk (Ref,thunk)
+#define pLsrThunkTailCall32(thunk)     pLsrThunkTailCallMk (32, thunk)
+#define pLsrThunkTailCall64(thunk)     pLsrThunkTailCallMk (64, thunk)
+#define pLsrThunkTailCallFloat(thunk)  pLsrThunkTailCallMk (Float, thunk)
+#define pLsrThunkTailCallDouble(thunk) pLsrThunkTailCallMk (Double, thunk)
+
+#define pLsrThunkTailCallDirectRef(code, thunk)    pLsrThunkTailCallDirectMk (Ref, code, thunk)
+#define pLsrThunkTailCallDirect32(code, thunk)     pLsrThunkTailCallDirectMk (32, code, thunk)
+#define pLsrThunkTailCallDirect64(code, thunk)     pLsrThunkTailCallDirectMk (64, code, thunk)
+#define pLsrThunkTailCallDirectFloat(code, thunk)  pLsrThunkTailCallDirectMk (Float, code, thunk)
+#define pLsrThunkTailCallDirectDouble(code, thunk) pLsrThunkTailCallDirectMk (Double, code, thunk)
+
+#define pLsrThunkEvalRef(thunk)    pLsrThunkSubsumptiveEvalMk (Ref, thunk)
+#define pLsrThunkEval32(thunk)     pLsrThunkEvalMk (32, thunk)
+#define pLsrThunkEval64(thunk)     pLsrThunkEvalMk (64, thunk)
+#define pLsrThunkEvalFloat(thunk)  pLsrThunkEvalMk (Float, thunk)
+#define pLsrThunkEvalDouble(thunk) pLsrThunkEvalMk (Double, thunk)
+
+#define pLsrThunkEvalDirectRef(code, thunk)    pLsrThunkSubsumptiveEvalDirectMk (Ref, code, thunk)
+#define pLsrThunkEvalDirect32(code, thunk)     pLsrThunkEvalDirectMk (32, code, thunk)
+#define pLsrThunkEvalDirect64(code, thunk)     pLsrThunkEvalDirectMk (64, code, thunk)
+#define pLsrThunkEvalDirectFloat(code, thunk)  pLsrThunkEvalDirectMk (Float, code, thunk)
+#define pLsrThunkEvalDirectDouble(code, thunk) pLsrThunkEvalDirectMk (Double, code, thunk)
+
+#define pLsrThunkTailEvalRef(thunk)    pLsrThunkSubsumptiveTailEvalMk (Ref, thunk)
+#define pLsrThunkTailEval32(thunk)     pLsrThunkTailEvalMk (32, thunk)
+#define pLsrThunkTailEval64(thunk)     pLsrThunkTailEvalMk (64, thunk)
+#define pLsrThunkTailEvalFloat(thunk)  pLsrThunkTailEvalMk (Float, thunk)
+#define pLsrThunkTailEvalDouble(thunk) pLsrThunkTailEvalMk (Double, thunk)
+
+#define pLsrThunkTailEvalDirectRef(code, thunk)    pLsrThunkSubsumptiveTailEvalDirectMk (Ref, code, thunk)
+#define pLsrThunkTailEvalDirect32(code, thunk)     pLsrThunkTailEvalDirectMk (32, code, thunk)
+#define pLsrThunkTailEvalDirect64(code, thunk)     pLsrThunkTailEvalDirectMk (64, code, thunk)
+#define pLsrThunkTailEvalDirectFloat(code, thunk)  pLsrThunkTailEvalDirectMk (Float, code, thunk)
+#define pLsrThunkTailEvalDirectDouble(code, thunk) pLsrThunkTailEvalDirectMk (Double, code, thunk)
+
+#define pLsrThunkReturnRef(thnk, val)                                   \
+     do {                                                               \
+         pLsrThunkSetValueRef(thnk, val);                               \
+         pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTableRef); \
+         return val;                                                    \
+    } while (0)
+
+#define pLsrThunkReturnNonRefMk(name, thnk, val)                        \
+    do {                                                                \
+        pLsrThunkSetValue##name(thnk, val);                             \
+        pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTable##name); \
+        return val;                                                     \
+    } while (0)
+
+#define pLsrThunkReturn32(thunk, val)     pLsrThunkReturnNonRefMk(32, thunk, val)
+#define pLsrThunkReturn64(thunk, val)     pLsrThunkReturnNonRefMk(64, thunk, val)
+#define pLsrThunkReturnFloat(thunk, val)  pLsrThunkReturnNonRefMk(Float, thunk, val)
+#define pLsrThunkReturnDouble(thunk, val) pLsrThunkReturnNonRefMk(Double, thunk, val)
+
+PlsrThunkReturnTypeRef pLsrThunkDoCut(PlsrThunkBRef t) {
+    pLsrRuntimeError_("Thunk recuts not supported with lightweight thunks");
+    return 0;
+#if 0
+    PilContinuation0 c = (PilContinuation0) pLsrThunkRef(t)->result;
+    pilCutTo0(c);
+    return 0;
+#endif
+}
+
+pLsrVTableStaticWithCustom(pLsrThunkCutVTable_, VThunkTag, "*thunk value (cut)*", 0, pLsrThunkDoCut);
+#define pLsrThunkCutVTable (&pLsrThunkCutVTable_)
+
+#define pLsrThunkCutMk(name, thunk, cont)                                \
+    do {                                                                \
+        assert(sizeof(PlsrThunkU##name) >= sizeof(PlsrThunkURef));      \
+        pLsrObjectChangeVTableMandatory(thunk, pLsrThunkCutVTable);      \
+        /*        pLsrThunkRef(thunk)->exn = (PlsrRef) (exn);*/      \
+        cut to cont;                                                    \
+    } while (0)
+
+#define pLsrThunkCutRef(thunk, cont)    pLsrThunkCutMk(Ref, thunk, cont)
+#define pLsrThunkCut32(thunk, cont)     pLsrThunkCutMk(32, thunk, cont)
+#define pLsrThunkCut64(thunk, cont)     pLsrThunkCutMk(64, thunk, cont)
+#define pLsrThunkCutFloat(thunk, cont)  pLsrThunkCutMk(Float, thunk, cont)
+#define pLsrThunkCutDouble(thunk, cont) pLsrThunkCutMk(Double, thunk, cont)
+
+/* Printing */
+
+static void pLsrThunkPrintRef(PlsrThunkBRef t)
+{
+    pLsrValuePrint((PlsrObjectB)pLsrThunkEvalRef(t));
+}
+
+/* VTables for black holed thunks */
+
+pLsrVTableStatic(pLsrThunkEvalVTableRef_, VThunkTag, "*evaled thunk (ref)*", pLsrThunkPaddingRef);
+#define pLsrThunkEvalVTableRef (&pLsrThunkEvalVTableRef_)
+
+pLsrVTableStatic(pLsrThunkEvalVTable32_, VThunkTag, "*evaled thunk (32)*", pLsrThunkPadding32);
+#define pLsrThunkEvalVTable32 (&pLsrThunkEvalVTable32_)
+
+pLsrVTableStatic(pLsrThunkEvalVTable64_, VThunkTag, "*evaled thunk (64)*", pLsrThunkPadding64);
+#define pLsrThunkEvalVTable64 (&pLsrThunkEvalVTable64_)
+
+pLsrVTableStatic(pLsrThunkEvalVTableFloat_, VThunkTag, "*evaled thunk (float)*", pLsrThunkPaddingFloat);
+#define pLsrThunkEvalVTableFloat (&pLsrThunkEvalVTableFloat_)
+
+pLsrVTableStatic(pLsrThunkEvalVTableDouble_, VThunkTag, "*evaled thunk (double)*", pLsrThunkPaddingDouble);
+#define pLsrThunkEvalVTableDouble (&pLsrThunkEvalVTableDouble_)
+
+#ifdef PLSR_THUNK_SYNCHRONIZE
+#define pLsrThunkClaimDo(name, thunk)                                   \
+    do {                                                                \
+        PlsrVTable pLsrThunkClaimDoVT = pLsrThunkVTableField##name(thunk); \
+        if (!(pLsrVTableGetCustom(pLsrThunkClaimDoVT) &&                \
+              pLsrObjectCmpAndSetVTableMandatory(thunk, pLsrThunkClaimDoVT, pLsrThunkEvalVTable##name))) { \
+            iFlcSynchWaitEqualVoidS(thunk, 0, (void*) pLsrThunkValVTable##name); \
+            return pLsrThunkGetVal##name(thunk);                        \
+        }                                                               \
+    } while (0)
+#else
+#define pLsrThunkClaimDo(name, thunk)                                   \
+    do {                                                                \
+        if (pLsrThunkVTableField##name(thunk) == pLsrThunkEvalVTable##name) { \
+            pLsrRuntimeError_("Black Hole");                            \
+        }                                                               \
+        pLsrObjectChangeVTableMandatory(thunk, pLsrThunkEvalVTable##name); \
+    } while (0)
+#endif
+
+#define pLsrThunkClaimRef(thunk)    pLsrThunkClaimDo(Ref, thunk)
+#define pLsrThunkClaim32(thunk)     pLsrThunkClaimDo(32, thunk)
+#define pLsrThunkClaim64(thunk)     pLsrThunkClaimDo(64, thunk)
+#define pLsrThunkClaimFloat(thunk)  pLsrThunkClaimDo(Float, thunk)
+#define pLsrThunkClaimDouble(thunk) pLsrThunkClaimDo(Double, thunk)
+
+#ifdef PLSR_THUNK_SYNCHRONIZE
+#define pLsrThunkBlackHoleDo(name, thunk)                               \
+do {                                                                    \
+    iFlcSynchWaitEqualUIntp(thunk, 0, pLsrThunkValVTable##name);        \
+    return pLsrThunkGetVal##name(thunk);                                \
+ } while (0)
+#else
+#define pLsrThunkBlackHoleDo(name, thunk)                               \
+    do {                                                                \
+        pLsrRuntimeError_("Black Hole");                                \
+    } while (0)
+#endif
+#define pLsrThunkBlackHoleRef(thunk)    pLsrThunkBlackHoleDo(Ref, thunk)
+#define pLsrThunkBlackHole32(thunk)     pLsrThunkBlackHoleDo(32, thunk)
+#define pLsrThunkBlackHole64(thunk)     pLsrThunkBlackHoleDo(64, thunk)
+#define pLsrThunkBlackHoleFloat(thunk)  pLsrThunkBlackHoleDo(Float, thunk)
+#define pLsrThunkBlackHoleDouble(thunk) pLsrThunkBlackHoleDo(Double, thunk)
+
+#define pLsrThunkZeroFV(zero)
+
+
+/*** Check Object Model ***/
+
+/* Generated code defines:
+ *   pLsrThunkFixedSizeRef
+ *   pLsrThunkFixedSize64
+ *   pLsrThunkFixedSize32
+ *   pLsrThunkFixedSizeFloat
+ *   pLsrThunkFixedSizeDouble
+ *   pLsrThunkResultOffsetRef
+ *   pLsrThunkResultOffset64
+ *   pLsrThunkResultOffset32
+ *   pLsrThunkResultOffsetFloat
+ *   pLsrThunkResultOffsetDouble
+ */
+
+static void pLsrThunkCheck()
+{
+    /*printf("Thunk check: %d/%d, %d/%d, %d/%d\n", pLsrThunkFixedSizeRef, 
+           sizeof(PlsrThunkURef), pLsrThunkFixedSize32, sizeof(PlsrThunkU32),
+           pLsrThunkFixedSize64, sizeof(PlsrThunkU64));*/
+    if (pLsrThunkFixedSizeRef != sizeof(PlsrThunkURef)) 
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkFixedSize32  != sizeof(PlsrThunkU32))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkFixedSize64  != sizeof(PlsrThunkU64))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkFixedSizeFloat  != sizeof(PlsrThunkUFloat))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkFixedSizeDouble  != sizeof(PlsrThunkUDouble))
+        pLsrRuntimeError("Bad thunk object model!\n"); 
+
+    if (pLsrThunkResultOffsetRef != ((unsigned)(&pLsrThunkResultFieldRef(0))))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkResultOffset32  != ((unsigned)(&pLsrThunkResultField32(0))))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkResultOffset64  != ((unsigned)(&pLsrThunkResultField64(0))))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkResultOffsetFloat  != ((unsigned)(&pLsrThunkResultFieldFloat(0))))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkResultOffsetDouble  != ((unsigned)(&pLsrThunkResultFieldDouble(0))))
+        pLsrRuntimeError("Bad thunk object model!\n"); 
+
+}
+
+#endif /* !_PLSR_THUNK_H_ */
diff --git a/runtime/include/hrc/plsr-main.h b/runtime/include/hrc/plsr-main.h
new file mode 100755
index 0000000..336236e
--- /dev/null
+++ b/runtime/include/hrc/plsr-main.h
@@ -0,0 +1,164 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_MAIN_H_
+#define _PLSR_MAIN_H_
+
+#ifdef P_USE_PILLAR
+# define MAIN pillar_main
+#else
+# define MAIN main
+#endif
+
+/* C functions used for Runtime */
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+#endif /* P_USE_PILLAR */
+
+#include <locale.h>
+
+#ifdef P_USE_PILLAR
+#  undef to
+#  pragma pillar_managed(on)
+#endif /* P_USE_PILLAR */
+
+
+
+static void pLsrRuntimeInitialize() 
+{
+    pLsrGCInitialize();
+    pLsrNumericInitialize(pLsrGmpMemLimitParam);
+}
+
+static void pLsrRegisterRuntimeGlobals()
+{
+    pLsrGCRegisterGlobals();
+    pLsrFinalizerRegisterGlobals();
+    pLsrWpoRegisterGlobals();
+    pLsrNumericRegisterGlobals();
+    pLsrValueRegisterGlobals();
+}
+
+static void pLsrRegisterRuntimeVTables()
+{
+    pLsrGCRegisterVTables();
+    pLsrFinalizerRegisterVTables();
+    pLsrWpoRegisterVTables();
+    pLsrNumericRegisterVTables();
+    pLsrValueRegisterVTables();
+}
+
+static void pLsrCheckRuntimeAssertions()
+{
+    pLsrFinalizerCheckAssertions();
+    pLsrWpoCheckAssertions();
+    pLsrNumericCheckAssertions();
+    pLsrObjectCheckModel();
+    pLsrThunkCheck();
+    pLsrValueCheck();
+}
+
+#ifdef P_USE_PILLAR
+#pragma pillar_managed(off)
+static void pLsrRuntimeReportRoots(PrtRseCallback rse, void* env) 
+{
+    pLsrFinalizerReportRoots(rse, env);
+}
+#pragma pillar_managed(on)
+#endif
+
+static void pLsrFuturesStart () 
+{
+    uintp sizeInBytes = pLsrStackSizeWorker * 1024 * 1024;
+    uintp digitsB10 = (uintp) log10(sizeInBytes);
+    char *fmt = "stacksize=%u";
+    char *buf = pLsrAllocC(strlen(fmt) + digitsB10 + 1);
+    if (!sprintf(buf, fmt, sizeInBytes)) {
+        pLsrRuntimeError("Unable to set stack size");
+    }
+#ifdef P_USE_PARALLEL_FUTURES
+    ptkFutureSystemSetOption(buf);
+    ptkFutureSystemStart(0);
+#endif
+    pLsrFreeC(buf);
+}
+
+static void pLsrIHRInitialize() {
+    if (pLsrIHRThreadCountParam > 0) ihrSetNCapabilities(pLsrIHRThreadCountParam);
+}
+
+static void __pmain();
+
+
+static void pLsrRun()
+{
+    pLsrEventsTransition("Enter", "Main");
+    __pmain();
+    pLsrFinalizerShutdown (1);
+    pLsrEventsTransition("Exit", "Main");
+    fflush(stdout);
+
+#ifdef PLSR_INSTRUMENT_ALLOCATION
+    printf("plsr: Number objects       allocated: %I64u\n", pLsrNumObjectsAllocated);
+    printf("plsr: Number bytes         allocated: %I64u\n", pLsrNumBytesAllocated);
+    printf("plsr: Number padding bytes allocated: %I64u\n", pLsrNumPaddingBytesAllocated);
+    printf("plsr: Number unmanaged objects allocated: %I64u\n", pLsrNumObjectsAllocatedUnmanaged);
+    printf("plsr: Number unmanaged bytes   allocated: %I64u\n", pLsrNumBytesAllocatedUnmanaged);
+    printf("plsr: Number unmanaged objects freed: %I64u\n", pLsrNumObjectsFreedUnmanaged);
+#endif /* PLSR_INSTRUMENT_ALLOCATION */
+#ifdef PLSR_INSTRUMENT_VTB_ALC
+    {
+        PlsrVTable cur = pLsrAllVTables;
+        printf("plsr: vtable allocation stats:\n");
+        while(cur) {
+            printf("  %s (%p): Number objects       allocated: %I64u\n",
+                   cur->name, cur, cur->numObjects);
+            printf("  %s (%p): Number bytes         allocated: %I64u\n",
+                   cur->name, cur, cur->numBytes);
+            printf("  %s (%p): Number padding bytes allocated: %I64u\n",
+                   cur->name, cur, cur->padding*cur->numObjects);
+            cur = cur->next;
+        }
+    }
+#endif /* PLSR_INSTRUMENT_VTB_ALC */
+
+    pLsrEventsShutdown();
+    pLsrExit(0);
+    
+    return;
+}
+
+int MAIN(int _argc, const char** _argv)
+{
+    int argc;
+    const char** argv;
+
+#ifdef P_USE_MCRT
+    mcrtStart(main, _argc, _argv);
+#endif
+    setlocale(LC_CTYPE, "");
+    pLsrEventsInit();
+    pLsrEventsTransition("Enter", "Startup");
+    pLsrDisableErrorBox();
+    pilCheck();
+    pLsrCheckRuntimeAssertions();
+    pLsrParseOptions(_argc, _argv, &argc, &argv);
+    pLsrGcInit(pLsrInitHeapParam, pLsrMaxHeapParam);
+    pLsrRegisterRuntimeVTables();
+    pLsrRegisterRuntimeGlobals();
+    pLsrRuntimeInitialize();
+    pLsrIHRInitialize();
+    pLsrFuturesStart ();
+    pLsrFinalizerStart ();
+    pLsrEventsTransition("Exit", "Startup");
+#ifdef __pillar2c__
+    prtSetPcallStackSize(pLsrStackSizeMain * 1024 * 1024);
+    pcall pLsrRun();
+#else
+    pLsrRun();
+#endif
+    return 0;
+}
+
+#endif /* !_PLSR_MAIN_H_ */
diff --git a/runtime/include/hrc/plsr-numeric.h b/runtime/include/hrc/plsr-numeric.h
new file mode 100755
index 0000000..e490ec0
--- /dev/null
+++ b/runtime/include/hrc/plsr-numeric.h
@@ -0,0 +1,65 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Arbitrary precision numbers */
+
+#ifndef _PLSR_NUMERIC_H_
+#define _PLSR_NUMERIC_H_
+
+/**********************************************************************
+ * Arbitrary precision integers
+ */
+
+#include "hrc/plsr-integer.h"
+
+/**********************************************************************
+ * Arbitrary precision rationals
+ */
+
+#include "hrc/plsr-rational.h"
+
+/**********************************************************************
+ * Some miscellaneous floating point stuff
+ */
+
+static char* pLsrCStringFromFloat32(float32 flt)
+{
+    char* str = pLsrAllocC(30);
+    sprintf(str, "%f", flt);
+    return str;
+}
+
+static float32 pLsrFloat32FromCString(char* str)
+{
+    float32 res = 0.0f;
+    sscanf(str, "%f", &res);
+    return res;
+}
+
+/**********************************************************************
+ * GC registration functions
+ */
+
+static void pLsrNumericRegisterVTables()
+{
+    pLsrIntegerRegisterVTables();
+    pLsrRationalRegisterVTables();
+}
+
+static void pLsrNumericRegisterGlobals()
+{
+    pLsrIntegerRegisterGlobals();
+    pLsrRationalRegisterGlobals();
+}
+
+static void pLsrNumericCheckAssertions()
+{
+    pLsrRationalCheckAssertions();
+}
+
+static void pLsrNumericInitialize(uintp memLimit)
+{
+    pLsrAPIntInitialize(memLimit);
+    pLsrAPRatInitialize();
+}
+#endif /* !_PLSR_NUMERIC_H_ */
diff --git a/runtime/include/hrc/plsr-objects.h b/runtime/include/hrc/plsr-objects.h
new file mode 100755
index 0000000..8da395b
--- /dev/null
+++ b/runtime/include/hrc/plsr-objects.h
@@ -0,0 +1,189 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Heap Objects and the Object Model */
+
+#ifndef _PLSR_OBJECTS_H_
+#define _PLSR_OBJECTS_H_
+
+/**********************************************************************
+ * Refs
+ */
+
+#ifdef P_USE_PILLAR
+typedef ref PlsrRef;
+#else /* !P_USE_PILLAR */
+typedef void* PlsrRef;
+#endif /* !P_USE_PILLAR */
+
+/**********************************************************************
+ * VTables
+ */
+
+enum PlsrValueTagE {
+    VNoneTag,
+    VRatTag,
+    VNameTag,
+    VFloatTag,
+    VDoubleTag,
+    VArrayTag,
+    VArrayIdxTag,
+    VSumTag,
+    VFunctionTag,
+    VSetTag,
+    VTypeTag,
+    VPtrTag,
+    VThunkTag
+};
+
+struct PlsrVTableS {
+    char reserve[P_VTABLE_RESERVE];
+    const enum PlsrValueTagE tag;
+    uintp custom;
+#if (defined(PLSR_INSTRUMENT_VTB_ALC) || defined(DEBUG))
+    char* name;
+#endif
+#ifdef PLSR_INSTRUMENT_VTB_ALC
+    struct PlsrVTableS* next;
+    uint64 numObjects;
+    uint64 numBytes;
+#endif /* PLSR_INSTRUMENT_VTB_ALC */
+#if (defined(PLSR_INSTRUMENT_ALLOCATION) || defined(PLSR_INSTRUMENT_VTB_ALC))
+    uint64 padding; /* How much padding does this object contain? */
+#endif
+
+};
+
+typedef struct PlsrVTableS* PlsrVTable;
+
+#define pLsrVTableStaticTagField(tg) .tag = (tg),
+#define pLsrVTableStaticCustomField(c) .custom = ((uintp) (c)),
+#if (defined(PLSR_INSTRUMENT_VTB_ALC) || defined(DEBUG))
+#define pLsrVTableStaticNameField(nm) .name  = (nm),
+#else
+#define pLsrVTableStaticNameField(nm)
+#endif
+#ifdef PLSR_INSTRUMENT_VTB_ALC
+#define pLsrVTableStaticNextField(n) .next = (n),
+#define pLsrVTableStaticNumObjectsField(n) .numObjects = (n),
+#define pLsrVTableStaticNumBytesField(n) .numBytes = (n),
+#else
+#define pLsrVTableStaticNextField(n) 
+#define pLsrVTableStaticNumObjectsField(n) 
+#define pLsrVTableStaticNumBytesField(n) 
+#endif 
+#if (defined(PLSR_INSTRUMENT_ALLOCATION) || defined(PLSR_INSTRUMENT_VTB_ALC))
+#define pLsrVTableStaticPaddingField(n) .padding = (n),
+#else
+#define pLsrVTableStaticPaddingField(n)
+#endif
+
+#define pLsrVTableStaticWithCustom(vt, tg, nm, p, c)        \
+    pil_aligned(16) static struct PlsrVTableS vt =                  \
+        { pLsrVTableStaticTagField(tg)                      \
+          pLsrVTableStaticCustomField(c)                    \
+          pLsrVTableStaticNameField(nm)                     \
+          pLsrVTableStaticNextField(0)                      \
+          pLsrVTableStaticNumObjectsField(0)                \
+          pLsrVTableStaticNumBytesField(0)                  \
+          pLsrVTableStaticPaddingField(p)}
+
+#define pLsrVTableStatic(vt, tg, nm, p)                   \
+    pLsrVTableStaticWithCustom(vt, tg, nm, p, 0)
+
+#define pLsrVTableGetTag(vt) ((vt)->tag)
+
+#define pLsrVTableGetCustom(vt) ((vt)->custom)
+
+/* This vtable should not be used with accurate GC, as it does not
+ * determine its GC info unambiguously.
+ */
+pLsrVTableStatic(pLsrVTableNone_, VNoneTag, "*none*", 0);
+#define pLsrVTableNone (&pLsrVTableNone_)
+
+/* VTables for thunks */
+
+/* When using accurate GC, these vtable is only for thunks with no
+ * free variables, initialized atomically at allocation time.
+ */
+pLsrVTableStatic(pLsrThunkValVTableRef_, VThunkTag, "*thunk value (ref)*", pLsrThunkPaddingRef);
+#define pLsrThunkValVTableRef (&pLsrThunkValVTableRef_)
+
+pLsrVTableStatic(pLsrThunkValVTable32_, VThunkTag, "*thunk value (32)*", pLsrThunkPadding32);
+#define pLsrThunkValVTable32 (&pLsrThunkValVTable32_)
+
+pLsrVTableStatic(pLsrThunkValVTable64_, VThunkTag, "*thunk value (64)*", pLsrThunkPadding64);
+#define pLsrThunkValVTable64 (&pLsrThunkValVTable64_)
+
+pLsrVTableStatic(pLsrThunkValVTableFloat_, VThunkTag, "*thunk value (float)*", pLsrThunkPaddingFloat);
+#define pLsrThunkValVTableFloat (&pLsrThunkValVTableFloat_)
+
+pLsrVTableStatic(pLsrThunkValVTableDouble_, VThunkTag, "*thunk value (double)*", pLsrThunkPaddingDouble);
+#define pLsrThunkValVTableDouble (&pLsrThunkValVTableDouble_)
+
+
+
+/**********************************************************************
+ * Objects
+ */
+
+typedef struct {
+    PlsrVTable vtable;
+} PlsrObjectU;
+
+#ifdef P_USE_PILLAR
+typedef PlsrRef PlsrObjectB;
+#else /* !P_USE_PILLAR */
+typedef PlsrObjectU* PlsrObjectB;
+#endif /* !P_USE_PILLAR */
+
+#define pLsrObjectGetVTable(obj)                                        \
+    ((PlsrVTable)((long)(((PlsrObjectU*)(obj))->vtable) & (0xFFffFFfc)))
+
+/* Generated code defines:
+ *   pLsrObjectFieldsBase
+ */
+
+#define pLsrObjectGetKind(obj) (pLsrVTableGetTag(pLsrObjectGetVTable(obj)))
+/* Note that both of these need the pointer type for t */
+#define pLsrObjectField(obj, off, t) (*(t)((char*)(obj) + (off)))
+#define pLsrObjectExtra(obj, off, t, es, i) (*(t)((char*)(obj) + (off) + (es * i)))
+
+#ifdef P_USE_PILLAR
+typedef PlsrRef PlsrThunkBRef;
+typedef PlsrRef PlsrThunkB32;
+typedef PlsrRef PlsrThunkB64;
+typedef PlsrRef PlsrThunkBFloat;
+typedef PlsrRef PlsrThunkBDouble;
+typedef PlsrRef PlsrPAny;
+#else /* !P_USE_PILLAR */
+typedef struct PlsrThunkSRef* PlsrThunkBRef;
+typedef struct PlsrThunkS32* PlsrThunkB32;
+typedef struct PlsrThunkS64* PlsrThunkB64;
+typedef struct PlsrThunkSFloat* PlsrThunkBFloat;
+typedef struct PlsrThunkSDouble* PlsrThunkBDouble;
+typedef PlsrObjectU* PlsrPAny;
+#endif /* !P_USE_PILLAR */
+
+static void pLsrValuePrint(PlsrObjectB);
+static PlsrRef pLsrThunkEvalRef(PlsrThunkBRef);
+
+#define pLsrObjectEval(dest, obj)                                       \
+    do {                                                                \
+        if (pLsrVTableGetTag(pLsrObjectGetVTable(obj)) == VThunkTag) {  \
+            (dest) = (PlsrPAny) pLsrThunkEvalRef((PlsrThunkBRef)obj);    \
+        } else {                                                        \
+            (dest) = (PlsrPAny)obj;                                     \
+        }                                                               \
+    } while (0)
+
+static void pLsrObjectCheckModel()
+{
+    if (pLsrObjectFieldsBase < sizeof(PlsrObjectU))
+        pLsrRuntimeError("Bad object model!\n");
+}
+
+#define pLsrMemzeroArray(dest, fsz, esz, c)  \
+    memset(((char*)dest) + (fsz),0,(esz)*(c))
+
+#endif /* !_PLSR_OBJECTS_H_ */
diff --git a/runtime/include/hrc/plsr-params.h b/runtime/include/hrc/plsr-params.h
new file mode 100755
index 0000000..5f9890f
--- /dev/null
+++ b/runtime/include/hrc/plsr-params.h
@@ -0,0 +1,236 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Parse Commandline Parameters */
+
+#ifndef _PLSR_PARAMS_H_
+#define _PLSR_PARAMS_H_
+
+typedef enum {
+    PPBool,
+    PPInt,
+    PPGc,
+    PPRT,
+    PFutures,
+#ifdef P_USE_PILLAR
+    PPPGc,
+#endif // P_USE_PILLAR
+    PPEnd,
+} PlsrPParamType;
+
+typedef struct pLsrPParamEntryS {
+    PlsrPParamType t;
+    char* name; 
+    int len;
+    int* loc; 
+    char* usage;
+} PlsrPParamEntry;
+
+static char pLsrPPStart[] = "@PPiler";
+static uintp pLsrPPStartLen = 7;
+static char pLsrPPEnd[] = "--";
+static uintp pLsrPPEndLen = 2;
+
+static uintp pLsrIHRThreadCountParam = 0;
+static sintp pLsrInitHeapParam = -1;   /* MB */
+static uintp pLsrMaxHeapParam = 0;     /* MB, 0 => unlimited */
+static uintp pLsrStackSizeMain = PLSR_STACK_SIZE_MAIN;
+static uintp pLsrStackSizeWorker = PLSR_STACK_SIZE_WORKER;
+static uintp pLsrGmpMemLimitParam = 0;
+
+static PlsrPParamEntry pLsroptions[] = {
+    {PPInt,    "gmpMaxMem",       0, &pLsrGmpMemLimitParam,    "Memory limit for gmp integer allocation"},
+    {PPInt,    "ihrThreads",      0, &pLsrIHRThreadCountParam, "How many haskell thread capabilities?"},
+    {PPInt,    "initHeap",        0, &pLsrInitHeapParam,       "Initial heap size (megabytes)"},
+    {PPInt,    "maxHeap",         0, &pLsrMaxHeapParam,        "Max heap size (megabytes), 0 => unlimited"},
+    {PPInt,    "stackMain",       0, &pLsrStackSizeMain,       "Stack size for main threads (megabytes)"},
+    {PPInt,    "stackWorker",     0, &pLsrStackSizeWorker,     "Stack size for worker threads (megabytes)"},
+    {PPGc,     "gc",              0, NULL,                     "Pass option to gc"},
+    {PPRT,     "prt",             0, NULL,                     "Pass option to prt"},
+    {PFutures, "futures",         0, NULL,                     "Pass option to the futures package"},
+#ifdef P_USE_PILLAR
+    {PPPGc,    "pgc",             0, NULL,                     "Pass option to pgc"},
+#endif // P_USR_PILLAR
+    {PPEnd,    NULL,              0, NULL,                     NULL} /*Signal the end */
+};
+
+static void pLsrinitPParamTable()
+{
+    uintp i = 0;
+    while(pLsroptions[i].name != NULL) {
+        pLsroptions[i].len = strlen(pLsroptions[i].name);
+        i++;
+    }
+    return;
+}
+
+static void pLsrdoPParamUsage(const char* argv[])
+{
+    uintp i = 0;
+    fprintf(stderr, "usage: %s @PPiler [opt]* -- \n", argv[0]);
+    fprintf(stderr, "  where opt is one of\n");
+    
+    while(pLsroptions[i].t != PPEnd) {
+        switch(pLsroptions[i].t) {
+        case PPInt:
+            fprintf(stderr, 
+                    "    %s n (default=%d)\n", 
+                    pLsroptions[i].name,
+                    *(pLsroptions[i].loc));
+            break;
+        case PPBool:
+            fprintf(stderr, 
+                    "    %s\n", 
+                    pLsroptions[i].name);
+            break;
+        case PPGc:
+            fprintf(stderr,
+                    "    %s <string>\n",
+                    pLsroptions[i].name);
+            break;
+        case PPRT:
+            fprintf(stderr,
+                    "    %s <string>\n",
+                    pLsroptions[i].name);
+            break;
+        case PFutures:
+            fprintf(stderr,
+                    "    %s <string>\n",
+                    pLsroptions[i].name);
+            break;
+#ifdef P_USE_PILLAR
+        case PPPGc:
+            fprintf(stderr,
+                    "    %s <string>\n",
+                    pLsroptions[i].name);
+            break;
+#endif // P_USR_PILLAR
+        case PPEnd:
+            assert(0);
+            break;
+        }
+        i++;
+    }
+    pLsrRuntimeError("Bad @PPiler option usage");
+}
+
+static int pLsrdoPParamInt(int* res, int idx, int argc, const char* argv[])
+{
+    if (idx >= argc) 
+        pLsrdoPParamUsage(argv);
+    if (sscanf(argv[idx], "%i", res) <= 0)
+        pLsrdoPParamUsage(argv);
+    return idx+1;
+}
+    
+static int pLsrdoPParamGc(int idx, int argc, const char* argv[])
+{
+    if (idx >= argc) 
+        pLsrdoPParamUsage(argv);
+    if (strcmp(argv[idx], "verbose") == 0)
+        pLsrGcOption("-verbosegc", "");
+    else
+        pLsrGcOption("-gc", argv[idx]);
+    return idx+1;
+}
+
+static int pLsrdoPParam(int idx, int argc, const char* argv[])
+{
+    void __cdecl prtSetOption(const char *optionString);
+    uintp j;
+    for(j=0; pLsroptions[j].t != PPEnd; j++) {
+        /* XXX NG: this is not strictly correct */
+        if (0 == strncmp(argv[idx], pLsroptions[j].name, pLsroptions[j].len)) {
+            idx++;
+            switch (pLsroptions[j].t) {
+            case PPInt:
+                idx = pLsrdoPParamInt(pLsroptions[j].loc, idx, argc, argv);
+                break;
+            case PPBool:
+                *(pLsroptions[j].loc) = 1;
+                break;
+            case PPGc:
+                idx = pLsrdoPParamGc(idx, argc, argv);
+                break;
+            case PPRT:
+                if (idx >= argc)
+                    pLsrdoPParamUsage(argv);
+#ifdef P_USE_PILLAR
+                prtSetOption(argv[idx]);
+#else /* !P_USE_PILLAR */
+                fprintf(stderr, 
+                        "prt option: %s ignored (not using pillar)\n",
+                        argv[idx]);
+#endif
+                idx++;
+                break;
+            case PFutures:
+                if (idx >= argc)
+                    pLsrdoPParamUsage(argv);
+                ptkFutureSystemSetOption(argv[idx]);
+                idx++;
+                break;
+#ifdef P_USE_PILLAR
+            case PPPGc:
+                if (idx >= argc)
+                    pLsrdoPParamUsage(argv);
+                pgcSetOption(argv[idx]);
+                idx++;
+                break;
+#endif // P_USR_PILLAR
+            case PPEnd:
+                assert(0);
+                /* Impossible */
+                break;
+            }
+            return idx;
+        }
+    }
+    /* If we get here, it was an unknown param */
+    pLsrdoPParamUsage(argv);
+    return 0;    
+}
+
+static int pLsrdoPParamList(int idx, int argc, const char* argv[])
+{
+    while(idx < argc) {
+        if (0 == strncmp(argv[idx], pLsrPPEnd, pLsrPPEndLen)) {
+            return idx+1;
+        }
+        idx = pLsrdoPParam(idx, argc, argv);
+    }
+    /* If we get here, then there was no PPEnd symbol */
+    pLsrdoPParamUsage(argv);
+    return 0;
+}
+
+static int pLsrdoPParamSegment(int idx, int argc, const char* argv[])
+{
+    if (idx < argc) {
+        if (0 == strncmp(argv[idx], pLsrPPStart, pLsrPPStartLen)) {
+            return pLsrdoPParamList(idx+1, argc, argv);
+        }
+        else {
+            return idx;
+        }
+    }
+    return idx;
+}
+
+static void pLsrParseOptions(int _argc, const char* _argv[],
+                             int* argc, const char*** argv)
+{
+    /* For now, assume that all the PPiler opts are first.  
+     * Later we can generalize 
+     */
+    int idx;
+    int count;
+    pLsrinitPParamTable();
+    idx = pLsrdoPParamSegment(1, _argc, _argv);
+    count = idx -1;
+    *argc = pargc = _argc - count;  /* consumed args */
+    *argv = pargv = _argv + count;  /* Advance to last consumed arg */
+    *argv[0] = _argv[0];    /* Keep command name around in place of last consumed */
+}
+
+#endif /* !_PLSR_PARAMS_H_ */
diff --git a/runtime/include/hrc/plsr-prims-ghc-longlong.h b/runtime/include/hrc/plsr-prims-ghc-longlong.h
new file mode 100644
index 0000000..eabdc7c
--- /dev/null
+++ b/runtime/include/hrc/plsr-prims-ghc-longlong.h
@@ -0,0 +1,51 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_PRIMS_GHC_LONGLONG_H_
+#define _PLSR_PRIMS_GHC_LONGLONG_H_
+
+/* get type definitions from pil.h */
+#include "hrc/pil.h"
+
+#define hs_gtWord64(a,b) ((a)> (b))
+#define hs_geWord64(a,b) ((a)>=(b))
+#define hs_eqWord64(a,b) ((a)==(b))
+#define hs_neWord64(a,b) ((a)!=(b))
+#define hs_ltWord64(a,b) ((a)< (b))
+#define hs_leWord64(a,b) ((a)<=(b))
+#define hs_gtInt64(a,b) ((a)> (b))
+#define hs_geInt64(a,b) ((a)>=(b))
+#define hs_eqInt64(a,b) ((a)==(b))
+#define hs_neInt64(a,b) ((a)!=(b))
+#define hs_ltInt64(a,b) ((a)< (b))
+#define hs_leInt64(a,b) ((a)<=(b))
+
+#define hs_remWord64(a,b)  ((a)% (b))
+#define hs_quotWord64(a,b) ((a)/ (b))
+
+#define hs_remInt64(a,b)   ((a)% (b))
+#define hs_quotInt64(a,b)  ((a)/ (b))
+#define hs_negateInt64(a)  (-(a))
+#define hs_plusInt64(a,b)  ((a)+ (b))
+#define hs_minusInt64(a,b) ((a)- (b))
+#define hs_timesInt64(a,b) ((a)* (b))
+
+#define hs_and64(a,b) ((a)& (b))
+#define hs_or64(a,b)  ((a)| (b))
+#define hs_xor64(a,b) ((a)^ (b))
+#define hs_not64(a)   ( ~(a))
+
+#define hs_uncheckedShiftL64(a,b)   ((a)<< (b))
+#define hs_uncheckedShiftRL64(a,b)  ((a)>> (b))
+#define hs_uncheckedIShiftL64(a,b)  ((a)<< (b))
+#define hs_uncheckedIShiftRA64(a,b) ((a)>> (b))
+#define hs_uncheckedIShiftRL64(a,b) ((sint64)((uint64)(a)>>(b)))
+
+#define hs_intToInt64(i)    ((sint64) (i))
+#define hs_int64ToInt(i)    ((sintp)  (i))
+#define hs_int64ToWord64(i) ((uint64) (i))
+#define hs_wordToWord64(w)  ((uint64) (w))
+#define hs_word64ToWord(w)  ((uintp)  (w))
+#define hs_word64ToInt64(w) ((sint64) (w))
+
+#endif
diff --git a/runtime/include/hrc/plsr-prims-ghc.h b/runtime/include/hrc/plsr-prims-ghc.h
new file mode 100644
index 0000000..5713740
--- /dev/null
+++ b/runtime/include/hrc/plsr-prims-ghc.h
@@ -0,0 +1,554 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_PRIMS_GHC_H_
+#define _PLSR_PRIMS_GHC_H_
+
+/* XXX NG: This stuff doesn't work under C */
+#ifdef P_USE_PILLAR
+
+/* get type definitions from pil.h */
+#include "hrc/pil.h"
+#include "hrc/plsr-value.h"
+#include "hrc/plsr-gc.h"
+#include "hrc/plsr-util.h"
+
+/* C functions used for Runtime */
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+#endif /* P_USE_PILLAR */
+
+#ifndef PLSR_LINUX
+#include <io.h>
+#include <errno.h>
+#endif // !PLSR_LINUX
+
+#include "hrc/ghc/float.h"
+#include "hrc/ghc/Globals.h"
+#include "hrc/ghc/TTY.h"
+#include "hrc/ghc/thread.h"
+
+void pLsrGetProgArgv (void* argc, void* argv)
+{
+    if (argc) { *((sintp*)argc) = pargc; }
+    if (argv) { *((void**)argv) = pargv; }
+}
+
+#ifdef P_USE_PILLAR
+#  undef to
+#  pragma pillar_managed(on)
+#endif /* P_USE_PILLAR */
+
+/*** Address Stuff ***/
+
+/* pointer read */
+#define pLsrPrimGHCIndexOffAddrzh(p,i,r,t)   ((r)*(((t*)(p))+(i)))
+#define pLsrPrimGHCIndexFloatOffAddrzh(p,i)  pLsrPrimGHCIndexOffAddrzh(p,i,float,float)
+#define pLsrPrimGHCIndexDoubleOffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,double,double)
+#define pLsrPrimGHCIndexAddrOffAddrzh(p,i)   pLsrPrimGHCIndexOffAddrzh(p,i,void*,void*)
+#define pLsrPrimGHCIndexIntOffAddrzh(p,i)    pLsrPrimGHCIndexOffAddrzh(p,i,sintp,sintp)
+#define pLsrPrimGHCIndexUIntOffAddrzh(p,i)   pLsrPrimGHCIndexOffAddrzh(p,i,uintp,uintp)
+#define pLsrPrimGHCIndexUInt8OffAddrzh(p,i)  pLsrPrimGHCIndexOffAddrzh(p,i,uintp,uint8)
+#define pLsrPrimGHCIndexUInt16OffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,uintp,uint16)
+#define pLsrPrimGHCIndexUInt32OffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,uintp,uint32)
+#define pLsrPrimGHCIndexUInt64OffAddrzh(p,i) pLsrPrimGHCIndexOffAddrzh(p,i,uint64,uint64)
+#define pLsrPrimGHCIndexInt8OffAddrzh(p,i)   pLsrPrimGHCIndexOffAddrzh(p,i,sintp,sint8)
+#define pLsrPrimGHCIndexInt16OffAddrzh(p,i)  pLsrPrimGHCIndexOffAddrzh(p,i,sintp,sint16)
+#define pLsrPrimGHCIndexInt32OffAddrzh(p,i)  pLsrPrimGHCIndexOffAddrzh(p,i,sintp,sint32)
+#define pLsrPrimGHCIndexInt64OffAddrzh(p,i)  pLsrPrimGHCIndexOffAddrzh(p,i,sint64,sint64)
+/* pointer write */
+#define pLsrPrimGHCWriteOffAddrzh(p,i,v,t)     { *(((t*)(p))+(i))=(t)(v); }
+#define pLsrPrimGHCWriteFloatOffAddrzh(p,i,v)  pLsrPrimGHCWriteOffAddrzh(p,i,v,float)
+#define pLsrPrimGHCWriteDoubleOffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,double)
+#define pLsrPrimGHCWriteAddrOffAddrzh(p,i,v)   pLsrPrimGHCWriteOffAddrzh(p,i,v,void*)
+#define pLsrPrimGHCWriteIntOffAddrzh(p,i,v)    pLsrPrimGHCWriteOffAddrzh(p,i,v,sintp)
+#define pLsrPrimGHCWriteUIntOffAddrzh(p,i,v)   pLsrPrimGHCWriteOffAddrzh(p,i,v,uintp)
+#define pLsrPrimGHCWriteUInt8OffAddrzh(p,i,v)  pLsrPrimGHCWriteOffAddrzh(p,i,v,uint8)
+#define pLsrPrimGHCWriteUInt16OffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,uint16)
+#define pLsrPrimGHCWriteUInt32OffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,uint32)
+#define pLsrPrimGHCWriteUInt64OffAddrzh(p,i,v) pLsrPrimGHCWriteOffAddrzh(p,i,v,uint64)
+#define pLsrPrimGHCWriteInt8OffAddrzh(p,i,v)   pLsrPrimGHCWriteOffAddrzh(p,i,v,sint8)
+#define pLsrPrimGHCWriteInt16OffAddrzh(p,i,v)  pLsrPrimGHCWriteOffAddrzh(p,i,v,sint16)
+#define pLsrPrimGHCWriteInt32OffAddrzh(p,i,v)  pLsrPrimGHCWriteOffAddrzh(p,i,v,sint32)
+#define pLsrPrimGHCWriteInt64OffAddrzh(p,i,v)  pLsrPrimGHCWriteOffAddrzh(p,i,v,sint64)
+/* pointer arithmetic */
+#define pLsrPrimGHCNullAddrzh() ((void*)0)
+#define pLsrPrimGHCPlusAddrzh(p,i) ((void*)((char*)(p)+(i)))
+#define pLsrPrimGHCMinusAddrzh(p,q) ((sintp)((char*)(p)-(char*)(q)))
+#define pLsrPrimGHCRemAddrzh(p,i) (((sintp)(p)%(i)))
+#define pLsrPrimGHCAddr2Intzh(p) ((sintp)(p))
+#define pLsrPrimGHCInt2Addrzh(i) ((void*)(i))
+#define pLsrPrimGHCGtAddrzh(p,q) ((p)>(q))
+#define pLsrPrimGHCGeAddrzh(p,q) ((p)>=(q))
+#define pLsrPrimGHCLtAddrzh(p,q) ((p)<(q))
+#define pLsrPrimGHCLeAddrzh(p,q) ((p)<=(q))
+#define pLsrPrimCastToAddrzh(p) ((void*)(p))
+#define pLsrPrimCastFromAddrzh(p) ((PlsrPAny*)(p))
+/* bit operations */
+#define pLsrPrimGHCPopCntzh(w) ((uintp)(__builtin_popcount((unsigned)(w))))
+#define pLsrPrimGHCPopCnt64zh(w) ((uintp)(__builtin_popcountll((uint64)(w))))
+/* byte array */
+#define pLsrPrimGHCByteArrayContentszh(p) ((void*)(&pLsrPArrayOElt((PlsrObjectB)(p), 0)))
+
+#ifdef P_USE_PILLAR
+static void performMajorGC () { pgc_force_gc(); }
+#endif // P_USE_PILLAR
+
+/*** Thread stuff ***/
+
+typedef enum {IhrTsRunning, IhrTsCompleted, IhrTsKilled} IhrThreadStatus;
+
+typedef struct IhrThreadS {
+    PlsrVTable vtable;
+    PrtTaskHandle prtThread;
+    IhrThreadStatus status;
+    bool maskAsyncExn, uninteruptable;
+} IhrThreadU;
+
+#define ihrThreadSize sizeof(IhrThreadU)
+#define ihrThreadPadding \
+    (sizeof(IhrThreadU) - sizeof(PlsrVTable) - sizeof(PrtTaskHandle) - sizeof(IhrThreadStatus) - 2*sizeof(bool))
+pLsrVTableStatic (ihrVTableThread, VNoneTag, "*haskell thread*", ihrThreadPadding);
+
+#ifdef P_USE_PILLAR
+typedef ref IhrThreadId;
+#define ihrThreadB(t) ((IhrThreadU*)(t))
+#else
+typedef IhrThreadU* IhrThreadId;
+#define ihrThreadB(t) (t)
+#endif
+
+typedef struct IhrTlsS {
+    IhrThreadId thread;
+    //PilContinuation(PlsrObjectB) exnHandler;
+    PilContinuation0 exnHandler;
+    PlsrObjectB exn;
+} IhrTlsU;
+
+typedef IhrTlsU* IhrTlsP;
+
+unsigned ihrTlsOffset;
+
+#define ihrPrtTlsToTls(tls) (*((IhrTlsP*)(((char*)(tls))+ihrTlsOffset)))
+
+static void __cdecl ihrTlsRse(PrtProvidedTlsHandle tls_, struct PrtRseInfo* rse)
+{
+    IhrTlsP tls = ihrPrtTlsToTls(tls_);
+    if (tls) {
+        rse->callback(rse->env, &tls->thread, PrtGcTagDefault, NULL);
+        rse->callback(rse->env, &tls->exn, PrtGcTagDefault, NULL);
+    }
+}
+
+static void ihrTlsInit()
+{
+    ihrTlsOffset = ptkGetNextTlsOffset(sizeof(IhrTlsP));
+    prtRegisterTlsEnumerator(ihrTlsRse);
+}
+
+static IhrTlsP ihrTlsGet()
+{
+    IhrTlsP tls = ihrPrtTlsToTls(prtGetTls());
+    return tls;
+}
+
+static void ihrTlsSet(IhrTlsP tls)
+{
+    ihrPrtTlsToTls(prtGetTls()) = tls;
+}
+
+/*** Exceptions ***/
+
+//static inline PilContinuation(PlsrObjectB) ihrExceptionHandlerGet()
+static inline PilContinuation0 ihrExceptionHandlerGet()
+{
+    return ihrTlsGet()->exnHandler;
+}
+
+static inline PlsrObjectB ihrExceptionExnGet()
+{
+    PlsrObjectB exn = ihrTlsGet()->exn;
+    assert(exn);
+    return exn;
+}
+
+//static inline void ihrExceptionHandlerSet(PilContinuation(PlsrObjectB) c)
+static inline void ihrExceptionHandlerSet(PilContinuation0 c)
+{
+    ihrTlsGet()->exnHandler = c;
+}
+
+static inline void ihrExceptionExnSet(PlsrObjectB c)
+{
+    ihrTlsGet()->exn = c;
+}
+
+/*** Delay/wait operations ***/
+
+static void ihrDelay(sintp d)
+{
+    uintp s;
+    if (d<0) return;
+    s = (d+999)/1000;
+    prtSleep(s);
+}
+
+static void ihrWaitRead(sintp f)
+{
+    /* NG: The GHC runtime blocks the given thread until the given file descriptor
+     *     is ready to be read from.  Ultimately it uses select to test this.
+     *     Since we don't multiplex Haskell level threads onto OS threads, it seems
+     *     fine to just do nothing.  The thread will then go ahead and try to read.
+     * XXX: If we start multiplexing Haskell threads, then we need to implement this
+     *      properly.
+     */
+}
+
+static void ihrWaitWrite(sintp f)
+{
+    /* NG: The GHC runtime blocks the given thread until the given file descriptor
+     *     is ready to be written to.  Ultimately it uses select to test this.
+     *     Since we don't multiplex Haskell level threads onto OS threads, it seems
+     *     fine to just do nothing.  The thread will then go ahead and try to write.
+     * XXX: If we start multiplexing Haskell threads, then we need to implement this
+     *      properly.
+     */
+}
+
+typedef struct IhrPairS {
+    PlsrVTable vtable;
+    sintp fst;
+    sintp snd;
+}* IhrPair;
+
+#define ihrPairPadding \
+    (sizeof(struct IhrPairS) - sizeof(PlsrVTable) - 2*sizeof(sintp))
+pLsrVTableStatic (ihrVTablePair, VNoneTag, "*IHR pair*", ihrPairPadding);
+#define ihrPairSize (sizeof(struct IhrPairS))
+
+#ifndef PLSR_LINUX
+
+/* NG: The GHC runtime blocks the given thread until the request read/write/action
+ *     is done.  Since we don't multiplex Haskell level threads onto OS threads, it
+ *     seems fine to just do the operation and the return.
+ * XXX: If we start multiplexing Haskell threads, then we need to implement this
+ *      properly.
+ */
+
+/* XXX NG: These are non-threaded implementations.  Threaded ones should barf. */
+
+static IhrPair ihrAsyncRead(sintp f, sintp isSock, sintp num, void* buf)
+{
+    IhrPair res;
+    int len;
+    DWORD ec = 0;
+
+    if (isSock) {
+        len = recv(f, buf, num, 0);
+        if (len==SOCKET_ERROR)
+            ec = WSAGetLastError();
+    } else {
+        len = read(f, buf, num);
+        /* XXX NG: GHC does some special processing in certain circumstances,
+         *         but I'm not sure we can do the same, so I'm ignoring that.
+         */
+        if (len==-1) ec = errno;
+    }
+
+    pLsrAlloc(IhrPair, res, &ihrVTablePair, ihrPairSize);
+    res->fst = len;
+    res->snd = ec;
+    return res;
+}
+
+static IhrPair ihrAsyncWrite(sintp f, sintp isSock, sintp num, void* buf)
+{
+    IhrPair res;
+    int len;
+    DWORD ec = 0;
+
+    if (isSock) {
+        len = send(f, buf, num,0);
+        if (len==SOCKET_ERROR)
+            ec = WSAGetLastError();
+    } else {
+        len = write(f, buf, num);
+        /* XXX NG: GHC does some special processing in certain circumstances,
+         *         but I'm not sure we can do the same, so I'm ignoring that.
+         */
+        if (len==-1) {
+            ec = errno;
+            if (ec==EINVAL && GetLastError()==ERROR_NO_DATA) ec = EPIPE;
+        }
+    }
+
+    pLsrAlloc(IhrPair, res, &ihrVTablePair, ihrPairSize);
+    res->fst = len;
+    res->snd = ec;
+    return res;
+}
+
+typedef sintp (*IhrProc)(void*);
+
+static sintp ihrAsyncDoProc(void* f_, void* a)
+{
+    IhrProc f = (IhrProc)f_;
+    if (f)
+        return f(a);
+    else
+        return 1;
+}
+
+#endif // !PLSR_LINUX
+
+/*** Concurency ***/
+
+#define ihrThreadMask(t) (ihrThreadB(t)->maskAsyncExn=1, ihrThreadB(t)->uninteruptable=1)
+#define ihrThreadUnmask(t) (ihrThreadB(t)->maskAsyncExn=0, ihrThreadB(t)->uninteruptable=0)
+
+#define ihrThreadAllocInit(t, pt)                                       \
+    do {                                                                \
+        pLsrAlloc(IhrThreadId, (t), &ihrVTableThread, ihrThreadSize);   \
+        ihrThreadB(t)->prtThread = (pt);                                \
+        ihrThreadB(t)->status = IhrTsRunning;                           \
+        ihrThreadMask(t);                                               \
+    } while(0)
+
+static void ihrThreadWrapper0(void (*f)())
+{
+    IhrTlsU tls;
+    PlsrObjectB exn;
+    PilContinuation0 cv;
+    //PilContinuation(PlsrObjectB) cv;
+    pilContinuationLocal(cvl);
+    ihrThreadAllocInit(tls.thread, prtGetTaskHandle());
+    pilContinuationMake(cv, cvl, c);
+    tls.exnHandler = c;
+    tls.exn = NULL;
+    ihrTlsSet(&tls);
+    ihrThreadUnmask(tls.thread);
+    f() pilCutToC(c);
+    ihrThreadMask(tls.thread);
+    ihrThreadB(tls.thread)->status = IhrTsCompleted;
+    return;
+    pilContinuation0(cvl, c)
+        //pilContinuation(cvl, c, exn)
+        ihrThreadB(tls.thread)->status = IhrTsKilled;
+        return;
+}
+
+static void ihrThreadWrapper1(void (*f)(PlsrObjectB), PlsrObjectB a)
+{
+    IhrTlsU tls;
+    PlsrObjectB exn;
+    PilContinuation0 cv;
+    //PilContinuation(PlsrObjectB) cv;
+    pilContinuationLocal(cvl);
+    ihrThreadAllocInit(tls.thread, prtGetTaskHandle());
+    pilContinuationMake(cv, cvl, c);
+    tls.exnHandler = c;
+    tls.exn = NULL;
+    ihrTlsSet(&tls);
+    ihrThreadUnmask(tls.thread);
+    f(a) pilCutToC(c);
+    ihrThreadMask(tls.thread);
+    ihrThreadB(tls.thread)->status = IhrTsCompleted;
+    return;
+    pilContinuation0(cvl, c)
+        //pilContinuation(cvl, c, exn)
+        ihrThreadB(tls.thread)->status = IhrTsKilled;
+        return;
+}
+
+/* We make this a separate function because some of the thunk evals are macros,
+ * and we need to annotate the cuts below in ihrForkedThreadMain
+ */
+static void ihrForkedThreadRun(PlsrObjectB thnk)
+{
+    pLsrThunkEvalRef(thnk);
+}
+
+static void ihrForkedThreadMain(IhrThreadId thrd, PlsrObjectB thnk)
+{
+    IhrTlsU tls;
+    PlsrObjectB exn;
+    PilContinuation0 cv;
+    //PilContinuation(PlsrObjectB) cv;
+    pilContinuationLocal(cvl);
+    ihrThreadB(thrd)->prtThread = prtGetTaskHandle();
+    tls.thread = thrd;
+    pilContinuationMake(cv, cvl, c);
+    tls.exnHandler = c;
+    tls.exn = NULL;
+    ihrTlsSet(&tls);
+    ihrThreadUnmask(tls.thread);
+    ihrForkedThreadRun(thnk) pilCutToC(c);
+    ihrThreadMask(tls.thread);
+    ihrThreadB(tls.thread)->status = IhrTsCompleted;
+    return;
+    pilContinuation0(cvl, c)
+        //pilContinuation(cvl, c, exn)
+        ihrThreadB(tls.thread)->status = IhrTsKilled;
+        return;
+}
+
+static void __cdecl ihrForkOnRse(PrtCodeAddress func,
+                                 void *arguments,
+                                 struct PrtRseInfo *rse,
+                                 PrtCimSpecificDataType opaque) {
+    rse->callback(rse->env,((void**)arguments)+1,PrtGcTagDefault,0);
+}
+
+static IhrThreadId ihrForkOn(sintp p, PlsrObjectB thnk)
+{
+    IhrThreadId thrd;
+    ihrThreadAllocInit(thrd, NULL);
+    noyield {
+        void* args[2] = {thrd, NULL};
+        // This write barrier promotes thnk to the public space.
+        pLsrWriteBarrierRef(args[1], thnk);
+        // This statement keeps thnk alive past the write barrier so that
+        // any GC caused by the write barrier doesn't eliminate it.
+        args[1] = thnk;
+        prtPcall((PrtCodeAddress)ihrForkedThreadMain, args, 2, p, ihrForkOnRse);
+    }
+    return thrd;
+}
+
+static IhrThreadId ihrFork(PlsrObjectB t)
+{
+    return ihrForkOn(PRT_NO_PROC_AFFINITY, t);
+}
+
+static void ihrKillThread(IhrThreadId t, PlsrObjectB exn)
+{
+    pLsrRuntimeError("ihrKillThread unimplemented");
+}
+
+static void ihrYield()
+{
+    prtYield();
+}
+
+static IhrThreadId ihrMyThreadId()
+{
+    return ihrTlsGet()->thread;
+}
+
+static void ihrLabelThread(IhrThreadId t, void* l)
+{
+    pLsrRuntimeError("ihrLabelThread unimplemented");
+}
+
+static sintp ihrIsCurrentThreadBound()
+{
+    return 0;
+}
+
+static void ihrNoDuplicate()
+{
+    /* This GHC primitive ensures that no two threads are evaluating the same thunk at the same time.
+     * Since we currently ensure this all the time, this is a no operation.
+     * If this changes, then we need to implement this operation.
+     */
+}
+
+static sintp ihrThreadStatus(IhrThreadId t)
+{
+    //pLsrRuntimeError("ihrThreadStatus unimplemented");
+    return 0;
+}
+
+/*** Weak Pointers ***/
+
+/* a no-op just to keep object alive */
+#define pLsrPrimGHCTouchzh(p) do {volatile ref pLsrPrimGHCTouchzh_tmp = (ref)(uintp)(p);} while (0)
+
+/* Run the Haskell finaliser - needs to wrap the thread for exceptions, etc. */
+void ihrRunHaskellFinaliser(PlsrObjectB f)
+{
+    ihrThreadWrapper1(pLsrWpoRunFinalizer, f);
+}
+
+/*** Float and Double Encoding/Decoding ***/
+
+typedef struct {
+    PlsrVTable vtable;
+    uintp tag;
+    sintp man_sign;
+    uintp man_high;
+    uintp man_low;
+    sintp exp;
+} PlsrDecodedDoubleU;
+typedef PlsrDecodedDoubleU *PlsrDecodedDoubleB;
+#define pLsrDecodedDoublePadding                \
+    (sizeof(PlsrDecodedDoubleU) - sizeof(PlsrVTable) - sizeof(uintp) - sizeof(sintp) - 2*sizeof(uintp) - sizeof(sintp))
+pLsrVTableStatic (pLsrVTableDecodedDouble, VNoneTag, "*decoded double*", pLsrDecodedDoublePadding);
+PlsrRef pLsrPrimGHCDecodeDouble2Intzh (double v)
+{
+    PlsrDecodedDoubleB d;
+    uintp h, l;
+    sintp s, e;
+    __decodeDouble_2Int (&s, &h, &l, &e, v);
+    pLsrAlloc (PlsrDecodedDoubleB, d, &pLsrVTableDecodedDouble, sizeof (PlsrDecodedDoubleU));
+    d->tag = 0;
+    d->man_sign = s;
+    d->man_high = h;
+    d->man_low  = l;
+    d->exp = e;
+    return (PlsrRef)d;
+}
+
+
+typedef struct {
+    PlsrVTable vtable;
+    uintp tag;
+    sintp man;
+    sintp exp;
+} PlsrDecodedFloatU;
+#define pLsrDecodedFloatPadding                \
+    (sizeof(PlsrDecodedFloatU) - sizeof(PlsrVTable) - sizeof(uintp) - 2*sizeof(sintp))
+typedef PlsrDecodedFloatU *PlsrDecodedFloatB;
+pLsrVTableStatic (pLsrVTableDecodedFloat, VNoneTag, "*decoded float*", pLsrDecodedFloatPadding);
+PlsrRef pLsrPrimGHCDecodeFloatzh (float v)
+{
+    PlsrDecodedFloatB f;
+    sintp m, e;
+    __decodeFloat_Int (&m, &e, v);
+    pLsrAlloc (PlsrDecodedFloatB, f, &pLsrVTableDecodedFloat, sizeof (PlsrDecodedFloatU));
+    f->tag = 0;
+    f->man = m;
+    f->exp = e;
+    return (PlsrRef)f;
+}
+#define pLsrDecodedDoubleSize (sizeof(PlsrDecodedDoubleU))
+#define pLsrDecodedFloatSize (sizeof(PlsrDecodedFloatU))
+
+/*** Initialisation ***/
+
+static void pLsrPrimGHCRegisterVTables()
+{
+    static PgcIsRef ihrThreadRefs[ihrThreadSize/P_WORD_SIZE] = { 0, };
+    static PgcIsRef ihrPairRefs[ihrPairSize/P_WORD_SIZE] = { 0, };
+    static PgcIsRef pLsrDecodedDoubleRefs[pLsrDecodedDoubleSize/P_WORD_SIZE] = { 0, };
+    static PgcIsRef pLsrDecodedFloatRefs[pLsrDecodedFloatSize/P_WORD_SIZE] = { 0, };
+
+    pLsrVTableRegister(&ihrVTableThread, 4, ihrThreadSize, ihrThreadRefs, 0, 0, 0, PGC_ALWAYS_MUTABLE, 0);
+    pLsrVTableRegister(&ihrVTablePair, 4, ihrPairSize, ihrPairRefs, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegister(&pLsrVTableDecodedDouble, 4, pLsrDecodedDoubleSize, pLsrDecodedDoubleRefs,
+                       0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegister(&pLsrVTableDecodedFloat, 4, pLsrDecodedFloatSize, pLsrDecodedFloatRefs,
+                       0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0);
+}
+
+static void ihrInit()
+{
+    pLsrPrimGHCRegisterVTables();
+    ihrTlsInit();
+    ihrGlobalInit();
+}
+
+#endif // P_USE_PILLAR
+
+#endif // !_PLSR_PRIMS_GHC_H_
diff --git a/runtime/include/hrc/plsr-prims-prims.h b/runtime/include/hrc/plsr-prims-prims.h
new file mode 100644
index 0000000..71b3c51
--- /dev/null
+++ b/runtime/include/hrc/plsr-prims-prims.h
@@ -0,0 +1,665 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_PRIMS_PRIMS_H_
+#define _PLSR_PRIMS_PRIMS_H_
+
+/**********************************************************************
+ * Unboxed Machine Integers and Floating-Point
+ */
+
+#define binArith(t, oper, dest, a, b)           \
+    ((dest) = ((t) ((t) a) oper ((t) b)))
+
+#define binArithPrefix(t, oper, dest, a, b)           \
+  ((dest) = ((t) (oper ((t) a, (t) b))))
+    
+#define unArith(t, oper, dest, a)               \
+    ((dest) = ((t) (oper ((t) a))))
+
+#define binPred(t, oper, dest, a, b)                    \
+    ((dest) = ((PlsrBoolean) (((t) a) oper ((t) b))))
+
+#define pLsrPrimNumericConv(dest, tTo, a)       \
+    ((dest) = (tTo) (a))
+
+/* SInt8 ops */
+#define pLsrPrimPSInt8Plus(dest, a, b)   binArith(sint8, +, dest, a, b)   
+#define pLsrPrimPSInt8Minus(dest, a, b)  binArith(sint8, -, dest, a, b)      
+#define pLsrPrimPSInt8Times(dest, a, b)  binArith(sint8, *, dest, a, b)      
+#define pLsrPrimPSInt8DivT(dest, a, b)   binArith(sint8, /, dest, a, b)
+#define pLsrPrimPSInt8ModT(dest, a, b)   binArith(sint8, %, dest, a, b)
+#define pLsrPrimPSInt8Negate(dest, a)    unArith(sint8, -, dest, a)
+
+#define pLsrPrimPSInt8EQ(dest, a, b) binPred(sint8, ==, dest, a, b)     
+#define pLsrPrimPSInt8NE(dest, a, b) binPred(sint8, !=, dest, a, b)     
+#define pLsrPrimPSInt8LT(dest, a, b) binPred(sint8, <, dest, a, b)      
+#define pLsrPrimPSInt8LE(dest, a, b) binPred(sint8, <=, dest, a, b)
+
+#define pLsrPrimPSInt8FromSInt8(dest, a)   pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt8FromSInt16(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt8FromSInt32(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt8FromSInt64(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt8FromUInt8(dest, a)   pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt8FromUInt16(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt8FromUInt32(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt8FromUInt64(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt8FromFloat32(dest, a) pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt8FromFloat64(dest, a) pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt8FromRational(dest, a) pLsrSInt8FromRational(dest, a)
+#define pLsrPrimPSInt8FromInteger(dest, a) pLsrSInt8FromInteger(dest, a)
+
+#define pLsrPrimPSInt8CastSInt8(dest, a)   pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt16CastSInt8(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt32CastSInt8(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPSInt64CastSInt8(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPUInt8CastSInt8(dest, a)   pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPUInt16CastSInt8(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPUInt32CastSInt8(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPUInt64CastSInt8(dest, a)  pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPFloat32CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPFloat64CastSInt8(dest, a) pLsrPrimNumericConv(dest, sint8, a)
+#define pLsrPrimPIntegerCastSInt8(dest, a) pLsrIntegerCastToSInt8(dest, a)
+
+/* SInt16 ops */
+#define pLsrPrimPSInt16Plus(dest, a, b)   binArith(sint16, +, dest, a, b)   
+#define pLsrPrimPSInt16Minus(dest, a, b)  binArith(sint16, -, dest, a, b)      
+#define pLsrPrimPSInt16Times(dest, a, b)  binArith(sint16, *, dest, a, b)      
+#define pLsrPrimPSInt16DivT(dest, a, b)   binArith(sint16, /, dest, a, b)
+#define pLsrPrimPSInt16ModT(dest, a, b)   binArith(sint16, %, dest, a, b)
+#define pLsrPrimPSInt16Negate(dest, a)    unArith(sint16, -, dest, a)
+
+#define pLsrPrimPSInt16EQ(dest, a, b) binPred(sint16, ==, dest, a, b)     
+#define pLsrPrimPSInt16NE(dest, a, b) binPred(sint16, !=, dest, a, b)     
+#define pLsrPrimPSInt16LT(dest, a, b) binPred(sint16, <, dest, a, b)      
+#define pLsrPrimPSInt16LE(dest, a, b) binPred(sint16, <=, dest, a, b)
+
+#define pLsrPrimPSInt16FromSInt8(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt16FromSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt16FromSInt32(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt16FromSInt64(dest, a) pLsrPrimNumericConv(dest, sint16, a
+#define pLsrPrimPSInt16FromUInt8(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt16FromUInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt16FromUInt32(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt16FromUInt64(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt16FromFloat32(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt16FromFloat64(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt16FromRational(dest, a) pLsrSInt16FromRational(dest, a)
+#define pLsrPrimPSInt16FromInteger(dest, a) pLsrSInt16FromInteger(dest, a)
+
+#define pLsrPrimPSInt8CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt16CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt32CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPSInt64CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPUInt8CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPUInt16CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPUInt32CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPUInt64CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPFloat32CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPFloat64CastSInt16(dest, a) pLsrPrimNumericConv(dest, sint16, a)
+#define pLsrPrimPIntegerCastSInt16(dest, a) pLsrIntegerCastToSInt16(dest, a)
+
+/* SInt32 ops */
+#define pLsrPrimPSInt32Plus(dest, a, b)   binArith(sint32, +, dest, a, b)   
+#define pLsrPrimPSInt32Minus(dest, a, b)  binArith(sint32, -, dest, a, b)      
+#define pLsrPrimPSInt32Times(dest, a, b)  binArith(sint32, *, dest, a, b)      
+#define pLsrPrimPSInt32DivT(dest, a, b)    binArith(sint32, /, dest, a, b)
+#define pLsrPrimPSInt32ModT(dest, a, b)    binArith(sint32, %, dest, a, b)
+#define pLsrPrimPSInt32DivModT(dest1, dest2, a, b) pLsrSInt32DivModT(dest1, dest2, a, b)
+#define pLsrPrimPSInt32DivModE(dest1, dest2, a, b) pLsrSInt32DivModE(dest1, dest2, a, b)
+#define pLsrPrimPSInt32DivModF(dest1, dest2, a, b) pLsrSInt32DivModF(dest1, dest2, a, b)
+#define pLsrPrimPSInt32Negate(dest, a)    unArith(sint32, -, dest, a)
+
+#define pLsrSInt32DivModT(dest1, dest2, a, b)   \
+    do {                                        \
+        div_t q = div (a, b);                   \
+        dest1 = q.quot;                      \
+        dest2 = q.rem;                       \
+    } while (0)
+
+#define pLsrPrimPSInt32EQ(dest, a, b) binPred(sint32, ==, dest, a, b)     
+#define pLsrPrimPSInt32NE(dest, a, b) binPred(sint32, !=, dest, a, b)     
+#define pLsrPrimPSInt32LT(dest, a, b) binPred(sint32, <, dest, a, b)      
+#define pLsrPrimPSInt32LE(dest, a, b) binPred(sint32, <=, dest, a, b)
+
+#define pLsrPrimPSInt32FromSInt8(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt32FromSInt16(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt32FromSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt32FromSInt64(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt32FromUInt8(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt32FromUInt16(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt32FromUInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt32FromUInt64(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt32FromFloat32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt32FromFloat64(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt32FromRational(dest, a) pLsrSInt32FromRational(dest, a)
+#define pLsrPrimPSInt32FromInteger(dest, a) pLsrSInt32FromInteger(dest, a)
+
+#define pLsrPrimPSInt8CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt16CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt32CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPSInt64CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPUInt8CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPUInt16CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPUInt32CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPUInt64CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPFloat32CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPFloat64CastSInt32(dest, a) pLsrPrimNumericConv(dest, sint32, a)
+#define pLsrPrimPIntegerCastSInt32(dest, a) pLsrIntegerCastToSInt32(dest, a)
+
+/* SInt64 ops */
+#define pLsrPrimPSInt64Plus(dest, a, b)   binArith(sint64, +, dest, a, b)   
+#define pLsrPrimPSInt64Minus(dest, a, b)  binArith(sint64, -, dest, a, b)      
+#define pLsrPrimPSInt64Times(dest, a, b)  binArith(sint64, *, dest, a, b)      
+#define pLsrPrimPSInt64DivT(dest, a, b)    binArith(sint64, /, dest, a, b)
+#define pLsrPrimPSInt64ModT(dest, a, b)    binArith(sint64, %, dest, a, b)
+#define pLsrPrimPSInt64DivModT(dest1, dest2, a, b) pLsrSInt64DivModT(dest1, dest2, a, b)
+#define pLsrPrimPSInt64DivModE(dest1, dest2, a, b) pLsrSInt64DivModE(dest1, dest2, a, b)
+#define pLsrPrimPSInt64DivModF(dest1, dest2, a, b) pLsrSInt64DivModF(dest1, dest2, a, b)
+#define pLsrPrimPSInt64Negate(dest, a)    unArith(sint64, -, dest, a)
+
+#define pLsrSInt64DivModT(dest1, dest2, a, b)                           \
+    do {                                                                \
+        lldiv_t q = lldiv (a, b);                                       \
+        dest1 = q.quot;                                                 \
+        dest2 = q.rem;                                                  \
+    } while (0)
+
+#define pLsrPrimPSInt64EQ(dest, a, b) binPred(sint64, ==, dest, a, b)     
+#define pLsrPrimPSInt64NE(dest, a, b) binPred(sint64, !=, dest, a, b)     
+#define pLsrPrimPSInt64LT(dest, a, b) binPred(sint64, <, dest, a, b)      
+#define pLsrPrimPSInt64LE(dest, a, b) binPred(sint64, <=, dest, a, b)
+
+#define pLsrPrimPSInt64FromSInt8(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt64FromSInt16(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt64FromSInt32(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt64FromSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt64FromUInt8(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt64FromUInt16(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt64FromUInt32(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt64FromUInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt64FromFloat32(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt64FromFloat64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt64FromRational(dest, a) pLsrSInt64FromRational(dest, a)
+#define pLsrPrimPSInt64FromInteger(dest, a) pLsrSInt64FromInteger(dest, a)
+
+#define pLsrPrimPSInt8CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt16CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt32CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPSInt64CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPUInt8CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPUInt16CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPUInt32CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPUInt64CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPFloat32CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPFloat64CastSInt64(dest, a) pLsrPrimNumericConv(dest, sint64, a)
+#define pLsrPrimPIntegerCastSInt64(dest, a) pLsrIntegerCastToSInt64(dest, a)
+
+/* UInt8 ops */
+#define pLsrPrimPUInt8Plus(dest, a, b)   binArith(uint8, +, dest, a, b)   
+#define pLsrPrimPUInt8Minus(dest, a, b)  binArith(uint8, -, dest, a, b)      
+#define pLsrPrimPUInt8Times(dest, a, b)  binArith(uint8, *, dest, a, b)      
+#define pLsrPrimPUInt8DivT(dest, a, b)    binArith(uint8, /, dest, a, b)
+#define pLsrPrimPUInt8ModT(dest, a, b)    binArith(uint8, %, dest, a, b)
+#define pLsrPrimPUInt8Negate(dest, a)    unArith(uint8, -, dest, a)
+
+#define pLsrPrimPUInt8EQ(dest, a, b) binPred(uint8, ==, dest, a, b)     
+#define pLsrPrimPUInt8NE(dest, a, b) binPred(uint8, !=, dest, a, b)     
+#define pLsrPrimPUInt8LT(dest, a, b) binPred(uint8, <, dest, a, b)      
+#define pLsrPrimPUInt8LE(dest, a, b) binPred(uint8, <=, dest, a, b)
+
+#define pLsrPrimPUInt8FromSInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt8FromSInt16(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt8FromSInt32(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt8FromSInt64(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt8FromUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt8FromUInt16(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt8FromUInt32(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt8FromUInt64(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt8FromFloat32(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt8FromFloat64(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt8FromRational(dest, a) pLsrUInt8FromRational(dest, a)
+#define pLsrPrimPUInt8FromInteger(dest, a) pLsrUInt8FromInteger(dest, a)
+
+#define pLsrPrimPSInt8CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPSInt16CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPSInt32CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPSInt64CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt8CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt16CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt32CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPUInt64CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPFloat32CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPFloat64CastUInt8(dest, a) pLsrPrimNumericConv(dest, uint8, a)
+#define pLsrPrimPIntegerCastUInt8(dest, a) pLsrIntegerCastToUInt8(dest, a)
+
+/* UInt16 ops */
+#define pLsrPrimPUInt16Plus(dest, a, b)   binArith(uint16, +, dest, a, b)   
+#define pLsrPrimPUInt16Minus(dest, a, b)  binArith(uint16, -, dest, a, b)      
+#define pLsrPrimPUInt16Times(dest, a, b)  binArith(uint16, *, dest, a, b)      
+#define pLsrPrimPUInt16DivT(dest, a, b)    binArith(uint16, /, dest, a, b)
+#define pLsrPrimPUInt16ModT(dest, a, b)    binArith(uint16, %, dest, a, b)
+#define pLsrPrimPUInt16Negate(dest, a)    unArith(uint16, -, dest, a)
+
+#define pLsrPrimPUInt16EQ(dest, a, b) binPred(uint16, ==, dest, a, b)     
+#define pLsrPrimPUInt16NE(dest, a, b) binPred(uint16, !=, dest, a, b)     
+#define pLsrPrimPUInt16LT(dest, a, b) binPred(uint16, <, dest, a, b)      
+#define pLsrPrimPUInt16LE(dest, a, b) binPred(uint16, <=, dest, a, b)
+
+#define pLsrPrimPUInt16FromSInt8(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt16FromSInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt16FromSInt32(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt16FromSInt64(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt16FromUInt8(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt16FromUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt16FromUInt32(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt16FromUInt64(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt16FromFloat32(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt16FromFloat64(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt16FromRational(dest, a) pLsrUInt16FromRational(dest, a)
+#define pLsrPrimPUInt16FromInteger(dest, a) pLsrUInt16FromInteger(dest, a)
+
+#define pLsrPrimPSInt8CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPSInt16CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPSInt32CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPSInt64CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt8CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt16CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt32CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPUInt64CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPFloat32CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPFloat64CastUInt16(dest, a) pLsrPrimNumericConv(dest, uint16, a)
+#define pLsrPrimPIntegerCastUInt16(dest, a) pLsrIntegerCastToUInt16(dest, a)
+
+/* UInt32 ops */
+#define pLsrPrimPUInt32Plus(dest, a, b)   binArith(uint32, +, dest, a, b)   
+#define pLsrPrimPUInt32Minus(dest, a, b)  binArith(uint32, -, dest, a, b)      
+#define pLsrPrimPUInt32Times(dest, a, b)  binArith(uint32, *, dest, a, b)      
+#define pLsrPrimPUInt32DivT(dest, a, b)    binArith(uint32, /, dest, a, b)
+#define pLsrPrimPUInt32ModT(dest, a, b)    binArith(uint32, %, dest, a, b)
+#define pLsrPrimPUInt32DivModT(dest1, dest2, a, b) pLsrUInt32DivModT(dest1, dest2, a, b)
+#define pLsrPrimPUInt32DivModE(dest1, dest2, a, b) pLsrUInt32DivModE(dest1, dest2, a, b)
+#define pLsrPrimPUInt32DivModF(dest1, dest2, a, b) pLsrUInt32DivModF(dest1, dest2, a, b)
+#define pLsrPrimPUInt32Negate(dest, a)    unArith(uint32, -, dest, a)
+#define pLsrPrimPUInt32Max(dest, a, b)  binArithPrefix(uint32, max, dest, a, b)
+#define pLsrPrimPUInt32Min(dest, a, b)  binArithPrefix(uint32, min, dest, a, b)
+
+#define pLsrUInt32DivModT(dest1, dest2, a, b)   \
+    do {                                        \
+        dest1 = (a)/(b);                        \
+        dest2 = (a)%(b);                        \
+    } while (0)
+
+#define pLsrPrimPUInt32EQ(dest, a, b) binPred(uint32, ==, dest, a, b)     
+#define pLsrPrimPUInt32NE(dest, a, b) binPred(uint32, !=, dest, a, b)     
+#define pLsrPrimPUInt32LT(dest, a, b) binPred(uint32, <, dest, a, b)      
+#define pLsrPrimPUInt32LE(dest, a, b) binPred(uint32, <=, dest, a, b)
+
+#define pLsrPrimPUInt32FromSInt8(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt32FromSInt16(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt32FromSInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt32FromSInt64(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt32FromUInt8(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt32FromUInt16(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt32FromUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt32FromUInt64(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt32FromFloat32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt32FromFloat64(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt32FromRational(dest, a) pLsrUInt32FromRational(dest, a)
+#define pLsrPrimPUInt32FromInteger(dest, a) pLsrUInt32FromInteger(dest, a)
+
+#define pLsrPrimPSInt8CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPSInt16CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPSInt32CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPSInt64CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt8CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt16CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt32CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPUInt64CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPFloat32CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPFloat64CastUInt32(dest, a) pLsrPrimNumericConv(dest, uint32, a)
+#define pLsrPrimPIntegerCastUInt32(dest, a) pLsrIntegerCastToUInt32(dest, a)
+
+#define pLsrPrimPUInt32BNot(dest, b) ((dest) = ~(b)) 
+#define pLsrPrimPUInt32BOr(dest, a, b) binArith(uint32, |, dest, a, b)
+#define pLsrPrimPSInt32BOr(dest, a, b) binArith(sint32, |, dest, a, b)
+#define pLsrPrimPUInt32BAnd(dest, a, b) binArith(uint32, &, dest, a, b)
+#define pLsrPrimPSInt32BAnd(dest, a, b) binArith(sint32, &, dest, a, b)
+#define pLsrPrimPUInt32BXor(dest, a, b) binArith(uint32, ^, dest, a, b)
+#define pLsrPrimPSInt32BXor(dest, a, b) binArith(sint32, ^, dest, a, b)
+#define pLsrPrimPSInt32BShiftL(dest, a, b) binArith(sint32, <<, dest, a, b)
+/*Note: implementation defined on negative values */
+#define pLsrPrimPSInt32BShiftR(dest, a, b) binArith(sint32, >>, dest, a, b)
+#define pLsrPrimPUInt32BShiftL(dest, a, b) binArith(uint32, <<, dest, a, b)
+#define pLsrPrimPSInt32BShiftR(dest, a, b) binArith(sint32, >>, dest, a, b)
+#define pLsrPrimPUInt32BShiftR(dest, a, b) binArith(uint32, >>, dest, a, b)
+
+/* UInt64 ops */
+#define pLsrPrimPUInt64Plus(dest, a, b)   binArith(uint64, +, dest, a, b)   
+#define pLsrPrimPUInt64Minus(dest, a, b)  binArith(uint64, -, dest, a, b)      
+#define pLsrPrimPUInt64Times(dest, a, b)  binArith(uint64, *, dest, a, b)      
+#define pLsrPrimPUInt64DivT(dest, a, b)   binArith(uint64, /, dest, a, b)
+#define pLsrPrimPUInt64ModT(dest, a, b)   binArith(uint64, %, dest, a, b)
+#define pLsrPrimPUInt64Negate(dest, a)    unArith(uint64, -, dest, a)
+
+#define pLsrPrimPUInt64EQ(dest, a, b) binPred(uint64, ==, dest, a, b)     
+#define pLsrPrimPUInt64NE(dest, a, b) binPred(uint64, !=, dest, a, b)     
+#define pLsrPrimPUInt64LT(dest, a, b) binPred(uint64, <, dest, a, b)      
+#define pLsrPrimPUInt64LE(dest, a, b) binPred(uint64, <=, dest, a, b)
+
+#define pLsrPrimPUInt64FromSInt8(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt64FromSInt16(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt64FromSInt32(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt64FromSInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt64FromUInt8(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt64FromUInt16(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt64FromUInt32(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt64FromUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt64FromFloat32(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt64FromFloat64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt64FromRational(dest, a) pLsrUInt64FromRational(dest, a)
+#define pLsrPrimPUInt64FromInteger(dest, a) pLsrUInt64FromInteger(dest, a)
+
+#define pLsrPrimPSInt8CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPSInt16CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPSInt32CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPSInt64CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt8CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt16CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt32CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPUInt64CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPFloat32CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPFloat64CastUInt64(dest, a) pLsrPrimNumericConv(dest, uint64, a)
+#define pLsrPrimPIntegerCastUInt64(dest, a) pLsrIntegerCastToUInt64(dest, a)
+
+#define pLsrPrimPUInt64BNot(dest, b) ((dest) = ~(b)) 
+#define pLsrPrimPUInt64BOr(dest, a, b) binArith(uint64, |, dest, a, b)
+#define pLsrPrimPSInt64BOr(dest, a, b) binArith(sint64, |, dest, a, b)
+#define pLsrPrimPUInt64BAnd(dest, a, b) binArith(uint64, &, dest, a, b)
+#define pLsrPrimPSInt64BAnd(dest, a, b) binArith(sint64, &, dest, a, b)
+#define pLsrPrimPUInt64BXor(dest, a, b) binArith(uint64, ^, dest, a, b)
+#define pLsrPrimPSInt64BXor(dest, a, b) binArith(sint64, ^, dest, a, b)
+#define pLsrPrimPSInt64BShiftL(dest, a, b) binArith(sint64, <<, dest, a, b)
+/*Note: implementation defined on negative values */
+#define pLsrPrimPSInt64BShiftR(dest, a, b) binArith(sint64, >>, dest, a, b)
+#define pLsrPrimPUInt64BShiftL(dest, a, b) binArith(uint64, <<, dest, a, b)
+#define pLsrPrimPSInt64BShiftR(dest, a, b) binArith(sint64, >>, dest, a, b)
+#define pLsrPrimPUInt64BShiftR(dest, a, b) binArith(uint64, >>, dest, a, b)
+
+/* Float32 ops */
+#define pLsrPrimPFloat32Plus(dest, a, b)   binArith(float32, +, dest, a, b)   
+#define pLsrPrimPFloat32Minus(dest, a, b)  binArith(float32, -, dest, a, b)      
+#define pLsrPrimPFloat32Times(dest, a, b)  binArith(float32, *, dest, a, b)      
+#define pLsrPrimPFloat32Divide(dest, a, b) binArith(float32, /, dest, a, b)
+#if (defined(__pillar__) && !defined(__pillar2c__))
+#warning "Using double precision math for floats to work around pillar bug"
+#define pLsrPrimPFloat32Max(dest,  a, b) binArithPrefix(float32, fmax, dest, a, b)
+#define pLsrPrimPFloat32Min(dest, a, b)  binArithPrefix(float32, fmin, dest, a, b)
+#define pLsrPrimPFloat32ModT(dest, a, b) binArithPrefix(float32, fmod, dest, a, b)
+#else
+#define pLsrPrimPFloat32Max(dest,  a, b) binArithPrefix(float32, fmaxf, dest, a, b)
+#define pLsrPrimPFloat32Min(dest, a, b)  binArithPrefix(float32, fminf, dest, a, b)
+#define pLsrPrimPFloat32ModT(dest, a, b) binArithPrefix(float32, fmodf, dest, a, b)
+#endif
+#define pLsrPrimPFloat32Negate(dest, a)    unArith(float32, -, dest, a)
+
+#define pLsrPrimPFloat32EQ(dest, a, b) binPred(float32, ==, dest, a, b)     
+#define pLsrPrimPFloat32NE(dest, a, b) binPred(float32, !=, dest, a, b)     
+#define pLsrPrimPFloat32LT(dest, a, b) binPred(float32, <, dest, a, b)      
+#define pLsrPrimPFloat32LE(dest, a, b) binPred(float32, <=, dest, a, b)
+
+#define pLsrPrimPFloat32FromSInt8(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat32FromSInt16(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat32FromSInt32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat32FromSInt64(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat32FromUInt8(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat32FromUInt16(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat32FromUInt32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat32FromUInt64(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat32FromFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat32FromFloat64(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat32FromRational(dest, a) pLsrFloat32FromRational(dest, a)
+#define pLsrPrimPFloat32FromInteger(dest, a) pLsrFloat32FromInteger(dest, a)
+
+#define pLsrPrimPSInt8CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPSInt16CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPSInt32CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPSInt64CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPUInt8CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPUInt16CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPUInt32CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPUInt64CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat32CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPFloat64CastFloat32(dest, a) pLsrPrimNumericConv(dest, float32, a)
+#define pLsrPrimPIntegerCastFloat32(dest, a) pLsrIntegerCastToFloat32(dest, a)
+
+/* XXX NG: need to find a header file for this */
+float64 __cdecl trunc(float64);
+
+#if (defined(__pillar__) && !defined(__pillar2c__))
+#warning "Using double precision math for floats to work around pillar bug"
+
+#define pLsrPrimPFloat32ACos(dest, a)   ((dest) = (float32)acos((float32)(a)))
+#define pLsrPrimPFloat32ASin(dest, a)   ((dest) = (float32)asin((float32)(a)))
+#define pLsrPrimPFloat32ATan(dest, a)   ((dest) = (float32)atan((float32)(a)))
+#define pLsrPrimPFloat32TanH(dest, a)   ((dest) = (float32)tanh((float32)(a)))
+#define pLsrPrimPFloat32CosH(dest, a)   ((dest) = (float32)cosh((float32)(a)))
+#define pLsrPrimPFloat32SinH(dest, a)   ((dest) = (float32)sinh((float32)(a)))
+#define pLsrPrimPFloat32Ceil(dest, a)   ((dest) = (float32)ceil((float32)(a)))
+#define pLsrPrimPFloat32Cos(dest, a)    ((dest) = (float32)cos((float32)(a)))
+#define pLsrPrimPFloat32Exp(dest, a)    ((dest) = (float32)exp((float32)(a)))
+#define pLsrPrimPFloat32Floor(dest, a)  ((dest) = (float32)floor((float32)(a)))
+#define pLsrPrimPFloat32Ln(dest, a)     ((dest) = (float32)log((float32)(a)))
+#define pLsrPrimPFloat32Rcp(dest, a)    ((dest) = (((float32)1.0)/((float32) a)))
+#define pLsrPrimPFloat32Sin(dest, a)    ((dest) = (float32)sin((float32)(a)))
+#define pLsrPrimPFloat32Sqrt(dest, a)   ((dest) = (float32)sqrt((float32)(a)))
+#define pLsrPrimPFloat32Tan(dest, a)    ((dest) = (float32)tan((float32)(a)))
+#define pLsrPrimPFloat32Trunc(dest, a)  ((dest) = (float32)trunc((float32)(a)))
+#define pLsrPrimPFloat32Pow(dest, a, b) ((dest) = (float32)pow((float32)(a), (float32)(b)))
+#else
+#define pLsrPrimPFloat32ACos(dest, a)   ((dest) = acosf((float32)(a)))
+#define pLsrPrimPFloat32ATan(dest, a)   ((dest) = atanf((float32)(a)))
+#define pLsrPrimPFloat32ASin(dest, a)   ((dest) = asinf((float32)(a)))
+#define pLsrPrimPFloat32TanH(dest, a)   ((dest) = tanhf((float32)(a)))
+#define pLsrPrimPFloat32CosH(dest, a)   ((dest) = coshf((float32)(a)))
+#define pLsrPrimPFloat32SinH(dest, a)   ((dest) = sinhf((float32)(a)))
+#define pLsrPrimPFloat32Ceil(dest, a)   ((dest) = ceilf((float32)(a)))
+#define pLsrPrimPFloat32Cos(dest, a)    ((dest) = cosf((float32)(a)))
+#define pLsrPrimPFloat32Exp(dest, a)    ((dest) = expf((float32)(a)))
+#define pLsrPrimPFloat32Floor(dest, a)  ((dest) = floorf((float32)(a)))
+#define pLsrPrimPFloat32Ln(dest, a)     ((dest) = logf((float32)(a)))
+#define pLsrPrimPFloat32Rcp(dest, a)    ((dest) = (((float32)1.0)/((float32) a)))
+#define pLsrPrimPFloat32Sin(dest, a)    ((dest) = sinf((float32)(a)))
+#define pLsrPrimPFloat32Sqrt(dest, a)   ((dest) = sqrtf((float32)(a)))
+#define pLsrPrimPFloat32Tan(dest, a)    ((dest) = tanf((float32)(a)))
+#define pLsrPrimPFloat32Trunc(dest, a)  ((dest) = truncf((float32)(a)))
+#define pLsrPrimPFloat32Pow(dest, a, b) ((dest) = powf((float32)(a), (float32)(b)))
+#endif
+
+/* Float64 ops */
+#define pLsrPrimPFloat64Plus(dest, a, b)   binArith(float64, +, dest, a, b)   
+#define pLsrPrimPFloat64Minus(dest, a, b)  binArith(float64, -, dest, a, b)      
+#define pLsrPrimPFloat64Times(dest, a, b)  binArith(float64, *, dest, a, b)      
+#define pLsrPrimPFloat64Divide(dest, a, b) binArith(float64, /, dest, a, b)
+#define pLsrPrimPFloat64Max(dest,  a, b) binArithPrefix(float64, fmax, dest, a, b)
+#define pLsrPrimPFloat64Min(dest, a, b)  binArithPrefix(float64, fmin, dest, a, b)
+#define pLsrPrimPFloat64ModT(dest, a, b) binArithPrefix(float64, fmod, dest, a, b)
+#define pLsrPrimPFloat64Negate(dest, a)    unArith(float64, -, dest, a)
+
+#define pLsrPrimPFloat64EQ(dest, a, b) binPred(float64, ==, dest, a, b)     
+#define pLsrPrimPFloat64NE(dest, a, b) binPred(float64, !=, dest, a, b)     
+#define pLsrPrimPFloat64LT(dest, a, b) binPred(float64, <, dest, a, b)      
+#define pLsrPrimPFloat64LE(dest, a, b) binPred(float64, <=, dest, a, b)
+
+#define pLsrPrimPFloat64FromSInt8(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat64FromSInt16(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat64FromSInt32(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat64FromSInt64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat64FromUInt8(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat64FromUInt16(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat64FromUInt32(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat64FromUInt64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat64FromFloat32(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat64FromFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat64FromRational(dest, a) pLsrFloat64FromRational(dest, a)
+#define pLsrPrimPFloat64FromInteger(dest, a) pLsrFloat64FromInteger(dest, a)
+
+#define pLsrPrimPSInt8CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPSInt16CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPSInt32CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPSInt64CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPUInt8CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPUInt16CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPUInt32CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPUInt64CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat32CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPFloat64CastFloat64(dest, a) pLsrPrimNumericConv(dest, float64, a)
+#define pLsrPrimPIntegerCastFloat64(dest, a) pLsrIntegerCastToFloat64(dest, a)
+
+#define pLsrPrimPFloat64ACos(dest, a)   ((dest) = acos((float64)(a)))
+#define pLsrPrimPFloat64ATan(dest, a)   ((dest) = atan((float64)(a)))
+#define pLsrPrimPFloat64ASin(dest, a)   ((dest) = asin((float64)(a)))
+#define pLsrPrimPFloat64TanH(dest, a)   ((dest) = tanh((float64)(a)))
+#define pLsrPrimPFloat64CosH(dest, a)   ((dest) = cosh((float64)(a)))
+#define pLsrPrimPFloat64SinH(dest, a)   ((dest) = sinh((float64)(a)))
+#define pLsrPrimPFloat64Ceil(dest, a)   ((dest) = ceil((float64)(a)))
+#define pLsrPrimPFloat64Cos(dest, a)    ((dest) = cos((float64)(a)))
+#define pLsrPrimPFloat64Floor(dest, a)  ((dest) = floor((float64)(a)))
+#define pLsrPrimPFloat64Exp(dest, a)    ((dest) = exp((float64)(a)))
+#define pLsrPrimPFloat64Ln(dest, a)     ((dest) = log((float64)(a)))
+#define pLsrPrimPFloat64Rcp(dest, a)    ((dest) = (((float64)1.0)/((float64) a)))
+#define pLsrPrimPFloat64Sin(dest, a)    ((dest) = sin((float64)(a)))
+#define pLsrPrimPFloat64Sqrt(dest, a)   ((dest) = sqrt((float64)(a)))
+#define pLsrPrimPFloat64Tan(dest, a)    ((dest) = tan((float64)(a)))
+#define pLsrPrimPFloat64Trunc(dest, a)  ((dest) = trunc((float64)(a)))
+#define pLsrPrimPFloat64Pow(dest, a, b) ((dest) = pow((float64)(a), (float64)(b)))
+
+/* Arbitrary precision integer ops */ 
+#define pLsrPrimPIntegerPlus(dest, a, b)    pLsrIntegerPlus(dest, a, b)
+#define pLsrPrimPIntegerMinus(dest, a, b)   pLsrIntegerMinus(dest, a, b)
+#define pLsrPrimPIntegerTimes(dest, a, b)   pLsrIntegerTimes(dest, a, b)
+#define pLsrPrimPIntegerDivT(dest, a, b)    pLsrIntegerDivT(dest, a, b)
+#define pLsrPrimPIntegerDivE(dest, a, b)    pLsrIntegerDivE(dest, a, b)
+#define pLsrPrimPIntegerDivF(dest, a, b)    pLsrIntegerDivF(dest, a, b)
+#define pLsrPrimPIntegerModT(dest, a, b)    pLsrIntegerModT(dest, a, b)
+#define pLsrPrimPIntegerModE(dest, a, b)    pLsrIntegerModE(dest, a, b)
+#define pLsrPrimPIntegerModF(dest, a, b)    pLsrIntegerModF(dest, a, b)
+#define pLsrPrimPIntegerDivModT(dest1, dest2, a, b) pLsrIntegerDivModT(dest1, dest2, a, b)
+#define pLsrPrimPIntegerDivModE(dest1, dest2, a, b) pLsrIntegerDivModE(dest1, dest2, a, b)
+#define pLsrPrimPIntegerDivModF(dest1, dest2, a, b) pLsrIntegerDivModF(dest1, dest2, a, b)
+#define pLsrPrimPIntegerNegate(dest, a)     pLsrIntegerNegate(dest, a)
+
+#define pLsrPrimPIntegerEQ(dest, a, b) pLsrIntegerEQ(dest, a, b)
+#define pLsrPrimPIntegerNE(dest, a, b) pLsrIntegerNE(dest, a, b)
+#define pLsrPrimPIntegerLT(dest, a, b) pLsrIntegerLT(dest, a, b)
+#define pLsrPrimPIntegerLE(dest, a, b) pLsrIntegerLE(dest, a, b)
+
+#define pLsrPrimPIntegerFromSInt8(dest, a) pLsrIntegerFromSInt8(dest, a)
+#define pLsrPrimPIntegerFromSInt16(dest, a) pLsrIntegerFromSInt16(dest, a)
+#define pLsrPrimPIntegerFromSInt32(dest, a) pLsrIntegerFromSInt32(dest, a)
+#define pLsrPrimPIntegerFromSInt64(dest, a) pLsrIntegerFromSInt64(dest, a)
+#define pLsrPrimPIntegerFromUInt8(dest, a) pLsrIntegerFromUInt8(dest, a)
+#define pLsrPrimPIntegerFromUInt16(dest, a) pLsrIntegerFromUInt16(dest, a)
+#define pLsrPrimPIntegerFromUInt32(dest, a) pLsrIntegerFromUInt32(dest, a)
+#define pLsrPrimPIntegerFromUInt64(dest, a) pLsrIntegerFromUInt64(dest, a)
+#define pLsrPrimPIntegerFromFloat32(dest, a) pLsrIntegerFromFloat32(dest, a)
+#define pLsrPrimPIntegerFromFloat64(dest, a) pLsrIntegerFromFloat64(dest, a)
+#define pLsrPrimPIntegerFromRational(dest, a) pLsrIntegerFromRational(dest, a)
+#define pLsrPrimPIntegerFromInteger(dest, a) pLsrIntegerFromInteger(dest, a)
+
+#define pLsrPrimPSInt8CastInteger(dest, a) pLsrIntegerFromSInt8(dest, a)
+#define pLsrPrimPSInt16CastInteger(dest, a) pLsrIntegerFromSInt16(dest, a)
+#define pLsrPrimPSInt32CastInteger(dest, a) pLsrIntegerFromSInt32(dest, a)
+#define pLsrPrimPSInt64CastInteger(dest, a) pLsrIntegerFromSInt64(dest, a)
+#define pLsrPrimPUInt8CastInteger(dest, a) pLsrIntegerFromUInt8(dest, a)
+#define pLsrPrimPUInt16CastInteger(dest, a) pLsrIntegerFromUInt16(dest, a)
+#define pLsrPrimPUInt32CastInteger(dest, a) pLsrIntegerFromUInt32(dest, a)
+#define pLsrPrimPUInt64CastInteger(dest, a) pLsrIntegerFromUInt64(dest, a)
+#define pLsrPrimPIntegerCastInteger(dest, a) pLsrIntegerFromInteger(dest, a)
+
+#define pLsrPrimPIntegerBNot(dest, a)       pLsrIntegerBNot(dest, a)
+#define pLsrPrimPIntegerBAnd(dest, a, b)    pLsrIntegerBAnd(dest, a, b)
+#define pLsrPrimPIntegerBOr(dest, a, b)     pLsrIntegerBOr(dest, a, b)
+#define pLsrPrimPIntegerBShiftL(dest, a, b) pLsrIntegerBShiftL(dest, a, b)
+#define pLsrPrimPIntegerBShiftR(dest, a, b) pLsrIntegerBShiftR(dest, a, b)
+#define pLsrPrimPIntegerBXor(dest, a, b)    pLsrIntegerBXor(dest, a, b)
+
+/* Rational ops */ 
+#define pLsrPrimPRationalPlus(dest, a, b)    pLsrRationalPlus(dest, a, b)
+#define pLsrPrimPRationalMinus(dest, a, b)   pLsrRationalMinus(dest, a, b)
+#define pLsrPrimPRationalTimes(dest, a, b)   pLsrRationalTimes(dest, a, b)
+#define pLsrPrimPRationalDivT(dest, a, b)    pLsrRationalDivT(dest, a, b)
+#define pLsrPrimPRationalDivE(dest, a, b)    pLsrRationalDivE(dest, a, b)
+#define pLsrPrimPRationalDivF(dest, a, b)    pLsrRationalDivF(dest, a, b)
+#define pLsrPrimPRationalModT(dest, a, b)    pLsrRationalModT(dest, a, b)
+#define pLsrPrimPRationalModE(dest, a, b)    pLsrRationalModE(dest, a, b)
+#define pLsrPrimPRationalModF(dest, a, b)    pLsrRationalModF(dest, a, b)
+#define pLsrPrimPRationalDivModT(dest1, dest2, a, b) pLsrRationalDivModT(dest1, dest2, a, b)
+#define pLsrPrimPRationalDivModE(dest1, dest2, a, b) pLsrRationalDivModE(dest1, dest2, a, b)
+#define pLsrPrimPRationalDivModF(dest1, dest2, a, b) pLsrRationalDivModF(dest1, dest2, a, b)
+#define pLsrPrimPRationalDivide(dest, a)     pLsrRationalDivide(dest, a)
+#define pLsrPrimPRationalNegate(dest, a)     pLsrRationalNegate(dest, a)
+
+#define pLsrPrimPRationalEQ(dest, a, b) pLsrRationalEQ(dest, a, b)
+#define pLsrPrimPRationalNE(dest, a, b) pLsrRationalNE(dest, a, b)
+#define pLsrPrimPRationalLT(dest, a, b) pLsrRationalLT(dest, a, b)
+#define pLsrPrimPRationalLE(dest, a, b) pLsrRationalLE(dest, a, b)
+
+#define pLsrPrimPRationalFromSInt8(dest, a) pLsrRationalFromSInt8(dest, a)
+#define pLsrPrimPRationalFromSInt16(dest, a) pLsrRationalFromSInt16(dest, a)
+#define pLsrPrimPRationalFromSInt32(dest, a) pLsrRationalFromSInt32(dest, a)
+#define pLsrPrimPRationalFromSInt64(dest, a) pLsrRationalFromSInt64(dest, a)
+#define pLsrPrimPRationalFromUInt8(dest, a) pLsrRationalFromUInt8(dest, a)
+#define pLsrPrimPRationalFromUInt16(dest, a) pLsrRationalFromUInt16(dest, a)
+#define pLsrPrimPRationalFromUInt32(dest, a) pLsrRationalFromUInt32(dest, a)
+#define pLsrPrimPRationalFromUInt64(dest, a) pLsrRationalFromUInt64(dest, a)
+#define pLsrPrimPRationalFromFloat32(dest, a) pLsrRationalFromFloat32(dest, a)
+#define pLsrPrimPRationalFromFloat64(dest, a) pLsrRationalFromFloat64(dest, a)
+#define pLsrPrimPRationalFromRational(dest, a) pLsrRationalFromRational(dest, a)
+#define pLsrPrimPRationalFromInteger(dest, a) pLsrRationalFromInteger(dest, a)
+
+/**********************************************************************
+ * Names
+ */
+
+#define pLsrPrimPNameGetString(dest, n) (dest = pLsrPNameGetString(n))
+#define pLsrPrimPNameGetHash(dest, n) (dest = pLsrPNameGetHash(n))
+
+/**********************************************************************
+ * CString
+ */
+
+#define pLsrPrimPCStringAllocate(dest, len)      \
+    do {                                        \
+        dest = (char*) pLsrAllocC(len+1);       \
+        dest[len] = '\0';                       \
+    } while(0)
+
+#define pLsrPrimPCStringDeallocate(str) (pLsrFreeC(str))
+#define pLsrPrimPCStringGetLen(dest, str) ((dest) = strlen(str))
+#define pLsrPrimPCStringGetChar(dest, str, idx) ((dest) = (str)[idx])
+#define pLsrPrimPCStringSetChar(str, idx, c) ((str)[idx] = (c))
+
+/**********************************************************************
+ * Pointer Equality
+ */
+
+#define pLsrPrimPPtrEq(dest, p1, p2) (dest = ((void*)p1)==((void*)p2))
+
+/**********************************************************************
+ * Conditional MOV
+ */
+#define pLsrPrimPCondMov(dest, b, u, v) ((dest) = ((b) ? (u) : (v)))
+
+/**********************************************************************
+ * Booleans
+ */
+
+#define pLsrPrimPBooleanNot(dest, b0)     ((dest) = !(b0))
+#define pLsrPrimPBooleanAnd(dest, b0, b1) ((dest) = (b0) && (b1))
+#define pLsrPrimPBooleanOr(dest, b0, b1)  ((dest) = (b0) || (b1))
+#define pLsrPrimPBooleanXOr(dest, b0, b1) ((dest) = (b0) ^ (b1))
+#define pLsrPrimPBooleanEq(dest, b0, b1)  ((dest) = (b0) == (b1))
+
+
+#endif /* _PLSR_PRIMS_PRIMS_H_ */
diff --git a/runtime/include/hrc/plsr-prims-runtime.h b/runtime/include/hrc/plsr-prims-runtime.h
new file mode 100644
index 0000000..bc145ad
--- /dev/null
+++ b/runtime/include/hrc/plsr-prims-runtime.h
@@ -0,0 +1,855 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_PRIMS_RUNTIME_H_
+#define _PLSR_PRIMS_RUNTIME_H_
+
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+#endif /* P_USE_PILLAR */
+
+#ifdef PLSR_LINUX
+#include <sys/time.h>
+#endif
+
+#ifdef P_USE_PILLAR
+#  undef to
+#  pragma pillar_managed(on)
+#endif
+
+/**********************************************************************
+ * IO
+ */
+
+/* IO Hacks */
+
+#define BUF_SZ (1<<16)
+
+struct PlsrBufList {
+    char buf[BUF_SZ];
+    struct PlsrBufList* next;
+};
+
+static char* pLsrReadLine()
+{
+    struct PlsrBufList* l = 0;
+    char* res;
+    uintp fullBufs = -1, len, cur, lastAmt, cont;
+    do {
+        struct PlsrBufList* p = l;
+        char* lastRead;
+        fullBufs++;
+        l = (struct PlsrBufList*)pLsrAllocC(sizeof(struct PlsrBufList));
+        l->next = p;
+        lastRead = fgets(l->buf, BUF_SZ, stdin);
+        assert(lastRead);
+        lastAmt = strlen(l->buf);
+        cont = l->buf[lastAmt-1]!='\n';
+    } while (cont);
+    len = fullBufs * (BUF_SZ-1) + lastAmt;
+    res = (char*)pLsrAllocC(len+1);
+    cur = len;
+    while(l) {
+        struct PlsrBufList* n = l->next;
+        lastAmt = strlen(l->buf);
+        cur -= lastAmt;
+        strncpy(res+cur, l->buf, lastAmt);
+        pLsrFreeC(l);
+        l = n;
+    }
+    assert(cur==0);
+    return res;
+}
+
+static char* pLsrReadFile(char* fn)
+{
+    FILE* f = fopen(fn, "r");
+    struct PlsrBufList* l = 0;
+    size_t lastRead;
+    uintp fullBufs = -1, len, cur;
+    char* res;
+    assert(f);
+    do {
+        struct PlsrBufList* p = l;
+        fullBufs++;
+        l = (struct PlsrBufList*)pLsrAllocC(sizeof(struct PlsrBufList));
+        l->next = p;
+        lastRead = fread(l->buf, 1, BUF_SZ, f);
+    } while (lastRead == BUF_SZ);
+    cur = fullBufs * BUF_SZ;
+    len = cur + lastRead;
+    res = (char*)pLsrAllocC(len+1);
+    res[len] = '\0';
+    while(l) {
+        struct PlsrBufList* n = l->next;
+        strncpy(res+cur, l->buf, lastRead);
+        pLsrFreeC(l);
+        l = n;
+        cur -= BUF_SZ;
+        lastRead = BUF_SZ;
+    }
+    fclose(f);
+    assert(cur==0);
+    return res;
+}
+
+static void pLsrWriteFile(char* fn, uintp n, char* s)
+{
+    FILE* f = fopen(fn, "wb");
+    assert(f);
+    fwrite(s, 1, n, f);
+    fclose(f);
+}
+    
+static char* pLsrPrimRReadAllStdin()
+{
+    struct PlsrBufList* l = 0;
+    size_t lastRead;
+    uintp fullBufs = -1, len, cur;
+    char* res;
+    FILE* f;
+    f = stdin;
+    assert(f);
+    do {
+        struct PlsrBufList* p = l;
+        fullBufs++;
+        l = (struct PlsrBufList*)pLsrAllocC(sizeof(struct PlsrBufList));
+        l->next = p;
+        lastRead = fread(l->buf, 1, BUF_SZ, f);
+    } while (lastRead == BUF_SZ);
+    cur = fullBufs * BUF_SZ;
+    len = cur + lastRead;
+    res = (char*)pLsrAllocC(len+1);
+    res[len] = '\0';
+    while(l) {
+        struct PlsrBufList* n = l->next;
+        strncpy(res+cur, l->buf, lastRead);
+        pLsrFreeC(l);
+        l = n;
+        cur -= BUF_SZ;
+        lastRead = BUF_SZ;
+    }
+    return res;
+}
+
+static uintp pLsrPrimNativeOpenOut(char* fn)
+{
+    FILE* f = fopen(fn, "wb");
+    assert(f);
+    return (uintp)f;
+}
+
+static void pLsrPrimNativeOutputByte(uintp fi, char b)
+{
+    FILE* f = (FILE*)fi;
+    fwrite(&b, 1, 1, f);
+}
+
+static void pLsrPrimNativeCloseOut(uintp fi)
+{
+    FILE* f = (FILE*)fi;
+    fclose(f);
+}
+
+static uintp pLsrPrimNativeOpenIn(char* fn)
+{
+    FILE* f = fopen(fn, "r");
+    assert(f);
+    return (uintp)f;
+}
+
+static uintp pLsrPrimNativeInputByte(uintp fi)
+{
+    FILE* f = (FILE*)fi;
+    int c = fgetc(f);
+    return (uintp)c;
+}
+
+static int isDelimiter (char c, char *delimiters) 
+{
+    int i = 0;
+    for (; delimiters[i]; i++) {
+      if (c == delimiters[i]) {
+        return 1;
+      }  
+    }
+    return 0;
+}
+
+static char* pLsrPrimNativeInputString(uintp fi, char *delimiters)
+{
+    // TODO WL: add max_len argument to the function
+    char* res = pLsrAllocC(256);
+    FILE* f = (FILE *)fi;
+    assert(f);
+
+    char c; 
+    // filter out all spaces and returns
+    while (!feof(f)) {
+      c = fgetc(f);
+      if (c == 0x0a || c == 0x0d || isDelimiter(c, delimiters))
+        continue;
+      else
+        break;
+    }
+
+    int i = 0;
+    while (!feof(f)) {
+      if (c == 0x0a || c == 0x0d || isDelimiter(c, delimiters)) {
+        res[i] = 0;
+        return res;
+      } else {
+        res[i++] = c;
+      } 
+      assert(i<256);
+      c = fgetc(f);
+    }
+
+    res[i] = 0;
+    return res;
+}
+
+static char* pLsrPrimNativeInputAll(uintp fi)
+{
+    struct PlsrBufList* l = 0;
+    size_t lastRead;
+    uintp fullBufs = -1, len, cur;
+    char* res;
+    FILE* f = (FILE *)fi;
+    assert(f);
+    do {
+        struct PlsrBufList* p = l;
+        fullBufs++;
+        l = (struct PlsrBufList*)pLsrAllocC(sizeof(struct PlsrBufList));
+        l->next = p;
+        lastRead = fread(l->buf, 1, BUF_SZ, f);
+    } while (lastRead == BUF_SZ);
+    cur = fullBufs * BUF_SZ;
+    len = cur + lastRead;
+    res = (char*)pLsrAllocC(len+1);
+    res[len] = '\0';
+    while(l) {
+        struct PlsrBufList* n = l->next;
+        strncpy(res+cur, l->buf, lastRead);
+        pLsrFreeC(l);
+        l = n;
+        cur -= BUF_SZ;
+        lastRead = BUF_SZ;
+    }
+    return res;
+}
+
+static uintp pLsrPrimNativeIsEOF(uintp fi)
+{
+    FILE* f = (FILE*)fi;
+    return (uintp)feof(f);
+}
+
+static void pLsrPrimNativeCloseIn(uintp fi)
+{
+    FILE* f = (FILE*)fi;
+    fclose(f);
+}
+
+static char* pLsrPrimNativeInputLine(uintp fi)
+{
+    struct PlsrBufList* l = 0;
+    FILE* f = (FILE*)fi;
+    char* res;
+    uintp fullBufs = -1, len, cur, lastAmt, cont;
+    assert(f);
+    do {
+        struct PlsrBufList* p = l;
+        char* lastRead;
+        fullBufs++;
+        l = (struct PlsrBufList*)pLsrAllocC(sizeof(struct PlsrBufList));
+        l->next = p;
+        lastRead = fgets(l->buf, BUF_SZ, f);
+        assert(lastRead);
+        lastAmt = strlen(l->buf);
+        cont = l->buf[lastAmt-1]!='\n';
+    } while (cont);
+    len = fullBufs * (BUF_SZ-1) + lastAmt;
+    res = (char*)pLsrAllocC(len+1);
+    cur = len;
+    while(l) {
+        struct PlsrBufList* n = l->next;
+        lastAmt = strlen(l->buf);
+        cur -= lastAmt;
+        strncpy(res+cur, l->buf, lastAmt);
+        pLsrFreeC(l);
+        l = n;
+    }
+    assert(cur==0);
+    return res;
+}
+
+/* The natives */
+
+static PlsrRational pLsrPrimROpenOut(PlsrPAny a)
+{
+    char* filename = pLsrPStringToCString(a);
+    uintp f = pLsrPrimNativeOpenOut(filename);
+    PlsrRational r;
+    pLsrFreeC(filename);
+    pLsrRationalFromUIntp(r, f);
+    return r;
+}
+
+#define pLsrPrimROpenOutT pLsrPrimROpenOut
+
+static PlsrRational pLsrPrimRGetStdout()
+{
+    PlsrRational r;
+    pLsrRationalFromUIntp(r, (uintp)stdout);
+    return r;
+}
+
+#define pLsrPrimRGetStdoutT pLsrPrimRGetStdout
+
+static void pLsrPrimROutputByte(PlsrRational a, PlsrRational b)
+{
+    uintp ai;
+    uintp bi;
+
+    pLsrUIntpFromRational(ai, a);
+    pLsrUIntpFromRational(bi, b);
+    pLsrPrimNativeOutputByte(ai, (char)bi);
+}
+
+#define pLsrPrimROutputByteT pLsrPrimROutputByte
+
+static void pLsrPrimRCloseOut(PlsrRational a)
+{
+    uintp ai;
+    pLsrUIntpFromRational(ai, a);
+    pLsrPrimNativeCloseOut(ai);
+}
+
+#define pLsrPrimRCloseOutT pLsrPrimRCloseOut
+
+static PlsrRational pLsrPrimROpenIn(PlsrPAny a)
+{
+    char* filename = pLsrPStringToCString(a);
+    uintp f = pLsrPrimNativeOpenIn(filename);
+    PlsrRational r;
+    pLsrFreeC(filename);
+    pLsrRationalFromUIntp(r, f);
+    return r;
+}
+
+#define pLsrPrimROpenInT pLsrPrimROpenIn
+
+static PlsrRational pLsrPrimRGetStdin()
+{
+    PlsrRational r;
+    pLsrRationalFromUIntp(r, (uintp)stdin);
+    return r;
+}
+
+#define pLsrPrimRGetStdinT pLsrPrimRGetStdin
+
+static PlsrRational pLsrPrimRInputByte(PlsrRational a)
+{
+    uintp f;
+    uintp c;
+    PlsrRational r;
+    pLsrUIntpFromRational(f, a);
+    c = pLsrPrimNativeInputByte(f);
+    pLsrRationalFromUIntp(r, c);
+    return r;
+}
+
+#define pLsrPrimRInputByteT pLsrPrimRInputByte
+
+static PlsrPAny pLsrPrimRInputString(PlsrRational a, PlsrPAny b)
+{
+    uintp f;
+    pLsrUIntpFromRational(f, a);
+    char *delimiters = pLsrPStringToCString(b);
+    char *s = pLsrPrimNativeInputString(f, delimiters);
+    pLsrFreeC(delimiters);
+    PlsrPAny res = pLsrCStringToPString(s);
+    pLsrFreeC(s);
+    return res;
+}
+
+static PlsrPAny pLsrPrimRInputStringT(PlsrRational a, PlsrPAny b)
+{
+    uintp f;
+    pLsrUIntpFromRational(f, a);
+    char *delimiters = pLsrPStringToCString(b);
+    char *s = pLsrPrimNativeInputString(f, delimiters);
+    pLsrFreeC(delimiters);
+    PlsrPAny res = pLsrCStringToPStringT(s);
+    pLsrFreeC(s);
+    return res;
+}
+
+static PlsrPAny pLsrPrimRInputAll(PlsrRational a)
+{
+    uintp f;
+    pLsrUIntpFromRational(f, a);
+    char *s = pLsrPrimNativeInputAll(f);
+    PlsrPAny res = pLsrCStringToPString(s);
+    pLsrFreeC(s);
+    return res;
+}
+
+static PlsrPAny pLsrPrimRInputAllT(PlsrRational a)
+{
+    uintp f;
+    pLsrUIntpFromRational(f, a);
+    char *s = pLsrPrimNativeInputAll(f);
+    PlsrPAny res = pLsrCStringToPStringT(s);
+    pLsrFreeC(s);
+    return res;
+}
+
+static PlsrBoolean pLsrPrimRIsEOF(PlsrRational a)
+{
+    uintp f;
+    pLsrUIntpFromRational(f, a);
+    uintp c = pLsrPrimNativeIsEOF(f);
+    return toPlsrBoolean(c);
+}
+
+#define pLsrPrimRIsEOFT pLsrPrimRIsEOF
+
+static void pLsrPrimRCloseIn(PlsrRational a)
+{
+    uintp f;
+    pLsrUIntpFromRational(f, a);
+    pLsrPrimNativeCloseIn(f);
+}
+
+#define pLsrPrimRCloseInT pLsrPrimRCloseIn
+
+/**********************************************************************
+ * Core Natives
+ */
+
+static float pLsrPrimRFloatMk(PlsrRational a, PlsrRational b)
+{
+    sint32 m;
+    sint32 e;
+    double r, p;
+    pLsrSInt32FromRational(m, a);
+    pLsrSInt32FromRational(e, b);
+    r = m;
+    p = 10;
+    if (e<0) e=-e, p=0.1;
+    while (e-->0) r*=p;
+    return r;
+}
+
+#define pLsrPrimRFloatMkT pLsrPrimRFloatMk
+
+static void pLsrPrimRWriteln(PlsrPAny a)
+{
+    pLsrValuePrint(a);
+    printf("\n");
+}
+
+#define pLsrPrimRWritelnT pLsrPrimRWriteln
+
+static PlsrPAny pLsrPrimRReadln()
+{
+    char* s = pLsrPrimNativeInputLine((uintp)stdin);
+    PlsrPAny res = pLsrCStringToPString(s);
+    pLsrFreeC(s);
+    return res;
+}
+
+static PlsrPAny pLsrPrimRReadlnT()
+{
+    char* s = pLsrPrimNativeInputLine((uintp)stdin);
+    PlsrPAny res = pLsrCStringToPStringT(s);
+    pLsrFreeC(s);
+    return res;
+}
+
+static PlsrPAny pLsrPrimRAssert(PlsrPAny a)
+{
+    pLsrRuntimeError("PAssert unimplemented");
+    return 0;
+}
+
+#define pLsrPrimRAssertT pLsrPrimRAssert
+
+static PlsrPAny pLsrPrimRError(PlsrPAny a)
+{
+    printf("Fatal P error: ");
+    pLsrValuePrint(a);
+    printf("\nExiting...\n");
+    pLsrExit(-1);
+    return 0;
+}
+
+#define pLsrPrimRErrorT pLsrPrimRError
+
+static void pLsrPrimRDebug(PlsrPAny a)
+{
+    printf("P Debug: ");
+    pLsrValuePrint(a);
+    printf("\n");
+}
+
+#define pLsrPrimRDebugT pLsrPrimRDebug
+
+/**********************************************************************
+ * Command Line support
+ */
+
+uintp pargc;
+const char **pargv;
+
+static PlsrPAny pLsrPrimRCommandLineT()
+{
+    PlsrPAny res = pLsrPArrayONew(pargc);
+    for (uintp i = 0; i < pargc; i++) {
+        PlsrPAny ps = pLsrCStringToPStringT(pargv[i]);
+        PlsrThunkBRef pst = pLsrThunkNewValRef((PlsrRef) ps);
+        pLsrWriteBarrierRefBase(res, pLsrPArrayOElt(res, i), (PlsrObjectB)pst);
+    }
+    return res;
+}
+
+static PlsrPAny pLsrPrimRCommandLine()
+{
+    PlsrPAny res = pLsrPArrayONew (pargc);
+    for (uintp i = 0; i < pargc; i++) {
+        PlsrPAny ps = pLsrCStringToPString(pargv[i]);
+        pLsrWriteBarrierRefBase(res, pLsrPArrayOElt(res, i), ps);
+    }
+    return res;
+}
+
+#define pLsrCommandLineCount() (pargc)
+#define pLsrCommandLineGet(i) ((char*)pargv[i])
+
+/**********************************************************************
+ * Conversions
+ */
+
+static uintp pLsrString2Nat(char *str)
+{
+  return (uintp)atoi(str);
+}
+
+static PlsrRational pLsrStringToRat(char *str)
+{
+    return pLsrRationalFromCString(str);
+}
+
+static float pLsrStringToFloat(char *str)
+{
+  return atof(str);
+}
+
+
+static PlsrRational pLsrPrimRStringToNat(PlsrPAny a)
+{
+    char* str = pLsrPStringToCString(a);
+    uintp i = pLsrString2Nat (str);
+    pLsrFreeC(str);
+    PlsrRational r;
+    pLsrRationalFromUIntp(r, i);
+    return r;
+}
+
+#define pLsrPrimRStringToNatT pLsrPrimRStringToNat
+
+static PlsrRational pLsrPrimRStringToRat(PlsrPAny a)
+{
+    char* str = pLsrPStringToCString(a);
+    PlsrRational r = pLsrStringToRat (str);
+    pLsrFreeC(str);
+    return r;
+}
+
+#define pLsrPrimRStringToRatT pLsrPrimRStringToRat
+
+static PlsrPAny pLsrPrimRRatToString(PlsrRational r)
+{
+    char* s = pLsrCStringFromRational(r);
+    PlsrPAny o = pLsrCStringToPString(s);
+    pLsrFreeC(s);
+    return o;
+}                
+
+static PlsrPAny pLsrPrimRRatToStringT(PlsrRational r)
+{
+    char* s = pLsrCStringFromRational(r);
+    PlsrPAny o = pLsrCStringToPStringT(s);
+    pLsrFreeC(s);
+    return o;
+}                
+
+static float pLsrPrimRStringToFloat(PlsrPAny a)
+{
+    char* str = pLsrPStringToCString(a);
+    float f = pLsrStringToFloat (str);
+    pLsrFreeC(str);
+    return f;
+}
+
+#define pLsrPrimRStringToFloatT pLsrPrimRStringToFloat
+
+static PlsrPAny pLsrPrimRFloatToString(float f, PlsrRational p)
+{
+    uintp pi;
+    pLsrUIntpFromRational(pi, p);
+    char str[100];
+    sprintf(str, "%.*f", pi, f);
+    return pLsrCStringToPString(str);
+}
+
+static PlsrPAny pLsrPrimRFloatToStringT(float f, PlsrRational p)
+{
+    uintp pi;
+    pLsrUIntpFromRational(pi, p);
+    char str[100];
+    sprintf(str, "%.*f", pi, f);
+    return pLsrCStringToPStringT(str);
+}                
+
+/* These are for the rats as unsafe integers hack */
+static PlsrPAny pLsrPrimRFloat2StringI(float f, sintp p)
+{
+    uintp pi = (uintp) p;
+    char str[100];
+    sprintf(str, "%.*f", pi, f);
+    return pLsrCStringToPString(str);
+}
+
+static PlsrPAny pLsrPrimRFloat2StringIT(float f, sintp p)
+{
+    uintp pi = (uintp) p;
+    char str[100];
+    sprintf(str, "%.*f", pi, f);
+    return pLsrCStringToPStringT(str);
+}                
+
+#define pLsrPrimRRatNumerator pLsrRatNumerator
+#define pLsrPrimRRatNumeratorT pLsrRatNumeratorT
+
+#define pLsrPrimRRatDenominator pLsrRatDenominator
+#define pLsrPrimRRatDenominatorT pLsrRatDenominatorT
+
+
+/**********************************************************************
+ * Equality and unification
+ */
+
+static PlsrBoolean pLsrPrimREqualSlow(PlsrPAny v1, PlsrPAny v2)
+{
+    PlsrVTable vt1;
+    PlsrVTable vt2;
+    uintp i;
+    vt1 = pLsrObjectGetVTable(v1);
+    vt2 = pLsrObjectGetVTable(v2);
+    if (pLsrVTableGetTag(vt1) != pLsrVTableGetTag(vt2)) return 0;
+    switch(pLsrVTableGetTag(vt1)) {
+    case VRatTag: {
+        PlsrBoolean b;
+        PlsrRational r1, r2;
+        pLsrRationalFromPRat(r1, v1);
+        pLsrRationalFromPRat(r2, v2);
+        pLsrRationalEQ(b, r1, r2);
+        return b;
+    }
+    case VNameTag: 
+        return pLsrPNameGetTag(v1) == pLsrPNameGetTag(v2);
+    case VFloatTag:
+        return pLsrPFloatGet(v1) == pLsrPFloatGet(v2);
+    case VArrayTag:
+        if (pLsrPArrayOGetLen(v1) != pLsrPArrayOGetLen(v2)) return 0;
+        for(i=0; i<pLsrPArrayOGetLen(v1); i++) {
+            PlsrPAny v1e;
+            pLsrPArrayOEltEval(v1e, v1, i);
+            PlsrPAny v2e;
+            pLsrPArrayOEltEval(v2e, v2, i);
+            if (!pLsrPrimREqualSlow(v1e, v2e)) return 0;
+        }
+        return 1;
+    case VSumTag:
+        if (pLsrPNameGetTag(pLsrPSumGetTag(v1)) !=
+            pLsrPNameGetTag(pLsrPSumGetTag(v2)))
+            return 0;
+        pLsrPSumGetValEval(v1, v1);
+        pLsrPSumGetValEval(v2, v2);
+        return pLsrPrimREqualSlow(v1, v2);
+    case VFunctionTag:
+        pLsrRuntimeError("PEq - can't compare functions for equality");
+        break;
+    default:
+        printf("P equality: %d unimplemented\n", pLsrVTableGetTag(vt1));
+        break;
+    }
+    pLsrRuntimeError("PEq");
+    return 0;
+}
+
+#define pLsrPrimREqual(v1, v2)                                              \
+    (((v1 == v2) && (pLsrVTableGetTag(pLsrObjectGetVTable(v1)) != VFloatTag)) || (pLsrPrimREqualSlow(v1, v2)))
+
+
+#define pLsrPrimREqualT pLsrPrimREqual
+
+
+/**********************************************************************
+ * Domains
+ */
+
+/* XXX NG: note that we abuse sets here.  Dom should return a finite set
+ * of the indices of the array, and nub should return its size.  Instead
+ * dom returns an option set with a rat in that is the length of the array
+ * and nub just returns this rat.
+ */
+
+static PlsrRational pLsrPrimRNub(PlsrPAny a)
+{
+    assert(pLsrVTableGetTag(pLsrObjectGetVTable(a)) == VSetTag);
+    return (PlsrRational)pLsrPSetGet(a);
+}
+
+#define pLsrPrimRNubT pLsrPrimRNub
+
+static PlsrPAny pLsrPrimRDom(PlsrPAny a)
+{
+    if (pLsrVTableGetTag(pLsrObjectGetVTable(a)) == VArrayTag) {
+        PlsrRational r;
+        pLsrRationalFromUIntp(r, pLsrPArrayOGetLen(a));
+        return pLsrPSetNew((PlsrObjectB) r);
+    } else {
+        pLsrRuntimeError("PDom unimplemented for nonarrays");
+        return 0;
+    }
+}
+
+#define pLsrPrimRDomT pLsrPrimRDom
+
+static uintp pLsrPrimRRatToUIntpChecked(PlsrRational a)
+{                                                                      
+    uintp dest;
+    if (sizeof(uint32) != sizeof(uintp)) {                         
+        pLsrRuntimeError("PRatToUintpChecked only implemented for uintp = uint32"); 
+    } else {
+        pLsrRationalToUInt32Checked(dest, a);
+    }   
+    return dest;
+}
+#define pLsrPrimRRatToUIntpCheckedT pLsrPrimRRatToUIntpChecked
+
+/**********************************************************************
+ * Timers 
+ */
+
+/* XXX: WL: WIN32 implementation, not work on Linux */
+uint64 t0[16];
+
+static uint64 pLsrGetCurrentTime()
+{
+#ifdef PLSR_LINUX
+    struct timeval tv;
+    gettimeofday(&tv,NULL);
+    uint64 time;
+    time = (((uint64)tv.tv_sec) * 1000000) + ((uint64)tv.tv_usec);
+    time *= 10; // convert from micro-seconds to 100-nanosecond intervals    
+#else // PLSR_LINUX
+    FILETIME ft;
+    uint32 lowTime;
+    uint32 highTime;
+    uint64 time;
+
+    GetSystemTimeAsFileTime(&ft);
+    lowTime = ft.dwLowDateTime;
+    highTime = ft.dwHighDateTime;
+
+    time = ((uint64)highTime)<<32 | (uint64)lowTime;
+#endif // PLSR_LINUX
+    return time;
+}
+
+static void pLsrPrimRResetTimer(PlsrRational i)
+{
+    uintp n;
+    pLsrUIntpFromRational(n, i);
+  
+    t0[n] = pLsrGetCurrentTime();
+    // printf("init Timer(%d) at %llu\n", n, t0[n]);
+}
+
+#define pLsrPrimRResetTimerT pLsrPrimRResetTimer
+
+static float pLsrPrimRGetTimer(PlsrRational i)
+{
+    uint64 t1;
+    float delta;
+    uintp n;
+    pLsrUIntpFromRational(n, i);
+    t1 = pLsrGetCurrentTime();
+    // printf("get Timer(%d) at %llu\n", n, t1);
+
+    delta = (t1-t0[n])/10000000.0; //interval is 100 ns
+    return delta;
+}
+
+#define pLsrPrimRGetTimerT pLsrPrimRGetTimer
+
+/**********************************************************************
+ * VTune primitives
+ */
+
+static void pLsrPrimRVTuneAttach()
+{
+#ifdef P_USE_PILLAR
+    prtVtuneResume(); 
+#endif
+}
+
+#define pLsrPrimRVTuneAttachT pLsrPrimRVTuneAttach
+
+static void pLsrPrimRVTuneDetach()
+{
+#ifdef P_USE_PILLAR
+    prtVtunePause(); 
+#endif
+}
+
+#define pLsrPrimRVTuneDetachT pLsrPrimRVTuneDetach
+
+/* XXX NG: deprecate */
+static void pLsrPrimRArrayEval(PlsrPAny v, uintp ordered)
+{
+    switch(pLsrVTableGetTag(pLsrObjectGetVTable(v))) {
+    case VArrayTag: {
+        uintp i;
+        for(i=0; i<pLsrPArrayOGetLen(v); i++)
+            pLsrThunkEvalRef((PlsrThunkBRef)pLsrPArrayOElt(v, i));
+        break;
+    }
+    case VArrayIdxTag: {
+        uintp i;
+        uintp len = pLsrPArrayIGetLen(v);
+        for(i=0; i < len; i++)
+            pLsrThunkEvalRef((PlsrThunkBRef)pLsrPArrayIElt(v, i));
+        break;
+    }
+    default:
+        pLsrRuntimeError("arrayEval of non-array");
+    }
+}
+
+#define pLsrPrimRArrayEvalT pLsrPrimRArrayEval
+
+/**********************************************************************
+ * Integer Hash
+ */
+
+#define pLsrPrimRIntHash(i) pLsrIntegerHash(i)
+#define pLsrPrimRIntHashT pLsrPrimRIntHash
+
+#endif /* !_PLSR_PRIMS_RUNTIME_H_ */
diff --git a/runtime/include/hrc/plsr-prims-vector-avx.h b/runtime/include/hrc/plsr-prims-vector-avx.h
new file mode 100644
index 0000000..d3129e2
--- /dev/null
+++ b/runtime/include/hrc/plsr-prims-vector-avx.h
@@ -0,0 +1,1142 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_PRIMS_VECTOR_AVX_H_
+#define _PLSR_PRIMS_VECTOR_AVX_H_
+
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+#endif /* P_USE_PILLAR */
+
+
+#ifdef P_USE_PILLAR
+#  undef to
+#  pragma pillar_managed(on)
+#endif /* P_USE_PILLAR */
+
+//#define __pillar2c__ 1
+
+#ifdef P_USE_PILLAR
+#define __ICL_INTRINCC __pcdecl
+#endif /* P_USE_PILLAR */
+#include <immintrin.h>
+
+typedef __m256 PlsrVector256F32;
+typedef __m256d PlsrVector256F64;
+typedef __m256i PlsrVector256B32;
+typedef __m256i PlsrVector256Ref;
+
+typedef __m256 PlsrVectorMask256Fs32;
+typedef __m256 PlsrVectorMask256Fs64;
+
+#ifdef TWOVEC64
+typedef struct two__m128i {
+  __m128i low,high;
+} two__m128i;
+
+typedef two__m128i PlsrVector256B64;
+#elif defined ONEVEC64
+typedef __m256i PlsrVector256B64;
+#else
+typedef struct PlsrVector256B64 {
+  __int64 values[4];
+} PlsrVector256B64;
+#endif
+
+#if 0
+#define pLsrPrimV256PointwiseFloat32LT(dest, a, b) ((dest) = _mm256_cmp_ps((a),(b),_CMP_LT_OS))
+#else
+#define pLsrPrimV256CompareFloat32LT(dest, a, b) ((dest) = _mm256_cmp_ps((a),(b),_CMP_LT_OS))
+#define pLsrPrimV256CompareFloat64LT(dest, a, b) ((dest) = _mm256_cmp_pd((a),(b),_CMP_LT_OS))
+#define pLsrPrimV256CompareFloat32LE(dest, a, b) ((dest) = _mm256_cmp_ps((a),(b),_CMP_LE_OS))
+#define pLsrPrimV256CompareFloat64LE(dest, a, b) ((dest) = _mm256_cmp_pd((a),(b),_CMP_LE_OS))
+#endif
+#define pLsrPrimV256PointwiseFloat32Exp(dest, a) ((dest) = _mm256_exp_ps((a)))
+#define pLsrPrimV256PointwiseFloat32Ln(dest, a) ((dest) = _mm256_log_ps((a)))
+#define pLsrPrimV256PointwiseFloat64Pow(dest, a, b) ((dest) = _mm256_pow_pd((a), (b)))
+#define pLsrPrimV256DataBlendF32(dest, mask, a, b) ((dest) =  _mm256_blendv_ps((b),(a),(mask)))
+#define pLsrPrimV256DataBlendB32(dest, mask, a, b) ((dest) =  _mm256_blendv_ps((b),(a),(mask)))
+#define pLsrPrimV256DataBlendF64(dest, mask, a, b) ((dest) =  _mm256_blendv_pd((b),(a),(mask)))
+#define pLsrPrimV256DataBlendB64(dest, mask, a, b) ((dest) =  _mm256_blendv_pd((b),(a),(mask)))
+
+
+// *****************************************************************************
+// Conversions
+// *****************************************************************************
+
+#define pLsrPrimV256256ConvertFloat32FromUInt32(dest, a) ((dest) = _mm256_cvtepi32_ps(a))
+#define pLsrPrimV256256ConvertUInt32FromFloat32(dest, a) ((dest) = _mm256_cvtps_epi32(a))
+
+#define pLsrPrimV256128ConvertFloat64FromFloat32(dest, a)  ((dest) = _mm256_cvtps_pd(a))
+
+// vec128 -> vec256
+#define pLsrPrimV256U32ToF64(dest, a) ((dest) = _mm256__cvtepi32_pd (_mm_movpi64_epi64(a)))
+#define pLsrPrimV256F64ToU32(dest, a) ((dest) = _mm256__movepi64_pi64 (_mm256_cvtpd_epi32(a)))
+
+#define pLsrPrimV256256CastSInt32ToFloat32(dest, a) ((dest) = _mm256_cvtepi32_ps(a))
+#define pLsrPrimV256128CastSInt32ToFloat64(dest, a) ((dest) = _mm256_cvtepi32_pd(a))
+#define pLsrPrimV256256CastFloat32ToSInt32(dest, a) ((dest) = _mm256_cvttps_epi32(a))
+#define pLsrPrimV256256CastSInt32ToUInt32(dest, a) ((dest) = a)
+#define pLsrPrimV256256CastUInt32ToSInt32(dest, a) ((dest) = a)
+
+#define pLsrPrimV256256ConvertUInt32FromSInt32(dest, a) ((dest) = a)
+#define pLsrPrimV256256ConvertSInt32FromUInt32(dest, a) ((dest) = a)
+
+
+
+#ifdef TWOVEC64
+#elif defined ONEVEC64
+#else
+#define pLsrPrimV128256CastSInt64ToFloat32(dest, a) (dest) = _mm_set_ps((a).values[3], (a).values[2], (a).values[1], (a).values[0])
+#define pLsrPrimV256128CastFloat32ToSInt64(dest, a) \
+  do { \
+    float in[4]; \
+    _mm_store_ps(in,a); \
+    dest.values[0] = in[0]; \
+    dest.values[1] = in[1]; \
+    dest.values[2] = in[2]; \
+    dest.values[3] = in[3]; \
+  } while(0)
+
+#endif
+
+// *****************************************************************************
+// Data operations on the more abstract B32/F32/F64 types
+// *****************************************************************************
+
+#define pLsrPrimV256DataVectorB32(dest, c0, c1, c2, c3, c4, c5, c6, c7) \
+  pLsrPrimV256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)
+
+#define pLsrPrimV256DataVectorB64(dest, c0, c1, c2, c3) \
+  pLsrPrimV256UInt64Const(dest, c0, c1, c2, c3)
+
+#define pLsrPrimV256DataVectorF32(dest, c0, c1, c2, c3, c4, c5, c6, c7) \
+  pLsrPrimV256Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)
+
+#define pLsrPrimV256DataVectorF64(dest, c0, c1, c2, c3) \
+  pLsrPrimV256Float64Const(dest, c0, c1, c2, c3)
+
+#define pLsrPrimV256DataBroadcastB32(dest, c0) pLsrPrimV256UInt32Lift(dest, c0)
+#define pLsrPrimV256DataBroadcastB64(dest, c0) pLsrPrimV256UInt64Lift(dest, c0)
+#define pLsrPrimV256DataBroadcastF32(dest, c0) pLsrPrimV256Float32Lift(dest, c0)
+#define pLsrPrimV256DataBroadcastF64(dest, c0) pLsrPrimV256Float64Lift(dest, c0)
+
+ // Subscripting operations (chose between the lower or higher part)
+
+
+#define pLsrPrimV256DataSub0B32(dest, a) pLsrPrimV256DataSubB32L(dest, 0, a)
+#define pLsrPrimV256DataSub1B32(dest, a) pLsrPrimV256DataSubB32L(dest, 1, a)
+#define pLsrPrimV256DataSub2B32(dest, a) pLsrPrimV256DataSubB32L(dest, 2, a)
+#define pLsrPrimV256DataSub3B32(dest, a) pLsrPrimV256DataSubB32L(dest, 3, a)
+#define pLsrPrimV256DataSub4B32(dest, a) pLsrPrimV256DataSubB32H(dest, 0, a)
+#define pLsrPrimV256DataSub5B32(dest, a) pLsrPrimV256DataSubB32H(dest, 1, a)
+#define pLsrPrimV256DataSub6B32(dest, a) pLsrPrimV256DataSubB32H(dest, 2, a)
+#define pLsrPrimV256DataSub7B32(dest, a) pLsrPrimV256DataSubB32H(dest, 3, a)
+
+#define pLsrPrimV256DataSub0F32(dest, a) pLsrPrimV256DataSubF32L(dest, 0, a)
+#define pLsrPrimV256DataSub1F32(dest, a) pLsrPrimV256DataSubF32L(dest, 1, a)
+#define pLsrPrimV256DataSub2F32(dest, a) pLsrPrimV256DataSubF32L(dest, 2, a)
+#define pLsrPrimV256DataSub3F32(dest, a) pLsrPrimV256DataSubF32L(dest, 3, a)
+#define pLsrPrimV256DataSub4F32(dest, a) pLsrPrimV256DataSubF32H(dest, 0, a)
+#define pLsrPrimV256DataSub5F32(dest, a) pLsrPrimV256DataSubF32H(dest, 1, a)
+#define pLsrPrimV256DataSub6F32(dest, a) pLsrPrimV256DataSubF32H(dest, 2, a)
+#define pLsrPrimV256DataSub7F32(dest, a) pLsrPrimV256DataSubF32H(dest, 3, a)
+
+#define pLsrPrimV256DataSubB32L(dest, sub, a) ((dest) = pLsrPrimV256DataSubB32L_help(sub, a))
+static inline int pLsrPrimV256DataSubB32L_help(int sub, __m256i a) {
+  __m128i loA = _mm256_extractf128_si256(a, 0);
+  return (_mm_extract_epi32(loA, sub));
+}
+#define pLsrPrimV256DataSubB32H(dest, sub, a) ((dest) = pLsrPrimV256DataSubB32H_help(sub, a))
+static inline int pLsrPrimV256DataSubB32H_help(int sub, __m256i a) {
+  __m128i hiA = _mm256_extractf128_si256(a, 1);
+  return (_mm_extract_epi32(hiA, sub));
+}
+
+#define pLsrPrimV256DataSubF32H(dest, sub, a) ((dest) = pLsrPrimV256DataSubF32H_help(sub, a))
+static inline float pLsrPrimV256DataSubF32H_help(int sub, __m256 a) {
+  //  float* out = (float*)malloc(sizeof(float));
+  __m128 hiA = _mm256_extractf128_ps(a, 1);
+  //__m128 outA = mm_extract_ps(out, hiA, sub);
+  //  _mm_move_ss(out, outA);
+  //  return *out;
+  float out;
+  _MM_EXTRACT_FLOAT(out, hiA, sub);
+  return out;
+
+}
+
+#define pLsrPrimV256DataSubF32L(dest, sub, a) ((dest) = pLsrPrimV256DataSubF32L_help(sub, a))
+static inline float pLsrPrimV256DataSubF32L_help(int sub, __m256 a) {
+  //  float* out = (float*)malloc(sizeof(float));
+  __m128 hiA = _mm256_extractf128_ps(a, 0);
+  //  __m128 outA = mm_extract_ps(out, hiA, sub);
+  //  _mm_move_ss(out, outA);
+  float out;
+  _MM_EXTRACT_FLOAT(out, hiA, sub);
+  return out;
+}
+
+#define pLsrPrimV256DataSub0F64(dest, a) pLsrPrimV256DataSubF64(dest, 0, a)
+#define pLsrPrimV256DataSub1F64(dest, a) pLsrPrimV256DataSubF64(dest, 1, a)
+#define pLsrPrimV256DataSub2F64(dest, a) pLsrPrimV256DataSubF64(dest, 2, a)
+#define pLsrPrimV256DataSub3F64(dest, a) pLsrPrimV256DataSubF64(dest, 3, a)
+
+#define pLsrPrimV256DataSubF64(dest, sub, a) ((dest) = ((double*)&a)[sub])
+
+// loads and stores
+
+#define pLsrPrimV256B32StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256UInt32Store(arr, offset, idx, v);              \
+        } else {                                                        \
+            pLsrPrimV256UInt32StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256B64StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256UInt64Store(arr, offset, idx, v);              \
+        } else {                                                        \
+            pLsrPrimV256UInt64StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256F32StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256Float32Store(arr, offset, idx, v);             \
+        } else {                                                        \
+            pLsrPrimV256Float32StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256F64StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256Float64Store(arr, offset, idx, v);             \
+        } else {                                                        \
+            pLsrPrimV256Float64StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256RefStoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256RefStore(arr, offset, idx, v);                 \
+        } else {                                                        \
+            pLsrPrimV256RefStoreStrided(arr, offset, idx, stride, v);  \
+        }                                                               \
+    } while (0)
+
+
+#define pLsrPrimV256B32LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256UInt32Load(dest, arr, offset, idx);            \
+        } else {                                                        \
+            pLsrPrimV256UInt32LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256B64LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256UInt64Load(dest, arr, offset, idx);            \
+        } else {                                                        \
+            pLsrPrimV256UInt64LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256F32LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256Float32Load(dest, arr, offset, idx);           \
+        } else {                                                        \
+            pLsrPrimV256Float32LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256F64LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256Float64Load(dest, arr, offset, idx);           \
+        } else {                                                        \
+            pLsrPrimV256Float64LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256RefLoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256RefLoad(dest, arr, offset, idx);               \
+        } else {                                                        \
+            pLsrPrimV256RefLoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+
+#define pLsrPrimV128Float32LoadIndexed(dest, arr, off, vi)              \
+    do {                                                                \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx0;                  \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx1;                  \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx2;                  \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx3;                  \
+                                                                        \
+        pLsrPrimV256Float32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(vi, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(vi, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(vi, 2); \
+        pLsrPrimV256Float32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(vi, 3); \
+        (dest) = _mm_set_ps(*((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx3), \
+                            *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx2), \
+                            *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx1), \
+                            *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx0)); \
+    } while (0)
+
+#define pLsrPrimV128Float32LoadIndexed64(dest, arr, offset, vi) \
+    do { \
+        (dest) = _mm_set_ps( *((float32*)((char*)arr + offset) + vi.values[3]), \
+                             *((float32*)((char*)arr + offset) + vi.values[2]), \
+                             *((float32*)((char*)arr + offset) + vi.values[1]), \
+                             *((float32*)((char*)arr + offset) + vi.values[0])); \
+    } while (0)
+
+/* scalar array, scalar offset, vector of indices */
+#define pLsrPrimV256F32LoadVI(dest, arr, off, vi)                       \
+    do {                                                                \
+        __m128i pLsrPrimV256Float32LoadVectorHelp_idxV;                 \
+        __m128 pLsrPrimV256Float32LoadVectorHelp_lo;                    \
+        __m128 pLsrPrimV256Float32LoadVectorHelp_hi;                    \
+        __m256  pLsrPrimV256Float32LoadVectorHelp_res;                  \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx0;                  \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx1;                  \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx2;                  \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx3;                  \
+                                                                        \
+        pLsrPrimV256Float32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 2); \
+        pLsrPrimV256Float32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 3); \
+        pLsrPrimV256Float32LoadVectorHelp_lo =                          \
+            _mm_set_ps(*((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx3), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx2), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx1), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx0)); \
+        pLsrPrimV256Float32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 2); \
+        pLsrPrimV256Float32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 3); \
+        pLsrPrimV256Float32LoadVectorHelp_hi =                          \
+            _mm_set_ps(*((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx3), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx2), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx1), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx0)); \
+        pLsrPrimV256Float32LoadVectorHelp_res = _mm256_castps128_ps256(pLsrPrimV256Float32LoadVectorHelp_lo); \
+        pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \
+                                                                     pLsrPrimV256Float32LoadVectorHelp_hi, \
+                                                                     1); \
+        (dest) = pLsrPrimV256Float32LoadVectorHelp_res;                 \
+    } while (0)
+
+/* scalar array, scalar offset, vector of indices */
+#define pLsrPrimV256F64LoadVI(dest, arr, off, vi)                       \
+    do {                                                                \
+        __m128i pLsrPrimV256Float64LoadVectorHelp_idxV;                 \
+        __m256d  pLsrPrimV256Float64LoadVectorHelp_res;                 \
+        uint32 pLsrPrimV256Float64LoadVectorHelp_idx0;                  \
+        uint32 pLsrPrimV256Float64LoadVectorHelp_idx1;                  \
+        uint32 pLsrPrimV256Float64LoadVectorHelp_idx2;                  \
+        uint32 pLsrPrimV256Float64LoadVectorHelp_idx3;                  \
+                                                                        \
+        pLsrPrimV256Float64LoadVectorHelp_idxV = vi;                    \
+        pLsrPrimV256Float64LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 0); \
+        pLsrPrimV256Float64LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 1); \
+        pLsrPrimV256Float64LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 2); \
+        pLsrPrimV256Float64LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 3); \
+        pLsrPrimV256Float64LoadVectorHelp_res =                         \
+            _mm256_set_pd(*((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx3), \
+                          *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx2), \
+                          *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx1), \
+                          *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx0)); \
+        (dest) = pLsrPrimV256Float64LoadVectorHelp_res;                 \
+    } while (0)
+
+
+/* scalar array, scalar offset, vector of indices */
+/* For some insane reason, the only way to load out a float from an __m128 register
+ * is as an integer containing the binary representation of the float. -leaf */
+#define pLsrPrimV256F32StoreVI(arr, off, vi, src)                       \
+    do {                                                                \
+        __m128i pLsrPrimV256Float32StoreVectorHelp_idxV;                 \
+        __m128 pLsrPrimV256Float32StoreVectorHelp_srcV;                 \
+        uint32 pLsrPrimV256Float32StoreVectorHelp_idx;                  \
+                                                                        \
+        pLsrPrimV256Float32StoreVectorHelp_idxV =_mm256_extractf128_si256(vi, 0); \
+        pLsrPrimV256Float32StoreVectorHelp_srcV =_mm256_extractf128_ps(src, 0); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 0); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 0); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 1); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 1); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 2); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 2); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 3); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 3); \
+        pLsrPrimV256Float32StoreVectorHelp_idxV =_mm256_extractf128_si256(vi, 1); \
+        pLsrPrimV256Float32StoreVectorHelp_srcV =_mm256_extractf128_ps(src, 1); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 0); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 0); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 1); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 1); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 2); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 2); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 3); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 3); \
+    } while (0)
+
+/* scalar array, scalar offset, vector of indices */
+#define pLsrPrimV256F64StoreVI(arr, off, vi, src)                       \
+    do {                                                                \
+        __m128i pLsrPrimV256Float64StoreVectorHelp_idxV;                 \
+        __m128d pLsrPrimV256Float64StoreVectorHelp_srcV;                 \
+        uint32 pLsrPrimV256Float64StoreVectorHelp_idx;                  \
+                                                                        \
+        pLsrPrimV256Float64StoreVectorHelp_idxV = vi;                   \
+        pLsrPrimV256Float64StoreVectorHelp_srcV =_mm256_extractf128_pd(src, 0); \
+        pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 0); \
+        _mm_storel_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \
+        pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 1); \
+        _mm_storeh_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \
+                                                                        \
+        pLsrPrimV256Float64StoreVectorHelp_srcV =_mm256_extractf128_pd(src, 1); \
+        pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 2); \
+        _mm_storel_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \
+        pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 3); \
+        _mm_storeh_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \
+    } while (0)
+
+
+#define pLsrPrimV256B32LoadVectorStrided(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256UInt32LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256B64LoadVectorStrided(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256UInt64LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256F32LoadVectorStrided(dest, arr, offset, idx, stride) \
+    do {                                                                \
+        if (stride == 0) {                                              \
+            pLsrPrimV256Float32LoadVector(dest, arr, offset, idx);      \
+        } else {                                                        \
+            pLsrPrimV256Float32LoadVectorStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)                                                         \
+
+#define pLsrPrimV256B32LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256UInt32LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256B64LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256UInt64LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256F32LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256F32LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256F64LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256F64LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256RefLoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256RefLoadVectorStrided(dest, arr, offset, idx, stride)
+
+
+// *****************************************************************************
+// Type-specific arithemetic and memory operations
+// *****************************************************************************
+
+// ================
+// UInt32 - 256-bit
+// ================
+
+// *****************************************************************************
+// Arithmetic
+// *****************************************************************************
+
+#define pLsrPrimV256PointwiseUInt32Plus(dest, a, b) vector256PointwiseUInt32Plus(dest, a, b)
+#define pLsrPrimV256PointwiseUInt32Minus(dest, a, b) vector256PointwiseUInt32Minus(dest, a, b)
+#define pLsrPrimV256PointwiseUInt32Times(dest, a, b) vector256PointwiseUInt32Times(dest, a, b)
+
+#define pLsrPrimV256PointwiseUInt64Plus(dest, a, b) vector256PointwiseUInt64Plus(dest, a, b)
+#define pLsrPrimV256PointwiseUInt64Minus(dest, a, b) vector256PointwiseUInt64Minus(dest, a, b)
+#define pLsrPrimV256PointwiseUInt64Times(dest, a, b) vector256PointwiseUInt64Times(dest, a, b)
+
+#define pLsrPrimV256PointwiseSInt32Plus(dest, a, b) vector256PointwiseSInt32Plus(dest, a, b)
+#define pLsrPrimV256PointwiseSInt32Minus(dest, a, b) vector256PointwiseSInt32Minus(dest, a, b)
+#define pLsrPrimV256PointwiseSInt32Times(dest, a, b) vector256PointwiseSInt32Times(dest, a, b)
+
+#define pLsrPrimV256PointwiseSInt64Plus(dest, a, b) vector256PointwiseSInt64Plus(dest, a, b)
+#define pLsrPrimV256PointwiseSInt64Minus(dest, a, b) vector256PointwiseSInt64Minus(dest, a, b)
+#define pLsrPrimV256PointwiseSInt64Times(dest, a, b) vector256PointwiseSInt64Times(dest, a, b)
+
+#define integer256via128(dest, a, b, op)                                \
+    do {                                                                \
+      __m256i integer256via128_dest = {0, 0, 0, 0, 0, 0, 0, 0};         \
+      __m128i integer256via128_hiA = _mm256_extractf128_si256(a, 1);    \
+      __m128i integer256via128_loA = _mm256_extractf128_si256(a, 0);    \
+      __m128i integer256via128_hiB  = _mm256_extractf128_si256(b, 1);   \
+      __m128i integer256via128_loB  = _mm256_extractf128_si256(b, 0);   \
+      __m128i integer256via128_loTemp;                                  \
+      __m128i integer256via128_hiTemp;                                  \
+      op(integer256via128_loTemp, integer256via128_loA, integer256via128_loB); \
+      op(integer256via128_hiTemp, integer256via128_hiA, integer256via128_hiB); \
+      integer256via128_dest = _mm256_insertf128_si256(integer256via128_dest, integer256via128_hiTemp, 1); \
+      integer256via128_dest = _mm256_insertf128_si256(integer256via128_dest, integer256via128_loTemp, 0); \
+      (dest) =  integer256via128_dest;                                  \
+} while (0)
+
+#define vector256PointwiseUInt32Plus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Plus)
+#define vector256PointwiseUInt32Minus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Minus)
+#define vector256PointwiseUInt32Times(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Times)
+
+#ifdef TWOVEC64
+
+#define vector256PointwiseSInt64Times(dest, a, b)          \
+    do {                                                   \
+        __declspec(align(16)) __int64 valsa[4], valsb[8];  \
+        _mm_store_si128((void *)&valsa[0],(a).low);        \
+        _mm_store_si128((void *)&valsa[2],(a).high);       \
+        _mm_store_si128((void *)&valsb[0],(b).low);        \
+        _mm_store_si128((void *)&valsb[2],(b).high);       \
+        unsigned i;                                        \
+        for(i = 0; i < 4; ++i) {                           \
+            valsa[i] *= valsb[i];                          \
+        }                                                  \
+        (dest).low  = _mm_load_si128((void *)&valsa[0]);   \
+        (dest).high = _mm_load_si128((void *)&valsa[2]);   \
+    } while (0)
+
+#define vector256PointwiseUInt64Plus(dest, a, b) \
+    do {                                         \
+        (dest).low  = _mm_add_epi64((a).low,  (b).low);  \
+        (dest).high = _mm_add_epi64((a).high, (b).high);  \
+    } while (0)
+
+#define vector256PointwiseUInt64Minus(dest, a, b) \
+    do {                                          \
+        (dest).low  = _mm_sub_epi64((a).low,  (b).low);   \
+        (dest).high = _mm_sub_epi64((a).high, (b).high);   \
+    } while (0)
+
+#define vector256PointwiseSInt64Plus(dest, a, b) \
+    do {                                         \
+        (dest).low  = _mm_add_epi64((a).low,  (b).low);  \
+        (dest).high = _mm_add_epi64((a).high, (b).high);  \
+    } while (0)
+
+#define vector256PointwiseSInt64Minus(dest, a, b) \
+    do {                                          \
+        (dest).low  = _mm_sub_epi64((a).low,  (b).low);   \
+        (dest).high = _mm_sub_epi64((a).high, (b).high);   \
+    } while (0)
+#elif defined ONEVEC64
+#else
+#define vector256PointwiseSInt64Times(dest, a, b)   \
+    do {                                            \
+        dest.values[0] = a.values[0] * b.values[0]; \
+        dest.values[1] = a.values[1] * b.values[1]; \
+        dest.values[2] = a.values[2] * b.values[2]; \
+        dest.values[3] = a.values[3] * b.values[3]; \
+    } while (0)
+
+#define vector256PointwiseSInt64Plus(dest, a, b)    \
+    do {                                            \
+        dest.values[0] = a.values[0] + b.values[0]; \
+        dest.values[1] = a.values[1] + b.values[1]; \
+        dest.values[2] = a.values[2] + b.values[2]; \
+        dest.values[3] = a.values[3] + b.values[3]; \
+    } while (0)
+
+#define vector256PointwiseSInt64Minus(dest, a, b)   \
+    do {                                            \
+        dest.values[0] = a.values[0] - b.values[0]; \
+        dest.values[1] = a.values[1] - b.values[1]; \
+        dest.values[2] = a.values[2] - b.values[2]; \
+        dest.values[3] = a.values[3] - b.values[3]; \
+    } while (0)
+
+#define pLsrPrimV256PointwiseSInt64DivT(dest, a, b)                    \
+    do { \
+        dest.values[0] = a.values[0] / b.values[0]; \
+        dest.values[1] = a.values[1] / b.values[1]; \
+        dest.values[2] = a.values[2] / b.values[2]; \
+        dest.values[3] = a.values[3] / b.values[3]; \
+    } while (0)
+
+#define pLsrPrimV256PointwiseSInt64ModT(dest, a, b)                    \
+    do { \
+        dest.values[0] = a.values[0] % b.values[0]; \
+        dest.values[1] = a.values[1] % b.values[1]; \
+        dest.values[2] = a.values[2] % b.values[2]; \
+        dest.values[3] = a.values[3] % b.values[3]; \
+    } while (0)
+
+
+#define vector256PointwiseUInt64Times(dest, a, b)                       \
+    do {                                                                \
+        dest.values[0] = (uint64_t)a.values[0] * (uint64_t)b.values[0]; \
+        dest.values[1] = (uint64_t)a.values[1] * (uint64_t)b.values[1]; \
+        dest.values[2] = (uint64_t)a.values[2] * (uint64_t)b.values[2]; \
+        dest.values[3] = (uint64_t)a.values[3] * (uint64_t)b.values[3]; \
+    } while (0)
+
+#define vectorV256PointwiseUInt64Plus(dest, a, b)                       \
+    do {                                                                \
+        dest.values[0] = (uint64_t)a.values[0] + (uint64_t)b.values[0]; \
+        dest.values[1] = (uint64_t)a.values[1] + (uint64_t)b.values[1]; \
+        dest.values[2] = (uint64_t)a.values[2] + (uint64_t)b.values[2]; \
+        dest.values[3] = (uint64_t)a.values[3] + (uint64_t)b.values[3]; \
+    } while (0)
+
+#define vectorV256PointwiseUInt64Minus(dest, a, b)                      \
+    do {                                                                \
+        dest.values[0] = (uint64_t)a.values[0] - (uint64_t)b.values[0]; \
+        dest.values[1] = (uint64_t)a.values[1] - (uint64_t)b.values[1]; \
+        dest.values[2] = (uint64_t)a.values[2] - (uint64_t)b.values[2]; \
+        dest.values[3] = (uint64_t)a.values[3] - (uint64_t)b.values[3]; \
+    } while (0)
+
+#define pLsrPrimV256PointwiseUInt64DivT(dest, a, b)                    \
+    do { \
+        dest.values[0] = (uint64_t)a.values[0] / (uint64_t)b.values[0]; \
+        dest.values[1] = (uint64_t)a.values[1] / (uint64_t)b.values[1]; \
+        dest.values[2] = (uint64_t)a.values[2] / (uint64_t)b.values[2]; \
+        dest.values[3] = (uint64_t)a.values[3] / (uint64_t)b.values[3]; \
+    } while (0)
+
+#define pLsrPrimV256PointwiseUInt64ModT(dest, a, b)                    \
+    do { \
+        dest.values[0] = (uint64_t)a.values[0] % (uint64_t)b.values[0]; \
+        dest.values[1] = (uint64_t)a.values[1] % (uint64_t)b.values[1]; \
+        dest.values[2] = (uint64_t)a.values[2] % (uint64_t)b.values[2]; \
+        dest.values[3] = (uint64_t)a.values[3] % (uint64_t)b.values[3]; \
+    } while (0)
+
+#endif
+
+#define vector256PointwiseSInt32Plus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Plus)
+#define vector256PointwiseSInt32Minus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Minus)
+#define vector256PointwiseSInt32Times(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Times)
+
+
+// *****************************************************************************
+// Comparison
+// *****************************************************************************
+
+#if 0
+#define pLsrPrimV256PointwiseSInt32LT(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32LT)
+#define pLsrPrimV256PointwiseSInt32EQ(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32EQ)
+#define pLsrPrimV256PointwiseSInt32GT(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32GT)
+#else
+#define pLsrPrimV256CompareSInt32LT(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128CompareSInt32LT)
+#define pLsrPrimV256CompareSInt32EQ(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128CompareSInt32EQ)
+#define pLsrPrimV256CompareSInt32GT(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128CompareSInt32GT)
+#endif
+
+// *****************************************************************************
+// Reductions (horizontals)
+// *****************************************************************************
+
+#define pLsrPrimV256ReduceAUInt32Plus(dest, init, a)  vector256reduceAUInt32Plus(dest, init, a)
+#define pLsrPrimV256ReduceAUInt32Times(dest, init, a) vector256reduceAUInt32Times(dest, init, a)
+#define pLsrPrimV256ReduceAUInt32Max(dest, init, a)   vector256reduceAUInt32Max(dest, init, a)
+#define pLsrPrimV256ReduceAUInt32Min(dest, init, a)   vector256reduceAUInt32Min(dest, init, a)
+
+#define pLsrPrimV256ReduceASInt32Plus(dest, init, a)  vector256reduceASInt32Plus(dest, init, a)
+//#define pLsrPrimV256ReduceASInt32Times(dest, init, a) vector256reduceAUInt32Times(dest, init, a)
+//#define pLsrPrimV256ReduceASInt32Max(dest, init, a)   vector256reduceAUInt32Max(dest, init, a)
+//#define pLsrPrimV256ReduceASInt32Min(dest, init, a)   vector256reduceAUInt32Min(dest, init, a)
+
+#define vector256reduceAUInt32Plus(dest, init, a)                       \
+  do {                                                                  \
+    __m128i vector256reduceAUInt32Plus_hiA = _mm256_extractf128_si256(a, 1); \
+    __m128i vector256reduceAUInt32Plus_loA = _mm256_extractf128_si256(a, 0); \
+    uint32 vector256reduceAUInt32Plus_aRes;                             \
+    pLsrPrimV128ReduceAUInt32Plus(vector256reduceAUInt32Plus_aRes, init, vector256reduceAUInt32Plus_hiA); \
+    uint32 vector256reduceAUInt32Plus_bRes;                             \
+    pLsrPrimV128ReduceAUInt32Plus(vector256reduceAUInt32Plus_bRes, init, vector256reduceAUInt32Plus_loA); \
+    (dest) = (vector256reduceAUInt32Plus_aRes + vector256reduceAUInt32Plus_bRes); \
+  } while (0)
+
+#define vector256reduceAUInt32Times(dest, init, a)                      \
+  do {                                                                  \
+    __m128i vector256reduceAUInt32Times_hiA = _mm256_extractf128_si256(a, 1); \
+    __m128i vector256reduceAUInt32Times_loA = _mm256_extractf128_si256(a, 0); \
+    uint32 vector256reduceAUInt32Times_aRes;                            \
+    pLsrPrimV128ReduceAUInt32Times(vector256reduceAUInt32Times_aRes, init, vector256reduceAUInt32Times_hiA); \
+    uint32 vector256reduceAUInt32Times_bRes;                            \
+    pLsrPrimV128ReduceAUInt32Times(vector256reduceAUInt32Times_bRes, init, vector256reduceAUInt32Times_loA); \
+    (dest) = (vector256reduceAUInt32Times_aRes * vector256reduceAUInt32Times_bRes); \
+  } while (0)
+
+#define vector256reduceAUInt32Max(dest, init, a)        \
+    do {                                                                \
+        __m128i vector256reduceAUInt32Max_hiA = _mm256_extractf128_si256(a, 1); \
+        __m128i vector256reduceAUInt32Max_loA = _mm256_extractf128_si256(a, 0); \
+        uint32 vector256reduceAUInt32Max_aRes; \
+        pLsrPrimV128ReduceAUInt32Max(vector256reduceAUInt32Max_aRes, init, vector256reduceAUInt32Max_hiA); \
+        uint32 vector256reduceAUInt32Max_bRes; \
+        pLsrPrimV128ReduceAUInt32Max(vector256reduceAUInt32Max_bRes, init, vector256reduceAUInt32Max_loA); \
+        (dest) = (max(vector256reduceAUInt32Max_aRes, vector256reduceAUInt32Max_bRes)); \
+    } while (0)
+
+#define vector256reduceAUInt32Min(dest, init, a)        \
+  do {                                                                  \
+    __m128i vector256reduceAUInt32Min_hiA = _mm256_extractf128_si256(a, 1); \
+    __m128i vector256reduceAUInt32Min_loA = _mm256_extractf128_si256(a, 0); \
+    uint32 vector256reduceAUInt32Min_aRes;                              \
+    pLsrPrimV128ReduceAUInt32Min(vector256reduceAUInt32Min_aRes, init, vector256reduceAUInt32Min_hiA); \
+    uint32 vector256reduceAUInt32Min_bRes;                              \
+    pLsrPrimV128ReduceAUInt32Min(vector256reduceAUInt32Min_bRes, init, vector256reduceAUInt32Min_loA); \
+    (dest) = (min(vector256reduceAUInt32Min_aRes, vector256reduceAUInt32Min_bRes)); \
+  } while (0)
+
+#define vector256reduceASInt32Plus(dest, init, a)                       \
+  do {                                                                  \
+    __m128i vector256reduceASInt32Plus_hiA = _mm256_extractf128_si256(a, 1); \
+    __m128i vector256reduceASInt32Plus_loA = _mm256_extractf128_si256(a, 0); \
+    sint32 vector256reduceASInt32Plus_aRes;                             \
+    pLsrPrimV128ReduceASInt32Plus(vector256reduceASInt32Plus_aRes, init, vector256reduceASInt32Plus_hiA); \
+    sint32 vector256reduceASInt32Plus_bRes;                             \
+    pLsrPrimV128ReduceASInt32Plus(vector256reduceASInt32Plus_bRes, init, vector256reduceASInt32Plus_loA); \
+    (dest) = (vector256reduceASInt32Plus_aRes + vector256reduceASInt32Plus_bRes); \
+  } while (0)
+
+
+// *****************************************************************************
+// Data operations
+// *****************************************************************************
+
+#define pLsrPrimV256UInt32Lift(dest, a) vector256UInt32Lift(dest, a)
+#define pLsrPrimV256UInt64Lift(dest, a) vector256UInt64Lift(dest, a)
+#define pLsrPrimV256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)   \
+    vector256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)
+#define pLsrPrimV256UInt64Const(dest, c0, c1, c2, c3)   \
+    vector256UInt64Const(dest, c0, c1, c2, c3)
+
+#define vector256UInt32Lift(dest, a)                                    \
+  do {                                                                  \
+    __m256i vector256UInt32Lift_dest = {0, 0, 0, 0, 0, 0, 0, 0};        \
+    __m128i vector256UInt32Lift_lo = _mm_set1_epi32(a);                 \
+    vector256UInt32Lift_dest = _mm256_insertf128_si256(vector256UInt32Lift_dest, vector256UInt32Lift_lo, 1); \
+    vector256UInt32Lift_dest = _mm256_insertf128_si256(vector256UInt32Lift_dest, vector256UInt32Lift_lo, 0); \
+    (dest) = vector256UInt32Lift_dest;                                  \
+  } while (0)
+
+#define pLsrPrimV256PointwiseUInt32BAnd(dest, a, b)    integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32BAnd)
+#define pLsrPrimV256PointwiseUInt32BShiftL(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32BShiftL)
+#define pLsrPrimV256PointwiseUInt32BShiftR(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32BShiftR)
+//#define pLsrPrimV256PointwiseUInt32BShiftR(dest, a, b) (dest) = _mm256_srlv_epi32((a),(b))
+
+#ifdef TWOVEC64
+
+#if 0
+#define intToM64(a) __m64{a}
+#if 0
+inline __m64 intToM64(__int64 a) {
+  __m64 ret;
+  ret.__m = a;
+  return ret;
+}
+#endif
+#else
+#define intToM64 _mm_cvtsi64_m64
+//#define intToM64 _m_from_int64
+#endif
+
+#if 0
+#define vector128UInt64Const(dest, c0, c1) \
+  do {                                     \
+    dest.m128i_u64[0] = c0;                \
+    dest.m128i_u64[1] = c1;                \
+  } while (0)
+#endif
+
+#if 0
+#define vector256UInt64Lift(dest, a) vector256UInt64Const(dest, a, a, a, a)
+#else
+#define vector256UInt64Lift(dest, a)             \
+  do {                                           \
+    (dest).low  = _mm_set1_epi64(intToM64(a)); \
+    (dest).high = _mm_set1_epi64(intToM64(a)); \
+  } while (0)
+#endif
+
+#if 0
+#define vector256UInt64Const(dest, c0, c1, c2, c3) \
+    do {                                           \
+        vector128UInt64Const((dest).low,  c0, c1); \
+        vector128UInt64Const((dest).high, c2, c3); \
+    } while (0)
+#else
+#define vector256UInt64Const(dest, c0, c1, c2, c3) \
+    do {                                           \
+        (dest).low  = _mm_set_epi64(intToM64(c0), intToM64(c1)); \
+        (dest).high = _mm_set_epi64(intToM64(c2), intToM64(c3)); \
+    } while (0)
+#endif
+
+#elif defined ONEVEC64
+#else
+#define vector256UInt64Lift(dest, a)             \
+  do {                                           \
+    (dest).values[0] = a; \
+    (dest).values[1] = a; \
+    (dest).values[2] = a; \
+    (dest).values[3] = a; \
+  } while (0)
+
+#define vector256UInt64Const(dest, c0, c1, c2, c3) \
+  do {                                           \
+    (dest).values[0] = c0; \
+    (dest).values[1] = c1; \
+    (dest).values[2] = c2; \
+    (dest).values[3] = c3; \
+  } while (0)
+#endif
+
+#if 1
+#define vector256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)      \
+    (dest) = _mm256_set_epi32(c7, c6, c5, c4, c3, c2, c1, c0)
+#else
+#define vector256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)      \
+    do {                                                                \
+        __m256i vector256UInt32Const_out = {0, 0, 0, 0, 0, 0, 0, 0};    \
+        __m128i vector256UInt32Const_lo = _mm_set_epi32(c3, c2, c1, c0); \
+        __m128i vector256UInt32Const_hi = _mm_set_epi32(c7, c6, c5, c4); \
+        vector256UInt32Const_out = _mm256_insertf128_si256(vector256UInt32Const_out, vector256UInt32Const_hi, 1); \
+        vector256UInt32Const_out = _mm256_insertf128_si256(vector256UInt32Const_out, vector256UInt32Const_lo, 0); \
+        (dest) = vector256UInt32Const_out;                              \
+    } while (0)
+#endif
+
+#define pLsrPrimV256UInt32Load(dest, arr, off, idx) pLsrPrimV256UInt32LoadHelp (dest, arr, off, idx)
+#define pLsrPrimV256UInt32Store(arr, off, idx, v) pLsrPrimV256UInt32StoreHelp (arr, off, idx, v)
+#define pLsrPrimV256RefStore(arr, off, idx, v) pLsrPrimV256UInt32StoreHelp (arr, off, idx, v)
+
+#define pLsrPrimV256UInt32LoadHelp(dest, arr, off, idx)                \
+    do {                                                                \
+        __m256i* pLsrPrimV256UInt32LoadHelp_marr = (__m256i*)((uint32*)((char*)arr+off) + idx); \
+        (dest) = _mm256_loadu_si256(pLsrPrimV256UInt32LoadHelp_marr);  \
+    } while (0)
+
+#define pLsrPrimV256UInt32StoreHelp(arr, off, idx, v)                  \
+    do {                                                                \
+        __m256i* pLsrPrimV256UInt32StoreHelp_marr = (__m256i*)((uint32*)((char*)arr+off) + idx); \
+        _mm256_storeu_si256(pLsrPrimV256UInt32StoreHelp_marr, v);                                   \
+    } while (0)
+
+/* scalar array, scalar offset, vector of indices */
+#define pLsrPrimV256B32LoadVI(dest, arr, off, vi)                   \
+    do {                                                            \
+        __m128i pLsrPrimV256B32LoadVectorHelp_idxV;                 \
+        __m128i pLsrPrimV256B32LoadVectorHelp_lo;                   \
+        __m128i pLsrPrimV256B32LoadVectorHelp_hi;                   \
+        __m256i pLsrPrimV256B32LoadVectorHelp_res;                  \
+        uint32 pLsrPrimV256B32LoadVectorHelp_idx0;                  \
+        uint32 pLsrPrimV256B32LoadVectorHelp_idx1;                  \
+        uint32 pLsrPrimV256B32LoadVectorHelp_idx2;                  \
+        uint32 pLsrPrimV256B32LoadVectorHelp_idx3;                  \
+                                                                    \
+        pLsrPrimV256B32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 0); \
+        pLsrPrimV256B32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 0); \
+        pLsrPrimV256B32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 1); \
+        pLsrPrimV256B32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 2); \
+        pLsrPrimV256B32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 3); \
+        pLsrPrimV256B32LoadVectorHelp_lo =                          \
+            _mm_set_epi32(*((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx3),  \
+                          *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx2),  \
+                          *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx1),  \
+                          *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx0)); \
+        pLsrPrimV256B32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 1); \
+        pLsrPrimV256B32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 0); \
+        pLsrPrimV256B32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 1); \
+        pLsrPrimV256B32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 2); \
+        pLsrPrimV256B32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256B32LoadVectorHelp_idxV, 3); \
+        pLsrPrimV256B32LoadVectorHelp_hi =                          \
+            _mm_set_epi32(*((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx3),  \
+                          *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx2),  \
+                          *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx1),  \
+                          *((sint32*)((char*)arr + off) + pLsrPrimV256B32LoadVectorHelp_idx0)); \
+        pLsrPrimV256B32LoadVectorHelp_res =_mm256_insertf128_si256(pLsrPrimV256B32LoadVectorHelp_res, pLsrPrimV256B32LoadVectorHelp_lo, 0); \
+        pLsrPrimV256B32LoadVectorHelp_res =_mm256_insertf128_si256(pLsrPrimV256B32LoadVectorHelp_res, pLsrPrimV256B32LoadVectorHelp_hi, 1); \
+        (dest) = pLsrPrimV256B32LoadVectorHelp_res;                 \
+    } while (0)
+
+// =====
+// Float
+// =====
+
+// *****************************************************************************
+// Arithmetic operations
+// *****************************************************************************
+
+static __m256 pLsrPrimV256Float32Zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+#define pLsrPrimV256PointwiseFloat32Plus(dest, a, b) ((dest) = _mm256_add_ps(a, b))
+#define pLsrPrimV256PointwiseFloat32Minus(dest, a, b) ((dest) = _mm256_sub_ps(a, b))
+#define pLsrPrimV256PointwiseFloat32Times(dest, a, b) ((dest) = _mm256_mul_ps(a, b))
+#define pLsrPrimV256PointwiseFloat32Divide(dest, a, b) ((dest) = _mm256_div_ps(a, b))
+#define pLsrPrimV256PointwiseFloat32Negate(dest, a) ((dest) = _mm256_sub_ps(pLsrPrimV256Float32Zero, a))
+#define pLsrPrimV256PointwiseFloat32Max(dest, a, b) ((dest) = _mm256_max_ps(a, b))
+#define pLsrPrimV256PointwiseFloat32Sqrt(dest, a) ((dest) = _mm256_sqrt_ps(a))
+
+// *****************************************************************************
+// Reductions
+// *****************************************************************************
+
+#define pLsrPrimV256ReduceAFloat32Plus(dest, init, a) reduce256AFloat32Plus(dest, init, a)
+
+#define reduce256AFloat32Plus(dest, init, a)                            \
+  do {                                                                  \
+    __m256 reduceAFloat32Plus_a_ = a;                                   \
+    __m128 rf32_hi = _mm256_extractf128_ps(reduceAFloat32Plus_a_, 1);   \
+    __m256 rf32_tmp = _mm256_insertf128_ps(reduceAFloat32Plus_a_, rf32_hi, 0); \
+    __m256 reduceAFloat32Plus_tmp = _mm256_hadd_ps(reduceAFloat32Plus_a_, rf32_tmp); \
+    __m256 reduceAFloat32Plus_tmp2 = _mm256_hadd_ps(reduceAFloat32Plus_tmp, reduceAFloat32Plus_tmp); \
+    __m256 reduceAFloat32Plus_tmp3 = _mm256_hadd_ps(reduceAFloat32Plus_tmp2, reduceAFloat32Plus_tmp2); \
+    float reduceAFloat32Plus_dest_;                                     \
+    pLsrPrimV256DataSubF32L(reduceAFloat32Plus_dest_, 0, reduceAFloat32Plus_tmp3); \
+    (dest) =  (reduceAFloat32Plus_dest_ + (init));                      \
+  } while (0)
+
+// *****************************************************************************
+// Data operations
+// *****************************************************************************
+
+#define pLsrPrimV256Float32Load(dest, arr, off, idx)                   \
+    do {                                                                \
+      float* pLsrPrimV256Float32Load_marr = ((float*)((char*)arr+off) + idx); \
+        __m256 float32load = _mm256_loadu_ps(pLsrPrimV256Float32Load_marr); \
+        (dest) = float32load;                                           \
+    } while (0)
+
+#define pLsrPrimV256Float32LoadVector(dest, arr, off, idx) \
+    pLsrPrimV256Float32LoadVectorHelp (dest, arr, off, idx)
+
+#define pLsrPrimV256Float32Store(arr, off, idx, v)                     \
+  do {                                                                  \
+    float* pLsrPrimV256Float32Store_marr = ((float*)((char*)arr+off) + idx); \
+    _mm256_storeu_ps(pLsrPrimV256Float32Store_marr, v);                \
+  } while (0)
+
+
+#define pLsrPrimV256Float32LoadStrided(dest, arr, off, idx, stride)     \
+    do {                                                                \
+        (dest) = _mm256_set_ps(*((float32*)((char*)arr + off) + idx + 7*stride), \
+                               *((float32*)((char*)arr + off) + idx + 6*stride), \
+                               *((float32*)((char*)arr + off) + idx + 5*stride), \
+                               *((float32*)((char*)arr + off) + idx + 4*stride), \
+                               *((float32*)((char*)arr + off) + idx + 3*stride), \
+                               *((float32*)((char*)arr + off) + idx + 2*stride), \
+                               *((float32*)((char*)arr + off) + idx + stride), \
+                               *((float32*)((char*)arr + off) + idx + 0)); \
+    } while (0)
+
+#define pLsrPrimV256Float32StoreStrided(arr, off, idx, stride, src)      \
+    do {                                                                \
+        __m128 pLsrPrimV256Float32StoreStrided_srcV;                   \
+                                                                        \
+        pLsrPrimV256Float32StoreStrided_srcV =_mm256_extractf128_ps(src, 0); \
+        *((uint32*)((char*)arr + off) + idx + 0) =                            \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 0); \
+        *((uint32*)((char*)arr + off) + idx + stride) =                       \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 1); \
+        *((uint32*)((char*)arr + off) + idx + 2*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 2); \
+        *((uint32*)((char*)arr + off) + idx + 3*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 3); \
+                                                                        \
+        pLsrPrimV256Float32StoreStrided_srcV =_mm256_extractf128_ps(src, 1); \
+        *((uint32*)((char*)arr + off) + idx + 4*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 0); \
+        *((uint32*)((char*)arr + off) + idx + 5*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 1); \
+        *((uint32*)((char*)arr + off) + idx + 6*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 2); \
+        *((uint32*)((char*)arr + off) + idx + 7*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 3); \
+    } while (0)
+
+#define pLsrPrimV256Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)  \
+    ((dest) = _mm256_set_ps(c7, c6, c5, c4, c3, c2, c1, c0))
+
+#define pLsrPrimV256Float32Lift(dest, a)               \
+  do {                                                  \
+    float temp = a;                                     \
+    (dest) = _mm256_broadcast_ss((float *)(&temp));     \
+  } while(0)
+
+
+#define pLsrPrimV256Float32LoadVectorHelp(dest, arr, off, idx)          \
+    do {                                                                \
+        __m128i pLsrPrimV256Float32LoadVectorHelp_ptrV;                \
+        __m128 pLsrPrimV256Float32LoadVectorHelp_lo;                   \
+        __m128 pLsrPrimV256Float32LoadVectorHelp_hi;                   \
+        __m256  pLsrPrimV256Float32LoadVectorHelp_res;                  \
+        float32* pLsrPrimV256Float32LoadVectorHelp_ptr0;                \
+        float32* pLsrPrimV256Float32LoadVectorHelp_ptr1;                \
+        float32* pLsrPrimV256Float32LoadVectorHelp_ptr2;                \
+        float32* pLsrPrimV256Float32LoadVectorHelp_ptr3;                \
+                                                                        \
+        pLsrPrimV256Float32LoadVectorHelp_ptrV = _mm256_extractf128_si256(arr, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr0 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr1 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr2 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 2); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr3 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 3); \
+        pLsrPrimV256Float32LoadVectorHelp_lo =                          \
+            _mm_set_ps(*(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr0 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr1 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr2 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr3 + off) + idx)); \
+        pLsrPrimV256Float32LoadVectorHelp_ptrV =_mm256_extractf128_si256(arr, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr0 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr1 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr2 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 2); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr3 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 3); \
+        pLsrPrimV256Float32LoadVectorHelp_hi =                          \
+            _mm_set_ps(*(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr0 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr1 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr2 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr3 + off) + idx)); \
+        pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \
+                                                                     pLsrPrimV256Float32LoadVectorHelp_hi, \
+                                                                     1); \
+        pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \
+                                                                     pLsrPrimV256Float32LoadVectorHelp_lo, \
+                                                                     0); \
+        (dest) = pLsrPrimV256Float32LoadVectorHelp_res;                 \
+    } while (0)
+
+// =====
+// Double
+// =====
+
+// *****************************************************************************
+// Arithmetic operations
+// *****************************************************************************
+
+static __m256d pLsrPrimV256Float64Zero = {0.0, 0.0, 0.0, 0.0};
+#define pLsrPrimV256PointwiseFloat64Plus(dest, a, b) ((dest) = _mm256_add_pd(a, b))
+#define pLsrPrimV256PointwiseFloat64Minus(dest, a, b) ((dest) = _mm256_sub_pd(a, b))
+#define pLsrPrimV256PointwiseFloat64Times(dest, a, b) ((dest) = _mm256_mul_pd(a, b))
+#define pLsrPrimV256PointwiseFloat64Divide(dest, a, b) ((dest) = _mm256_div_pd(a, b))
+#define pLsrPrimV256PointwiseFloat64Negate(dest, a) ((dest) = _mm256_sub_pd(pLsrPrimV256Float64Zero, a))
+#define pLsrPrimV256PointwiseFloat64Max(dest, a, b) ((dest) = _mm256_max_pd(a, b))
+#define pLsrPrimV256PointwiseFloat64Sqrt(dest, a) ((dest) = _mm256_sqrt_pd(a))
+
+#define pLsrPrimV256PointwiseSInt32DivT(dest, a, b)                     \
+    do {                                                                \
+        __m128i pLsrPrimV256PointwiseSInt32DivT_a;                      \
+        __m128i pLsrPrimV256PointwiseSInt32DivT_b;                      \
+        __m128i pLsrPrimV256PointwiseSInt32DivT_rlo;                    \
+        __m128i pLsrPrimV256PointwiseSInt32DivT_rhi;                    \
+        __m256i pLsrPrimV256PointwiseSInt32DivT_res;                    \
+                                                                        \
+        pLsrPrimV256PointwiseSInt32DivT_a = _mm256_extractf128_si256(a, 0); \
+        pLsrPrimV256PointwiseSInt32DivT_b = _mm256_extractf128_si256(b, 0); \
+        pLsrPrimV128PointwiseSInt32DivT(pLsrPrimV256PointwiseSInt32DivT_rlo, pLsrPrimV256PointwiseSInt32DivT_a, pLsrPrimV256PointwiseSInt32DivT_b);\
+        pLsrPrimV256PointwiseSInt32DivT_a = _mm256_extractf128_si256(a, 1); \
+        pLsrPrimV256PointwiseSInt32DivT_b = _mm256_extractf128_si256(b, 1); \
+        pLsrPrimV128PointwiseSInt32DivT(pLsrPrimV256PointwiseSInt32DivT_rhi, pLsrPrimV256PointwiseSInt32DivT_a, pLsrPrimV256PointwiseSInt32DivT_b); \
+        pLsrPrimV256PointwiseSInt32DivT_res = _mm256_castsi128_si256(pLsrPrimV256PointwiseSInt32DivT_rlo); \
+        pLsrPrimV256PointwiseSInt32DivT_res = _mm256_insertf128_si256(pLsrPrimV256PointwiseSInt32DivT_res, \
+                                                                      pLsrPrimV256PointwiseSInt32DivT_rhi, \
+                                                                      1); \
+        (dest) = pLsrPrimV256PointwiseSInt32DivT_res;                 \
+    } while (0)
+
+#define pLsrPrimV256PointwiseSInt32ModT(dest, a, b)                     \
+    do {                                                                \
+        __m128i pLsrPrimV256PointwiseSInt32ModT_a;                      \
+        __m128i pLsrPrimV256PointwiseSInt32ModT_b;                      \
+        __m128i pLsrPrimV256PointwiseSInt32ModT_rlo;                    \
+        __m128i pLsrPrimV256PointwiseSInt32ModT_rhi;                    \
+        __m256i pLsrPrimV256PointwiseSInt32ModT_res;                    \
+                                                                        \
+        pLsrPrimV256PointwiseSInt32ModT_a = _mm256_extractf128_si256(a, 0); \
+        pLsrPrimV256PointwiseSInt32ModT_b = _mm256_extractf128_si256(b, 0); \
+        pLsrPrimV128PointwiseSInt32ModT(pLsrPrimV256PointwiseSInt32ModT_rlo, pLsrPrimV256PointwiseSInt32ModT_a, pLsrPrimV256PointwiseSInt32ModT_b);\
+        pLsrPrimV256PointwiseSInt32ModT_a = _mm256_extractf128_si256(a, 1); \
+        pLsrPrimV256PointwiseSInt32ModT_b = _mm256_extractf128_si256(b, 1); \
+        pLsrPrimV128PointwiseSInt32ModT(pLsrPrimV256PointwiseSInt32ModT_rhi, pLsrPrimV256PointwiseSInt32ModT_a, pLsrPrimV256PointwiseSInt32ModT_b); \
+        pLsrPrimV256PointwiseSInt32ModT_res = _mm256_castsi128_si256(pLsrPrimV256PointwiseSInt32ModT_rlo); \
+        pLsrPrimV256PointwiseSInt32ModT_res = _mm256_insertf128_si256(pLsrPrimV256PointwiseSInt32ModT_res, \
+                                                                      pLsrPrimV256PointwiseSInt32ModT_rhi, \
+                                                                      1); \
+        (dest) = pLsrPrimV256PointwiseSInt32ModT_res;                 \
+    } while (0)
+
+// *****************************************************************************
+// Reductions
+// *****************************************************************************
+
+#define pLsrPrimV256ReduceAFloat64Plus(dest, init, a)                   \
+    do {                                                                \
+        __m256d reduceAFloat64Plus_p_ = a;                               \
+        __m128d reduceAFloat64Plus_p_lo   = _mm256_extractf128_pd(reduceAFloat64Plus_p_, 0); \
+        __m128d reduceAFloat64Plus_p_hi   = _mm256_extractf128_pd(reduceAFloat64Plus_p_, 1); \
+        __m128d reduceAFloat64Plus_p_tmp  = _mm_hadd_pd(reduceAFloat64Plus_p_lo, reduceAFloat64Plus_p_hi); \
+        reduceAFloat64Plus_p_tmp  = _mm_hadd_pd(reduceAFloat64Plus_p_tmp, reduceAFloat64Plus_p_tmp); \
+        double reduceAFloat64Plus_dest_;                                \
+        pLsrPrimV256DataSubF64(reduceAFloat64Plus_dest_, 0, reduceAFloat64Plus_p_tmp); \
+        (dest) =  (reduceAFloat64Plus_dest_ + (init));                  \
+    } while (0)
+
+// *****************************************************************************
+// Data operations
+// *****************************************************************************
+
+#define pLsrPrimV256Float64Load(dest, arr, off, idx)                   \
+    do {                                                                \
+        double* pLsrPrimV256Float64Load_marr = ((double*)((char*)arr+off) + idx); \
+        __m256d float64load = _mm256_loadu_pd(pLsrPrimV256Float64Load_marr); \
+        (dest) = float64load;                                           \
+    } while (0)
+
+#define pLsrPrimV256Float64Store(arr, off, idx, v)                     \
+  do {                                                                  \
+    double* pLsrPrimV256Float64Store_marr = ((double*)((char*)arr+off) + idx); \
+    _mm256_storeu_pd(pLsrPrimV256Float64Store_marr, v);                \
+  } while (0)
+
+
+#define pLsrPrimV256Float64LoadStrided(dest, arr, off, idx, stride)     \
+    do {                                                                \
+        (dest) = _mm256_set_pd(*((float64*)((char*)arr + off) + idx + 3*stride), \
+                               *((float64*)((char*)arr + off) + idx + 2*stride), \
+                               *((float64*)((char*)arr + off) + idx + stride), \
+                               *((float64*)((char*)arr + off) + idx + 0));    \
+    } while (0)
+
+#define pLsrPrimV256Float64StoreStrided(arr, off, idx, stride, src)     \
+    do {                                                                \
+        __m128d pLsrPrimV256Float64StoreStrided_srcV;                   \
+                                                                        \
+        pLsrPrimV256Float64StoreStrided_srcV =_mm256_extractf128_pd(src, 0); \
+        _mm_storel_pd(((float64*)((char*)arr + off) + idx + 0), pLsrPrimV256Float64StoreStrided_srcV); \
+        _mm_storeh_pd(((float64*)((char*)arr + off) + idx + stride), pLsrPrimV256Float64StoreStrided_srcV); \
+                                                                        \
+        pLsrPrimV256Float64StoreStrided_srcV =_mm256_extractf128_pd(src, 1); \
+        _mm_storel_pd(((float64*)((char*)arr + off) + idx + 2*stride), pLsrPrimV256Float64StoreStrided_srcV); \
+        _mm_storeh_pd(((float64*)((char*)arr + off) + idx + 3*stride), pLsrPrimV256Float64StoreStrided_srcV); \
+    } while (0)
+
+#define pLsrPrimV256Float64Const(dest, c0, c1, c2, c3)  \
+    ((dest) = _mm256_set_pd(c3, c2, c1, c0))
+
+#define pLsrPrimV256Float64Lift(dest, a)               \
+  do {                                                  \
+    double temp = a;                                     \
+    (dest) = _mm256_broadcast_sd((double *)(&temp));     \
+  } while(0)
+
+// ================
+// Ref - 256-bit
+// ================
+
+#define pLsrPrimV256RefLoad(dest, arr, off, idx) pLsrPrimV256UInt32Load(dest, arr, off, idx)
+
+#endif _PLSR_PRIMS_VECTOR_AVX_H_
diff --git a/runtime/include/hrc/plsr-prims-vector-mic.h b/runtime/include/hrc/plsr-prims-vector-mic.h
new file mode 100644
index 0000000..8add559
--- /dev/null
+++ b/runtime/include/hrc/plsr-prims-vector-mic.h
@@ -0,0 +1,1633 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_PRIMS_VECTOR_MIC_H_
+#define _PLSR_PRIMS_VECTOR_MIC_H_
+
+#ifdef P_USE_PILLAR
+#ifdef LINUX
+#  pragma pillar_push_cc(__pcdecl)
+#  define to __to__
+#else
+#  pragma pillar_managed(off)
+#  define to __to__
+#endif
+#endif /* P_USE_PILLAR */
+
+#ifdef P_USE_PILLAR
+#define __ICL_INTRINCC __pcdecl
+#endif /* P_USE_PILLAR */
+#include <immintrin.h>
+#undef __cdecl  // deep in the include for immintrin.h, there is a "#define __cdecl".  This undef gets rid of the effects of this.
+
+#ifdef P_USE_PILLAR
+#ifdef LINUX
+#  undef to
+#  pragma pillar_pop_cc
+#else
+#  undef to
+#  pragma pillar_managed(on)
+#endif
+#endif /* P_USE_PILLAR */
+
+//#define __pillar2c__ 1
+
+typedef __m512  PlsrVector512F32;
+typedef __m512i PlsrVector512B64;
+typedef __m512d PlsrVector512F64;
+typedef __m512i PlsrVector512Ref;
+
+typedef __mmask16 PlsrVectorMask512Fs32;
+typedef __mmask8  PlsrVectorMask512Fs64;
+
+#define pLsrPrimV512F64StoreVS(arr, offset, idx, stride, v)               \
+    do {                                                                  \
+        if (stride == 1) {                                                \
+            pLsrPrimV512Float64Store(arr, offset, idx, v);                \
+        } else {                                                          \
+            pLsrPrimV512Float64StoreStrided(arr, offset, idx, stride, v); \
+        }                                                                 \
+    } while (0)
+
+#define pLsrPrimV512F32StoreVS(arr, offset, idx, stride, v)               \
+    do {                                                                  \
+        if (stride == 1) {                                                \
+            pLsrPrimV512Float32Store(arr, offset, idx, v);                \
+        } else {                                                          \
+            pLsrPrimV512Float32StoreStrided(arr, offset, idx, stride, v); \
+        }                                                                 \
+    } while (0)
+
+#define pLsrPrimV512F64LoadVS(dest, arr, offset, idx, stride)               \
+    do {                                                                    \
+        if (stride == 1) {                                                  \
+            pLsrPrimV512Float64Load(dest, arr, offset, idx);                \
+        } else {                                                            \
+            pLsrPrimV512Float64LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                                   \
+    } while (0)
+
+#define pLsrPrimV512F32LoadVS(dest, arr, offset, idx, stride)               \
+    do {                                                                    \
+        if (stride == 1) {                                                  \
+            pLsrPrimV512Float32Load(dest, arr, offset, idx);                \
+        } else {                                                            \
+            pLsrPrimV512Float32LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                                   \
+    } while (0)
+
+#define pLsrPrimV512B64LoadVS(dest, arr, offset, idx, stride)               \
+    do {                                                                    \
+        if (stride == 1) {                                                  \
+            pLsrPrimV512UInt64Load(dest, arr, offset, idx);                 \
+        } else {                                                            \
+            pLsrPrimV512UInt64LoadStrided(dest, arr, offset, idx, stride);  \
+        }                                                                   \
+    } while (0)
+
+#define pLsrPrimV512B64StoreVS(arr, offset, idx, stride, v)                 \
+    do {                                                                    \
+        if (stride == 1) {                                                  \
+            pLsrPrimV512UInt64Store(arr, offset, idx, v);                   \
+        } else {                                                            \
+            pLsrPrimV512UInt64StoreStrided(arr, offset, idx, stride, v );   \
+        }                                                                   \
+    } while (0)
+
+#define pLsrPrimV512RefStoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV512RefStore(arr, offset, idx, v);                 \
+        } else {                                                        \
+            pLsrPrimV512RefStoreStrided(arr, offset, idx, stride, v);  \
+        }                                                               \
+    } while (0)
+
+#define pLsrPrimV512RefLoadVS(dest, arr, offset, idx, stride)           \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV512RefLoad(dest, arr, offset, idx);                \
+        } else {                                                        \
+            pLsrPrimV512RefLoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+
+#define pLsrPrimV512512ConvertFloat32FromUInt32(dest, a)  ((dest) = _mm512_castsi512_ps(a))
+#define pLsrPrimV512512ConvertUInt32FromFloat32(dest, a)  ((dest) = _mm512_castps_si512(a))
+#define pLsrPrimV512256ConvertFloat64FromFloat32(dest, a) ((dest) = _mm512_cvtpslo_pd(a))
+#define pLsrPrimV512512CastSInt32ToFloat32(dest, a) ((dest) = _mm512_castsi512_ps(a))
+#define pLsrPrimV512512ConvertUInt32FromSInt32(dest, a) ((dest) = a)
+#define pLsrPrimV512512ConvertSInt32FromUInt32(dest, a) ((dest) = a)
+#define pLsrPrimV512512ConvertUInt64FromSInt64(dest, a) ((dest) = a)
+#define pLsrPrimV512512ConvertSInt64FromUInt64(dest, a) ((dest) = a)
+#define pLsrPrimV512512CastUInt64ToSInt64(dest, a) ((dest) = a)
+#define pLsrPrimV512512CastSInt64ToUInt64(dest, a) ((dest) = a)
+#define pLsrPrimV512512CastSInt64ToFloat64(dest, a) ((dest) = _mm512_castsi512_pd(a))
+
+#define pLsrPrimV512DataVectorB64(dest, c0, c1, c2, c3, c4, c5, c6, c7) \
+  pLsrPrimV512UInt64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)
+
+#define pLsrPrimV512Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15) \
+    ((dest) = _mm512_setr_ps(c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15))
+
+#define pLsrPrimV512DataVectorF32(dest, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15) \
+  pLsrPrimV512Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15)
+
+#define pLsrPrimV512DataVectorF64(dest, c0, c1, c2, c3, c4, c5, c6, c7) \
+  pLsrPrimV512Float64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)
+
+#if 1
+#define pLsrPrimV512Float32Lift(dest, a) ((dest) = _mm512_set1_ps(a))
+#else
+#define pLsrPrimV512Float32Lift(dest, a) \
+  do { \
+    float ftemp = a; \
+    (dest) = _mm512_extload_ps((void const *)(&ftemp),_MM_UPCONV_PS_NONE,_MM_BROADCAST_1X16,0); \
+  } while (0)
+#endif
+
+#define pLsrPrimV512DataBroadcastB64(dest, c0) pLsrPrimV512UInt64Lift(dest, c0)
+//#define pLsrPrimV512DataBroadcastB32(dest, c0) pLsrPrimV512UInt32Lift(dest, c0)
+#define pLsrPrimV512DataBroadcastF32(dest, c0) pLsrPrimV512Float32Lift(dest, c0)
+#define pLsrPrimV512DataBroadcastF64(dest, c0) pLsrPrimV512Float64Lift(dest, c0)
+
+#define pLsrPrimV512F64LoadVI(dest, arr, off, vi)                       \
+    (dest) = _mm512_i32logather_pd(vi, ((char*)arr + off), sizeof(float64))
+
+#if 0
+    do {                                                                \
+        __int64 indices[8];                                             \
+        _mm512_extstore_epi64(&indices, vi, _MM_DOWNCONV_PD_NONE, 0);   \
+        (dest) = _mm512_set_pd(*((float64*)((char*)arr + off) + indices[7]), \
+                               *((float64*)((char*)arr + off) + indices[6]), \
+                               *((float64*)((char*)arr + off) + indices[5]), \
+                               *((float64*)((char*)arr + off) + indices[4]), \
+                               *((float64*)((char*)arr + off) + indices[3]), \
+                               *((float64*)((char*)arr + off) + indices[2]), \
+                               *((float64*)((char*)arr + off) + indices[1]), \
+                               *((float64*)((char*)arr + off) + indices[0])); \
+    } while (0)
+#endif
+
+#define pLsrPrimV512PointwiseUInt64BAnd(dest, a, b) (dest) = _mm512_and_epi64((a),(b))
+#define pLsrPrimV512PointwiseUInt64BOr(dest, a, b) (dest) = _mm512_or_epi64((a),(b))
+
+//#define pLsrPrimV512PointwiseUInt64BShiftR(dest, a, b) (dest) = _mm512_srlv_epi64((a),(b))
+#define pLsrPrimV512PointwiseUInt64BShiftR(dest, a, b) \
+    do {                                                                \
+        __declspec(align(64)) __int64 valsa[8], valsb[8];               \
+        _mm512_store_epi64(valsa,(a));                          \
+        _mm512_store_epi64(valsb, (b));                        \
+        unsigned i;                                                     \
+        for(i = 0; i < 8; ++i) {                                        \
+            valsa[i] >>= valsb[i];                                      \
+        }                                                               \
+        (dest) = _mm512_load_epi64(valsa);                     \
+    } while (0)
+
+#define pLsrPrimV512PointwiseUInt64BShiftL(dest, a, b) \
+    do {                                                                \
+        __declspec(align(64)) __int64 valsa[8], valsb[8];               \
+        _mm512_store_epi64(valsa,(a));                          \
+        _mm512_store_epi64(valsb, (b));                        \
+        unsigned i;                                                     \
+        for(i = 0; i < 8; ++i) {                                        \
+            valsa[i] <<= valsb[i];                                      \
+        }                                                               \
+        (dest) = _mm512_load_epi64(valsa);                     \
+    } while (0)
+
+// This is brain dead.  Unpack the vector.  Multiply each one and repack.
+#define pLsrPrimV512PointwiseSInt64Times(dest, a, b)                    \
+    do {                                                                \
+        __declspec(align(64)) __int64 valsa[8], valsb[8];               \
+        _mm512_store_epi64(valsa,(a));                          \
+        _mm512_store_epi64(valsb,(b));                          \
+        unsigned i;                                                     \
+        for(i = 0; i < 8; ++i) {                                        \
+            valsa[i] *= valsb[i];                                       \
+        }                                                               \
+        (dest) = _mm512_load_epi64(valsa);                     \
+    } while (0)
+
+#define pLsrPrimV512PointwiseSInt64Plus(dest, a, b)  vector512PointwiseSInt64Plus(dest, a, b)
+#define vector512PointwiseSInt64Plus(dest, a, b)  (dest) = _mm512_add_epi64((a),(b))
+
+#define pLsrPrimV512PointwiseSInt64Minus(dest, a, b)  vector512PointwiseSInt64Minus(dest, a, b)
+#define vector512PointwiseSInt64Minus(dest, a, b)  (dest) = _mm512_sub_epi64((a),(b))
+
+#if 0
+#define pLsrPrimV512PointwiseFloat32LT(dest, a, b) \
+    do { \
+       __declspec(align(64)) float32 avec[16], bvec[16];       \
+       __declspec(align(64)) uint32_t resvec[16];       \
+        _mm512_store_ps(avec,(a));                          \
+        _mm512_store_ps(bvec,(b));                        \
+       unsigned i; \
+       for(i=0;i<16;++i) { \
+         if(avec[i] < bvec[i]) resvec[i] = 1; else resvec[i] = 0; \
+       } \
+       (dest) = _mm512_load_epi32(resvec);               \
+    } while(0)
+
+#define pLsrPrimV512PointwiseSInt64LT(dest, a, b) \
+    do { \
+       __declspec(align(64)) __int64 avec[8], bvec[8];       \
+        _mm512_store_epi64(avec,(a));                          \
+        _mm512_store_epi64(bvec,(b));                        \
+       unsigned i; \
+       for(i=0;i<8;++i) { \
+         if(avec[i] < bvec[i]) avec[i] = 1; else avec[i] = 0; \
+       } \
+       (dest) = _mm512_load_epi64(avec);               \
+    } while(0)
+
+#define pLsrPrimV512PointwiseSInt64EQ(dest, a, b) \
+    do { \
+       __declspec(align(64)) __int64 avec[8], bvec[8];       \
+        _mm512_store_epi64(avec,(a));                          \
+        _mm512_store_epi64(bvec,(b));                        \
+       unsigned i; \
+       for(i=0;i<8;++i) { \
+         if(avec[i] == bvec[i]) avec[i] = 1; else avec[i] = 0; \
+       } \
+       (dest) = _mm512_load_epi64(avec);               \
+    } while(0)
+
+#define pLsrPrimV512PointwiseCondMov(dest, mask, a, b) \
+    do { \
+       __mmask8 realmask=0; \
+       __declspec(align(64)) __int64 mvec[8];       \
+        _mm512_store_epi64(mvec,(mask));                          \
+       unsigned i; \
+       for(i=0;i<8;++i) { \
+           if(mvec[i]) realmask |= (1 << i); \
+       } \
+       (dest) = _mm512_mask_blend_epi64((realmask),(a),(b)); \
+    } while (0)
+#else
+#define pLsrPrimV512CompareFloat32LT(dest, a, b) (dest) = _mm512_cmplt_ps_mask((a),(b))
+#define pLsrPrimV512DataBlendB32(dest, mask, a, b) (dest) = _mm512_mask_blend_epi32((mask),(a),(b))
+#define pLsrPrimV512DataBlendB64(dest, mask, a, b) (dest) = _mm512_mask_blend_epi64((mask),(a),(b))
+#define pLsrPrimV512DataBlendF32(dest, mask, a, b) (dest) = _mm512_mask_blend_ps((mask),(a),(b))
+#define pLsrPrimV512DataBlendF64(dest, mask, a, b) (dest) = _mm512_mask_blend_pd((mask),(a),(b))
+
+//#define pLsrPrimV512CompareSInt64LT(dest, a, b) (dest) = _mm512_cmplt_epi64_mask((a),(b))
+//#define pLsrPrimV512CompareSInt64EQ(dest, a, b) (dest) = _mm512_cmpeq_epi64_mask((a),(b))
+#define pLsrPrimV512CompareSInt64LT(dest, a, b) \
+    do { \
+       __declspec(align(64)) __int64 avec[8], bvec[8];       \
+        _mm512_store_epi64(avec,(a));                          \
+        _mm512_store_epi64(bvec,(b));                        \
+       unsigned i; \
+       (dest) = 0; \
+       for(i=0;i<8;++i) { \
+         if(avec[i] < bvec[i]) (dest) |= (1 << i); \
+       } \
+    } while(0)
+
+#define pLsrPrimV512CompareSInt64EQ(dest, a, b) \
+    do { \
+       __declspec(align(64)) __int64 avec[8], bvec[8];       \
+        _mm512_store_epi64(avec,(a));                          \
+        _mm512_store_epi64(bvec,(b));                        \
+       unsigned i; \
+       (dest) = 0; \
+       for(i=0;i<8;++i) { \
+         if(avec[i] == bvec[i]) (dest) |= (1 << i); \
+       } \
+    } while(0)
+
+#endif
+
+#define pLsrPrimV512UInt64Lift(dest, a) vector512UInt64Lift(dest, a)
+#define pLsrPrimV512UInt64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)   \
+    vector512UInt64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)
+
+#define vector512UInt64Lift(dest, a) \
+    do { \
+        uint64_t tmp = a; \
+        (dest) = _mm512_extload_epi64((void const *)(&tmp),_MM_UPCONV_EPI64_NONE,_MM_BROADCAST_1X8,0); \
+    } while(0)
+
+#define vector512UInt64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)   \
+    (dest) = _mm512_setr_epi64(c0, c1, c2, c3, c4, c5, c6, c7)
+
+
+
+#define pLsrPrimV512PointwiseFloat64Plus(dest, a, b)  ((dest) = _mm512_add_pd(a, b))
+#define pLsrPrimV512PointwiseFloat64Minus(dest, a, b)  ((dest) = _mm512_sub_pd(a, b))
+#define pLsrPrimV512PointwiseFloat64Times(dest, a, b) ((dest) = _mm512_mul_pd(a, b))
+
+#define pLsrPrimV512PointwiseFloat32Plus(dest, a, b)  ((dest) = _mm512_add_ps(a, b))
+#define pLsrPrimV512PointwiseFloat32Minus(dest, a, b)  ((dest) = _mm512_sub_ps(a, b))
+#define pLsrPrimV512PointwiseFloat32Times(dest, a, b) ((dest) = _mm512_mul_ps(a, b))
+#define pLsrPrimV512PointwiseFloat32Sqrt(dest, a) ((dest) = _mm512_sqrt_ps(a))
+#define pLsrPrimV512PointwiseFloat32Divide(dest, a, b) ((dest) = _mm512_div_ps(a, b))
+
+#define pLsrPrimV512PointwiseSInt64DivT(dest, a, b) \
+    (dest) = _mm512_div_epi64((a),(b))
+
+#define pLsrPrimV512PointwiseSInt64ModT(dest, a, b) \
+    (dest) = _mm512_rem_epi64((a),(b))
+
+#define pLsrPrimV512ReduceAFloat64Plus(dest, init, a) \
+    (dest) = (init) + _mm512_reduce_add_pd(a)
+
+#define pLsrPrimV512ReduceAFloat32Plus(dest, init, a) \
+    (dest) = (init) + _mm512_reduce_add_ps(a)
+
+#ifdef HAS_ALIGNMENT
+#define pLsrPrimV512Float64Load(dest, arr, off, idx)  \
+    (dest) = _mm512_load_pd((void const *)((double*)((char*)arr+off) + idx))
+#else
+#define pLsrPrimV512Float64Load(dest, arr, off, idx)                   \
+    do {                                                               \
+        (dest) = _mm512_loadunpacklo_pd((dest),(void *)((double*)((char*)arr+off) + idx)); \
+        (dest) = _mm512_loadunpackhi_pd((dest),(char *)((double*)((char*)arr+off) + idx) + 64); \
+    } while (0)
+#endif
+
+#ifdef HAS_ALIGNMENT
+#define pLsrPrimV512Float32Load(dest, arr, off, idx)  \
+    (dest) = _mm512_load_ps((void const *)((float*)((char*)arr+off) + idx))
+#else
+#define pLsrPrimV512Float32Load(dest, arr, off, idx)                   \
+    do {                                                               \
+        (dest) = _mm512_loadunpacklo_ps((dest),(void *)((float*)((char*)arr+off) + idx)); \
+        (dest) = _mm512_loadunpackhi_ps((dest),(char *)((float*)((char*)arr+off) + idx) + 64); \
+    } while (0)
+#endif
+
+#ifdef HAS_ALIGNMENT
+#define pLsrPrimV512Float64Store(arr, off, idx, v)           \
+    _mm512_store_pd((void const *)((double*)((char*)arr+off) + idx), v)
+#define pLsrPrimV512Float32Store(arr, off, idx, v)           \
+    _mm512_store_ps((void const *)((float*)((char*)arr+off) + idx), v)
+#define pLsrPrimV512UInt64Store(arr, off, idx, v)           \
+    _mm512_store_epi64((void const *)((uint64_t*)((char*)arr+off) + idx), v)
+#else
+#define pLsrPrimV512Float64Store(arr, off, idx, v)           \
+    do {                                                               \
+        _mm512_packstorelo_pd((void *)((double*)((char*)arr+off) + idx),(v)); \
+        _mm512_packstorehi_pd((char *)((double*)((char*)arr+off) + idx) + 64,(v)); \
+    } while (0)
+#define pLsrPrimV512Float32Store(arr, off, idx, v)           \
+    do {                                                               \
+        _mm512_packstorelo_ps((void *)((float*)((char*)arr+off) + idx),(v)); \
+        _mm512_packstorehi_ps((char *)((float*)((char*)arr+off) + idx) + 64,(v)); \
+    } while (0)
+#define pLsrPrimV512UInt64Store(arr, off, idx, v)           \
+    do {                                                               \
+        _mm512_packstorelo_epi64((void *)((uint64_t*)((char*)arr+off) + idx),(v)); \
+        _mm512_packstorehi_epi64((char *)((uint64_t*)((char*)arr+off) + idx) + 64,(v)); \
+    } while (0)
+#endif
+
+#ifdef HAS_ALIGNMENT
+#define pLsrPrimV512UInt64Load(dest, arr, off, idx)  \
+    (dest) = _mm512_load_epi64((void const *)((uint64_t*)((char*)arr+off) + idx))
+#else
+#define pLsrPrimV512UInt64Load(dest, arr, off, idx)                   \
+    do {                                                               \
+        (dest) = _mm512_loadunpacklo_epi64((dest),(void *)((uint64_t*)((char*)arr+off) + idx)); \
+        (dest) = _mm512_loadunpackhi_epi64((dest),(char *)((uint64_t*)((char*)arr+off) + idx) + 64); \
+    } while (0)
+#endif
+
+#ifdef HAS_ALIGNMENT
+#define pLsrPrimV512RefLoad(dest, arr, off, idx)  \
+    (dest) = _mm512_load_epi64((void const *)((void**)((char*)arr+off) + idx))
+#else
+#define pLsrPrimV512RefLoad(dest, arr, off, idx)                   \
+    do {                                                               \
+        (dest) = _mm512_loadunpacklo_epi64((dest),(void *)((void**)((char*)arr+off) + idx)); \
+        (dest) = _mm512_loadunpackhi_epi64((dest),(char *)((void**)((char*)arr+off) + idx) + 64); \
+    } while (0)
+#endif
+
+#ifdef HAS_ALIGNMENT
+#define pLsrPrimV512RefStore(arr, off, idx, v)           \
+    _mm512_store_epi64((void const *)((void**)((char*)arr+off) + idx), v)
+#else
+#define pLsrPrimV512RefStore(arr, off, idx, v)           \
+    do {                                                               \
+        _mm512_packstorelo_epi64((void *)((void**)((char*)arr+off) + idx),(v)); \
+        _mm512_packstorehi_epi64((char *)((void**)((char*)arr+off) + idx) + 64,(v)); \
+    } while (0)
+#endif
+
+
+
+#define pLsrPrimV512Float64LoadStrided(dest, arr, off, idx, stride)     \
+    do {                                                                \
+        (dest) = _mm512_set_pd(*((float64*)((char*)arr + off) + idx + 7*stride), \
+                               *((float64*)((char*)arr + off) + idx + 6*stride), \
+                               *((float64*)((char*)arr + off) + idx + 5*stride), \
+                               *((float64*)((char*)arr + off) + idx + 4*stride), \
+                               *((float64*)((char*)arr + off) + idx + 3*stride), \
+                               *((float64*)((char*)arr + off) + idx + 2*stride), \
+                               *((float64*)((char*)arr + off) + idx + 1*stride), \
+                               *((float64*)((char*)arr + off) + idx + 0*stride));    \
+    } while (0)
+
+#define pLsrPrimV512Float32LoadStrided(dest, arr, off, idx, stride)     \
+    do {                                                                \
+        (dest) = _mm512_set_ps(*((float32*)((char*)arr + off) + idx + 15*stride), \
+                               *((float32*)((char*)arr + off) + idx + 14*stride), \
+                               *((float32*)((char*)arr + off) + idx + 13*stride), \
+                               *((float32*)((char*)arr + off) + idx + 12*stride), \
+                               *((float32*)((char*)arr + off) + idx + 11*stride), \
+                               *((float32*)((char*)arr + off) + idx + 10*stride), \
+                               *((float32*)((char*)arr + off) + idx + 9*stride), \
+                               *((float32*)((char*)arr + off) + idx + 8*stride), \
+                               *((float32*)((char*)arr + off) + idx + 7*stride), \
+                               *((float32*)((char*)arr + off) + idx + 6*stride), \
+                               *((float32*)((char*)arr + off) + idx + 5*stride), \
+                               *((float32*)((char*)arr + off) + idx + 4*stride), \
+                               *((float32*)((char*)arr + off) + idx + 3*stride), \
+                               *((float32*)((char*)arr + off) + idx + 2*stride), \
+                               *((float32*)((char*)arr + off) + idx + 1*stride), \
+                               *((float32*)((char*)arr + off) + idx + 0*stride));    \
+    } while (0)
+
+#define pLsrPrimV512Float64StoreStrided(arr, off, idx, stride, src)     \
+    do {                                                                \
+        __declspec(align(64)) double ftemp[8];                          \
+        _mm512_store_pd(ftemp, src);                                   \
+        *((float64*)((char*)arr + off) + idx + 0*stride) = ftemp[0];    \
+        *((float64*)((char*)arr + off) + idx + 1*stride) = ftemp[1];    \
+        *((float64*)((char*)arr + off) + idx + 2*stride) = ftemp[2];    \
+        *((float64*)((char*)arr + off) + idx + 3*stride) = ftemp[3];    \
+        *((float64*)((char*)arr + off) + idx + 4*stride) = ftemp[4];    \
+        *((float64*)((char*)arr + off) + idx + 5*stride) = ftemp[5];    \
+        *((float64*)((char*)arr + off) + idx + 6*stride) = ftemp[6];    \
+        *((float64*)((char*)arr + off) + idx + 7*stride) = ftemp[7];    \
+    } while (0)
+
+#define pLsrPrimV512UInt64StoreStrided(arr, off, idx, stride, src)      \
+    do {                                                                \
+        __declspec(align(64)) uint64_t ftemp[8];                        \
+        _mm512_store_epi64(ftemp, src);                                \
+        *((uint64_t*)((char*)arr + off) + idx + 0*stride) = ftemp[0];   \
+        *((uint64_t*)((char*)arr + off) + idx + 1*stride) = ftemp[1];   \
+        *((uint64_t*)((char*)arr + off) + idx + 2*stride) = ftemp[2];   \
+        *((uint64_t*)((char*)arr + off) + idx + 3*stride) = ftemp[3];   \
+        *((uint64_t*)((char*)arr + off) + idx + 4*stride) = ftemp[4];   \
+        *((uint64_t*)((char*)arr + off) + idx + 5*stride) = ftemp[5];   \
+        *((uint64_t*)((char*)arr + off) + idx + 6*stride) = ftemp[6];   \
+        *((uint64_t*)((char*)arr + off) + idx + 7*stride) = ftemp[7];   \
+    } while (0)
+
+#define pLsrPrimV512Float32StoreStrided(arr, off, idx, stride, src)     \
+    do {                                                                \
+        __declspec(align(64)) float ftemp[16];                          \
+        _mm512_store_ps(ftemp, src);                                   \
+        *((float32*)((char*)arr + off) + idx + 0*stride) = ftemp[0];    \
+        *((float32*)((char*)arr + off) + idx + 1*stride) = ftemp[1];    \
+        *((float32*)((char*)arr + off) + idx + 2*stride) = ftemp[2];    \
+        *((float32*)((char*)arr + off) + idx + 3*stride) = ftemp[3];    \
+        *((float32*)((char*)arr + off) + idx + 4*stride) = ftemp[4];    \
+        *((float32*)((char*)arr + off) + idx + 5*stride) = ftemp[5];    \
+        *((float32*)((char*)arr + off) + idx + 6*stride) = ftemp[6];    \
+        *((float32*)((char*)arr + off) + idx + 7*stride) = ftemp[7];    \
+        *((float32*)((char*)arr + off) + idx + 8*stride) = ftemp[8];    \
+        *((float32*)((char*)arr + off) + idx + 9*stride) = ftemp[9];    \
+        *((float32*)((char*)arr + off) + idx + 10*stride) = ftemp[10];  \
+        *((float32*)((char*)arr + off) + idx + 11*stride) = ftemp[11];  \
+        *((float32*)((char*)arr + off) + idx + 12*stride) = ftemp[12];  \
+        *((float32*)((char*)arr + off) + idx + 13*stride) = ftemp[13];  \
+        *((float32*)((char*)arr + off) + idx + 14*stride) = ftemp[14];  \
+        *((float32*)((char*)arr + off) + idx + 15*stride) = ftemp[15];  \
+    } while (0)
+
+#define pLsrPrimV512UInt64LoadStrided(dest, arr, off, idx, stride)      \
+    do {                                                                \
+        (dest) = _mm512_set_epi64(*((uint64_t*)((char*)arr + off) + idx + 7*stride), \
+                               *((uint64_t*)((char*)arr + off) + idx + 6*stride), \
+                               *((uint64_t*)((char*)arr + off) + idx + 5*stride), \
+                               *((uint64_t*)((char*)arr + off) + idx + 4*stride), \
+                               *((uint64_t*)((char*)arr + off) + idx + 3*stride), \
+                               *((uint64_t*)((char*)arr + off) + idx + 2*stride), \
+                               *((uint64_t*)((char*)arr + off) + idx + 1*stride), \
+                               *((uint64_t*)((char*)arr + off) + idx + 0*stride));    \
+    } while (0)
+
+#define pLsrPrimV512RefLoadStrided(dest, arr, off, idx, stride)     \
+    do {                                                                \
+        (dest) = _mm512_set_epi64(*((void**)((char*)arr + off) + idx + 7*stride), \
+                                  *((void**)((char*)arr + off) + idx + 6*stride), \
+                                  *((void**)((char*)arr + off) + idx + 5*stride), \
+                                  *((void**)((char*)arr + off) + idx + 4*stride), \
+                                  *((void**)((char*)arr + off) + idx + 3*stride), \
+                                  *((void**)((char*)arr + off) + idx + 2*stride), \
+                                  *((void**)((char*)arr + off) + idx + 1*stride), \
+                                  *((void**)((char*)arr + off) + idx + 0*stride));    \
+    } while (0)
+
+#define pLsrPrimV512ReftoreStrided(arr, off, idx, stride, src)     \
+    do {                                                                \
+        __declspec(align(64)) void * ftemp[8];                         \
+        _mm512_store_epi64(ftemp, src);                                   \
+        *((void**)((char*)arr + off) + idx + 0*stride) = ftemp[0];    \
+        *((void**)((char*)arr + off) + idx + 1*stride) = ftemp[1];    \
+        *((void**)((char*)arr + off) + idx + 2*stride) = ftemp[2];    \
+        *((void**)((char*)arr + off) + idx + 3*stride) = ftemp[3];    \
+        *((void**)((char*)arr + off) + idx + 4*stride) = ftemp[4];    \
+        *((void**)((char*)arr + off) + idx + 5*stride) = ftemp[5];    \
+        *((void**)((char*)arr + off) + idx + 6*stride) = ftemp[6];    \
+        *((void**)((char*)arr + off) + idx + 7*stride) = ftemp[7];    \
+    } while (0)
+
+#define pLsrPrimV512Float64Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)  \
+    ((dest) = _mm512_set_pd(c7, c6, c5, c4, c3, c2, c1, c0))
+
+
+#define pLsrPrimV256512CastSInt64ToFloat32(dest, a) \
+    do {                                            \
+        __declspec(align(64)) __int64 the_ints[8]; \
+        _mm512_store_epi64(the_ints, a);           \
+        (dest) = _mm512_set_ps(0.0,                 \
+                               0.0,                 \
+                               0.0,                 \
+                               0.0,                 \
+                               0.0,                 \
+                               0.0,                 \
+                               0.0,                 \
+                               0.0,                 \
+                               (float)the_ints[7],  \
+                               (float)the_ints[6],  \
+                               (float)the_ints[5],  \
+                               (float)the_ints[4],  \
+                               (float)the_ints[3],  \
+                               (float)the_ints[2],  \
+                               (float)the_ints[1],  \
+                               (float)the_ints[0]); \
+    } while (0)
+
+#define pLsrPrimV512ReduceASInt64Plus(dest, init, a) \
+    do { \
+        __declspec(align(64)) __int64 vi64[8];                         \
+        _mm512_store_epi64(vi64, a);                                   \
+        (dest) = (init);                                                \
+        unsigned i;                                                     \
+        for(i=0;i<8;++i) (dest) += vi64[i];                             \
+    } while(0)
+
+// ==============================================================================================
+
+
+
+#ifndef HAS_256
+
+typedef __m512  PlsrVector256F32;
+typedef __m512i PlsrVector256B32;
+typedef __m512d PlsrVector256F64;
+typedef __m512i PlsrVector256Ref;
+
+#define lower256of512  ((__mmask16)0xff)
+#define higher256of512 ((__mmask16)0xff00)
+#define lower256of512double  ((__mmask8)0xf)
+#define higher256of512double ((__mmask8)0xf0)
+
+#define pLsrPrimV256PointwiseFloat32Ln(dest, a) ((dest) = _mm512_mask_log_ps(dest, lower256of512, a))
+#define pLsrPrimV256PointwiseFloat32Exp(dest, a) ((dest) = _mm512_mask_exp_ps(dest, lower256of512, a))
+#define pLsrPrimV256PointwiseFloat32Negate(dest, a) ((dest) = _mm512_mask_mul_ps(dest, lower256of512, a, _mm512_set1_ps(-1.0)))
+
+#define pLsrPrimV512256CastFloat32ToSInt64(dest, a) \
+    do { \
+        __declspec(align(64)) float32 vi32[16];                        \
+        __declspec(align(64)) __int64 vi64[8];                        \
+        _mm512_store_ps(vi32, a);                                  \
+        unsigned i; \
+        for(i=0;i<8;i++) vi64[i] = vi32[i]; \
+        (dest) = _mm512_load_epi64(vi64); \
+    } while (0)
+
+//#define pLsrPrimV512256ConvertSInt64FromSInt32(dest, a)  ((dest) = _mm512_cvtepi32_epi64(a))
+#define pLsrPrimV512256ConvertSInt64FromSInt32(dest, a) \
+    do { \
+        __declspec(align(64)) __int32 vi32[16];                        \
+        __declspec(align(64)) __int64 vi64[8];                        \
+        _mm512_store_epi32(vi32, a);                                  \
+        unsigned i; \
+        for(i=0;i<8;i++) vi64[i] = vi32[i]; \
+        (dest) = _mm512_load_epi64(vi64); \
+    } while (0)
+
+#define pLsrPrimV256DataConcatF32(dest, lower, upper) \
+    do { \
+        __m512 res = lower;                                             \
+        res = _mm512_mask_permute4f128_ps(res, higher256of512, upper, _MM_PERM_BABA);\
+        (dest) = res;                                                   \
+    } while (0)
+
+/*
+#define pLsrPrimV256DataConcatF32(dest, lower, upper) \
+    do { \
+        __m512 res = lower;                                             \
+        res = _mm512_mask_alignr_epi32(res, higher256of512, upper, res, 8);\
+        (dest) = res;                                                   \
+    } while (0)
+#define pLsrPrimV256DataConcatF32(dest, lower, upper) \
+    do { \
+        __m512 res = lower;                                             \
+        __declspec(align(64)) double ftemp[8];                         \
+        _mm512_mask_store_pd(&ftemp,lower256of512double,upper);         \
+        _mm512_mask_extload_pd(res,higher256of512double,&ftemp,_MM_UPCONV_PD_NONE,_MM_BROADCAST_4X8,0); \
+        (dest) = res;                                                   \
+    } while (0)
+*/
+
+#if 0
+#define pLsrPrimV256PointwiseCondMov(dest, mask, a, b) \
+    do { \
+       __mmask16 realmask=0; \
+       __declspec(align(64)) __int64 mvec[8];       \
+        _mm512_store_epi64(mvec,(mask));                          \
+       unsigned i; \
+       for(i=0;i<8;++i) { \
+           if(mvec[i]) realmask |= (1 << i); \
+       } \
+       (dest) = _mm512_mask_blend_ps((realmask),(a),(b)); \
+    } while (0)
+#else
+#define pLsrPrimV256DataBlendB32(dest, mask, a, b) \
+    do { \
+       __mmask16 realmask=mask; \
+       (dest) = _mm512_mask_blend_epi32((realmask),(a),(b)); \
+    } while (0)
+#define pLsrPrimV256DataBlendF32(dest, mask, a, b) \
+    do { \
+       __mmask16 realmask=mask; \
+       (dest) = _mm512_mask_blend_ps((realmask),(a),(b)); \
+    } while (0)
+#endif
+
+#define pLsrPrimV256DataVectorF32(dest, c0, c1, c2, c3, c4, c5, c6, c7) \
+  pLsrPrimV512Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
+
+#define pLsrPrimV256B32LoadVI64(dest, arr, off, vi)                       \
+    do {                                                                \
+        __declspec(align(64)) __int64 vi64[8];                         \
+        __declspec(align(64)) __int32 vi32[16];                        \
+        _mm512_store_epi64(vi64, vi);                                  \
+        unsigned i;                                                     \
+        for(i=0;i<8;++i) vi32[i] = vi64[i];                             \
+        __m512i indices32bit;                                           \
+        indices32bit = _mm512_load_epi32(vi32);                        \
+        _mm512_mask_i32gather_epi32(dest,lower256of512,indices32bit,((char*)arr + off),1); \
+    } while (0)
+
+#define pLsrPrimV256F32LoadVI64(dest, arr, off, vi)                       \
+    do {                                                                \
+        __declspec(align(64)) __int64 vi64[8];                         \
+        __declspec(align(64)) __int32 vi32[16];                        \
+        _mm512_store_epi64(vi64, vi);                                  \
+        unsigned i;                                                     \
+        for(i=0;i<8;++i) vi32[i] = vi64[i];                             \
+        __m512i indices32bit;                                           \
+        indices32bit = _mm512_load_epi32(vi32);                        \
+        _mm512_mask_i32gather_ps(dest,lower256of512,indices32bit,((char*)arr + off),1); \
+    } while (0)
+
+#define pLsrPrimV256F32LoadVI(dest, arr, off, vi) _mm512_mask_i32gather_ps(dest,lower256of512,vi,((char*)arr + off),1);
+
+#define pLsrPrimV256PointwiseFloat32Plus(dest, a, b)  ((dest) = _mm512_mask_add_ps( (dest), lower256of512, (a), (b)))
+#define pLsrPrimV256PointwiseFloat32Minus(dest, a, b) ((dest) = _mm512_mask_sub_ps( (dest), lower256of512, (a), (b)))
+#define pLsrPrimV256PointwiseFloat32Times(dest, a, b) ((dest) = _mm512_mask_mul_ps( (dest), lower256of512, (a), (b)))
+#define pLsrPrimV256PointwiseFloat32Divide(dest, a, b) ((dest) = _mm512_mask_mul_ps( (dest), lower256of512, (a), _mm512_mask_rcp23_ps( (b), lower256of512, (b))))
+#define pLsrPrimV256PointwiseFloat32Sqrt(dest, a) ((dest) = _mm512_mask_sqrt_ps( (dest), lower256of512, (a)))
+
+#define pLsrPrimV256ReduceAFloat32Plus(dest, init, a) reduce256AFloat32Plus(dest, init, a)
+
+#define reduce256AFloat32Plus(dest, init, a)                            \
+  (dest) = (_mm512_mask_reduce_add_ps(lower256of512,a) + (init));
+
+#define pLsrPrimV256Float32Lift(dest, a) \
+  do { \
+    float ftemp = a; \
+    (dest) = _mm512_mask_extload_ps((dest), lower256of512, (void const *)(&ftemp),_MM_UPCONV_PS_NONE,_MM_BROADCAST_1X16,0); \
+  } while (0)
+
+#define pLsrPrimV256DataBroadcastF32(dest, c0) pLsrPrimV512Float32Lift(dest, c0)
+
+#ifdef HAS_ALIGNMENT
+#define pLsrPrimV256Float32Load(dest, arr, off, idx)                 \
+    (dest) = _mm512_mask_load_ps((dest), lower256of512, (void const *)((float*)((char*)arr+off) + idx))
+#else
+#define pLsrPrimV256Float32Load(dest, arr, off, idx)                     \
+    do {                                                               \
+        (dest) = _mm512_mask_loadunpacklo_ps((dest),lower256of512,(void *)((float*)((char*)arr+off) + idx)); \
+        (dest) = _mm512_mask_loadunpackhi_ps((dest),lower256of512,(char *)((float*)((char*)arr+off) + idx) + 64); \
+    } while (0)
+#endif
+
+#define STRIDE_VIA_GATHER
+
+#ifdef STRIDE_VIA_GATHER
+#define pLsrPrimV256Float32StoreStrided(arr, off, idx, stride, src)   \
+    do {                                                                        \
+        __m512i indices, vstride;                                               \
+        indices = _mm512_set_epi32(0,0,0,0,0,0,0,0,7,6,5,4,3,2,1,0);            \
+        vstride = _mm512_set1_epi32(stride);                                    \
+        indices = _mm512_mask_mullo_epi32(indices,lower256of512,indices,vstride); \
+        _mm512_mask_i32scatter_ps((float*)((char*)arr + off) + idx,lower256of512,indices,(src),_MM_SCALE_1); \
+    } while (0)
+#else
+#define pLsrPrimV256Float32StoreStrided(arr, off, idx, stride, src)   \
+    do {                                                              \
+        __declspec(align(64)) float ftemp[16];                       \
+        _mm512_mask_store_ps(ftemp, lower256of512, src);             \
+        *((float*)((char*)arr + off) + idx + 0*stride) = ftemp[0];    \
+        *((float*)((char*)arr + off) + idx + 1*stride) = ftemp[1];    \
+        *((float*)((char*)arr + off) + idx + 2*stride) = ftemp[2];    \
+        *((float*)((char*)arr + off) + idx + 3*stride) = ftemp[3];    \
+        *((float*)((char*)arr + off) + idx + 4*stride) = ftemp[4];    \
+        *((float*)((char*)arr + off) + idx + 5*stride) = ftemp[5];    \
+        *((float*)((char*)arr + off) + idx + 6*stride) = ftemp[6];    \
+        *((float*)((char*)arr + off) + idx + 7*stride) = ftemp[7];    \
+    } while (0)
+#endif
+
+#ifdef HAS_ALIGNMENT
+#define pLsrPrimV256Float32Store(arr, off, idx, v) \
+    _mm512_mask_store_ps((void const *)((float*)((char*)arr+off) + idx), lower256of512, v)
+#else
+#define pLsrPrimV256Float32Store(arr, off, idx, v)             \
+    do {                                                               \
+        _mm512_mask_packstorelo_ps((void *)((float*)((char*)arr+off) + idx),lower256of512,v); \
+        _mm512_mask_packstorehi_ps((char *)((float*)((char*)arr+off) + idx) + 64,lower256of512,v); \
+    } while (0)
+#endif
+
+#if 0
+        if(stride <= 8 && stride/2%==0) {                                       \
+            (dest) = _mm512_mask_i32gather_ps((dest),lower256of512,,(float*)((char*)arr + off) + idx,stride); \
+        } else {                                                                \
+            (dest) = _mm512_mask_i32gather_ps((dest),lower256of512,,(float*)((char*)arr + off) + idx,_MM_SCALE_1); \
+        }
+#endif
+
+#ifdef STRIDE_VIA_GATHER
+#define pLsrPrimV256Float32LoadStrided(dest, arr, off, idx, stride)             \
+    do {                                                                        \
+        __m512i indices, vstride;                                               \
+        indices = _mm512_set_epi32(0,0,0,0,0,0,0,0,7,6,5,4,3,2,1,0);            \
+        vstride = _mm512_set1_epi32(stride);                                    \
+        indices = _mm512_mask_mullo_epi32(indices,lower256of512,indices,vstride); \
+        (dest) = _mm512_mask_i32gather_ps((dest),lower256of512,indices,(float*)((char*)arr + off) + idx,_MM_SCALE_1); \
+    } while (0)
+#else
+#define pLsrPrimV256Float32LoadStrided(dest, arr, off, idx, stride)             \
+    do {                                                                        \
+        (dest) = _mm512_set_ps(0.0,                                             \
+                               0.0,                                             \
+                               0.0,                                             \
+                               0.0,                                             \
+                               0.0,                                             \
+                               0.0,                                             \
+                               0.0,                                             \
+                               0.0,                                             \
+                               *((float*)((char*)arr + off) + idx + 7*stride),  \
+                               *((float*)((char*)arr + off) + idx + 6*stride),  \
+                               *((float*)((char*)arr + off) + idx + 5*stride),  \
+                               *((float*)((char*)arr + off) + idx + 4*stride),  \
+                               *((float*)((char*)arr + off) + idx + 3*stride),  \
+                               *((float*)((char*)arr + off) + idx + 2*stride),  \
+                               *((float*)((char*)arr + off) + idx + 1*stride),  \
+                               *((float*)((char*)arr + off) + idx + 0*stride)); \
+    } while (0)
+#endif
+
+#define pLsrPrimV256F32LoadVS(dest, arr, offset, idx, stride)               \
+    do {                                                                    \
+        if (stride == 1) {                                                  \
+            pLsrPrimV256Float32Load(dest, arr, offset, idx);                \
+        } else {                                                            \
+            pLsrPrimV256Float32LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                                   \
+    } while (0)
+
+#define pLsrPrimV256F32StoreVS(arr, offset, idx, stride, v)               \
+    do {                                                                  \
+        if (stride == 1) {                                                \
+            pLsrPrimV256Float32Store(arr, offset, idx, v);                \
+        } else {                                                          \
+            pLsrPrimV256Float32StoreStrided(arr, offset, idx, stride, v); \
+        }                                                                 \
+    } while (0)
+
+#ifdef HAS_ALIGNMENT
+#define pLsrPrimV256UInt32Load(dest, arr, off, idx)                 \
+    (dest) = _mm512_mask_load_epi32((dest), lower256of512, (void const *)((uint32_t*)((char*)arr+off) + idx))
+#else
+#define pLsrPrimV256UInt32Load(dest, arr, off, idx)                     \
+    do {                                                               \
+        (dest) = _mm512_mask_loadunpacklo_epi32((dest),lower256of512,(void *)((uint32_t*)((char*)arr+off) + idx)); \
+        (dest) = _mm512_mask_loadunpackhi_epi32((dest),lower256of512,(char *)((uint32_t*)((char*)arr+off) + idx) + 64); \
+    } while (0)
+#endif
+
+#define pLsrPrimV256UInt32LoadStrided(dest, arr, off, idx, stride)             \
+    do {                                                                        \
+        (dest) = _mm512_set_epi32(0,                                             \
+                               0,                                             \
+                               0,                                             \
+                               0,                                             \
+                               0,                                             \
+                               0,                                             \
+                               0,                                             \
+                               0,                                             \
+                               *((uint32_t*)((char*)arr + off) + idx + 7*stride),  \
+                               *((uint32_t*)((char*)arr + off) + idx + 6*stride),  \
+                               *((uint32_t*)((char*)arr + off) + idx + 5*stride),  \
+                               *((uint32_t*)((char*)arr + off) + idx + 4*stride),  \
+                               *((uint32_t*)((char*)arr + off) + idx + 3*stride),  \
+                               *((uint32_t*)((char*)arr + off) + idx + 2*stride),  \
+                               *((uint32_t*)((char*)arr + off) + idx + 1*stride),  \
+                               *((uint32_t*)((char*)arr + off) + idx + 0*stride)); \
+    } while (0)
+
+#define pLsrPrimV256B32LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256UInt32Load(dest, arr, offset, idx);            \
+        } else {                                                        \
+            pLsrPrimV256UInt32LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+
+#define pLsrPrimV256UInt32StoreStrided(arr, off, idx, stride, src)   \
+    do {                                                              \
+        __declspec(align(64)) uint32_t ftemp[16];                       \
+        _mm512_mask_store_epi32(ftemp, lower256of512, src);             \
+        *((uint32_t*)((char*)arr + off) + idx + 0*stride) = ftemp[0];    \
+        *((uint32_t*)((char*)arr + off) + idx + 1*stride) = ftemp[1];    \
+        *((uint32_t*)((char*)arr + off) + idx + 2*stride) = ftemp[2];    \
+        *((uint32_t*)((char*)arr + off) + idx + 3*stride) = ftemp[3];    \
+        *((uint32_t*)((char*)arr + off) + idx + 4*stride) = ftemp[4];    \
+        *((uint32_t*)((char*)arr + off) + idx + 5*stride) = ftemp[5];    \
+        *((uint32_t*)((char*)arr + off) + idx + 6*stride) = ftemp[6];    \
+        *((uint32_t*)((char*)arr + off) + idx + 7*stride) = ftemp[7];    \
+    } while (0)
+
+#ifdef HAS_ALIGNMENT
+#define pLsrPrimV256UInt32Store(arr, off, idx, v) \
+    _mm512_mask_store_epi32((void const *)((uint32_t*)((char*)arr+off) + idx), lower256of512, v)
+#else
+#define pLsrPrimV256UInt32Store(arr, off, idx, v)             \
+    do {                                                               \
+        _mm512_mask_packstorelo_epi32((void *)((uint32_t*)((char*)arr+off) + idx),lower256of512,v); \
+        _mm512_mask_packstorehi_epi32((char *)((uint32_t*)((char*)arr+off) + idx) + 64,lower256of512,v); \
+    } while (0)
+#endif
+
+#define pLsrPrimV256B32StoreVS(arr, offset, idx, stride, v)               \
+    do {                                                                  \
+        if (stride == 1) {                                                \
+            pLsrPrimV256UInt32Store(arr, offset, idx, v);                \
+        } else {                                                          \
+            pLsrPrimV256UInt32StoreStrided(arr, offset, idx, stride, v); \
+        }                                                                 \
+    } while (0)
+
+#else // HAS_256
+
+typedef __m256  PlsrVector256F32;
+typedef __m256i PlsrVector256B32;
+typedef __m256d PlsrVector256F64;
+typedef __m256i PlsrVector256Ref;
+
+// *****************************************************************************
+// Conversions
+// *****************************************************************************
+
+#define pLsrPrimV256256ConvertFloat32FromUInt32(dest, a) ((dest) = _mm256_cvtepi32_ps(a))
+#define pLsrPrimV256256ConvertUInt32FromFloat32(dest, a) ((dest) = _mm256_cvtps_epi32(a))
+#define pLsrPrimV256128ConvertFloat64FromFloat32(dest, a)  ((dest) = _mm256_cvtps_pd(a))
+
+// vec128 -> vec256
+#define pLsrPrimV256U32ToF64(dest, a) ((dest) = _mm256__cvtepi32_pd (_mm_movpi64_epi64(a)))
+#define pLsrPrimV256F64ToU32(dest, a) ((dest) = _mm256__movepi64_pi64 (_mm256_cvtpd_epi32(a)))
+#define pLsrPrimV256256CastSInt32ToFloat32(dest, a) ((dest) = _mm256_cvtepi32_ps(a))
+
+// *****************************************************************************
+// Data operations on the more abstract B32/F32/F64 types
+// *****************************************************************************
+
+#define pLsrPrimV256DataVectorB32(dest, c0, c1, c2, c3, c4, c5, c6, c7) \
+  pLsrPrimV256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)
+
+#define pLsrPrimV256DataVectorF32(dest, c0, c1, c2, c3, c4, c5, c6, c7) \
+  pLsrPrimV256Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)
+
+ // Subscripting operations (chose between the lower or higher part)
+
+
+#define pLsrPrimV256DataSub0B32(dest, a) pLsrPrimV256DataSubB32L(dest, 0, a)
+#define pLsrPrimV256DataSub1B32(dest, a) pLsrPrimV256DataSubB32L(dest, 1, a)
+#define pLsrPrimV256DataSub2B32(dest, a) pLsrPrimV256DataSubB32L(dest, 2, a)
+#define pLsrPrimV256DataSub3B32(dest, a) pLsrPrimV256DataSubB32L(dest, 3, a)
+#define pLsrPrimV256DataSub4B32(dest, a) pLsrPrimV256DataSubB32H(dest, 0, a)
+#define pLsrPrimV256DataSub5B32(dest, a) pLsrPrimV256DataSubB32H(dest, 1, a)
+#define pLsrPrimV256DataSub6B32(dest, a) pLsrPrimV256DataSubB32H(dest, 2, a)
+#define pLsrPrimV256DataSub7B32(dest, a) pLsrPrimV256DataSubB32H(dest, 3, a)
+
+#define pLsrPrimV256DataSub0F32(dest, a) pLsrPrimV256DataSubF32L(dest, 0, a)
+#define pLsrPrimV256DataSub1F32(dest, a) pLsrPrimV256DataSubF32L(dest, 1, a)
+#define pLsrPrimV256DataSub2F32(dest, a) pLsrPrimV256DataSubF32L(dest, 2, a)
+#define pLsrPrimV256DataSub3F32(dest, a) pLsrPrimV256DataSubF32L(dest, 3, a)
+#define pLsrPrimV256DataSub4F32(dest, a) pLsrPrimV256DataSubF32H(dest, 0, a)
+#define pLsrPrimV256DataSub5F32(dest, a) pLsrPrimV256DataSubF32H(dest, 1, a)
+#define pLsrPrimV256DataSub6F32(dest, a) pLsrPrimV256DataSubF32H(dest, 2, a)
+#define pLsrPrimV256DataSub7F32(dest, a) pLsrPrimV256DataSubF32H(dest, 3, a)
+
+#define pLsrPrimV256DataSubB32L(dest, sub, a) ((dest) = pLsrPrimV256DataSubB32L_help(sub, a))
+static inline int pLsrPrimV256DataSubB32L_help(int sub, __m256i a) {
+  __m128i loA = _mm256_extractf128_si256(a, 0);
+  return (_mm_extract_epi32(loA, sub));
+}
+#define pLsrPrimV256DataSubB32H(dest, sub, a) ((dest) = pLsrPrimV256DataSubB32H_help(sub, a))
+static inline int pLsrPrimV256DataSubB32H_help(int sub, __m256i a) {
+  __m128i hiA = _mm256_extractf128_si256(a, 1);
+  return (_mm_extract_epi32(hiA, sub));
+}
+
+#define pLsrPrimV256DataSubF32H(dest, sub, a) ((dest) = pLsrPrimV256DataSubF32H_help(sub, a))
+static inline float pLsrPrimV256DataSubF32H_help(int sub, __m256 a) {
+  //  float* out = (float*)malloc(sizeof(float));
+  __m128 hiA = _mm256_extractf128_ps(a, 1);
+  //__m128 outA = mm_extract_ps(out, hiA, sub);
+  //  _mm_move_ss(out, outA);
+  //  return *out;
+  float out;
+  _MM_EXTRACT_FLOAT(out, hiA, sub);
+  return out;
+
+}
+
+#define pLsrPrimV256DataSubF32L(dest, sub, a) ((dest) = pLsrPrimV256DataSubF32L_help(sub, a))
+static inline float pLsrPrimV256DataSubF32L_help(int sub, __m256 a) {
+  //  float* out = (float*)malloc(sizeof(float));
+  __m128 hiA = _mm256_extractf128_ps(a, 0);
+  //  __m128 outA = mm_extract_ps(out, hiA, sub);
+  //  _mm_move_ss(out, outA);
+  float out;
+  _MM_EXTRACT_FLOAT(out, hiA, sub);
+  return out;
+}
+
+#define pLsrPrimV256DataSub0F64(dest, a) pLsrPrimV256DataSubF64(dest, 0, a)
+#define pLsrPrimV256DataSub1F64(dest, a) pLsrPrimV256DataSubF64(dest, 1, a)
+#define pLsrPrimV256DataSub2F64(dest, a) pLsrPrimV256DataSubF64(dest, 2, a)
+#define pLsrPrimV256DataSub3F64(dest, a) pLsrPrimV256DataSubF64(dest, 3, a)
+
+#define pLsrPrimV256DataSubF64(dest, sub, a) ((dest) = ((double*)&a)[sub])
+
+// loads and stores
+
+#define pLsrPrimV256B32StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256UInt32Store(arr, offset, idx, v);              \
+        } else {                                                        \
+            pLsrPrimV256UInt32StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256B64StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256UInt64Store(arr, offset, idx, v);              \
+        } else {                                                        \
+            pLsrPrimV256UInt64StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256F32StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256Float32Store(arr, offset, idx, v);             \
+        } else {                                                        \
+            pLsrPrimV256Float32StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256F64StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256Float64Store(arr, offset, idx, v);             \
+        } else {                                                        \
+            pLsrPrimV256Float64StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256RefStoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256RefStore(arr, offset, idx, v);                 \
+        } else {                                                        \
+            pLsrPrimV256RefStoreStrided(arr, offset, idx, stride, v);  \
+        }                                                               \
+    } while (0)
+
+
+#define pLsrPrimV256B32LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256UInt32Load(dest, arr, offset, idx);            \
+        } else {                                                        \
+            pLsrPrimV256UInt32LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256B64LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256UInt64Load(dest, arr, offset, idx);            \
+        } else {                                                        \
+            pLsrPrimV256UInt64LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256F32LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256Float32Load(dest, arr, offset, idx);           \
+        } else {                                                        \
+            pLsrPrimV256Float32LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256F64LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256Float64Load(dest, arr, offset, idx);           \
+        } else {                                                        \
+            pLsrPrimV256Float64LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV256RefLoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV256RefLoad(dest, arr, offset, idx);               \
+        } else {                                                        \
+            pLsrPrimV256RefLoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+
+#define pLsrPrimV256F64LoadVI(dest, arr, off, vi)                       \
+    do {                                                                \
+        __m128i pLsrPrimV256Float64LoadVectorHelp_idxV;                 \
+        __m256d  pLsrPrimV256Float64LoadVectorHelp_res;                 \
+        uint32 pLsrPrimV256Float64LoadVectorHelp_idx0;                  \
+        uint32 pLsrPrimV256Float64LoadVectorHelp_idx1;                  \
+        uint32 pLsrPrimV256Float64LoadVectorHelp_idx2;                  \
+        uint32 pLsrPrimV256Float64LoadVectorHelp_idx3;                  \
+                                                                        \
+        pLsrPrimV256Float64LoadVectorHelp_idxV = vi;                    \
+        pLsrPrimV256Float64LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 0); \
+        pLsrPrimV256Float64LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 1); \
+        pLsrPrimV256Float64LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 2); \
+        pLsrPrimV256Float64LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float64LoadVectorHelp_idxV, 3); \
+        pLsrPrimV256Float64LoadVectorHelp_res =                         \
+            _mm256_set_pd(*((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx3), \
+                          *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx2), \
+                          *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx1), \
+                          *((float64*)((char*)arr + off) + pLsrPrimV256Float64LoadVectorHelp_idx0)); \
+        (dest) = pLsrPrimV256Float64LoadVectorHelp_res;                 \
+    } while (0)
+
+/* scalar array, scalar offset, vector of indices */
+#define pLsrPrimV256F32LoadVI(dest, arr, off, vi)                       \
+    do {                                                                \
+        __m128i pLsrPrimV256Float32LoadVectorHelp_idxV;                 \
+        __m128 pLsrPrimV256Float32LoadVectorHelp_lo;                    \
+        __m128 pLsrPrimV256Float32LoadVectorHelp_hi;                    \
+        __m256  pLsrPrimV256Float32LoadVectorHelp_res;                  \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx0;                  \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx1;                  \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx2;                  \
+        uint32 pLsrPrimV256Float32LoadVectorHelp_idx3;                  \
+                                                                        \
+        pLsrPrimV256Float32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 2); \
+        pLsrPrimV256Float32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 3); \
+        pLsrPrimV256Float32LoadVectorHelp_lo =                          \
+            _mm_set_ps(*((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx3), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx2), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx1), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx0)); \
+        pLsrPrimV256Float32LoadVectorHelp_idxV = _mm256_extractf128_si256(vi, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_idx0 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_idx1 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_idx2 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 2); \
+        pLsrPrimV256Float32LoadVectorHelp_idx3 = (uint32)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_idxV, 3); \
+        pLsrPrimV256Float32LoadVectorHelp_hi =                          \
+            _mm_set_ps(*((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx3), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx2), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx1), \
+                       *((float32*)((char*)arr + off) + pLsrPrimV256Float32LoadVectorHelp_idx0)); \
+        pLsrPrimV256Float32LoadVectorHelp_res = _mm256_castps128_ps256(pLsrPrimV256Float32LoadVectorHelp_lo); \
+        pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \
+                                                                     pLsrPrimV256Float32LoadVectorHelp_hi, \
+                                                                     1); \
+        (dest) = pLsrPrimV256Float32LoadVectorHelp_res;                 \
+    } while (0)
+
+/* scalar array, scalar offset, vector of indices */
+
+/* scalar array, scalar offset, vector of indices */
+/* For some insane reason, the only way to load out a float from an __m128 register
+ * is as an integer containing the binary representation of the float. -leaf */
+#define pLsrPrimV256F32StoreVI(arr, off, vi, src)                       \
+    do {                                                                \
+        __m128i pLsrPrimV256Float32StoreVectorHelp_idxV;                 \
+        __m128 pLsrPrimV256Float32StoreVectorHelp_srcV;                 \
+        uint32 pLsrPrimV256Float32StoreVectorHelp_idx;                  \
+                                                                        \
+        pLsrPrimV256Float32StoreVectorHelp_idxV =_mm256_extractf128_si256(vi, 0); \
+        pLsrPrimV256Float32StoreVectorHelp_srcV =_mm256_extractf128_ps(src, 0); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 0); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 0); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 1); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 1); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 2); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 2); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 3); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 3); \
+        pLsrPrimV256Float32StoreVectorHelp_idxV =_mm256_extractf128_si256(vi, 1); \
+        pLsrPrimV256Float32StoreVectorHelp_srcV =_mm256_extractf128_ps(src, 1); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 0); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 0); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 1); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 1); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 2); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 2); \
+        pLsrPrimV256Float32StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float32StoreVectorHelp_idxV, 3); \
+        *((uint32*)((char*)arr + off) + pLsrPrimV256Float32StoreVectorHelp_idx) = \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreVectorHelp_srcV, 3); \
+    } while (0)
+
+/* scalar array, scalar offset, vector of indices */
+#define pLsrPrimV256F64StoreVI(arr, off, vi, src)                       \
+    do {                                                                \
+        __m128i pLsrPrimV256Float64StoreVectorHelp_idxV;                 \
+        __m128d pLsrPrimV256Float64StoreVectorHelp_srcV;                 \
+        uint32 pLsrPrimV256Float64StoreVectorHelp_idx;                  \
+                                                                        \
+        pLsrPrimV256Float64StoreVectorHelp_idxV = vi;                   \
+        pLsrPrimV256Float64StoreVectorHelp_srcV =_mm256_extractf128_pd(src, 0); \
+        pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 0); \
+        _mm_storel_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \
+        pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 1); \
+        _mm_storeh_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \
+                                                                        \
+        pLsrPrimV256Float64StoreVectorHelp_srcV =_mm256_extractf128_pd(src, 1); \
+        pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 2); \
+        _mm_storel_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \
+        pLsrPrimV256Float64StoreVectorHelp_idx = (uint32)_mm_extract_epi32(pLsrPrimV256Float64StoreVectorHelp_idxV, 3); \
+        _mm_storeh_pd(((float64*)((char*)arr + off) + pLsrPrimV256Float64StoreVectorHelp_idx), pLsrPrimV256Float64StoreVectorHelp_srcV); \
+    } while (0)
+
+
+#define pLsrPrimV256B32LoadVectorStrided(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256UInt32LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256B64LoadVectorStrided(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256UInt64LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256F32LoadVectorStrided(dest, arr, offset, idx, stride) \
+    do {                                                                \
+        if (stride == 0) {                                              \
+            pLsrPrimV256Float32LoadVector(dest, arr, offset, idx);      \
+        } else {                                                        \
+            pLsrPrimV256Float32LoadVectorStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)                                                         \
+
+#define pLsrPrimV256B32LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256UInt32LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256B64LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256UInt64LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256F32LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256F32LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256F64LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256F64LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV256RefLoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV256RefLoadVectorStrided(dest, arr, offset, idx, stride)
+
+
+// *****************************************************************************
+// Type-specific arithemetic and memory operations
+// *****************************************************************************
+
+// ================
+// UInt32 - 256-bit
+// ================
+
+// *****************************************************************************
+// Arithmetic
+// *****************************************************************************
+
+#define pLsrPrimV256PointwiseUInt32Plus(dest, a, b)  vector256PointwiseUInt32Plus(dest, a, b)
+#define pLsrPrimV256PointwiseUInt32Minus(dest, a, b) vector256PointwiseUInt32Minus(dest, a, b)
+#define pLsrPrimV256PointwiseUInt32Times(dest, a, b) vector256PointwiseUInt32Times(dest, a, b)
+
+#define pLsrPrimV256PointwiseSInt32Plus(dest, a, b)  vector256PointwiseSInt32Plus(dest, a, b)
+#define pLsrPrimV256PointwiseSInt32Minus(dest, a, b) vector256PointwiseSInt32Minus(dest, a, b)
+#define pLsrPrimV256PointwiseSInt32Times(dest, a, b) vector256PointwiseSInt32Times(dest, a, b)
+
+#define integer256via128(dest, a, b, op)                                \
+    do {                                                                \
+      __m256i integer256via128_dest = {0, 0, 0, 0, 0, 0, 0, 0};         \
+      __m128i integer256via128_hiA = _mm256_extractf128_si256(a, 1);    \
+      __m128i integer256via128_loA = _mm256_extractf128_si256(a, 0);    \
+      __m128i integer256via128_hiB  = _mm256_extractf128_si256(b, 1);   \
+      __m128i integer256via128_loB  = _mm256_extractf128_si256(b, 0);   \
+      __m128i integer256via128_loTemp;                                  \
+      __m128i integer256via128_hiTemp;                                  \
+      op(integer256via128_loTemp, integer256via128_loA, integer256via128_loB); \
+      op(integer256via128_hiTemp, integer256via128_hiA, integer256via128_hiB); \
+      integer256via128_dest = _mm256_insertf128_si256(integer256via128_dest, integer256via128_hiTemp, 1); \
+      integer256via128_dest = _mm256_insertf128_si256(integer256via128_dest, integer256via128_loTemp, 0); \
+      (dest) =  integer256via128_dest;                                  \
+} while (0)
+
+#define vector256PointwiseUInt32Plus(dest, a, b)  integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Plus)
+#define vector256PointwiseUInt32Minus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Minus)
+#define vector256PointwiseUInt32Times(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseUInt32Times)
+
+#define vector256PointwiseSInt32Plus(dest, a, b)  integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Plus)
+#define vector256PointwiseSInt32Minus(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Minus)
+#define vector256PointwiseSInt32Times(dest, a, b) integer256via128(dest, a, b, pLsrPrimV128PointwiseSInt32Times)
+
+// *****************************************************************************
+// Reductions (horizontals)
+// *****************************************************************************
+
+#define pLsrPrimV256ReduceAUInt32Plus(dest, init, a)  vector256reduceAUInt32Plus(dest, init, a)
+#define pLsrPrimV256ReduceAUInt32Times(dest, init, a) vector256reduceAUInt32Times(dest, init, a)
+#define pLsrPrimV256ReduceAUInt32Max(dest, init, a)   vector256reduceAUInt32Max(dest, init, a)
+#define pLsrPrimV256ReduceAUInt32Min(dest, init, a)   vector256reduceAUInt32Min(dest, init, a)
+
+#define vector256reduceAUInt32Plus(dest, init, a)                       \
+  do {                                                                  \
+    __m128i vector256reduceAUInt32Plus_hiA = _mm256_extractf128_si256(a, 1); \
+    __m128i vector256reduceAUInt32Plus_loA = _mm256_extractf128_si256(a, 0); \
+    uint32 vector256reduceAUInt32Plus_aRes;                             \
+    pLsrPrimV128ReduceAUInt32Plus(vector256reduceAUInt32Plus_aRes, init, vector256reduceAUInt32Plus_hiA); \
+    uint32 vector256reduceAUInt32Plus_bRes;                             \
+    pLsrPrimV128ReduceAUInt32Plus(vector256reduceAUInt32Plus_bRes, init, vector256reduceAUInt32Plus_loA); \
+    (dest) = (vector256reduceAUInt32Plus_aRes + vector256reduceAUInt32Plus_bRes); \
+  } while (0)
+
+#define vector256reduceAUInt32Times(dest, init, a)                      \
+  do {                                                                  \
+    __m128i vector256reduceAUInt32Times_hiA = _mm256_extractf128_si256(a, 1); \
+    __m128i vector256reduceAUInt32Times_loA = _mm256_extractf128_si256(a, 0); \
+    uint32 vector256reduceAUInt32Times_aRes;                            \
+    pLsrPrimV128ReduceAUInt32Times(vector256reduceAUInt32Times_aRes, init, vector256reduceAUInt32Times_hiA); \
+    uint32 vector256reduceAUInt32Times_bRes;                            \
+    pLsrPrimV128ReduceAUInt32Times(vector256reduceAUInt32Times_bRes, init, vector256reduceAUInt32Times_loA); \
+    (dest) = (vector256reduceAUInt32Times_aRes * vector256reduceAUInt32Times_bRes); \
+  } while (0)
+
+#define vector256reduceAUInt32Max(dest, init, a)        \
+    do {                                                                \
+        __m128i vector256reduceAUInt32Max_hiA = _mm256_extractf128_si256(a, 1); \
+        __m128i vector256reduceAUInt32Max_loA = _mm256_extractf128_si256(a, 0); \
+        uint32 vector256reduceAUInt32Max_aRes; \
+        pLsrPrimV128ReduceAUInt32Max(vector256reduceAUInt32Max_aRes, init, vector256reduceAUInt32Max_hiA); \
+        uint32 vector256reduceAUInt32Max_bRes; \
+        pLsrPrimV128ReduceAUInt32Max(vector256reduceAUInt32Max_bRes, init, vector256reduceAUInt32Max_loA); \
+        (dest) = (max(vector256reduceAUInt32Max_aRes, vector256reduceAUInt32Max_bRes)); \
+    } while (0)
+
+#define vector256reduceAUInt32Min(dest, init, a)        \
+  do {                                                                  \
+    __m128i vector256reduceAUInt32Min_hiA = _mm256_extractf128_si256(a, 1); \
+    __m128i vector256reduceAUInt32Min_loA = _mm256_extractf128_si256(a, 0); \
+    uint32 vector256reduceAUInt32Min_aRes;                              \
+    pLsrPrimV128ReduceAUInt32Min(vector256reduceAUInt32Min_aRes, init, vector256reduceAUInt32Min_hiA); \
+    uint32 vector256reduceAUInt32Min_bRes;                              \
+    pLsrPrimV128ReduceAUInt32Min(vector256reduceAUInt32Min_bRes, init, vector256reduceAUInt32Min_loA); \
+    (dest) = (min(vector256reduceAUInt32Min_aRes, vector256reduceAUInt32Min_bRes)); \
+  } while (0)
+
+// *****************************************************************************
+// Data operations
+// *****************************************************************************
+
+#define pLsrPrimV256UInt32Lift(dest, a) vector256UInt32Lift(dest, a)
+#define pLsrPrimV256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)   \
+    vector256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)
+
+#define vector256UInt32Lift(dest, a)                                    \
+  do {                                                                  \
+    __m256i vector256UInt32Lift_dest = {0, 0, 0, 0, 0, 0, 0, 0};        \
+    __m128i vector256UInt32Lift_lo = _mm_set1_epi32(a);                 \
+    vector256UInt32Lift_dest = _mm256_insertf128_si256(vector256UInt32Lift_dest, vector256UInt32Lift_lo, 1); \
+    vector256UInt32Lift_dest = _mm256_insertf128_si256(vector256UInt32Lift_dest, vector256UInt32Lift_lo, 0); \
+    (dest) = vector256UInt32Lift_dest;                                  \
+  } while (0)
+
+#define vector256UInt32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)      \
+    do {                                                                \
+        __m256i vector256UInt32Const_out = {0, 0, 0, 0, 0, 0, 0, 0};    \
+        __m128i vector256UInt32Const_lo = _mm_set_epi32(c3, c2, c1, c0); \
+        __m128i vector256UInt32Const_hi = _mm_set_epi32(c7, c6, c5, c4); \
+        vector256UInt32Const_out = _mm256_insertf128_si256(vector256UInt32Const_out, vector256UInt32Const_hi, 1); \
+        vector256UInt32Const_out = _mm256_insertf128_si256(vector256UInt32Const_out, vector256UInt32Const_lo, 0); \
+        (dest) = vector256UInt32Const_out;                              \
+    } while (0)
+
+#define pLsrPrimV256UInt32Load(dest, arr, off, idx) pLsrPrimV256UInt32LoadHelp (dest, arr, off, idx)
+#define pLsrPrimV256UInt32Store(arr, off, idx, v) pLsrPrimV256UInt32StoreHelp (arr, off, idx, v)
+
+#define pLsrPrimV256UInt32LoadHelp(dest, arr, off, idx)                \
+    do {                                                                \
+        __m256i* pLsrPrimV256UInt32LoadHelp_marr = (__m256i*)((uint32*)((char*)arr+off) + idx); \
+        (dest) = _mm256_loadu_si256(pLsrPrimV256UInt32LoadHelp_marr);  \
+    } while (0)
+
+#define pLsrPrimV256UInt32StoreHelp(arr, off, idx, v)                  \
+    do {                                                                \
+        __m256i* pLsrPrimV256UInt32StoreHelp_marr = (__m256i*)((uint32*)((char*)arr+off) + idx); \
+        _mm256_storeu_si256(pLsrPrimV256UInt32StoreHelp_marr, v);                                   \
+    } while (0)
+
+// =====
+// Float
+// =====
+
+// *****************************************************************************
+// Arithmetic operations
+// *****************************************************************************
+
+static __m256 pLsrPrimV256Float32Zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+#define pLsrPrimV256PointwiseFloat32Plus(dest, a, b) ((dest) = _mm256_add_ps(a, b))
+#define pLsrPrimV256PointwiseFloat32Minus(dest, a, b) ((dest) = _mm256_sub_ps(a, b))
+#define pLsrPrimV256PointwiseFloat32Times(dest, a, b) ((dest) = _mm256_mul_ps(a, b))
+#define pLsrPrimV256PointwiseFloat32Divide(dest, a, b) ((dest) = _mm256_div_ps(a, b))
+#define pLsrPrimV256PointwiseFloat32Negate(dest, a) ((dest) = _mm256_sub_ps(pLsrPrimV256Float32Zero, a))
+#define pLsrPrimV256PointwiseFloat32Max(dest, a, b) ((dest) = _mm256_max_ps(a, b))
+#define pLsrPrimV256PointwiseFloat32Sqrt(dest, a) ((dest) = _mm256_sqrt_ps(a))
+
+// *****************************************************************************
+// Reductions
+// *****************************************************************************
+
+#define pLsrPrimV256ReduceAFloat32Plus(dest, init, a) reduce256AFloat32Plus(dest, init, a)
+
+#define reduce256AFloat32Plus(dest, init, a)                            \
+  do {                                                                  \
+    __m256 reduceAFloat32Plus_a_ = a;                                   \
+    __m128 rf32_hi = _mm256_extractf128_ps(reduceAFloat32Plus_a_, 1);   \
+    __m256 rf32_tmp = _mm256_insertf128_ps(reduceAFloat32Plus_a_, rf32_hi, 0); \
+    __m256 reduceAFloat32Plus_tmp = _mm256_hadd_ps(reduceAFloat32Plus_a_, rf32_tmp); \
+    __m256 reduceAFloat32Plus_tmp2 = _mm256_hadd_ps(reduceAFloat32Plus_tmp, reduceAFloat32Plus_tmp); \
+    __m256 reduceAFloat32Plus_tmp3 = _mm256_hadd_ps(reduceAFloat32Plus_tmp2, reduceAFloat32Plus_tmp2); \
+    float reduceAFloat32Plus_dest_;                                     \
+    pLsrPrimV256DataSubF32L(reduceAFloat32Plus_dest_, 0, reduceAFloat32Plus_tmp3); \
+    (dest) =  (reduceAFloat32Plus_dest_ + (init));                      \
+  } while (0)
+
+// *****************************************************************************
+// Data operations
+// *****************************************************************************
+
+#define pLsrPrimV256Float32Load(dest, arr, off, idx)                   \
+    do {                                                                \
+      float* pLsrPrimV256Float32Load_marr = ((float*)((char*)arr+off) + idx); \
+        __m256 float32load = _mm256_loadu_ps(pLsrPrimV256Float32Load_marr); \
+        (dest) = float32load;                                           \
+    } while (0)
+
+#define pLsrPrimV256Float32LoadVector(dest, arr, off, idx) \
+    pLsrPrimV256Float32LoadVectorHelp (dest, arr, off, idx)
+
+#define pLsrPrimV256Float32Store(arr, off, idx, v)                     \
+  do {                                                                  \
+    float* pLsrPrimV256Float32Store_marr = ((float*)((char*)arr+off) + idx); \
+    _mm256_storeu_ps(pLsrPrimV256Float32Store_marr, v);                \
+  } while (0)
+
+
+#define pLsrPrimV256Float32LoadStrided(dest, arr, off, idx, stride)     \
+    do {                                                                \
+        (dest) = _mm256_set_ps(*((float32*)((char*)arr + off) + idx + 7*stride), \
+                               *((float32*)((char*)arr + off) + idx + 6*stride), \
+                               *((float32*)((char*)arr + off) + idx + 5*stride), \
+                               *((float32*)((char*)arr + off) + idx + 4*stride), \
+                               *((float32*)((char*)arr + off) + idx + 3*stride), \
+                               *((float32*)((char*)arr + off) + idx + 2*stride), \
+                               *((float32*)((char*)arr + off) + idx + stride), \
+                               *((float32*)((char*)arr + off) + idx + 0)); \
+    } while (0)
+
+#define pLsrPrimV256Float32StoreStrided(arr, off, idx, stride, src)      \
+    do {                                                                \
+        __m128 pLsrPrimV256Float32StoreStrided_srcV;                   \
+                                                                        \
+        pLsrPrimV256Float32StoreStrided_srcV =_mm256_extractf128_ps(src, 0); \
+        *((uint32*)((char*)arr + off) + idx + 0) =                            \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 0); \
+        *((uint32*)((char*)arr + off) + idx + stride) =                       \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 1); \
+        *((uint32*)((char*)arr + off) + idx + 2*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 2); \
+        *((uint32*)((char*)arr + off) + idx + 3*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 3); \
+                                                                        \
+        pLsrPrimV256Float32StoreStrided_srcV =_mm256_extractf128_ps(src, 1); \
+        *((uint32*)((char*)arr + off) + idx + 4*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 0); \
+        *((uint32*)((char*)arr + off) + idx + 5*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 1); \
+        *((uint32*)((char*)arr + off) + idx + 6*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 2); \
+        *((uint32*)((char*)arr + off) + idx + 7*stride) =                     \
+            (uint32) _mm_extract_ps(pLsrPrimV256Float32StoreStrided_srcV, 3); \
+    } while (0)
+
+#define pLsrPrimV256Float32Const(dest, c0, c1, c2, c3, c4, c5, c6, c7)  \
+    ((dest) = _mm256_set_ps(c7, c6, c5, c4, c3, c2, c1, c0))
+
+#define pLsrPrimV256Float32Lift(dest, a)               \
+  do {                                                  \
+    float temp = a;                                     \
+    (dest) = _mm256_broadcast_ss((float *)(&temp));     \
+  } while(0)
+
+
+#define pLsrPrimV256Float32LoadVectorHelp(dest, arr, off, idx)          \
+    do {                                                                \
+        __m128i pLsrPrimV256Float32LoadVectorHelp_ptrV;                \
+        __m128 pLsrPrimV256Float32LoadVectorHelp_lo;                   \
+        __m128 pLsrPrimV256Float32LoadVectorHelp_hi;                   \
+        __m256  pLsrPrimV256Float32LoadVectorHelp_res;                  \
+        float32* pLsrPrimV256Float32LoadVectorHelp_ptr0;                \
+        float32* pLsrPrimV256Float32LoadVectorHelp_ptr1;                \
+        float32* pLsrPrimV256Float32LoadVectorHelp_ptr2;                \
+        float32* pLsrPrimV256Float32LoadVectorHelp_ptr3;                \
+                                                                        \
+        pLsrPrimV256Float32LoadVectorHelp_ptrV = _mm256_extractf128_si256(arr, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr0 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr1 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr2 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 2); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr3 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 3); \
+        pLsrPrimV256Float32LoadVectorHelp_lo =                          \
+            _mm_set_ps(*(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr0 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr1 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr2 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr3 + off) + idx)); \
+        pLsrPrimV256Float32LoadVectorHelp_ptrV =_mm256_extractf128_si256(arr, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr0 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 0); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr1 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 1); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr2 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 2); \
+        pLsrPrimV256Float32LoadVectorHelp_ptr3 = (float*)_mm_extract_epi32(pLsrPrimV256Float32LoadVectorHelp_ptrV, 3); \
+        pLsrPrimV256Float32LoadVectorHelp_hi =                          \
+            _mm_set_ps(*(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr0 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr1 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr2 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV256Float32LoadVectorHelp_ptr3 + off) + idx)); \
+        pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \
+                                                                     pLsrPrimV256Float32LoadVectorHelp_hi, \
+                                                                     1); \
+        pLsrPrimV256Float32LoadVectorHelp_res = _mm256_insertf128_ps(pLsrPrimV256Float32LoadVectorHelp_res, \
+                                                                     pLsrPrimV256Float32LoadVectorHelp_lo, \
+                                                                     0); \
+        (dest) = pLsrPrimV256Float32LoadVectorHelp_res;                 \
+    } while (0)
+
+// =====
+// Double
+// =====
+
+// *****************************************************************************
+// Arithmetic operations
+// *****************************************************************************
+
+static __m256d pLsrPrimV256Float64Zero = {0.0, 0.0, 0.0, 0.0};
+#define pLsrPrimV256PointwiseFloat64Plus(dest, a, b) ((dest) = _mm256_add_pd(a, b))
+#define pLsrPrimV256PointwiseFloat64Minus(dest, a, b) ((dest) = _mm256_sub_pd(a, b))
+#define pLsrPrimV256PointwiseFloat64Times(dest, a, b) ((dest) = _mm256_mul_pd(a, b))
+#define pLsrPrimV256PointwiseFloat64Divide(dest, a, b) ((dest) = _mm256_div_pd(a, b))
+#define pLsrPrimV256PointwiseFloat64Negate(dest, a) ((dest) = _mm256_sub_pd(pLsrPrimV256Float64Zero, a))
+#define pLsrPrimV256PointwiseFloat64Max(dest, a, b) ((dest) = _mm256_max_pd(a, b))
+#define pLsrPrimV256PointwiseFloat64Sqrt(dest, a) ((dest) = _mm256_sqrt_pd(a))
+
+#define pLsrPrimV256PointwiseSInt32DivT(dest, a, b)                     \
+    do {                                                                \
+        __m128i pLsrPrimV256PointwiseSInt32DivT_a;                      \
+        __m128i pLsrPrimV256PointwiseSInt32DivT_b;                      \
+        __m128i pLsrPrimV256PointwiseSInt32DivT_rlo;                    \
+        __m128i pLsrPrimV256PointwiseSInt32DivT_rhi;                    \
+        __m256i pLsrPrimV256PointwiseSInt32DivT_res;                    \
+                                                                        \
+        pLsrPrimV256PointwiseSInt32DivT_a = _mm256_extractf128_si256(a, 0); \
+        pLsrPrimV256PointwiseSInt32DivT_b = _mm256_extractf128_si256(b, 0); \
+        pLsrPrimV128PointwiseSInt32DivT(pLsrPrimV256PointwiseSInt32DivT_rlo, pLsrPrimV256PointwiseSInt32DivT_a, pLsrPrimV256PointwiseSInt32DivT_b);\
+        pLsrPrimV256PointwiseSInt32DivT_a = _mm256_extractf128_si256(a, 1); \
+        pLsrPrimV256PointwiseSInt32DivT_b = _mm256_extractf128_si256(b, 1); \
+        pLsrPrimV128PointwiseSInt32DivT(pLsrPrimV256PointwiseSInt32DivT_rhi, pLsrPrimV256PointwiseSInt32DivT_a, pLsrPrimV256PointwiseSInt32DivT_b); \
+        pLsrPrimV256PointwiseSInt32DivT_res = _mm256_castsi128_si256(pLsrPrimV256PointwiseSInt32DivT_rlo); \
+        pLsrPrimV256PointwiseSInt32DivT_res = _mm256_insertf128_si256(pLsrPrimV256PointwiseSInt32DivT_res, \
+                                                                      pLsrPrimV256PointwiseSInt32DivT_rhi, \
+                                                                      1); \
+        (dest) = pLsrPrimV256PointwiseSInt32DivT_res;                 \
+    } while (0)
+
+#define pLsrPrimV256PointwiseSInt32ModT(dest, a, b)                     \
+    do {                                                                \
+        __m128i pLsrPrimV256PointwiseSInt32ModT_a;                      \
+        __m128i pLsrPrimV256PointwiseSInt32ModT_b;                      \
+        __m128i pLsrPrimV256PointwiseSInt32ModT_rlo;                    \
+        __m128i pLsrPrimV256PointwiseSInt32ModT_rhi;                    \
+        __m256i pLsrPrimV256PointwiseSInt32ModT_res;                    \
+                                                                        \
+        pLsrPrimV256PointwiseSInt32ModT_a = _mm256_extractf128_si256(a, 0); \
+        pLsrPrimV256PointwiseSInt32ModT_b = _mm256_extractf128_si256(b, 0); \
+        pLsrPrimV128PointwiseSInt32ModT(pLsrPrimV256PointwiseSInt32ModT_rlo, pLsrPrimV256PointwiseSInt32ModT_a, pLsrPrimV256PointwiseSInt32ModT_b);\
+        pLsrPrimV256PointwiseSInt32ModT_a = _mm256_extractf128_si256(a, 1); \
+        pLsrPrimV256PointwiseSInt32ModT_b = _mm256_extractf128_si256(b, 1); \
+        pLsrPrimV128PointwiseSInt32ModT(pLsrPrimV256PointwiseSInt32ModT_rhi, pLsrPrimV256PointwiseSInt32ModT_a, pLsrPrimV256PointwiseSInt32ModT_b); \
+        pLsrPrimV256PointwiseSInt32ModT_res = _mm256_castsi128_si256(pLsrPrimV256PointwiseSInt32ModT_rlo); \
+        pLsrPrimV256PointwiseSInt32ModT_res = _mm256_insertf128_si256(pLsrPrimV256PointwiseSInt32ModT_res, \
+                                                                      pLsrPrimV256PointwiseSInt32ModT_rhi, \
+                                                                      1); \
+        (dest) = pLsrPrimV256PointwiseSInt32ModT_res;                 \
+    } while (0)
+
+// *****************************************************************************
+// Reductions
+// *****************************************************************************
+
+#define pLsrPrimV256ReduceAFloat64Plus(dest, init, a)                   \
+    do {                                                                \
+        __m256d reduceAFloat64Plus_p_ = a;                               \
+        __m128d reduceAFloat64Plus_p_lo   = _mm256_extractf128_pd(reduceAFloat64Plus_p_, 0); \
+        __m128d reduceAFloat64Plus_p_hi   = _mm256_extractf128_pd(reduceAFloat64Plus_p_, 1); \
+        __m128d reduceAFloat64Plus_p_tmp  = _mm_hadd_pd(reduceAFloat64Plus_p_lo, reduceAFloat64Plus_p_hi); \
+        reduceAFloat64Plus_p_tmp  = _mm_hadd_pd(reduceAFloat64Plus_p_tmp, reduceAFloat64Plus_p_tmp); \
+        double reduceAFloat64Plus_dest_;                                \
+        pLsrPrimV256DataSubF64(reduceAFloat64Plus_dest_, 0, reduceAFloat64Plus_p_tmp); \
+        (dest) =  (reduceAFloat64Plus_dest_ + (init));                  \
+    } while (0)
+
+// *****************************************************************************
+// Data operations
+// *****************************************************************************
+
+#define pLsrPrimV256Float64Load(dest, arr, off, idx)                   \
+    do {                                                                \
+        double* pLsrPrimV256Float64Load_marr = ((double*)((char*)arr+off) + idx); \
+        __m256d float64load = _mm256_loadu_pd(pLsrPrimV256Float64Load_marr); \
+        (dest) = float64load;                                           \
+    } while (0)
+
+#define pLsrPrimV256Float64Store(arr, off, idx, v)                     \
+  do {                                                                  \
+    double* pLsrPrimV256Float64Store_marr = ((double*)((char*)arr+off) + idx); \
+    _mm256_storeu_pd(pLsrPrimV256Float64Store_marr, v);                \
+  } while (0)
+
+
+#define pLsrPrimV256Float64LoadStrided(dest, arr, off, idx, stride)     \
+    do {                                                                \
+        (dest) = _mm256_set_pd(*((float64*)((char*)arr + off) + idx + 3*stride), \
+                               *((float64*)((char*)arr + off) + idx + 2*stride), \
+                               *((float64*)((char*)arr + off) + idx + stride), \
+                               *((float64*)((char*)arr + off) + idx + 0));    \
+    } while (0)
+
+#define pLsrPrimV256Float64StoreStrided(arr, off, idx, stride, src)     \
+    do {                                                                \
+        __m128d pLsrPrimV256Float64StoreStrided_srcV;                   \
+                                                                        \
+        pLsrPrimV256Float64StoreStrided_srcV =_mm256_extractf128_pd(src, 0); \
+        _mm_storel_pd(((float64*)((char*)arr + off) + idx + 0), pLsrPrimV256Float64StoreStrided_srcV); \
+        _mm_storeh_pd(((float64*)((char*)arr + off) + idx + stride), pLsrPrimV256Float64StoreStrided_srcV); \
+                                                                        \
+        pLsrPrimV256Float64StoreStrided_srcV =_mm256_extractf128_pd(src, 1); \
+        _mm_storel_pd(((float64*)((char*)arr + off) + idx + 2*stride), pLsrPrimV256Float64StoreStrided_srcV); \
+        _mm_storeh_pd(((float64*)((char*)arr + off) + idx + 3*stride), pLsrPrimV256Float64StoreStrided_srcV); \
+    } while (0)
+
+#define pLsrPrimV256Float64Lift(dest, a)               \
+  do {                                                  \
+    double temp = a;                                     \
+    (dest) = _mm256_broadcast_sd((double *)(&temp));     \
+  } while(0)
+
+// ================
+// Ref - 256-bit
+// ================
+
+#define pLsrPrimV256RefLoad(dest, arr, off, idx) pLsrPrimV256UInt32Load(dest, arr, off, idx)
+
+#endif
+
+#endif _PLSR_PRIMS_VECTOR_MIC_H_
diff --git a/runtime/include/hrc/plsr-prims-vector-sse.h b/runtime/include/hrc/plsr-prims-vector-sse.h
new file mode 100644
index 0000000..2263dd1
--- /dev/null
+++ b/runtime/include/hrc/plsr-prims-vector-sse.h
@@ -0,0 +1,1224 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_PRIMS_VECTOR_SSE_H_
+#define _PLSR_PRIMS_VECTOR_SSE_H_
+
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+#endif /* P_USE_PILLAR */
+
+
+#ifdef P_USE_PILLAR
+#  undef to
+#  pragma pillar_managed(on)
+#endif /* P_USE_PILLAR */
+
+#ifdef PLSR_VECTOR_USE_ASM
+// define the vector types
+/* typedef float __m64 __attribute__ ((__vector_size__ (8)));
+typedef int  __m128i  __attribute__ ((__vector_size__ (16)));
+typedef double __m128d __attribute__ ((__vector_size__ (16)));
+typedef float __m128 __attribute__ ((__vector_size__ (16))); */
+
+/*
+struct floatVec {
+  float elems[4];
+};
+typedef struct floatVec __m128;
+
+struct floatVecShort {
+  float elems[2];
+};
+typedef struct floatVecShort __m64;
+
+struct doubleVec {
+  double elems[2];
+};
+typedef struct doubleVec __m128d; */
+
+typedef union __m128i {
+    __int8              m128i_i8[16];
+    __int16             m128i_i16[8];
+    __int32             m128i_i32[4];
+    __int64             m128i_i64[2];
+    unsigned __int8     m128i_u8[16];
+    unsigned __int16    m128i_u16[8];
+    unsigned __int32    m128i_u32[4];
+    unsigned __int64    m128i_u64[2];
+} __m128i;
+
+typedef struct __m128r {
+    ref    m128i_u32[4];
+} __m128r;
+
+typedef struct __m128d {
+    double              m128d_f64[2];
+} __m128d;
+
+typedef union __m128 {
+     float               m128_f32[4];
+     unsigned __int64    m128_u64[2];
+     __int8              m128_i8[16];
+     __int16             m128_i16[8];
+     __int32             m128_i32[4];
+     __int64             m128_i64[2];
+     unsigned __int8     m128_u8[16];
+     unsigned __int16    m128_u16[8];
+     unsigned __int32    m128_u32[4];
+ } __m128;
+
+typedef union __m64
+{
+    unsigned __int64    m64_u64;
+    float               m64_f32[2];
+    __int8              m64_i8[8];
+    __int16             m64_i16[4];
+    __int32             m64_i32[2];
+    __int64             m64_i64;
+    unsigned __int8     m64_u8[8];
+    unsigned __int16    m64_u16[4];
+    unsigned __int32    m64_u32[2];
+    } __m64;
+
+#else
+#ifdef P_USE_PILLAR
+#define __ICL_INTRINCC __pcdecl
+#endif /* P_USE_PILLAR */
+#include <nmmintrin.h>
+#endif
+
+typedef __m64 PlsrVector64F32;
+
+typedef __m128 PlsrVector128F32;
+typedef __m128i PlsrVector128B32;
+typedef __m128d PlsrVector128F64;
+typedef __m128i PlsrVector128Ref;
+
+typedef __m128 PlsrVectorMask128Fs32;
+typedef __m128 PlsrVectorMask128Fs64;
+
+// *****************************************************************************
+// Comparison
+// *****************************************************************************
+#if 0
+#define pLsrPrimV128PointwiseSInt32LT(dest, a, b) ((dest) = _mm_cmplt_epi32((a),(b)))
+#define pLsrPrimV128PointwiseSInt32EQ(dest, a, b) ((dest) = _mm_cmpeq_epi32((a),(b)))
+#define pLsrPrimV128PointwiseSInt32GT(dest, a, b) ((dest) = _mm_cmpgt_epi32((a),(b)))
+#else
+#define pLsrPrimV128CompareSInt32LT(dest, a, b) ((dest) = _mm_cmplt_epi32((a),(b)))
+#define pLsrPrimV128CompareSInt32EQ(dest, a, b) ((dest) = _mm_cmpeq_epi32((a),(b)))
+#define pLsrPrimV128CompareSInt32GT(dest, a, b) ((dest) = _mm_cmpgt_epi32((a),(b)))
+#endif
+
+// *****************************************************************************
+// Conversions & Casts
+// *****************************************************************************
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+#define pLsrPrimV128128ConvertFloat32FromUInt32(dest, a) vector128128ConvertFloat32FromUInt32(dest, a)
+#define pLsrPrimV128128ConvertUInt32FromFloat32(dest, a) vector128128ConvertUInt32FromFloat32(dest, a)
+
+#define vector128128ConvertFloat32FromUInt32(dest, a)    \
+    do {                                                 \
+        __asm__ ("cvtdq2ps %1, %0;"                      \
+                 : "=x"(dest)                            \
+                 : "xm"(a));                             \
+    } while (0)
+
+#define vector128128ConvertUInt32FromFloat32(dest, a)   \
+    do {                                                \
+        __asm__ ("cvtps2dq %1, %0;"                     \
+                 : "=x"(dest)                           \
+                 : "xm"(a));                            \
+    } while (0)
+
+#else
+
+#define pLsrPrimV128128ConvertFloat32FromUInt32(dest, a) ((dest) = _mm_cvtepi32_ps(a))
+#define pLsrPrimV128128ConvertUInt32FromFloat32(dest, a) ((dest) = _mm_cvtps_epi32(a))
+// vec64 -> vec128
+#define pLsrPrimV128128ConvertFloat64FromUInt32(dest, a) ((dest) = _mm_cvtepi32_pd (_mm_movpi64_epi64(a)))
+#define pLsrPrimV128128ConvertUInt32FromFloat64(dest, a) ((dest) = _mm_movepi64_pi64 (_mm_cvtpd_epi32(a)))
+
+#define pLsrPrimV128128CastSInt32ToFloat32(dest, a) ((dest) = _mm_cvtepi32_ps(a))
+
+#endif
+
+#define pLsrPrimV128128ConvertUInt32FromSInt32(dest, a) ((dest) = a)
+#define pLsrPrimV128128ConvertSInt32FromUInt32(dest, a) ((dest) = a)
+
+// *****************************************************************************
+// Data operations on the more abstract B32/F32/F64 types
+// *****************************************************************************
+
+#define pLsrPrimV128DataVectorB32(dest, c0, c1, c2, c3) pLsrPrimV128UInt32Const(dest, c0, c1, c2, c3)
+#define pLsrPrimV128DataVectorF32(dest, c0, c1, c2, c3) pLsrPrimV128Float32Const(dest, c0, c1, c2, c3)
+#define pLsrPrimV128DataVectorF64(dest, c0, c1)         pLsrPrimV128Float64Const(dest, c0, c1)
+
+#define pLsrPrimV128DataBroadcastB32(dest, c0) pLsrPrimV128UInt32Lift(dest, c0)
+#define pLsrPrimV128DataBroadcastF32(dest, c0) pLsrPrimV128Float32Lift(dest, c0)
+#define pLsrPrimV128DataBroadcastF64(dest, c0) pLsrPrimV128Float64Lift(dest, c0)
+
+// Subscripting operations
+
+#define pLsrPrimV128DataSub0B32(dest, a) pLsrPrimV128DataSubB32(dest, 0, a)
+#define pLsrPrimV128DataSub1B32(dest, a) pLsrPrimV128DataSubB32(dest, 1, a)
+#define pLsrPrimV128DataSub2B32(dest, a) pLsrPrimV128DataSubB32(dest, 2, a)
+#define pLsrPrimV128DataSub3B32(dest, a) pLsrPrimV128DataSubB32(dest, 3, a)
+
+/*
+static inline int pLsrPrimV128DataSubB32(int index, __m128i a) {
+  // _mm_extract_epi32 (SSE >= 4.1)
+} */
+
+#define pLsrPrimV128DataSubB32(dest, n, a)     \
+  do {                                          \
+    int pLsrPrimV128DataSubB32_x;              \
+    __asm__ ("pshufd %1, %2, %2;\n\t"           \
+             "movd %2, %0;\n\t"                 \
+             : "=r"(pLsrPrimV128DataSubB32_x)  \
+             : "i"(n), "x"(a));                 \
+    (dest) = pLsrPrimV128DataSubB32_x;         \
+  } while (0)
+
+
+#define pLsrPrimV128DataSub0F32(dest, a) pLsrPrimV128DataSubF32(dest, 0, a)
+#define pLsrPrimV128DataSub1F32(dest, a) pLsrPrimV128DataSubF32(dest, 1, a)
+#define pLsrPrimV128DataSub2F32(dest, a) pLsrPrimV128DataSubF32(dest, 2, a)
+#define pLsrPrimV128DataSub3F32(dest, a) pLsrPrimV128DataSubF32(dest, 3, a)
+
+#define pLsrPrimV128DataSubF32(dest, sub, a) ((dest) = ((float*)&a)[sub])
+
+#define pLsrPrimV128DataSub0F64(dest, a) pLsrPrimV128DataSubF64(dest, 0, a)
+#define pLsrPrimV128DataSub1F64(dest, a) pLsrPrimV128DataSubF64(dest, 1, a)
+
+#define pLsrPrimV128DataSubF64(dest, sub, a) ((dest) = ((double*)&a)[sub])
+
+// loads and stores
+
+#define pLsrPrimV128B32StoreF(arr, offset, v) pLsrPrimV128UInt32StoreF(arr, offset, v)
+#define pLsrPrimV128B64StoreF(arr, offset, v) pLsrPrimV128UInt64StoreF(arr, offset, v)
+#define pLsrPrimV128F32StoreF(arr, offset, v) pLsrPrimV128Float32StoreF(arr, offset, v)
+#define pLsrPrimV128F64StoreF(arr, offset, v) pLsrPrimV128Float64StoreF(arr, offset, v)
+#define pLsrPrimV128RefStoreF(arr, offset, v) pLsrPrimV128RefStoreF(arr, offset, v)
+
+#define pLsrPrimV128B32StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV128UInt32Store(arr, offset, idx, v);              \
+        } else {                                                        \
+            pLsrPrimV128UInt32StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+
+#define pLsrPrimV128B64StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV128UInt64Store(arr, offset, idx, v);              \
+        } else {                                                        \
+            pLsrPrimV128UInt64StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV128F32StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV128Float32Store(arr, offset, idx, v);             \
+        } else {                                                        \
+            pLsrPrimV128Float32StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV128F64StoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV128Float64Store(arr, offset, idx, v);             \
+        } else {                                                        \
+            pLsrPrimV128Float64StoreStrided(arr, offset, idx, stride, v); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV128RefStoreVS(arr, offset, idx, stride, v)            \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV128RefStore(arr, offset, idx, v);                 \
+        } else {                                                        \
+            pLsrPrimV128RefStoreStrided(arr, offset, idx, stride, v);  \
+        }                                                               \
+    } while (0)
+
+#define pLsrPrimV128B32StoreVI(arr, offset, vi, v) pLsrPrimV128UInt32StoreIndexed(arr, offset, vi, v)
+#define pLsrPrimV128B64StoreVI(arr, offset, vi, v) pLsrPrimV128UInt64StoreIndexed(arr, offset, vi, v)
+#define pLsrPrimV128F32StoreVI(arr, offset, vi, v) pLsrPrimV128Float32StoreIndexed(arr, offset, vi, v)
+#define pLsrPrimV128F64StoreVI(arr, offset, vi, v) pLsrPrimV128Float64StoreIndexed(arr, offset, vi, v)
+#define pLsrPrimV128RefStoreVI(arr, offset, vi, v) pLsrPrimV128RefStoreIndexed(arr, offset, vi, v)
+
+#define pLsrPrimV128B32StoreVVS(arr, offset, idx, stride, v)           \
+    pLsrPrimV128UInt32StoreVectorStrided(arr, offset, idx, stride, v)
+#define pLsrPrimV128B64StoreVVS(arr, offset, idx, stride, v)           \
+    pLsrPrimV128UInt64StoreVectorStrided(arr, offset, idx, stride, v)
+#define pLsrPrimV128F32StoreVVS(arr, offset, idx, stride, v)           \
+    pLsrPrimV128Float32StoreVectorStrided(arr, offset, idx, stride, v)
+#define pLsrPrimV128F64StoreVVS(arr, offset, idx, stride, v)           \
+    pLsrPrimV128Float64StoreVectorStrided(arr, offset, idx, stride, v)
+#define pLsrPrimV128RefStoreVVS(arr, offset, idx, stride, v)           \
+    pLsrPrimV128RefStoreVectorStrided(arr, offset, idx, stride, v)
+
+#define pLsrPrimV128B32StoreVVI(arr, offset, vi, v) pLsrPrimV128UInt32StoreVectorIndexed(arr, offset, vi, v)
+#define pLsrPrimV128B64StoreVVI(arr, offset, vi, v) pLsrPrimV128UInt64StoreVectorIndexed(arr, offset, vi, v)
+#define pLsrPrimV128F32StoreVVI(arr, offset, vi, v) pLsrPrimV128Float32StoreVectorIndexed(arr, offset, vi, v)
+#define pLsrPrimV128F64StoreVVI(arr, offset, vi, v) pLsrPrimV128Float64StoreVectorIndexed(arr, offset, vi, v)
+#define pLsrPrimV128RefStoreVVI(arr, offset, vi, v) pLsrPrimV128RefStoreVectorIndexed(arr, offset, vi, v)
+
+#define pLsrPrimV128B32LoadF(dest, arr, offset) pLsrPrimV128UInt32LoadF(dest, arr, offset)
+#define pLsrPrimV128B64LoadF(dest, arr, offset) pLsrPrimV128UInt64LoadF(dest, arr, offset)
+#define pLsrPrimV128F32LoadF(dest, arr, offset) pLsrPrimV128Float32LoadF(dest, arr, offset)
+#define pLsrPrimV128F64LoadF(dest, arr, offset) pLsrPrimV128Float64LoadF(dest, arr, offset)
+#define pLsrPrimV128RefLoadF(dest, arr, offset) pLsrPrimV128RefLoadF(dest, arr, offset)
+
+#define pLsrPrimV128B32LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV128UInt32Load(dest, arr, offset, idx);            \
+        } else {                                                        \
+            pLsrPrimV128UInt32LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV128B64LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV128UInt64Load(dest, arr, offset, idx);            \
+        } else {                                                        \
+            pLsrPrimV128UInt64LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV128F32LoadVS(dest, arr, offset, idx, stride)        \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV128Float32Load(dest, arr, offset, idx);           \
+        } else {                                                        \
+            pLsrPrimV128Float32LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV128F64LoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV128Float64Load(dest, arr, offset, idx);           \
+        } else {                                                        \
+            pLsrPrimV128Float64LoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+#define pLsrPrimV128RefLoadVS(dest, arr, offset, idx, stride)          \
+    do {                                                                \
+        if (stride == 1) {                                              \
+            pLsrPrimV128RefLoad(dest, arr, offset, idx);               \
+        } else {                                                        \
+            pLsrPrimV128RefLoadStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)
+
+#define pLsrPrimV128B32LoadVI(dest, arr, offset, vi) pLsrPrimV128UInt32LoadIndexed(dest, arr, offset, vi)
+#define pLsrPrimV128B64LoadVI(dest, arr, offset, vi) pLsrPrimV128UInt64LoadIndexed(dest, arr, offset, vi)
+#define pLsrPrimV128F32LoadVI(dest, arr, offset, vi) pLsrPrimV128Float32LoadIndexed(dest, arr, offset, vi)
+#define pLsrPrimV128F32LoadVI64(dest, arr, offset, vi) pLsrPrimV128Float32LoadIndexed64(dest, arr, offset, vi)
+#define pLsrPrimV128F64LoadVI(dest, arr, offset, vi) pLsrPrimV128Float64LoadIndexed(dest, arr, offset, vi)
+#define pLsrPrimV128RefLoadVI(dest, arr, offset, vi) pLsrPrimV128RefLoadIndexed(dest, arr, offset, vi)
+
+#define pLsrPrimV128B32LoadVectorStrided(dest, arr, offset, idx, stride)           \
+    pLsrPrimV128UInt32LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV128B64LoadVectorStrided(dest, arr, offset, idx, stride)           \
+    pLsrPrimV128UInt64LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV128F32LoadVectorStrided(dest, arr, offset, idx, stride) \
+    do {                                                                \
+        if (stride == 0) {                                              \
+            pLsrPrimV128Float32LoadVector(dest, arr, offset, idx);      \
+        } else {                                                        \
+            pLsrPrimV128Float32LoadVectorStrided(dest, arr, offset, idx, stride); \
+        }                                                               \
+    } while (0)                                                         \
+
+#define pLsrPrimV128F64LoadVectorStrided(dest, arr, offset, idx, stride)           \
+    pLsrPrimV128Float64LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV128RefLoadVectorStrided(dest, arr, offset, idx, stride)           \
+    pLsrPrimV128RefLoadVectorStrided(dest, arr, offset, idx, stride)
+
+#define pLsrPrimV128B32LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV128UInt32LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV128B64LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV128UInt64LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV128F32LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV128F32LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV128F64LoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV128F64LoadVectorStrided(dest, arr, offset, idx, stride)
+#define pLsrPrimV128RefLoadVVS(dest, arr, offset, idx, stride)           \
+    pLsrPrimV128RefLoadVectorStrided(dest, arr, offset, idx, stride)
+
+#define pLsrPrimV128B32LoadVVI(dest, arr, offset, vi) pLsrPrimV128UInt32LoadVectorIndexed(dest, arr, offset, vi)
+#define pLsrPrimV128B64LoadVVI(dest, arr, offset, vi) pLsrPrimV128UInt64LoadVectorIndexed(dest, arr, offset, vi)
+#define pLsrPrimV128F32LoadVVI(dest, arr, offset, vi) pLsrPrimV128Float32LoadVectorIndexed(dest, arr, offset, vi)
+#define pLsrPrimV128F64LoadVVI(dest, arr, offset, vi) pLsrPrimV128Float64LoadVectorIndexed(dest, arr, offset, vi)
+#define pLsrPrimV128RefLoadVVI(dest, arr, offset, vi) pLsrPrimV128RefLoadVectorIndexed(dest, arr, offset, vi)
+
+// *****************************************************************************
+// Type-specific arithemetic and memory operations
+// *****************************************************************************
+
+// ================
+// UInt32 - 128-bit
+// ================
+
+// Arithmetic
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+#define pLsrPrimV128PointwiseUInt32Plus(dest, a, b) vector128PointwiseUInt32Plus(dest, a, b)
+#define pLsrPrimV128PointwiseUInt32Minus(dest, a, b) vector128PointwiseUInt32Minus(dest, a, b)
+#define pLsrPrimV128PointwiseUInt32Times(dest, a, b) vector128PointwiseUInt32Times(dest, a, b)
+
+#define vector128PointwiseUInt32Plus(dest, a, b)                        \
+    do {                                                                \
+        __m128i vector128PointwiseUInt32Plus_c = b;                     \
+        __asm__ ("paddd %2, %0;\n\t"                                    \
+                 : "=x"(vector128PointwiseUInt32Plus_c)                 \
+                 : "0"(vector128PointwiseUInt32Plus_c), "x"(a));        \
+        (dest) = vector128PointwiseUInt32Plus_c;                        \
+    } while (0)
+
+#define vector128PointwiseUInt32Minus(dest, a, b)                       \
+    do {                                                                \
+        __m128i vector128PointwiseUInt32Minus_c = b;                    \
+        __asm__ ("psubd %2, %0;\n\t"                                    \
+                 : "=x"(vector128PointwiseUInt32Minus_c)                \
+                 : "0"(vector128PointwiseUInt32Minus_c), "x"(a));       \
+        (dest) = vector128PointwiseUInt32Minus_c;                       \
+    } while (0)
+
+#define vector128PointwiseUInt32Times(dest, a, b)               \
+    do {                                                        \
+        __m128i vector128PointwiseUInt32Times_c;                \
+        __asm__ ("movdqu %1, %%xmm0;\n\t"                       \
+                 "movdqu %2, %%xmm1;\n\t"                       \
+                 "psrldq $0x04, %%xmm0;\n\t" /* c */            \
+                 "psrldq $0x04, %%xmm1;\n\t"  /* d */           \
+                 "pmuludq %%xmm0, %%xmm1;\n\t" /* xmm1 = c*d */ \
+                 "pmuludq %1, %2;\n\t" /* v = a*b */            \
+                 "pshufd $0x08, %2, %0;\n\t"                    \
+                 "pshufd $0x08, %%xmm1, %%xmm1;\n\t"            \
+                 "punpckldq %%xmm1, %0;\n\t"                    \
+                 : "=x"(vector128PointwiseUInt32Times_c)        \
+                 : "x"(a), "x"(b)                               \
+                 : "xmm0", "xmm1");                             \
+        (dest) = vector128PointwiseUInt32Times_c;               \
+    } while (0)
+
+#else
+
+#define pLsrPrimV128PointwiseUInt32Plus(dest, a, b) ((dest) = _mm_add_epi32(a, b))
+#define pLsrPrimV128PointwiseUInt32Minus(dest, a, b) ((dest) = _mm_sub_epi32(a, b))
+#define pLsrPrimV128PointwiseUInt32Times(dest, a, b) u32_mul(dest, a, b)  // _mm_mullo_epi32(a, b)) <- SSE 4.1
+#define pLsrPrimV128PointwiseUInt32Max(a, b) (_mm_max_epi32(a, b)) //not supported <- SSE 4.1
+
+#define pLsrPrimV128PointwiseSInt32Plus(dest, a, b) ((dest) = _mm_add_epi32(a, b))
+#define pLsrPrimV128PointwiseSInt32Minus(dest, a, b) ((dest) = _mm_sub_epi32(a, b))
+#define pLsrPrimV128PointwiseSInt32Times(dest, a, b) ((dest) = _mm_mullo_epi32(a, b)) //<- SSE 4.1
+
+#define pLsrPrimV128PointwiseUInt32BAnd(dest, a, b) ((dest) = _mm_and_si128((a), (b)))
+
+/*
+#define pLsrPrimV128PointwiseUInt32BShiftR(dest, a, b)  \
+    do {						\
+	int pLsrPrimV128PointwiseUInt32BShiftR_count = _mm_extract_epi32(b, 0);		\
+	(dest) = _mm_srli_epi32((a), pLsrPrimV128PointwiseUInt32BShiftR_count);		\
+    } while (0)
+
+#define pLsrPrimV128PointwiseUInt32BShiftL(dest, a, b)  \
+    do {						\
+	int pLsrPrimV128PointwiseUInt32BShiftL_count = _mm_extract_epi32(b, 0);		\
+	(dest) = _mm_slli_epi32((a), pLsrPrimV128PointwiseUInt32BShiftL_count);		\
+    } while (0)
+*/
+
+#define pLsrPrimV128PointwiseUInt32BShiftR(dest, a, b) \
+    do {                                                                \
+        uint32 pLsrPrimV128PointwiseUInt32BShiftR_R0 = _mm_extract_epi32(a, 0) >> _mm_extract_epi32(b, 0) ; \
+        uint32 pLsrPrimV128PointwiseUInt32BShiftR_R1 = _mm_extract_epi32(a, 1) >> _mm_extract_epi32(b, 1) ; \
+        uint32 pLsrPrimV128PointwiseUInt32BShiftR_R2 = _mm_extract_epi32(a, 2) >> _mm_extract_epi32(b, 2) ; \
+        uint32 pLsrPrimV128PointwiseUInt32BShiftR_R3 = _mm_extract_epi32(a, 3) >> _mm_extract_epi32(b, 3) ; \
+        (dest) = _mm_set_epi32(pLsrPrimV128PointwiseUInt32BShiftR_R3,      \
+                               pLsrPrimV128PointwiseUInt32BShiftR_R2,      \
+                               pLsrPrimV128PointwiseUInt32BShiftR_R1,      \
+                               pLsrPrimV128PointwiseUInt32BShiftR_R0);     \
+    } while (0)
+
+#define pLsrPrimV128PointwiseUInt32BShiftL(dest, a, b) \
+    do {                                                                \
+        uint32 pLsrPrimV128PointwiseUInt32BShiftL_R0 = _mm_extract_epi32(a, 0) << _mm_extract_epi32(b, 0) ; \
+        uint32 pLsrPrimV128PointwiseUInt32BShiftL_R1 = _mm_extract_epi32(a, 1) << _mm_extract_epi32(b, 1) ; \
+        uint32 pLsrPrimV128PointwiseUInt32BShiftL_R2 = _mm_extract_epi32(a, 2) << _mm_extract_epi32(b, 2) ; \
+        uint32 pLsrPrimV128PointwiseUInt32BShiftL_R3 = _mm_extract_epi32(a, 3) << _mm_extract_epi32(b, 3) ; \
+        (dest) = _mm_set_epi32(pLsrPrimV128PointwiseUInt32BShiftL_R3,      \
+                               pLsrPrimV128PointwiseUInt32BShiftL_R2,      \
+                               pLsrPrimV128PointwiseUInt32BShiftL_R1,      \
+                               pLsrPrimV128PointwiseUInt32BShiftL_R0);     \
+    } while (0)
+
+#define pLsrPrimV128PointwiseSInt32DivT(dest, a, b)                     \
+    do {                                                                \
+        sint32 pLsrPrimV128PointwiseSInt32DivT_R0 = _mm_extract_epi32(a, 0) / _mm_extract_epi32(b, 0) ; \
+        sint32 pLsrPrimV128PointwiseSInt32DivT_R1 = _mm_extract_epi32(a, 1) / _mm_extract_epi32(b, 1) ; \
+        sint32 pLsrPrimV128PointwiseSInt32DivT_R2 = _mm_extract_epi32(a, 2) / _mm_extract_epi32(b, 2) ; \
+        sint32 pLsrPrimV128PointwiseSInt32DivT_R3 = _mm_extract_epi32(a, 3) / _mm_extract_epi32(b, 3) ; \
+        (dest) = _mm_set_epi32(pLsrPrimV128PointwiseSInt32DivT_R3,      \
+                               pLsrPrimV128PointwiseSInt32DivT_R2,      \
+                               pLsrPrimV128PointwiseSInt32DivT_R1,      \
+                               pLsrPrimV128PointwiseSInt32DivT_R0);     \
+    } while (0)
+
+#define pLsrPrimV128PointwiseSInt32ModT(dest, a, b)                     \
+    do {                                                                \
+        sint32 pLsrPrimV128PointwiseSInt32ModT_R0 = _mm_extract_epi32(a, 0) % _mm_extract_epi32(b, 0) ; \
+        sint32 pLsrPrimV128PointwiseSInt32ModT_R1 = _mm_extract_epi32(a, 1) % _mm_extract_epi32(b, 1) ; \
+        sint32 pLsrPrimV128PointwiseSInt32ModT_R2 = _mm_extract_epi32(a, 2) % _mm_extract_epi32(b, 2) ; \
+        sint32 pLsrPrimV128PointwiseSInt32ModT_R3 = _mm_extract_epi32(a, 3) % _mm_extract_epi32(b, 3) ; \
+        (dest) = _mm_set_epi32(pLsrPrimV128PointwiseSInt32ModT_R3,      \
+                               pLsrPrimV128PointwiseSInt32ModT_R2,      \
+                               pLsrPrimV128PointwiseSInt32ModT_R1,      \
+                               pLsrPrimV128PointwiseSInt32ModT_R0);     \
+    } while (0)
+
+
+// Multiplication routine for SSE < 4.1
+#define u32_mul(u32_mul_dest, u32_mul_a, u32_mul_b)                     \
+  do {                                                                  \
+    __m128i u32_mul_c = _mm_srli_si128(u32_mul_a, 4);                   \
+    __m128i u32_mul_d = _mm_srli_si128(u32_mul_b, 4);                   \
+    __m128i u32_mul_u = _mm_mul_epu32(u32_mul_a, u32_mul_b);            \
+    __m128i u32_mul_v = _mm_mul_epu32(u32_mul_c, u32_mul_d);            \
+    __m128i u32_mul_r1 = _mm_shuffle_epi32(u32_mul_u, 8);               \
+    __m128i u32_mul_r2 = _mm_shuffle_epi32(u32_mul_v, 8);               \
+    (u32_mul_dest) = _mm_unpacklo_epi32(u32_mul_r1, u32_mul_r2);        \
+  } while (0)
+
+#endif
+
+// Reductions (horizontals)
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+#define pLsrPrimV128ReduceAUInt32Plus(dest, init, a) reduceAUInt32Plus(dest, init, a)
+#define pLsrPrimV128ReduceASInt32Plus(dest, init, a) reduceAUInt32Plus(dest, init, a)
+
+#define reduceAUInt32Plus(dest, init, a)        \
+    do {                                        \
+        uint32 reduceAUInt32Plus_c;             \
+        __asm__("phaddd %1, %1;\n\t"            \
+                "phaddd %1, %1;\n\t"            \
+                "movd %1, %0;\n\t"              \
+                : "=r"(reduceAUInt32Plus_c)     \
+                : "x"(a));                      \
+        (dest) = (reduceAUInt32Plus_c + init);  \
+    } while (0)
+
+#else
+
+#define pLsrPrimV128ReduceAUInt32Plus(dest, init, a) reduceAUInt32Plus(dest, init, a)
+#define pLsrPrimV128ReduceAUInt32Times(dest, init, a) reduceAUInt32Times(dest, init, a)
+#define pLsrPrimV128ReduceAUInt32Max(dest, init, a) reduceAUInt32Max(dest, init, a)
+#define pLsrPrimV128ReduceAUInt32Min(dest, init, a) reduceAUInt32Min(dest, init, a)
+
+#define pLsrPrimV128ReduceASInt32Plus(dest, init, a) reduceAUInt32Plus(dest, init, a)
+
+#define reduceAUInt32Plus(reduceAUInt32Plus_dest, reduceAUInt32Plus_init, reduceAUInt32Plus_a) \
+  do {                                                                  \
+    /* probably not a great implementation */                           \
+    __m128i reduceAUInt32Plus_tmp = _mm_hadd_epi32(reduceAUInt32Plus_a, reduceAUInt32Plus_a); \
+    __m128i reduceAUInt32Plus_tmp2 = _mm_hadd_epi32(reduceAUInt32Plus_tmp, reduceAUInt32Plus_tmp); \
+    uint32 reduceAUInt32Plus_tmp3;                                      \
+    pLsrPrimV128DataSub0B32(reduceAUInt32Plus_tmp3, reduceAUInt32Plus_tmp2); \
+    (reduceAUInt32Plus_dest) = (reduceAUInt32Plus_tmp3 + reduceAUInt32Plus_init); \
+  } while (0)
+
+#define reduceAUInt32Times(dest, init, a)                               \
+    do {                                                                \
+        __m128i reduceAUInt32Times_atmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)); \
+        __m128i reduceAUInt32Times_b;                                   \
+        pLsrPrimV128PointwiseUInt32Times(reduceAUInt32Times_b, reduceAUInt32Times_atmp, a); \
+        __m128i reduceAUInt32Times_btmp = _mm_shuffle_epi32(reduceAUInt32Times_b, _MM_SHUFFLE(0, 0, 0, 1)); \
+        __m128i reduceAUInt32Times_c;                                   \
+        pLsrPrimV128PointwiseUInt32Times(reduceAUInt32Times_c, reduceAUInt32Times_btmp, reduceAUInt32Times_b); \
+        uint32 reduceAUInt32Times_r;                                    \
+        pLsrPrimV128DataSub0B32(reduceAUInt32Times_r, c);              \
+        (dest) = (reduceAUInt32Times_r * (init));                       \
+    } while (0)
+
+//inline int max(int a, int b) {
+//  return a > b ? a : b;
+//}
+
+#define reduceAUInt32Max(dest, init, a)                                 \
+    do {                                                                \
+        __m128i reduceAUInt32Max_atmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)); \
+        __m128i reduceAUInt32Max_b = _mm_max_epi32(reduceAUInt32Max_atmp, a); /* note is SSE >= 4.1 */ \
+        __m128i reduceAUInt32Max_btmp = _mm_shuffle_epi32(reduceAUInt32Max_b, _MM_SHUFFLE(0, 0, 0, 1)); \
+        __m128i reduceAUInt32Max_c = _mm_max_epi32(reduceAUInt32Max_btmp, reduceAUInt32Max_b); \
+        uint32 reduceAUInt32Max_r;                                      \
+        pLsrPrimV128DataSub0B32(reduceAUInt32Max_r, reduceAUInt32Max_c);                \
+        (dest) = (max(reduceAUInt32Max_r, (init)));                     \
+    } while (0)                                                         \
+
+#define reduceAUInt32Min(dest, init, a)                                 \
+    do {                                                                \
+        __m128i reduceAUInt32Min_atmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)); \
+        __m128i reduceAUInt32Min_b = _mm_min_epi32(reduceAUInt32Min_atmp, a); /* note is SSE >= 4.1 */ \
+        __m128i reduceAUInt32Min_btmp = _mm_shuffle_epi32(reduceAUInt32Min_b, _MM_SHUFFLE(0, 0, 0, 1)); \
+        __m128i reduceAUInt32Min_c = _mm_min_epi32(reduceAUInt32Min_btmp, reduceAUInt32Min_b); \
+        uint32 reduceAUInt32Min_r;                                      \
+        pLsrPrimV128DataSub0B32(reduceAUInt32Min_r, c);                \
+        (dest) = (min(reduceAUInt32Min_r, (init)));                     \
+    } while (0)
+
+#endif
+
+// Data operations
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+#define pLsrPrimV128UInt32Lift(dest, a) vector128UInt32Lift(dest, a)
+#define pLsrPrimV128UInt32Const(dest, c0, c1, c2, c3) vector128UInt32Const(dest, c0, c1, c2, c3)
+#define pLsrPrimV128UInt32Load(dest, arr, off, idx) vector128UInt32LoadHelp (dest, arr, off, idx)
+#define pLsrPrimV128UInt32Store(arr, off, idx, v) vector128UInt32StoreHelp (arr, off, idx, v)
+
+#define vector128UInt32Lift(dest, a)            \
+    do {                                        \
+        __m128i vector128UInt32Lift_b;          \
+        __asm__ ("movd %1, %0;\n\t"             \
+                 "pshufd $0x0, %0, %0;\n\t"     \
+                 : "=x"(vector128UInt32Lift_b)  \
+                 : "r"(a));                     \
+        (dest) = vector128UInt32Lift_b;         \
+    } while (0)
+
+ // SSE 4.1
+  //__m128i dest
+  // __asm__ ("pinsrd $0x0, %1, %0;\n\t"
+  //         "pinsrd $0x1, %2, %0;\n\t"
+  //         "pinsrd $0x2, %3, %0;\n\t"
+  //         "pinsrd $0x3, %4, %0;\n\t"
+  //         : "=x"(dest)
+  //         : "rm"(c0), "rm"(c1), "rm"(c2), "rm"(c3));
+
+#define vector128UInt32Const(dest, c0, c1, c2, c3)              \
+    do {                                                        \
+        int vector128UInt32Const_c[4];                          \
+        __m128i vector128UInt32Const_r;                         \
+        vector128UInt32Const_c[0] = c0;                         \
+        vector128UInt32Const_c[1] = c1;                         \
+        vector128UInt32Const_c[2] = c2;                         \
+        vector128UInt32Const_c[3] = c3;                         \
+        __asm__("movdqu (%1), %0;"                              \
+                : "=x"(vector128UInt32Const_r)                  \
+                : "r"(vector128UInt32Const_c));                 \
+        dest = vector128UInt32Const_r;                          \
+    } while (0)
+
+#define vector128UInt32LoadHelp(dest, arr, off, idx)                    \
+  do {                                                                  \
+    __m128i* vector128UInt32LoadHelp_marr = (__m128i*)((uint32*)((char*)arr+off) + idx); \
+    __m128i vector128UInt32LoadHelp_r;                                  \
+    __asm__("movdqu (%1), %0;\n" : "=x"(vector128UInt32LoadHelp_r) : "r"(vector128UInt32LoadHelp_marr)); \
+    (dest) = vector128UInt32LoadHelp_r;                                 \
+  } while (0)
+
+#define vector128UInt32StoreHelp(arr, off, idx, v)                      \
+  do {                                                                  \
+    __m128i* vector128UInt32StoreHelp_marr = (__m128i*)((uint32*)((char*)arr+off) + idx); \
+    __asm__("movdqu %1, (%0);\n" : : "r"(vector128UInt32StoreHelp_marr), "x"(v)); \
+  } while (0)
+
+#define pLsrPrimV128UInt32LoadStrided(dest, arr, off, idx, stride)      \
+    do {                                                        \
+        pLsrRuntimeError("Strided operations unimplemented");   \
+    } while (0)
+
+#define pLsrPrimV128UInt32StoreStrided(arr, off, idx, stride, v)       \
+    do {                                                                \
+        pLsrRuntimeError("Strided operations unimplemented");           \
+    } while (0)
+
+#else
+
+#define pLsrPrimV128UInt32Lift(dest, a) ((dest) = _mm_set1_epi32(a))
+#define pLsrPrimV128UInt32Const(dest, c0, c1, c2, c3) ((dest) = _mm_set_epi32(c3, c2, c1, c0))
+
+#define pLsrPrimV128UInt32Load(dest, arr, off, idx) pLsrPrimV128UInt32LoadHelp (dest, arr, off, idx)
+#define pLsrPrimV128UInt32Store(arr, off, idx, v) pLsrPrimV128UInt32StoreHelp (arr, off, idx, v)
+
+#define pLsrPrimV128UInt32LoadHelp(dest, arr, off, idx)                 \
+    do {                                                                \
+        __m128i* pLsrPrimV128UInt32LoadHelp_marr = (__m128i*)((uint32*)((char*)arr+off) + idx); \
+        (dest) = _mm_loadu_si128(pLsrPrimV128UInt32LoadHelp_marr);     \
+    } while (0)
+
+#define pLsrPrimV128UInt32StoreHelp(arr, off, idx, v)                  \
+    do {                                                                \
+        __m128i* pLsrPrimV128UInt32StoreHelp_marr = (__m128i*)((uint32*)((char*)arr+off) + idx); \
+        _mm_storeu_si128(pLsrPrimV128UInt32StoreHelp_marr, v);         \
+    } while (0)
+
+#define pLsrPrimV128UInt32LoadStrided(dest, arr, off, idx, stride)      \
+    do {                                                        \
+        pLsrRuntimeError("Strided operations unimplemented");   \
+    } while (0)
+
+#define pLsrPrimV128UInt32StoreStrided(arr, off, idx, stride, v)       \
+    do {                                                                \
+        pLsrRuntimeError("Strided operations unimplemented");           \
+    } while (0)
+
+#endif
+
+// ================
+// UInt32 - 64-bit
+// ================
+
+// Arihmetic operations
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+
+#else
+
+#define pLsrPrimV64PointwiseUInt32Plus(dest, a, b) ((dest) = _mm_add_pi32(a, b))
+#define pLsrPrimV64PointwiseUInt32Minus(dest, a, b) ((dest) = _mm_sub_pi32(a, b))
+#define pLsrPrimV64PointwiseUInt32Times(dest, a, b) (u32_mul_64(dest, a, b)) //_mm_mullo_pi32(a, b))
+
+
+// Multiplication route for SSE < 4.1
+#define u32_mul_64(dest, a, b)                                          \
+    do {                                                                \
+        __m64 u32_mul_64_c = _mm_srli_si64(a, 4);                       \
+        __m64 u32_mul_64_d = _mm_srli_si64(b, 4);                       \
+        __m64 u32_mul_64_u = _mm_mul_su32(a, b);                        \
+        __m64 u32_mul_64_v = _mm_mul_su32(u32_mul_64_c, u32_mul_64_d);  \
+        (dest) = _mm_unpacklo_pi32(u32_mul_64_u, u32_mul_64_v);         \
+    } while (0)
+
+#endif
+
+// Data operations
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+
+
+#else
+
+#define pLsrPrimV64UInt32Lift(dest, a) ((dest) = _mm_set1_pi32(a))
+#define pLsrPrimV64UInt32Const(dest, c0, c1) ((dest) = _mm_set_pi32(c1, c0))
+
+#endif
+
+// ================
+// SInt32 - 128-bit
+// ================
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+
+
+#else
+
+#define pLsrPrimV128PointwiseSInt32Plus(dest, a, b) ((dest) = _mm_add_epi32(a, b))
+#define pLsrPrimV128PointwiseSInt32Minus(dest, a, b) ((dest) = _mm_sub_epi32(a, b))
+#define pLsrPrimV128PointwiseSInt32Times(dest, a, b) ((dest) = _mm_mullo_epi32(a, b))
+#define pLsrPrimV128PointwiseSInt32Lift(dest, a) ((dest) = _mm_set1_epi32(a))
+#define pLsrPrimV128PointwiseSInt32Const(dest, c0, c1, c2, c3) ((dest) = _mm_set_epi32(c3, c2, c1, c0))
+
+#define pLsrPrimV64PointwiseSInt32Plus(dest, a, b) ((dest) = _mm_add_pi32(a, b))
+#define pLsrPrimV64PointwiseSInt32Minus(dest, a, b) ((dest) = _mm_sub_pi32(a, b))
+#define pLsrPrimV64PointwiseSInt32Times(dest, a, b) ((dest) = _mm_mullo_pi32(a, b))
+#define pLsrPrimV64SInt32Lift(dest, a) ((dest) = _mm_set1_pi32(a))
+#define pLsrPrimV64SInt32Const(dest, c0, c1) ((dest) = _mm_set_pi32(c1, c0))
+
+#endif
+
+// =====
+// Float
+// =====
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+static __m128 pLsrPrimV128Float32Zero = {0.0, 0.0, 0.0, 0.0};
+#define pLsrPrimV128PointwiseFloat32Plus(dest, a, b)   vector128PointwiseFloat32Plus(dest, a, b)
+#define pLsrPrimV128PointwiseFloat32Minus(dest, a, b)  vector128PointwiseFloat32Minus(dest, a, b)
+#define pLsrPrimV128PointwiseFloat32Times(dest, a, b)  vector128PointwiseFloat32Times(dest, a, b)
+#define pLsrPrimV128PointwiseFloat32Divide(dest, a, b) vector128PointwiseFloat32Divide(dest, a, b)
+#define pLsrPrimV128PointwiseFloat32Negate(dest, a) vector128PointwiseFloat32Minus(dest, pLsrPrimV128Float32Zero, a)
+#define pLsrPrimV128PointwiseFloat32Max(dest, a, b) vector128PointwiseFloat32Max(dest, a, b)
+#define pLsrPrimV128PointwiseFloat32Min(dest, a, b) vector128PointwiseFloat32Min(dest, a, b)
+
+#define vector128PointwiseFloat32Plus(dest, a, b)                       \
+    do {                                                                \
+        __m128 vector128PointwiseFloat32Plus_a = a;                     \
+        __m128 vector128PointwiseFloat32Plus_b = b;                     \
+        __asm__ ("addps %1, %0;" : "+x"(vector128PointwiseFloat32Plus_b) : "x"(vector128PointwiseFloat32Plus_a)); \
+        (dest) = vector128PointwiseFloat32Plus_b;                       \
+    } while (0)
+
+#define vector128PointwiseFloat32Minus(dest, a, b)                      \
+    do {                                                                \
+        __m128 vector128PointwiseFloat32Minus_a = a;                    \
+        __m128 vector128PointwiseFloat32Minus_b = b;                    \
+        __asm__ ("subps %1, %0;" : "+x"(vector128PointwiseFloat32Minus_b) : "x"(vector128PointwiseFloat32Minus_a)); \
+        (dest) = vector128PointwiseFloat32Minus_b;                      \
+    } while (0)
+
+#define vector128PointwiseFloat32Times(dest, a, b)                      \
+    do {                                                                \
+        __m128 vector128PointwiseFloat32Times_a = a;                    \
+        __m128 vector128PointwiseFloat32Times_b = b;                    \
+        __asm__ ("mulps %1, %0;" : "+x"(vector128PointwiseFloat32Times_b) : "x"(vector128PointwiseFloat32Times_a)); \
+        (dest) = vector128PointwiseFloat32Times_b;                      \
+    } while (0)
+
+#define vector128PointwiseFloat32Divide(dest, a, b)                        \
+    do {                                                                \
+        __m128 vector128PointwiseFloat32Div_a = a;                      \
+        __m128 vector128PointwiseFloat32Div_b = b;                      \
+        __asm__ ("divps %1, %0;" : "+x"(vector128PointwiseFloat32Div_b) : "x"(vector128PointwiseFloat32Div_a)); \
+        (dest) = vector128PointwiseFloat32Div_b;                        \
+    } while (0)
+
+#define vector128PointwiseFloat32Max(dest, a, b)                        \
+    do {                                                                \
+        __m128 vector128PointwiseFloat32Max_a = a;                      \
+        __m128 vector128PointwiseFloat32Max_b = b;                      \
+        __asm__ ("maxps %1, %0;" : "+x"(vector128PointwiseFloat32Max_b) : "x"(vector128PointwiseFloat32Max_a)); \
+        (dest) = vector128PointwiseFloat32Max_b;                        \
+    } while (0)
+
+#define vector128PointwiseFloat32Min(dest, a, b)                        \
+    do {                                                                \
+        __m128 vector128PointwiseFloat32Min_a = a;                      \
+        __m128 vector128PointwiseFloat32Min_b = b;                      \
+        __asm__ ("maxps %1, %0;" : "+x"(vector128PointwiseFloat32Min_b) : "x"(vector128PointwiseFloat32Min_a)); \
+        (dest) = vector128PointwiseFloat32Min_b;                        \
+    } while (0)
+
+#else
+
+static __m128 pLsrPrimV128Float32Zero = {0.0, 0.0, 0.0, 0.0};
+#define pLsrPrimV128PointwiseFloat32Plus(dest, a, b) ((dest) = _mm_add_ps(a, b))
+#define pLsrPrimV128PointwiseFloat32Minus(dest, a, b) ((dest) = _mm_sub_ps(a, b))
+#define pLsrPrimV128PointwiseFloat32Times(dest, a, b) ((dest) = _mm_mul_ps(a, b))
+#define pLsrPrimV128PointwiseFloat32Divide(dest, a, b) ((dest) = _mm_div_ps(a, b))
+#define pLsrPrimV128PointwiseFloat32Negate(dest, a) ((dest) = _mm_sub_ps(pLsrPrimV128Float32Zero, a))
+#define pLsrPrimV128PointwiseFloat32Max(a, b) ((dest) = _mm_max_ps(a, b))
+#define pLsrPrimV128PointwiseFloat32Min(dest, a, b) ((dest) = _mm_min_ps(a, b))
+#define pLsrPrimV128PointwiseFloat32Sqrt(dest, a) ((dest) = _mm_sqrt_ps(a))
+
+#endif
+
+// Reductions (horizontals)
+
+// =====
+// Float
+// =====
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+#define pLsrPrimV128ReduceAFloat32Plus(dest, init, a) reduceAFloat32Plus(dest, init, a)
+#define pLsrPrimV128ReduceAFloat32Max(dest, init, a) reduceAFloat32Max(dest, init, a)
+
+#define reduceAFloat32Plus(dest, init, a)                               \
+  do {                                                                  \
+  __m128 rAFloat32Plus_a_ = a;                                          \
+  float rAFloat32Plus_dest_;                                            \
+  __asm__("haddps %1, %1;\n\t"                                          \
+          "pextrd $0x0, %1, %0;\n\t"                                    \
+          : "=r"(rAFloat32Plus_dest_)                                   \
+          : "x"(rAFloat32Plus_a_));                                     \
+  (dest) = (rAFloat32Plus_dest_ + (init));                             \
+  } while (0)
+
+#define reduceAFloat32Max(dest, init, a)                                \
+    do {                                                                \
+        __m128 reduceAFloat32Max_a_ = a;                                \
+        float reduceAFloat32Max_dest_;                                  \
+        __asm__ ("shuhfps $0x0e, %1, %1;\n\t"                           \
+                 "max_ps %1, %1;\n\t"                                   \
+                 "shufps $0x01, %1, %1;\n\t"                            \
+                 "max_ps %1, %1;\n\t"                                   \
+                 "pextrd $0x0, %1, %0; \n\t"                            \
+                 : "=r"(reduceAFloat32Max_dest_)                        \
+                 : "x"(reduceAFloat32Max_a_));                          \
+        (dest) = (max(reduceAFloat32Max_dest_, init));                  \
+    } while (0)
+
+#else
+
+#define pLsrPrimV128ReduceAFloat32Plus(dest, init, a) reduceAFloat32Plus(dest, init, a)
+#define pLsrPrimV128ReduceAFloat32Times(dest, init, a) reduceAFloat32Times(dest, init, a)
+#define pLsrPrimV128ReduceAFloat32Max(dest, init, a) reduceAFloat32Max(dest, init, a)
+#define pLsrPrimV128ReduceAFloat32Min(dest, init, a) reduceAFloat32Min(dest, init, a)
+
+#define reduceAFloat32Plus(dest, init, a)                               \
+    do {                                                                \
+        __m128 reduceAFloat32Plus_a_ = a;                               \
+        __m128 reduceAFloat32Plus_tmp = _mm_hadd_ps(reduceAFloat32Plus_a_, reduceAFloat32Plus_a_); \
+        __m128 reduceAFloat32Plus_tmp2 = _mm_hadd_ps(reduceAFloat32Plus_tmp, reduceAFloat32Plus_tmp); \
+        float reduceAFloat32Plus_dest_;                                 \
+        pLsrPrimV128DataSubF32(reduceAFloat32Plus_dest_, 0, reduceAFloat32Plus_tmp2); \
+        (dest) =  (reduceAFloat32Plus_dest_ + (init));                  \
+    } while (0)
+
+#define reduceAFloat32Times(dest, init, a)                      \
+    do {                                                        \
+        __m128 reduceAFloat32Times_a_ = a;                             \
+        __m128 reduceAFloat32Times_atmp = _mm_shuffle_ps(reduceAFloat32Times_a_, reduceAFloat32Times_a_, _MM_SHUFFLE(0, 0, 3, 2)); \
+        __m128 reduceAFloat32Times_b;                                   \
+        pLsrPrimV128PointwiseFloat32Times(reduceAFloat32Times_b, reduceAFloat32Times_atmp, reduceAFloat32Times_a_); \
+        __m128 reduceAFloat32Times_btmp = _mm_shuffle_ps(reduceAFloat32Times_b, reduceAFloat32Times_b, _MM_SHUFFLE(0, 0, 0, 1)); \
+        __m128 reduceAFloat32Times_c;                                   \
+        pLsrPrimV128PointwiseFloat32Times(reduceAFloat32Times_c, reduceAFloat32Times_btmp, reduceAFloat32Times_b); \
+        float reduceAFloat32Times_dest_;                                \
+        pLsrPrimV128DataSubF32(reduceAFloat32Times_dest_, 0, reduceAFloat32Times_c); \
+        (dest) = (reduceAFloat32Times_dest_ * (init));                  \
+    } while (0)
+
+#define reduceAFloat32Max(init, a)                              \
+    do {                                                        \
+        __m128 reduceAFloat32Max_a_ = a;                                \
+        __m128 reduceAFloat32Max_atmp = _mm_shuffle_ps(reduceAFloat32Max_a_, reduceAFloat32Max_a_, _MM_SHUFFLE(0, 0, 3, 2)); \
+        __m128 reduceAFloat32Max_b = _mm_max_ps(reduceAFloat32Max_atmp, reduceAFloat32Max_a); \
+        __m128 reduceAFloat32Max_btmp = _mm_shuffle_ps(reduceAFloat32Max_b, reduceAFloat32Max_b, _MM_SHUFFLE(0, 0, 0, 1)); \
+        __m128 reduceAFloat32Max_c = _mm_max_ps(reduceAFloat32Max_btmp, reduceAFloat32Max_b); \
+        float reduceAFloat32Max_dest_;                                  \
+        pLsrPrimV128DataSubF32(reduceAFloat32Max_dest_, 0, reduceAFloat32Max_c); \
+        (dest) = max(reduceAFloat32Max_, (init));                       \
+    } while (0)
+
+#define reduceAFloat32Min(init, a)                              \
+    do {                                                        \
+        __m128 reduceAFloat32Min_a_ = a;                        \
+        __m128 reduceAFloat32Min_atmp = _mm_shuffle_ps(reduceAFloat32Min_a_, reduceAFloat32Min_a_, _MM_SHUFFLE(0, 0, 3, 2)); \
+        __m128 reduceAFloat32Min_b = _mm_min_ps(reduceAFloat32Min_atmp, reduceAFloat32Min_a); \
+        __m128 reduceAFloat32Min_btmp = _mm_shuffle_ps(reduceAFloat32Min_b, reduceAFloat32Min_b, _MM_SHUFFLE(0, 0, 0, 1)); \
+        __m128 reduceAFloat32Min_c = _mm_min_ps(reduceAFloat32Min_btmp, reduceAFloat32Min_b); \
+        float reduceAFloat32Min_dest_;                                  \
+        pLsrPrimV128DataSubF32(reduceAFloat32Min_dest_, 0, reduceAFloat32Min_c); \
+        (dest) = min(reduceAFloat32Min_, (init));                       \
+    } while (0)
+
+#endif
+
+// =====
+// Double
+// =====
+
+
+#ifdef PLSR_VECTOR_USE_ASM
+#else
+
+#define pLsrPrimV128ReduceAFloat64Plus(dest, init, a) reduceAFloat64Plus(dest, init, a)
+#define pLsrPrimV128ReduceAFloat64Times(dest, init, a) reduceAFloat64Times(dest, init, a)
+#define pLsrPrimV128ReduceAFloat64Max(dest, init, a) reduceAFloat64Max(dest, init, a)
+#define pLsrPrimV128ReduceAFloat64Min(dest, init, a) reduceAFloat64Min(dest, init, a)
+
+#define reduceAFloat64Plus(dest, init, a)                               \
+    do {                                                                \
+        __m128d reduceAFloat64Plus_a_ = a;                               \
+        __m128d reduceAFloat64Plus_tmp = _mm_hadd_pd(reduceAFloat64Plus_a_, reduceAFloat64Plus_a_); \
+        double reduceAFloat64Plus_dest_;                                 \
+        pLsrPrimV128DataSubF64(reduceAFloat64Plus_dest_, 0, reduceAFloat64Plus_tmp); \
+        (dest) =  (reduceAFloat64Plus_dest_ + (init));                  \
+    } while (0)
+
+#define reduceAFloat64Times(dest, init, a)                             \
+    do {                                                               \
+        __m128d reduceAFloat64Times_a_ = a;                              \
+        double reduceAFloat64Times_dest0_;                              \
+        double reduceAFloat64Times_dest1_;                              \
+        pLsrPrimV128DataSubF64(reduceAFloat64Times_dest0_, 0, reduceAFloat64Times_a_); \
+        pLsrPrimV128DataSubF64(reduceAFloat64Times_dest1_, 1, reduceAFloat64Times_a_); \
+        (dest) = reduceAFloat64Times_dest0_ * reduceAFloat64Times_dest0_ * (init); \
+    } while (0)
+
+#define reduceAFloat64Max(init, a)                              \
+    do {                                                               \
+        __m128d reduceAFloat64Max_a_ = a;                              \
+        double reduceAFloat64Max_dest0_;                              \
+        double reduceAFloat64Max_dest1_;                              \
+        pLsrPrimV128DataSubF64(reduceAFloat64Max_dest0_, 0, reduceAFloat64Max_a_); \
+        pLsrPrimV128DataSubF64(reduceAFloat64Max_dest1_, 1, reduceAFloat64Max_a_); \
+        (dest) = max(max(reduceAFloat64Max_dest0_, reduceAFloat64Max_dest0_), (init)); \
+    } while (0)
+
+#define reduceAFloat64Min(init, a)                              \
+    do {                                                               \
+        __m128 reduceAFloat64Min_a_ = a;                              \
+        double reduceAFloat64Min_dest0_;                              \
+        double reduceAFloat64Min_dest1_;                              \
+        pLsrPrimV128DataSubF64(reduceAFloat64Min_dest0_, 0, reduceAFloat64Min_a_); \
+        pLsrPrimV128DataSubF64(reduceAFloat64Min_dest1_, 1, reduceAFloat64Min_a_); \
+        (dest) = min(min(reduceAFloat64Min_dest0_, reduceAFloat64Min_dest0_), (init)); \
+    } while (0)
+
+#endif
+
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+#define pLsrPrimV128Float32Const(dest, c0, c1, c2, c3) vector128Float32Const(dest, c0, c1, c2, c3)
+#define pLsrPrimV128Float32Lift(dest, a) vector128Float32Lift(dest, a)
+#define pLsrPrimV128Float32Load(dest, arr, off, idx) pLsrPrimV128Float32LoadHelp (dest, arr, off, idx)
+#define pLsrPrimV128Float32Store(arr, off, idx, v) pLsrPrimV128Float32StoreHelp (arr, off, idx, v)
+
+#define pLsrPrimV128Float32LoadVector(dest, arr, off, idx) pLsrPrimV128Float32LoadVectorHelp (dest, arr, off, idx)
+
+/* Suspicous register constraint XXX -leaf */
+#define vector128Float32Lift(dest, a)                    \
+  do {                                                   \
+    float32 vector128Float32Lift_a_ = a;                 \
+    __m128 vector128Float32Lift_dest_;                   \
+    __asm__ ("movss %1, %0;\n\t"                         \
+             "shufps $0x0, %0, %0;"                      \
+             : "=x"(vector128Float32Lift_dest_)          \
+             : "xm"(vector128Float32Lift_a_));           \
+    (dest) = vector128Float32Lift_dest_;                 \
+  } while (0)
+
+#define vector128Float32Const(dest, c0, c1, c2, c3)                     \
+  do {                                                                  \
+    __m128 vector128Float32Const_dest_;                                 \
+    float vector128Float32Const_c[4];                                   \
+    vector128Float32Const_c[0] = c0;                                    \
+    vector128Float32Const_c[1] = c1;                                    \
+    vector128Float32Const_c[2] = c2;                                    \
+    vector128Float32Const_c[3] = c3;                                    \
+    __asm__("movups (%1), %0;"                                          \
+            : "=x"(vector128Float32Const_dest_)                         \
+            : "r"(vector128Float32Const_c));                            \
+    (dest) = vector128Float32Const_dest_;                               \
+  } while(0);
+
+#define pLsrPrimV128Float32LoadVectorHelp(dest, arr, off, idx)          \
+    do {                                                                \
+        float32* pLsrPrimV128Float32LoadVectorHelp_ptr;                  \
+        float32 pLsrPrimV128Float32LoadVectorHelp_marr[4];              \
+        pLsrPrimV128DataSubB32(pLsrPrimV128Float32LoadVectorHelp_ptr, 0, arr); \
+        pLsrPrimV128Float32LoadVectorHelp_marr[0] =                     \
+            *((float32*) (((char*)pLsrPrimV128Float32LoadVectorHelp_ptr + off) + idx)); \
+        pLsrPrimV128DataSubB32(pLsrPrimV128Float32LoadVectorHelp_ptr, 1, arr); \
+        pLsrPrimV128Float32LoadVectorHelp_marr[1] =                     \
+            *((float32*) (((char*)pLsrPrimV128Float32LoadVectorHelp_ptr + off) + idx)); \
+        pLsrPrimV128DataSubB32(pLsrPrimV128Float32LoadVectorHelp_ptr, 2, arr); \
+        pLsrPrimV128Float32LoadVectorHelp_marr[2] =                     \
+            *((float32*) (((char*)pLsrPrimV128Float32LoadVectorHelp_ptr + off) + idx)); \
+        pLsrPrimV128DataSubB32(pLsrPrimV128Float32LoadVectorHelp_ptr, 3, arr); \
+        pLsrPrimV128Float32LoadVectorHelp_marr[3] =                     \
+            *((float32*) (((char*)pLsrPrimV128Float32LoadVectorHelp_ptr + off) + idx)); \
+        pLsrPrimV128Float32Load(dest, &pLsrPrimV128Float32LoadVectorHelp_marr, 0, 0); \
+    } while (0)
+
+
+#define pLsrPrimV128Float32LoadHelp(dest, arr, off, idx)       \
+    do {                                                                \
+        __m128* pLsrPrimV128Float32LoadHelp_marr = (__m128*)((float*)((char*)arr+off) + idx); \
+        __m128 pLsrPrimV128Float32LoadHelp_dest_;                      \
+        __asm__("movups (%1), %0;"                                      \
+                : "=x"(pLsrPrimV128Float32LoadHelp_dest_)              \
+                : "r"(pLsrPrimV128Float32LoadHelp_marr));              \
+        (dest) = pLsrPrimV128Float32LoadHelp_dest_;                    \
+    } while (0)
+
+#define pLsrPrimV128Float32StoreHelp(arr, off, idx, v) \
+    do {                                                \
+        float* pLsrPrimV128Float32StoreHelp_marr = ((float*)((char*)arr+off) + idx); \
+        __asm__("movups %1, (%0);"                                      \
+                :                                                       \
+                : "r"(pLsrPrimV128Float32StoreHelp_marr), "x"(v));    \
+    } while (0)
+
+#else
+
+#define pLsrPrimV128Float32Const(dest, c0, c1, c2, c3) ((dest) = _mm_set_ps (c3, c2, c1, c0))
+#define pLsrPrimV128Float32Lift(dest, a) ((dest) = _mm_set1_ps(a))
+#define pLsrPrimV128Float32Load(dest, arr, off, idx) pLsrPrimV128Float32LoadHelp (dest, arr, off, idx)
+#define pLsrPrimV128Float32LoadVector(dest, arr, off, idx) \
+    pLsrPrimV128Float32LoadVectorHelp (dest, arr, off, idx)
+#define pLsrPrimV128Float32Store(arr, off, idx, v) pLsrPrimV128Float32StoreHelp (arr, off, idx, v)
+
+#define pLsrPrimV128Float32LoadHelp(dest, arr, off, idx)               \
+    do {                                                                \
+        float32* pLsrPrimV128Float32LoadHelp_marr = (float32*)((char*)arr+off)+idx; \
+        (dest) = _mm_loadu_ps(pLsrPrimV128Float32LoadHelp_marr);       \
+    } while (0)
+
+#define pLsrPrimV128Float32StoreHelp(arr, off, idx, v)                 \
+    do {                                                                \
+        float32* pLsrPrimV128Float32StoreHelp_marr = (float32*)((char*)arr+off) + idx; \
+        _mm_storeu_ps(pLsrPrimV128Float32StoreHelp_marr, v);           \
+    } while (0)
+
+#define pLsrPrimV128Float32LoadVectorHelp(dest, arr, off, idx)          \
+    do {                                                                \
+        float32* pLsrPrimV128Float32LoadVectorHelp_ptr0;                \
+        float32* pLsrPrimV128Float32LoadVectorHelp_ptr1;                \
+        float32* pLsrPrimV128Float32LoadVectorHelp_ptr2;                \
+        float32* pLsrPrimV128Float32LoadVectorHelp_ptr3;                \
+                                                                        \
+        pLsrPrimV128Float32LoadVectorHelp_ptr0 = (float*)_mm_extract_epi32(arr, 0); \
+        pLsrPrimV128Float32LoadVectorHelp_ptr1 = (float*)_mm_extract_epi32(arr, 1); \
+        pLsrPrimV128Float32LoadVectorHelp_ptr2 = (float*)_mm_extract_epi32(arr, 2); \
+        pLsrPrimV128Float32LoadVectorHelp_ptr3 = (float*)_mm_extract_epi32(arr, 3); \
+        (dest) =                                                        \
+            _mm_set_ps(*(float32*)(((char*)pLsrPrimV128Float32LoadVectorHelp_ptr0 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV128Float32LoadVectorHelp_ptr1 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV128Float32LoadVectorHelp_ptr2 + off) + idx), \
+                       *(float32*)(((char*)pLsrPrimV128Float32LoadVectorHelp_ptr3 + off) + idx)); \
+    } while (0)
+
+#define pLsrPrimV128Float32LoadStrided(dest, arr, off, idx, stride)    \
+    do {                                                                \
+        pLsrRuntimeError("Strided operations unimplemented");           \
+    } while (0)
+
+#define pLsrPrimV128Float32LoadVectorStrided(dest, arr, off, idx, stride)    \
+    do {                                                                \
+        pLsrRuntimeError("Strided operations unimplemented");           \
+    } while (0)
+
+#define pLsrPrimV128Float32StoreStrided(arr, off, idx, stride, v)      \
+    do {                                                                \
+        pLsrRuntimeError("Strided operations unimplemented");           \
+    } while (0)
+
+#endif
+
+// ======
+// Double
+// ======
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+
+
+#else
+
+#define pLsrPrimV128PointwiseFloat64Plus(dest, a, b) ((dest) = _mm_add_pd(a, b))
+#define pLsrPrimV128PointwiseFloat64Minus(dest, a, b) ((dest) = _mm_sub_pd(a, b))
+#define pLsrPrimV128PointwiseFloat64Times(dest, a, b) ((dest) = _mm_mul_pd(a, b))
+#define pLsrPrimV128PointwiseFloat64Divide(dest, a, b) ((dest) = _mm_div_pd(a, b))
+
+#define pLsrPrimV128Float64Const(dest, c0, c1) ((dest) = _mm_set_pd(c1, c0))
+#define pLsrPrimV128Float64Lift(dest, a) ((dest) = _mm_set1_pd(a))
+#define pLsrPrimV128Float64Load(dest, arr, off, idx) pLsrPrimV128Float64LoadHelp (dest, arr, off, idx)
+#define pLsrPrimV128Float64Store(arr, off, idx, v) pLsrPrimV128Float64StoreHelp (arr, off, idx, v)
+
+#define pLsrPrimV128Float64LoadHelp(dest, arr, off, idx)               \
+    do {                                                                \
+        float64* pLsrPrimV128Float64LoadHelp_marr = (float64*)((char*)arr+off) + idx; \
+        (dest) = _mm_loadu_pd(pLsrPrimV128Float64LoadHelp_marr);       \
+    } while (0)
+
+#define pLsrPrimV128Float64StoreHelp(arr, off, idx, v)                 \
+    do {                                                                \
+        float64* pLsrPrimV128Float64StoreHelp_marr = (float64*)((char*)arr+off) + idx; \
+        _mm_storeu_pd(pLsrPrimV128Float64StoreHelp_marr, v);           \
+    } while (0)
+
+#endif
+
+#define pLsrPrimV128Float64LoadStrided(dest, arr, off, idx, stride)     \
+    do {                                                        \
+        pLsrRuntimeError("Strided operations unimplemented");   \
+    } while (0)
+
+#define pLsrPrimV128Float64StoreStrided(arr, off, idx, stride, v)      \
+    do {                                                        \
+        pLsrRuntimeError("Strided operations unimplemented");   \
+    } while (0)
+
+
+
+// ================
+// Ref - 128-bit
+// ================
+
+#ifdef PLSR_VECTOR_USE_ASM
+
+#define pLsrPrimV128RefLift(dest, a) vector128RefLift(dest, a)
+#define pLsrPrimV128RefConst(dest, c0, c1, c2, c3) vector128RefConst(dest, c0, c1, c2, c3)
+#define pLsrPrimV128RefLoad(dest, arr, off, idx) vector128RefLoadHelp (dest, arr, off, idx)
+#define pLsrPrimV128RefStore(arr, off, idx, v) vector128RefStoreHelp (arr, off, idx, v)
+
+#define vector128RefLift(dest, a)            \
+    do {                                        \
+        __m128r vector128RefLift_b;          \
+        __asm__ ("movd %1, %0;\n\t"             \
+                 "pshufd $0x0, %0, %0;\n\t"     \
+                 : "=x"(vector128RefLift_b)  \
+                 : "r"(a));                     \
+        (dest) = vector128RefLift_b;         \
+    } while (0)
+
+ // SSE 4.1
+  //__m128r dest
+  // __asm__ ("pinsrd $0x0, %1, %0;\n\t"
+  //         "pinsrd $0x1, %2, %0;\n\t"
+  //         "pinsrd $0x2, %3, %0;\n\t"
+  //         "pinsrd $0x3, %4, %0;\n\t"
+  //         : "=x"(dest)
+  //         : "rm"(c0), "rm"(c1), "rm"(c2), "rm"(c3));
+
+#define vector128RefConst(dest, c0, c1, c2, c3)              \
+    do {                                                        \
+        int vector128RefConst_c[4];                          \
+        __m128r vector128RefConst_r;                         \
+        vector128RefConst_c[0] = c0;                         \
+        vector128RefConst_c[1] = c1;                         \
+        vector128RefConst_c[2] = c2;                         \
+        vector128RefConst_c[3] = c3;                         \
+        __asm__("movdqu (%1), %0;"                              \
+                : "=x"(vector128RefConst_r)                  \
+                : "r"(vector128RefConst_c));                 \
+        dest = vector128RefConst_r;                          \
+    } while (0)
+
+#define vector128RefLoadHelp(dest, arr, off, idx)                    \
+  do {                                                                  \
+    __m128r* vector128RefLoadHelp_marr = (__m128r*)((uint32*)((char*)arr+off) + idx); \
+    __m128r vector128RefLoadHelp_r;                                  \
+    __asm__("movdqu (%1), %0;\n" : "=x"(vector128RefLoadHelp_r) : "r"(vector128RefLoadHelp_marr)); \
+    (dest) = vector128RefLoadHelp_r;                                 \
+  } while (0)
+
+#define vector128RefStoreHelp(arr, off, idx, v)                      \
+  do {                                                                  \
+    __m128r* vector128RefStoreHelp_marr = (__m128r*)((uint32*)((char*)arr+off) + idx); \
+    __asm__("movdqu %1, (%0);\n" : : "r"(vector128RefStoreHelp_marr), "x"(v)); \
+  } while (0)
+
+#define pLsrPrimV128RefLoadStrided(dest, arr, off, idx, stride)      \
+    do {                                                        \
+        pLsrRuntimeError("Strided operations unimplemented");   \
+    } while (0)
+
+#define pLsrPrimV128RefStoreStrided(arr, off, idx, stride, v)       \
+    do {                                                                \
+        pLsrRuntimeError("Strided operations unimplemented");           \
+    } while (0)
+
+#else
+
+#define pLsrPrimV128RefLoad(dest, arr, off, idx) pLsrPrimV128UInt32Load(dest, arr, off, idx)
+
+#define pLsrPrimV128RefLoadStrided(dest, arr, off, idx, stride)      \
+    do {                                                        \
+        pLsrRuntimeError("Strided operations unimplemented");   \
+    } while (0)
+
+#endif
+
+#endif /* _PLSR_PRIMS_VECTOR_SSE_H_ */
diff --git a/runtime/include/hrc/plsr-prims-vector.h b/runtime/include/hrc/plsr-prims-vector.h
new file mode 100644
index 0000000..e083ccd
--- /dev/null
+++ b/runtime/include/hrc/plsr-prims-vector.h
@@ -0,0 +1,25 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_PRIMS_VECTOR_H_
+#define _PLSR_PRIMS_VECTOR_H_
+
+#ifdef P_USE_VI_SSE
+#define pLsrViWidth 128
+#include "hrc/plsr-prims-vector-sse.h"
+
+#elif P_USE_VI_AVX
+#define pLsrViWidth 256
+#include "hrc/plsr-prims-vector-sse.h"
+#include "hrc/plsr-prims-vector-avx.h"
+
+#elif P_USE_VI_MIC
+#define pLsrViWidth 512
+#include "hrc/plsr-prims-vector-mic.h"
+
+#else
+#define pLsrViWidth 128
+
+#endif
+
+#endif /* _PLSR_PRIMS_VECTOR_H_ */
diff --git a/runtime/include/hrc/plsr-prims.h b/runtime/include/hrc/plsr-prims.h
new file mode 100755
index 0000000..3d6c296
--- /dev/null
+++ b/runtime/include/hrc/plsr-prims.h
@@ -0,0 +1,12 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_PRIMS_H_
+#define _PLSR_PRIMS_H_
+
+#include "hrc/plsr-prims-prims.h"
+#include "hrc/plsr-prims-runtime.h"
+#include "hrc/plsr-prims-vector.h"
+#include "hrc/plsr-prims-ghc.h"
+
+#endif /* _PLSR_PRIMS_H_ */
diff --git a/runtime/include/hrc/plsr-ptk-thunk.h b/runtime/include/hrc/plsr-ptk-thunk.h
new file mode 100644
index 0000000..0f1d7c7
--- /dev/null
+++ b/runtime/include/hrc/plsr-ptk-thunk.h
@@ -0,0 +1,570 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Unified Futures Thunk Implementation */
+
+#ifndef _PLSR_THUNK_H_
+#define _PLSR_THUNK_H_
+
+#ifdef PLSR_THUNK_INTERCEPT_CUTS
+  #ifndef P_USE_PILLAR
+    #error "Cut interception only supported on Pillar"
+  #endif
+#endif
+
+/*** Types ***/
+
+typedef enum {
+    PlsrThunkIndir = PtkFutureStatusUserDefined,
+    PlsrThunkCut
+} PlsrFutureStatus;
+
+#ifdef PLSR_THUNK_INTERCEPT_CUTS
+#define PlsrTypesMk(name, retType)                                      \
+    typedef PlsrFutureStatus PlsrThunkReturnType##name;                 \
+    typedef PlsrThunkReturnType##name (*PlsrThunkCode##name)(PlsrThunkB##name); \
+    /* PlsrObjectU must be a prefix of this structure */                \
+    typedef struct PlsrThunkS##name {                                   \
+        PlsrVTable vtable;                                              \
+        PtkFutureBuffer future;                                         \
+        PilContinuation0 cutCont;                                       \
+        retType result;                                                 \
+        char fvs[];                                                     \
+    } PlsrThunkU##name
+#else /* !PLSR_THUNK_INTERCEPT_CUTS */
+#define PlsrTypesMk(name, retType)                                      \
+    typedef PlsrFutureStatus PlsrThunkReturnType##name;                 \
+    typedef PlsrThunkReturnType##name (*PlsrThunkCode##name)(PlsrThunkB##name); \
+    /* PlsrObjectU must be a prefix of this structure */                \
+    typedef struct PlsrThunkS##name {                                   \
+        PlsrVTable vtable;                                              \
+        PtkFutureBuffer future;                                         \
+        retType result;                                                 \
+        char fvs[];                                                     \
+    } PlsrThunkU##name
+#endif /* !PLSR_THUNK_INTERCEPT_CUTS */
+
+/*
+ * PlsrThunkCodeRef, pLsrThunkSRef, pLsrThunkURef
+ * PlsrThunkCode32, pLsrThunkS32, pLsrThunkU32
+ * PlsrThunkCode64, pLsrThunkS64, pLsrThunkU64
+ * PlsrThunkCodeFloat, pLsrThunkSFloat, pLsrThunkUFloat
+ * PlsrThunkCodeDouble, pLsrThunkSDouble, pLsrThunkUDouble
+ */
+PlsrTypesMk(Ref, PlsrRef);
+PlsrTypesMk(32, uint32);
+PlsrTypesMk(64, uint64);
+PlsrTypesMk(Float, float32);
+PlsrTypesMk(Double, float64);
+
+#define pLsrThunkRef(t) ((PlsrThunkURef*)(t))
+#define pLsrThunk32(t)  ((PlsrThunkU32*)(t))
+#define pLsrThunk64(t)  ((PlsrThunkU64*)(t))
+#define pLsrThunkFloat(t)  ((PlsrThunkUFloat*)(t))
+#define pLsrThunkDouble(t)  ((PlsrThunkUDouble*)(t))
+
+#define pLsrThunkResultFieldRef(thnk)     (pLsrThunkRef(thnk)->result)
+#define pLsrThunkResultField32(thnk)      (pLsrThunk32(thnk)->result)
+#define pLsrThunkResultField64(thnk)      (pLsrThunk64(thnk)->result)
+#define pLsrThunkResultFieldFloat(thnk)   (pLsrThunkFloat(thnk)->result)
+#define pLsrThunkResultFieldDouble(thnk)  (pLsrThunkDouble(thnk)->result)
+
+#ifdef PLSR_THUNK_SUBSUMPTION
+#define pLsrThunkCastToObjectRef(thunk)     ((PlsrRef) thunk)
+#else /* !PLSR_THUNK_SUBSUMPTION */
+#define pLsrThunkCastToObjectRef(thunk)     ((PlsrRef) 0)
+#endif /* !PLSR_THUNK_SUBSUMPTION */
+#define pLsrThunkCastToObject32(thunk)      ((uint32) 0)
+#define pLsrThunkCastToObject64(thunk)      ((uint64) 0)
+#define pLsrThunkCastToObjectFloat(thunk)   ((float32) 0)
+#define pLsrThunkCastToObjectDouble(thunk)  ((float64) 0)
+
+/* Global Thunks */
+
+  #define pLsrThunkStaticValueMk(name, v, vt, retType, val)               \
+    static PlsrThunkU##name v =                                         \
+        { .vtable = (vt),                                               \
+          .future = ptkFutureStatic(PlsrThunkIndir, 0),                 \
+          .result = (retType) (val) }
+
+/* These are marked ALWAYS_IMMUTABLE, so must be initialized atomically wrt the gc */
+#define pLsrThunkStaticValueRef(v, val)    pLsrThunkStaticValueMk(Ref,    v, pLsrThunkValVTableRef, PlsrRef, val)
+#define pLsrThunkStaticValue32(v, val)     pLsrThunkStaticValueMk(32,     v, pLsrThunkValVTable32, uint32, val)
+#define pLsrThunkStaticValue64(v, val)     pLsrThunkStaticValueMk(64,     v, pLsrThunkValVTable64, uint64, val)
+#define pLsrThunkStaticValueFloat(v, val)  pLsrThunkStaticValueMk(Float,  v, pLsrThunkValVTableFloat, float32, val)
+#define pLsrThunkStaticValueDouble(v, val) pLsrThunkStaticValueMk(Double, v, pLsrThunkValVTableDouble, float64, val)
+
+/* Creation */
+
+/* This cannot engender a yield */
+#define pLsrThunkNewMk(name, dv, vt, sz, algn)                               \
+    do {                                                                \
+        pLsrAllocAligned(PlsrThunkB##name, (dv), (vt), (sz), (algn));   \
+        assert((vt)->tag==VThunkTag);                                   \
+    } while (0)
+#ifdef PLSR_ZERO_REFS                                                   
+#define pLsrThunkNewRef(dv, vt, sz, algn)      \
+    do {                                       \
+        pLsrThunkNewMk(Ref, dv, vt, sz, algn); \
+        pLsrThunkResultFieldRef(dv) = NULL;    \
+    } while (0) 
+#else
+#define pLsrThunkNewRef(dv, vt, sz, algn)    pLsrThunkNewMk(Ref, dv, vt, sz, algn)
+#endif
+#define pLsrThunkNew32(dv, vt, sz, algn)     pLsrThunkNewMk(32, dv, vt, sz, algn)
+#define pLsrThunkNew64(dv, vt, sz, algn)     pLsrThunkNewMk(64, dv, vt, sz, algn)
+#define pLsrThunkNewFloat(dv, vt, sz, algn)  pLsrThunkNewMk(Float, dv, vt, sz, algn)
+#define pLsrThunkNewDouble(dv, vt, sz, algn) pLsrThunkNewMk(Double, dv, vt, sz, algn)
+
+/* Initialisation */
+
+#define pLsrThunkSetInitMk(name, thnk, code)                            \
+    (ptkFutureInit((PtkRef)(thnk),                                      \
+                   (unsigned)&pLsrThunk##name(0)->future,               \
+                   (PtkFutureCodePointer)code))
+
+#define pLsrThunkSetInitRef(thnk, code)    pLsrThunkSetInitMk(Ref, thnk, code)
+#define pLsrThunkSetInit32(thnk, code)     pLsrThunkSetInitMk(32, thnk, code)
+#define pLsrThunkSetInit64(thnk, code)     pLsrThunkSetInitMk(64, thnk, code)
+#define pLsrThunkSetInitFloat(thnk, code)  pLsrThunkSetInitMk(Float, thnk, code)
+#define pLsrThunkSetInitDouble(thnk, code) pLsrThunkSetInitMk(Double, thnk, code)
+
+#define pLsrThunkSetValueRef(thnk, v)                                   \
+    (pLsrWriteBarrierRefBase((PlsrThunkBRef)(thnk), pLsrThunkRef(thnk)->result, (v)), \
+     ptkFutureSetStatus((PtkRef)(thnk),                          \
+                        (unsigned)&pLsrThunkRef(0)->future,             \
+                        (PtkFutureStatus)PlsrThunkIndir))
+/* This should not engender a yield */
+#define pLsrThunkSetValueNonWbMk(name, thnk, v)                         \
+    (pLsrThunk##name(thnk)->result = (v),                               \
+     ptkFutureSetStatus((PlsrThunkB##name)(thnk),                                    \
+                        (unsigned)&pLsrThunk##name(0)->future,          \
+                        (PtkFutureStatus)PlsrThunkIndir))
+
+#define pLsrThunkSetValue32(thnk, v)     pLsrThunkSetValueNonWbMk(32, thnk, v)
+#define pLsrThunkSetValue64(thnk, v)     pLsrThunkSetValueNonWbMk(64, thnk, v)
+#define pLsrThunkSetValueFloat(thnk, v)  pLsrThunkSetValueNonWbMk(Float, thnk, v)
+#define pLsrThunkSetValueDouble(thnk, v) pLsrThunkSetValueNonWbMk(Double, thnk, v)
+
+#define pLsrThunkValueInitRef(thnk, v)                                  \
+    do {                                                                \
+        pLsrThunkSetValueRef(thnk, v);                                  \
+        pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTableRef);   \
+    } while (0)
+
+#define pLsrThunkValueInit32(thnk, v)                                  \
+    do {                                                                \
+        pLsrThunkSetValue32(thnk, v);                                  \
+        pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTable32);   \
+    } while (0)
+
+#define pLsrThunkValueInit64(thnk, v)                                  \
+    do {                                                                \
+        pLsrThunkSetValue64(thnk, v);                                  \
+        pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTable64);   \
+    } while (0)
+
+#define pLsrThunkValueInitFloat(thnk, v)                                  \
+    do {                                                                \
+        pLsrThunkSetValueFloat(thnk, v);                                  \
+        pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTableFloat); \
+    } while (0)
+
+#define pLsrThunkValueInitDouble(thnk, v)                                  \
+    do {                                                                \
+        pLsrThunkSetValueDouble(thnk, v);                                  \
+        pLsrObjectChangeVTableMandatory(thnk, pLsrThunkValVTableDouble); \
+    } while (0)
+
+#define pLsrThunkNewValueMk(name, dv, vt, sz, algn, v)                        \
+    do {                                                                \
+        noyield {                                                       \
+            pLsrThunkNew##name(dv, vt, sz, algn);                             \
+            pLsrThunkSetValueNonWbMk(name, dv, v);                      \
+        }                                                               \
+    } while(0)
+
+#define pLsrThunkNewValueRef(dv, vt, sz, algn, v)    pLsrThunkNewValueMk(Ref, dv, vt, sz, algn, ((PlsrRef) v))
+#define pLsrThunkNewValue32(dv, vt, sz, algn, v)     pLsrThunkNewValueMk(32, dv, vt, sz, algn, v)
+#define pLsrThunkNewValue64(dv, vt, sz, algn, v)     pLsrThunkNewValueMk(64, dv, vt, sz, algn, v)
+#define pLsrThunkNewValueFloat(dv, vt, sz, algn, v)  pLsrThunkNewValueMk(Float, dv, vt, sz, algn, v)
+#define pLsrThunkNewValueDouble(dv, vt, sz, algn, v) pLsrThunkNewValueMk(Double, dv, vt, sz, algn, v)
+
+static PlsrThunkBRef pLsrThunkNewValRef(PlsrRef v)
+{
+    noyield {
+        PlsrThunkBRef res;
+        pLsrAllocAligned(PlsrThunkBRef, res, pLsrThunkValVTableRef, sizeof(PlsrThunkURef), sizeof(PlsrRef));
+        pLsrThunkSetValueRef(res, v);
+        return res;
+    }
+}
+
+/* Projection */
+#define pLsrThunkIsUnboxedRef(thnk)    (0)
+#define pLsrThunkIsUnboxed32(thnk)     (0)
+#define pLsrThunkIsUnboxed64(thnk)     (0)
+#define pLsrThunkIsUnboxedFloat(thnk)  (0)
+#define pLsrThunkIsUnboxedDouble(thnk) (0)
+
+#define pLsrThunkStatusMk(name, thnk)                                    \
+    (ptkFutureGetStatus((PlsrThunkB##name)(thnk), (unsigned)&pLsrThunk##name(0)->future))
+#define pLsrThunkStatusRef(thnk)    pLsrThunkStatusMk(Ref, thnk)
+#define pLsrThunkStatus32(thnk)     pLsrThunkStatusMk(32, thnk)
+#define pLsrThunkStatus64(thnk)     pLsrThunkStatusMk(64, thnk)
+#define pLsrThunkStatusFloat(thnk)  pLsrThunkStatusMk(Float, thnk)
+#define pLsrThunkStatusDouble(thnk) pLsrThunkStatusMk(Double, thnk)
+
+#define pLsrThunkIsEvaledRef(thnk)     (pLsrThunkStatusRef(thnk) == PlsrThunkIndir)
+#define pLsrThunkIsEvaled32(thnk)      (pLsrThunkStatus32 (thnk) == PlsrThunkIndir)
+#define pLsrThunkIsEvaled64(thnk)      (pLsrThunkStatus64 (thnk) == PlsrThunkIndir)
+#define pLsrThunkIsEvaledFloat(thnk)   (pLsrThunkStatusFloat (thnk) == PlsrThunkIndir)
+#define pLsrThunkIsEvaledDouble(thnk)  (pLsrThunkStatusDouble (thnk) == PlsrThunkIndir)
+
+#define pLsrThunkGetValRef(thnk)     pLsrThunkResultFieldRef(thnk)
+#define pLsrThunkGetVal32(thnk)      pLsrThunkResultField32(thnk)
+#define pLsrThunkGetVal64(thnk)      pLsrThunkResultField64(thnk)
+#define pLsrThunkGetValFloat(thnk)   pLsrThunkResultFieldFloat(thnk)
+#define pLsrThunkGetValDouble(thnk)  pLsrThunkResultFieldDouble(thnk)
+
+
+/* Evaluation */
+
+static PlsrRef pLsrThunkEvalStatusErrorRef(PlsrFutureStatus status) 
+{
+    char msg[80];
+    sprintf(msg, "Bad thunk status in Eval: %d", status);
+    pLsrRuntimeError(msg);
+    return 0;
+}
+
+static PlsrRef pLsrThunkEvalSlowRef(PlsrThunkBRef thunk)
+{
+    for(;;) {
+        switch (pLsrThunkStatusRef(thunk)) {
+        case PtkFutureStatusUninit:
+            pLsrRuntimeError("Uninitialised thunk in Eval");
+            return 0;
+#ifdef P_USE_PARALLEL_FUTURES
+        case PtkFutureStatusSpawning:
+        case PtkFutureStatusSpawned:
+#endif /* P_USE_PARALLEL_FUTURES */
+        case PtkFutureStatusInit:
+        case PtkFutureStatusStarted:
+            ptkFutureWait((PtkRef)thunk,
+                          (unsigned)&pLsrThunkRef(0)->future,
+                          1);
+#ifdef PLSR_THUNK_INTERCEPT_CUTS
+            assert(pLsrThunkStatusRef(thunk) == PlsrThunkIndir ||
+                   pLsrThunkStatusRef(thunk) == PlsrThunkCut);
+#else /* !PLSR_THUNK_INTERCEPT_CUTS */
+            assert(pLsrThunkStatusRef(thunk) == PlsrThunkIndir);
+            return pLsrThunkGetValRef(thunk);
+#endif /* !PLSR_THUNK_INTERCEPT_CUTS */
+            break;
+        case PlsrThunkIndir:
+            return pLsrThunkGetValRef(thunk);
+#ifdef PLSR_THUNK_INTERCEPT_CUTS
+        case PlsrThunkCut:
+            pilCutTo0(pLsrThunkRef(thunk)->cutCont);
+            return 0;
+#endif /* PLSR_THUNK_INTERCEPT_CUTS */
+        default:
+            return pLsrThunkEvalStatusErrorRef(pLsrThunkStatusRef(thunk));
+        }
+    }
+}
+
+
+#ifdef P_USE_PARALLEL_FUTURES
+#ifdef PLSR_THUNK_INTERCEPT_CUTS
+#define pLsrThunkEvalSlowNonRefMk(retType, name)                        \
+                                                                        \
+    static retType pLsrThunkEvalStatusError##name(PlsrFutureStatus status) \
+    {                                                                   \
+        char msg[80];                                                   \
+        sprintf(msg, "Bad thunk status in Eval: %d", status);           \
+        pLsrRuntimeError(msg);                                          \
+        return 0;                                                       \
+    }                                                                   \
+                                                                        \
+    static retType pLsrThunkEvalSlow##name(PlsrThunkB##name thunk)      \
+    {                                                                   \
+        for(;;) {                                                       \
+            switch (pLsrThunkStatus##name(thunk)) {                     \
+            case PtkFutureStatusUninit:                                 \
+                pLsrRuntimeError("Uninitialised thunk in Eval");        \
+                return 0;                                               \
+            case PtkFutureStatusSpawning:                               \
+            case PtkFutureStatusSpawned:                                \
+            case PtkFutureStatusInit:                                   \
+            case PtkFutureStatusStarted:                                \
+                ptkFutureWait((PtkRef)thunk,                            \
+                              (unsigned)&pLsrThunk##name(0)->future,    \
+                              1);                                       \
+                assert(pLsrThunkStatus##name(thunk) == PlsrThunkIndir || \
+                       pLsrThunkStatus##name(thunk) == PlsrThunkCut);   \
+                break;                                                  \
+            case PlsrThunkIndir:                                        \
+                return pLsrThunkGetVal##name(thunk);                    \
+            case PlsrThunkCut:                                          \
+                pilCutTo0(pLsrThunk##name(thunk)->cutCont);             \
+                return 0;                                               \
+            default:                                                    \
+                return pLsrThunkEvalStatusError##name(pLsrThunkStatus##name(thunk)); \
+            }                                                           \
+        }                                                               \
+    }
+#else /* !PLSR_THUNK_INTERCEPT_CUTS */
+#define pLsrThunkEvalSlowNonRefMk(retType, name)                            \
+                                                                        \
+    static retType pLsrThunkEvalStatusError##name(PlsrFutureStatus status) \
+    {                                                                   \
+        char msg[80];                                                   \
+        sprintf(msg, "Bad thunk status in Eval: %d", status);           \
+        pLsrRuntimeError(msg);                                          \
+        return 0;                                                       \
+    }                                                                   \
+                                                                        \
+    static retType pLsrThunkEvalSlow##name(PlsrThunkB##name thunk)          \
+    {                                                                   \
+        for(;;) {                                                       \
+            switch (pLsrThunkStatus##name(thunk)) {                     \
+            case PtkFutureStatusUninit:                                 \
+                pLsrRuntimeError("Uninitialised thunk in Eval");        \
+                return 0;                                               \
+            case PtkFutureStatusSpawning:                               \
+            case PtkFutureStatusSpawned:                                \
+            case PtkFutureStatusInit:                                   \
+            case PtkFutureStatusStarted:                                \
+                ptkFutureWait((PtkRef)thunk,                               \
+                              (unsigned)&pLsrThunk##name(0)->future,    \
+                              1);                                       \
+                assert(pLsrThunkStatus##name(thunk) == PlsrThunkIndir); \
+                break;                                                  \
+            case PlsrThunkIndir:                                        \
+                return pLsrThunkGetVal##name(thunk);                    \
+            default:                                                    \
+                return pLsrThunkEvalStatusError##name(pLsrThunkStatus##name(thunk)); \
+            }                                                           \
+        }                                                               \
+    }
+#endif /* PLSR_THUNK_INTERCEPT_CUTS */
+#else /* !P_USE_PARALLEL_FUTURES */
+#define pLsrThunkEvalSlowNonRefMk(retType, name)                            \
+                                                                        \
+    static retType pLsrThunkEvalStatusError##name(PlsrFutureStatus status) \
+    {                                                                   \
+        char msg[80];                                                   \
+        sprintf(msg, "Bad thunk status in Eval: %d", status);           \
+        pLsrRuntimeError(msg);                                          \
+        return 0;                                                       \
+    }                                                                   \
+                                                                        \
+    static retType pLsrThunkEvalSlow##name(PlsrThunkB##name thunk)          \
+    {                                                                   \
+        for(;;) {                                                       \
+            switch (pLsrThunkStatus##name(thunk)) {                     \
+            case PtkFutureStatusUninit:                                 \
+                pLsrRuntimeError("Uninitialised thunk in Eval");        \
+                return 0;                                               \
+            case PtkFutureStatusInit:                                   \
+            case PtkFutureStatusStarted:                                \
+                ptkFutureWait((PtkRef)thunk,                            \
+                              (unsigned)&pLsrThunk##name(0)->future,    \
+                              1);                                       \
+                assert(pLsrThunkStatus##name(thunk) == PlsrThunkIndir); \
+                return pLsrThunkGetVal##name(thunk);                    \
+            case PlsrThunkIndir:                                        \
+                return pLsrThunkGetVal##name(thunk);                    \
+            default:                                                    \
+                return pLsrThunkEvalStatusError##name(pLsrThunkStatus##name(thunk)); \
+            }                                                           \
+        }                                                               \
+    }
+
+#endif /* P_USE_PARALLEL_FUTURES */
+
+/*
+ * pLsrThunkEval32
+ * pLsrThunkEval64
+ * pLsrThunkEvalFloat
+ * pLsrThunkEvalDouble
+ */
+pLsrThunkEvalSlowNonRefMk(uint32, 32);
+pLsrThunkEvalSlowNonRefMk(uint64, 64);
+pLsrThunkEvalSlowNonRefMk(float32, Float);
+pLsrThunkEvalSlowNonRefMk(float64, Double);
+
+#define pLsrThunkEvalRef(thunk)    pLsrThunkEvalSlowRef (thunk)
+#define pLsrThunkEval32(thunk)     pLsrThunkEvalSlow32 (thunk)
+#define pLsrThunkEval64(thunk)     pLsrThunkEvalSlow64 (thunk)
+#define pLsrThunkEvalFloat(thunk)  pLsrThunkEvalSlowFloat (thunk)
+#define pLsrThunkEvalDouble(thunk) pLsrThunkEvalSlowDouble (thunk)
+
+#define pLsrThunkCallRef(thunk)    pLsrThunkEvalSlowRef (thunk)
+#define pLsrThunkCall32(thunk)     pLsrThunkEvalSlow32 (thunk)
+#define pLsrThunkCall64(thunk)     pLsrThunkEvalSlow64 (thunk)
+#define pLsrThunkCallFloat(thunk)  pLsrThunkEvalSlowFloat (thunk)
+#define pLsrThunkCallDouble(thunk) pLsrThunkEvalSlowDouble (thunk)
+
+#define pLsrThunkEvalDirectRef(code, thunk)    pLsrThunkEvalSlowRef (thunk)
+#define pLsrThunkEvalDirect32(code, thunk)     pLsrThunkEvalSlow32 (thunk)
+#define pLsrThunkEvalDirect64(code, thunk)     pLsrThunkEvalSlow64 (thunk)
+#define pLsrThunkEvalDirectFloat(code, thunk)  pLsrThunkEvalSlowFloat (thunk)
+#define pLsrThunkEvalDirectDouble(code, thunk) pLsrThunkEvalSlowDouble (thunk)
+
+#define pLsrThunkCallDirectRef(code, thunk)    pLsrThunkEvalDirectRef(code, thunk)
+#define pLsrThunkCallDirect32(code, thunk)     pLsrThunkEvalDirect32 (code, thunk)
+#define pLsrThunkCallDirect64(code, thunk)     pLsrThunkEvalDirect64 (code, thunk)
+#define pLsrThunkCallDirectFloat(code, thunk)  pLsrThunkEvalDirectFloat (code, thunk)
+#define pLsrThunkCallDirectDouble(code, thunk) pLsrThunkEvalDirectDouble (code, thunk)
+
+#define pLsrThunkTailEvalRef(code, thunk)    TAILCALL(pLsrThunkEvalSlowRef (thunk))
+#define pLsrThunkTailEval32(code, thunk)     TAILCALL(pLsrThunkEvalSlow32 (thunk))
+#define pLsrThunkTailEval64(code, thunk)     TAILCALL(pLsrThunkEvalSlow64 (thunk))
+#define pLsrThunkTailEvalFloat(code, thunk)  TAILCALL(pLsrThunkEvalSlowFloat (thunk))
+#define pLsrThunkTailEvalDouble(code, thunk) TAILCALL(pLsrThunkEvalSlowDouble (thunk))
+
+#define pLsrThunkTailCallRef(thunk)    TAILCALL(pLsrThunkEvalSlowRef (thunk))
+#define pLsrThunkTailCall32(thunk)     TAILCALL(pLsrThunkEvalSlow32 (thunk))
+#define pLsrThunkTailCall64(thunk)     TAILCALL(pLsrThunkEvalSlow64 (thunk))
+#define pLsrThunkTailCallFloat(thunk)  TAILCALL(pLsrThunkEvalSlowFloat (thunk))
+#define pLsrThunkTailCallDouble(thunk) TAILCALL(pLsrThunkEvalSlowDouble (thunk))
+
+#define pLsrThunkTailEvalDirectRef(code, thunk)    TAILCALL(pLsrThunkEvalSlowRef (thunk))
+#define pLsrThunkTailEvalDirect32(code, thunk)     TAILCALL(pLsrThunkEvalSlow32 (thunk))
+#define pLsrThunkTailEvalDirect64(code, thunk)     TAILCALL(pLsrThunkEvalSlow64 (thunk))
+#define pLsrThunkTailEvalDirectFloat(code, thunk)  TAILCALL(pLsrThunkEvalSlowFloat (thunk))
+#define pLsrThunkTailEvalDirectDouble(code, thunk) TAILCALL(pLsrThunkEvalSlowDouble (thunk))
+
+#define pLsrThunkTailCallDirectRef(code, thunk)    TAILCALL(pLsrThunkEvalDirectRef(code, thunk))
+#define pLsrThunkTailCallDirect32(code, thunk)     TAILCALL(pLsrThunkEvalDirect32 (code, thunk))
+#define pLsrThunkTailCallDirect64(code, thunk)     TAILCALL(pLsrThunkEvalDirect64 (code, thunk))
+#define pLsrThunkTailCallDirectFloat(code, thunk)  TAILCALL(pLsrThunkEvalDirectFloat (code, thunk))
+#define pLsrThunkTailCallDirectDouble(code, thunk) TAILCALL(pLsrThunkEvalDirectDouble (code, thunk))
+
+#define pLsrThunkReturnRef(thnk, val)                                   \
+    do {                                                                \
+        pLsrWriteBarrierRefBase((thnk), pLsrThunkResultFieldRef(thnk), (val)); \
+        return PlsrThunkIndir;                                          \
+    } while (0)
+
+#define pLsrThunkReturnNonRefMk(name, thnk, val)                        \
+    do {                                                                \
+        pLsrThunkResultField##name(thnk) = (val);                       \
+        return PlsrThunkIndir;                                          \
+    } while (0)
+
+#define pLsrThunkReturn32(thnk, val)     pLsrThunkReturnNonRefMk(32, thnk, val)
+#define pLsrThunkReturn64(thnk, val)     pLsrThunkReturnNonRefMk(64, thnk, val)
+#define pLsrThunkReturnFloat(thnk, val)  pLsrThunkReturnNonRefMk(Float, thnk, val)
+#define pLsrThunkReturnDouble(thnk, val) pLsrThunkReturnNonRefMk(Double, thnk, val)
+
+#ifdef PLSR_THUNK_INTERCEPT_CUTS
+
+#define pLsrThunkCutMk(name, thnk, cont)                                \
+    do {                                                                \
+        pLsrThunk##name(thnk)->cutCont = (cont);                        \
+        return PlsrThunkCut;                                            \
+        cut to cont;                                                    \
+    } while (0)
+#define pLsrThunkCutRef(thnk, cont)    pLsrThunkCutMk(Ref, thnk, cont)
+#define pLsrThunkCut32(thnk, cont)     pLsrThunkCutMk(32, thnk, cont)
+#define pLsrThunkCut64(thnk, cont)     pLsrThunkCutMk(64, thnk, cont)
+#define pLsrThunkCutFloat(thnk, cont)  pLsrThunkCutMk(Float, thnk, cont)
+#define pLsrThunkCutDouble(thnk, cont) pLsrThunkCutMk(Double, thnk, cont)
+
+#endif /* PLSR_THUNK_INTERCEPT_CUTS */
+
+/* Printing */
+
+static void pLsrThunkPrintRef(PlsrThunkBRef t)
+{
+    pLsrValuePrint((PlsrObjectB)pLsrThunkEvalRef(t));
+}
+
+/* Spawning */
+
+#ifdef P_USE_PARALLEL_FUTURES
+
+#define pLsrThunkSpawnMk(name)                                          \
+    static void pLsrThunkSpawn##name(PlsrThunkB##name thunk)            \
+    {                                                                   \
+        ptkFutureSpawn((PtkRef)thunk, (unsigned)&pLsrThunk##name(0)->future); \
+    }
+
+/*
+ * pLsrThunkSpawnRef
+ * pLsrThunkSpawn32
+ * pLsrThunkSpawn64
+ * pLsrThunkSpawnFloat
+ * pLsrThunkSpawnDouble
+ */
+pLsrThunkSpawnMk(Ref);
+pLsrThunkSpawnMk(32);
+pLsrThunkSpawnMk(64);
+pLsrThunkSpawnMk(Float);
+pLsrThunkSpawnMk(Double);
+
+#endif /* P_USE_PARALLEL_FUTURES */
+
+#define pLsrThunkBlackHoleRef(thunk)
+#define pLsrThunkBlackHole32(thunk)
+#define pLsrThunkBlackHole64(thunk)
+#define pLsrThunkBlackHoleFloat(thunk)
+#define pLsrThunkBlackHoleDouble(thunk)
+
+#define pLsrThunkClaimRef(thunk)
+#define pLsrThunkClaim32(thunk)
+#define pLsrThunkClaim64(thunk)
+#define pLsrThunkClaimFloat(thunk)
+#define pLsrThunkClaimDouble(thunk)
+
+
+#define pLsrThunkZeroFV(zero) zero
+
+/*** Check Object Model ***/
+
+/* Generated code defines:
+ *   pLsrThunkFixedSizeRef
+ *   pLsrThunkFixedSize64
+ *   pLsrThunkFixedSize32
+ *   pLsrThunkFixedSizeFloat
+ *   pLsrThunkFixedSizeDouble
+ *   pLsrThunkResultOffsetRef
+ *   pLsrThunkResultOffset64
+ *   pLsrThunkResultOffset32
+ *   pLsrThunkResultOffsetFloat
+ *   pLsrThunkResultOffsetDouble
+ */
+
+static void pLsrThunkCheck()
+{
+    /*printf("Thunk check: %d/%d, %d/%d, %d/%d\n", pLsrThunkFixedSizeRef, 
+           sizeof(PlsrThunkURef), pLsrThunkFixedSize32, sizeof(PlsrThunkU32),
+           pLsrThunkFixedSize64, sizeof(PlsrThunkU64));*/
+    if (pLsrThunkFixedSizeRef != sizeof(PlsrThunkURef))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkFixedSize32  != sizeof(PlsrThunkU32))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkFixedSize64  != sizeof(PlsrThunkU64))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkFixedSizeFloat  != sizeof(PlsrThunkUFloat))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkFixedSizeDouble  != sizeof(PlsrThunkUDouble))
+        pLsrRuntimeError("Bad thunk object model!\n"); 
+
+    if (pLsrThunkResultOffsetRef != ((unsigned)(&pLsrThunkResultFieldRef(0))))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkResultOffset32  != ((unsigned)(&pLsrThunkResultField32(0))))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkResultOffset64  != ((unsigned)(&pLsrThunkResultField64(0))))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkResultOffsetFloat  != ((unsigned)(&pLsrThunkResultFieldFloat(0))))
+        pLsrRuntimeError("Bad thunk object model!\n");
+    if (pLsrThunkResultOffsetDouble  != ((unsigned)(&pLsrThunkResultFieldDouble(0))))
+        pLsrRuntimeError("Bad thunk object model!\n"); 
+}
+
+#endif /* !_PLSR_THUNK_H_ */
diff --git a/runtime/include/hrc/plsr-rational.h b/runtime/include/hrc/plsr-rational.h
new file mode 100755
index 0000000..d2f4f86
--- /dev/null
+++ b/runtime/include/hrc/plsr-rational.h
@@ -0,0 +1,311 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Arbitrary precision numbers */
+
+#ifndef _PLSR_RATIONAL_H_
+#define _PLSR_RATIONAL_H_
+
+/**********************************************************************
+ * This file implements the Rational type in terms of
+ * an arbitrary precision rational package APRat, with associated
+ * operations.
+ */
+
+#include "hrc/plsr-ap-rational.h"
+
+static void pLsrRationalRegisterVTables()
+{
+    pLsrAPRatRegisterVTables();
+}
+
+static void pLsrRationalRegisterGlobals() {
+    pLsrAPRatRegisterGlobals();
+}
+
+
+#ifdef P_USE_TAGGED_RATIONALS
+
+#include "hrc/plsr-tagged-int32.h"
+
+/* Basic constructors */
+
+typedef PlsrAPRat PlsrRational;
+typedef PlsrAPRatS PlsrRationalU;
+
+#define pLsrRationalIsTagged(a) (pLsrTaggedInt32TaggedIntIsTagged(a))
+
+#define pLsrRationalUnTag(a) (pLsrTaggedInt32TaggedIntUnTag(a))
+
+/* Basic queries and destructors */
+
+#define pLsrRationalIsIntegral(a)                                       \
+    (pLsrRationalIsTagged(a) || pLsrAPRatIsIntegral((PlsrAPRat) a))
+
+#define pLsrRationalStaticUnboxedDef pLsrAPRatStaticUnboxedDef
+#define pLsrRationalStaticInit pLsrAPRatStaticInit
+
+#define pLsrSmallRationalFromSInt32(a) (pLsrTaggedInt32TaggedIntFromSmallInt32(PlsrRational, a))
+
+#define pLsrSmallRationalMax pLsrTaggedInt32Max
+#define pLsrSmallRationalMin pLsrTaggedInt32Min
+
+
+/* Conversions */
+/* Helpers */
+static void pLsrIntegersFromRational(PlsrInteger* numO, PlsrInteger* denO, PlsrRational r) {
+    if (pLsrRationalIsTagged(r)) {
+        /* This conversion could be optimized */
+        pLsrIntegerFromSInt32(*numO, pLsrRationalUnTag(r));
+        *denO = pLsrIntegerOne;
+    } else {
+        pLsrIntegersFromAPRat(numO, denO, r);
+    }
+}
+
+static PlsrInteger pLsrRatNumeratorSlow(PlsrRational r) {
+    if (pLsrRationalIsTagged(r)) {
+        PlsrInteger i;
+        /* This conversion could be optimized */
+        pLsrIntegerFromSInt32(i, pLsrRationalUnTag(r));
+        return i;
+    } else {
+        return pLsrAPRatNumerator(r);
+    }
+}
+
+#define pLsrRatNumerator pLsrRatNumeratorSlow
+
+static PlsrInteger pLsrRatDenominatorSlow(PlsrRational r) {
+    if (pLsrRationalIsTagged(r)) {
+        return pLsrIntegerOne;
+    } else {
+        return pLsrAPRatDenominator(r);
+    }
+}
+
+#define pLsrRatDenominator pLsrRatDenominatorSlow
+
+/************* Runtime/Compiler API ***************/
+
+
+#define pLsrRationalFromSInt8 pLsrRationalFromSInt32
+#define pLsrSInt8FromRational pLsrSInt32FromRational
+
+#define pLsrRationalFromSInt16 pLsrRationalFromSInt32
+#define pLsrSInt16FromRational pLsrSInt32FromRational
+
+#define pLsrRationalFromSInt32(dest, a)                              \
+    pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(PlsrRational, sint32, pLsrAPRatFromSInt32, dest, a)
+#define pLsrSInt32FromRational(dest, a)                             \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sint32, PlsrRational, PlsrAPRat, pLsrSInt32FromAPRat, dest, a)
+
+#define pLsrRationalFromSIntp(dest, a)                              \
+    pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(PlsrRational, sintp, pLsrAPRatFromSIntp, dest, a)
+#define pLsrSIntpFromRational(dest, a)                             \
+    pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(sintp, PlsrRational, PlsrAPRat, pLsrSIntpFromAPRat, dest, a)
+
+#define pLsrRationalFromSInt64(dest, a)                              \
+    pLsrTaggedInt64NumConvFastTaggedIntFromSIntX(PlsrRational, sint64, pLsrAPRatFromSInt64, dest, a)
+#define pLsrSInt64FromRational(dest, a)                             \
+    pLsrTaggedInt64NumConvFastSIntXFromTaggedInt(sint64, PlsrRational, PlsrAPRat, pLsrSInt64FromAPRat, dest, a)
+
+#define pLsrRationalFromUInt8 pLsrRationalFromUInt32
+#define pLsrUInt8FromRational pLsrUInt32FromRational
+
+#define pLsrRationalFromUInt16 pLsrRationalFromUInt32
+#define pLsrUInt16FromRational pLsrUInt32FromRational
+
+#define pLsrRationalFromUInt32(dest, a)                              \
+    pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(PlsrRational, uint32, pLsrAPRatFromUInt32, dest, a)
+#define pLsrUInt32FromRational(dest, a)                             \
+    pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(uint32, PlsrRational, PlsrAPRat, pLsrUInt32FromAPRat, dest, a)
+
+#define pLsrRationalFromUIntp(dest, a)                              \
+    pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(PlsrRational, uintp, pLsrAPRatFromUIntp, dest, a)
+#define pLsrUIntpFromRational(dest, a)                             \
+    pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(uintp, PlsrRational, PlsrAPRat, pLsrUIntpFromAPRat, dest, a)
+
+#define pLsrRationalFromUInt64(dest, a)                              \
+    pLsrTaggedInt64NumConvFastTaggedIntFromUIntX(PlsrRational, uint64, pLsrAPRatFromUInt64, dest, a)
+#define pLsrUInt64FromRational(dest, a)                             \
+    pLsrTaggedInt64NumConvFastUIntXFromTaggedInt(uint64, PlsrRational, PlsrAPRat, pLsrUInt64FromAPRat, dest, a)
+
+#define pLsrRationalFromFloat32(dest, a)                                \
+    pLsrTaggedInt32NumConvFastTaggedIntFromFloat32(PlsrRational, pLsrAPRatFromFloat32, dest, a)
+#define pLsrFloat32FromRational(dest, a)                                 \
+    pLsrTaggedInt32NumConvFastFloatXFromTaggedInt(float32, PlsrRational, PlsrAPRat, pLsrFloat32FromAPRat, dest, a)
+
+#define pLsrRationalFromFloat64(dest, a)                                \
+    pLsrTaggedInt64NumConvFastTaggedIntFromFloat64(PlsrRational, pLsrAPRatFromFloat64, dest, a)
+#define pLsrFloat64FromRational(dest, a)                                 \
+    pLsrTaggedInt64NumConvFastFloatXFromTaggedInt(float64, PlsrRational, PlsrAPRat, pLsrFloat64FromAPRat, dest, a)
+
+static PlsrRational pLsrRationalFromIntegerSlow(PlsrInteger i)
+{
+    if (pLsrIntegerFitsInSInt32(i)) {
+        sint32 si;
+        PlsrRational r;
+        pLsrSInt32FromInteger(si, i);
+        pLsrRationalFromSInt32(r, si);
+        return r;
+    } else {
+        return (PlsrRational) pLsrAPRatFromInteger(i);
+    }
+}
+
+#define pLsrRationalFromInteger(dest, i) ((dest) = (pLsrRationalFromIntegerSlow(i)))
+
+static PlsrInteger pLsrIntegerFromRationalSlow(PlsrRational r)  {
+    if (pLsrRationalIsTagged(r)) {
+        PlsrInteger i;
+        /* This conversion could be optimized */
+        pLsrIntegerFromSInt32(i, pLsrRationalUnTag(r));
+        return i;
+    } else {
+        return pLsrIntegerFromAPRat(r);
+    }
+}
+
+#define pLsrIntegerFromRational(dest, r) ((dest) = (pLsrIntegerFromRationalSlow(r)))
+
+#define pLsrRationalToUInt32Checked(dest, a)     \
+    do {                                             \
+        if (pLsrRationalIsTagged(a)) {               \
+            sint32 aa = pLsrRationalUnTag(a);        \
+            if (aa < 0) {(dest) = 0xffffffff;}       \
+            else {(dest) = ((uint32) aa);}           \
+        } else {                                     \
+            ((dest) = pLsrAPRatToUInt32Checked(a));  \
+        }                                            \
+    } while(0)
+
+/* pLsrCStringFromRational */
+pLsrTaggedInt32MkCStringFrom(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrCStringFromAPRat);
+
+/* pLsrRationalFromCString */
+pLsrTaggedInt32MkFromCString(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, \
+                             pLsrSInt32FromAPRat, pLsrAPRatLe, pLsrAPRatFromCString);
+
+/* Arithmetic */
+
+#define pLsrRationalNegate(dest, r) \
+    pLsrTaggedInt32TaggedIntNeg(PlsrRational, PlsrAPRat, pLsrAPRatNeg, pLsrAPRatFromSInt32, dest, r)
+
+/* pLsrTaggedInt32RationalAddSlow*/
+pLsrTaggedInt32MkTaggedIntAddSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatPlus);
+#define pLsrRationalPlus(dest, a, b) \
+    pLsrTaggedInt32TaggedIntAdd(PlsrRational, PlsrAPRat, pLsrTaggedInt32RationalAddSlow, dest, a, b)
+
+/* pLsrTaggedInt32RationalSubSlow*/
+pLsrTaggedInt32MkTaggedIntSubSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatMinus);
+#define pLsrRationalMinus(dest, a, b)                               \
+    pLsrTaggedInt32TaggedIntSub(PlsrRational, PlsrAPRat, pLsrTaggedInt32RationalSubSlow, dest, a, b)
+
+/* pLsrTaggedInt32RationalMulSlow*/
+pLsrTaggedInt32MkTaggedIntMulSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatFromSInt64, pLsrAPRatMult);
+#define pLsrRationalTimes(dest, a, b) \
+    pLsrTaggedInt32TaggedIntMul(PlsrRational, PlsrAPRat, pLsrTaggedInt32RationalMulSlow, dest, a, b)
+
+/* Comparisons */
+pLsrTaggedInt32MkTaggedIntEqSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatEq);
+pLsrTaggedInt32MkTaggedIntNeSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatNe);
+pLsrTaggedInt32MkTaggedIntLtSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatLt);
+pLsrTaggedInt32MkTaggedIntGtSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatGt);
+pLsrTaggedInt32MkTaggedIntLeSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatLe);
+pLsrTaggedInt32MkTaggedIntGeSlow(Rational, PlsrRational, PlsrAPRat, pLsrAPRatFromSInt32, pLsrAPRatGe);
+
+#define pLsrRationalEQ(dest, a, b) pLsrTaggedInt32TaggedIntEq(pLsrTaggedInt32RationalEqSlow, dest, a, b)
+#define pLsrRationalNE(dest, a, b) pLsrTaggedInt32TaggedIntNe(pLsrTaggedInt32RationalNeSlow, dest, a, b)
+#define pLsrRationalLT(dest, a, b) pLsrTaggedInt32TaggedIntLt(pLsrTaggedInt32RationalLtSlow, dest, a, b)
+#define pLsrRationalGT(dest, a, b) pLsrTaggedInt32TaggedIntGt(pLsrTaggedInt32RationalGtSlow, dest, a, b)
+#define pLsrRationalLE(dest, a, b) pLsrTaggedInt32TaggedIntLe(pLsrTaggedInt32RationalLeSlow, dest, a, b)
+#define pLsrRationalGE(dest, a, b) pLsrTaggedInt32TaggedIntGe(pLsrTaggedInt32RationalGeSlow, dest, a, b)
+
+/* Constants */
+#define pLsrRationalOne pLsrTaggedInt32One
+#define pLsrRationalZero pLsrTaggedInt32Zero
+
+#define pLsrRationalCheckTaggedInt32Assertions() pLsrTaggedInt32Check(PlsrRational, PlsrAPRat)
+
+static void pLsrRationalCheckAssertions() {
+    assert(pLsrTaggedInt32Max == pLsrSmallRationalMax);
+    assert(pLsrTaggedInt32Min == pLsrSmallRationalMin);
+    pLsrRationalCheckTaggedInt32Assertions();
+}
+
+#else /* P_USE_TAGGED_RATIONALS*/
+
+/**********************************************************************
+ * This file implements the Rational type in terms of
+ * an arbitrary precision rational package APRat, with associated
+ * operations.
+ */
+
+/* Basic constructors */
+typedef PlsrAPRat PlsrRational;
+typedef PlsrAPRatS PlsrRationalU;
+
+/* Basic queries and destructors */
+
+#define pLsrRationalIsIntegral pLsrAPRatIsIntegral
+#define pLsrRationalStaticInit pLsrAPRatStaticInit
+#define pLsrRationalStaticUnboxedDef pLsrAPRatStaticUnboxedDef
+
+/* Conversions */
+
+#define pLsrRationalFromIntegers pLsrAPRatFromIntegers
+#define pLsrIntegersFromRational pLsrIntegersFromAPRat
+
+#define pLsrRatNumerator pLsrAPRatNumerator
+#define pLsrRatDenominator pLsrAPRatDenominator
+
+#define pLsrRationalFromInteger(dest, a) ((dest) = pLsrAPRatFromInteger(a))
+
+
+#define pLsrIntegerFromRational(dest, a) ((dest) = pLsrIntegerFromAPRat(a))
+
+#define pLsrRationalFromSIntp(dest, a) ((dest) = pLsrAPRatFromSIntp(a))
+#define pLsrSIntpFromRational(dest, a) ((dest) = pLsrSIntpFromAPRat(a))
+
+#define pLsrRationalFromUIntp(dest, a) ((dest) = pLsrAPRatFromUIntp(a))
+#define pLsrUIntpFromRational(dest, a) ((dest) = pLsrUIntpFromAPRat(a))
+
+#define pLsrRationalFromSInt32(dest, a) ((dest) = pLsrAPRatFromSInt32(a))
+#define pLsrSInt32FromRational(dest, a) ((dest) = pLsrSInt32FromAPRat(a))
+
+#define pLsrSmallRationalFromSInt32 pLsrRationalFromSInt32
+
+#define pLsrRationalFromUInt32(dest, a) ((dest) = pLsrAPRatFromUInt32(a))
+#define pLsrUInt32FromRational(dest, a) ((dest) = pLsrUInt32FromAPRat(a))
+
+#define pLsrRationalToUInt32Checked(dest, a) ((dest) = pLsrAPRatToUInt32Checked(a))
+
+#define pLsrCStringFromRational pLsrCStringFromAPRat
+#define pLsrRationalFromCString pLsrAPRatFromCString
+
+/* Arithmetic */
+
+#define pLsrRationalNegate(dest, a) ((dest) = pLsrAPRatNeg(a))
+#define pLsrRationalPlus(dest, a, b) ((dest) = pLsrAPRatPlus(a, b))
+#define pLsrRationalMinus(dest, a, b) ((dest) = pLsrAPRatMinus(a, b))
+#define pLsrRationalTimes(dest, a, b) ((dest) = pLsrAPRatMult(a, b))
+
+/* Comparisons */
+#define pLsrRationalEQ(dest, a, b) ((dest) = pLsrAPRatEq(a, b))
+#define pLsrRationalNE(dest, a, b) ((dest) = pLsrAPRatNe(a, b))
+#define pLsrRationalLT(dest, a, b) ((dest) = pLsrAPRatLt(a, b))
+#define pLsrRationalGT(dest, a, b) ((dest) = pLsrAPRatGt(a, b))
+#define pLsrRationalLE(dest, a, b) ((dest) = pLsrAPRatLe(a, b))
+#define pLsrRationalGE(dest, a, b) ((dest) = pLsrAPRatGe(a, b))
+
+/* Constants */
+#define pLsrRationalOne pLsrAPRatOne
+#define pLsrRationalZero pLsrAPRatZero
+
+static void pLsrRationalCheckAssertions() {
+}
+
+#endif /* P_USE_TAGGED_RATIONALS*/
+
+#endif /* !_PLSR_RATIONAL_H_ */
diff --git a/runtime/include/hrc/plsr-synchronization.h b/runtime/include/hrc/plsr-synchronization.h
new file mode 100644
index 0000000..c9359db
--- /dev/null
+++ b/runtime/include/hrc/plsr-synchronization.h
@@ -0,0 +1,244 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Synchronization primitives */
+
+#ifndef _PLSR_SYNCHRONIZATION_H_
+#define _PLSR_SYNCHRONIZATION_H_
+
+#ifdef P_USE_PILLAR
+
+#include "pgc/pgc.h"
+
+#ifdef __pillar2c__
+#define SYNCHCC __pcdecl
+#else
+#define SYNCHCC __cdecl
+#endif
+
+static PrtBool SYNCHCC iFlcSyncEqual(volatile void *location, void *data)
+{
+    return (*(void **)location == data) ? PrtTrue : PrtFalse;
+}
+
+static void iFlcSynchWaitNull(ref loc, uintp off)
+{
+    prtYieldUntilMovable(iFlcSyncEqual, &pLsrObjectField(loc, off, void**), NULL,
+                         PrtInfiniteWaitCycles64, PrtGcTagOffset, (void*)off);
+}
+
+static PrtBool SYNCHCC iFlcSyncNotEqual(volatile void *location, void *data)
+{
+    return (*(void **)location != data) ? PrtTrue : PrtFalse;
+}
+
+static void iFlcSynchWaitNonNull(ref loc, uintp off)
+{
+    prtYieldUntilMovable(iFlcSyncNotEqual, &pLsrObjectField(loc, off, void**), NULL,
+                         PrtInfiniteWaitCycles64, PrtGcTagOffset, (void*)off);
+}
+
+static void iFlcSynchWaitEqualVoidS(ref loc, uintp off, void *data)
+{
+    prtYieldUntilMovable(iFlcSyncEqual, &pLsrObjectField(loc, off, void**), data,
+                         PrtInfiniteWaitCycles64, PrtGcTagOffset, (void*)off);
+}
+
+/* Note that this function should not be __pcdecl as it takes and returns refs */
+static inline ref pLsrSynchLockedCmpxchgRef(ref* loc, ref cmpValue, ref newValue)
+{
+    return (ref)pgc_write_ref_cas(loc,newValue,cmpValue);
+}
+
+/* Taken from mcrt/src/mcrtthreads/mcrtia32.h */
+/* Attempts to atomically set *loc to newValue only if *loc==cmpValue.  Returns the value read
+ * from *loc: cmpValue if successful, otherwise the contents of loc.
+*/
+static inline uint32 SYNCHCC pLsrSynchLockedCmpxchgUInt32(volatile uint32 * loc, uint32 cmpValue, uint32 newValue)
+{
+#ifdef PLSR_GNU_ASM
+    uint32 value;
+    __asm__ __volatile__(
+        "lock; cmpxchgl %3,(%1)"
+        : "=a"(value)
+        : "r"(loc), "a"(cmpValue), "d"(newValue)
+        : "memory");
+    return value;
+#else
+#ifdef INTEL64
+    register uint32 value;
+    __asm {
+        mov rcx, loc
+        mov edx, newValue
+        mov eax, cmpValue
+        lock cmpxchg dword ptr[rcx], edx
+        mov value, eax
+    }
+    return value;
+#else // !INTEL64
+    register uint32 value;
+    __asm {
+        mov ecx, loc
+        mov edx, newValue
+        mov eax, cmpValue
+        lock cmpxchg dword ptr[ecx], edx
+        mov value, eax
+    }
+    return value;
+#endif // INTEL64
+#endif
+}
+
+static inline uint64 SYNCHCC pLsrSynchLockedCmpxchgUInt64(volatile uint64 * loc, uint64 cmpValue, uint64 newValue)
+{
+#ifdef PLSR_GNU_ASM
+    uint64 result;
+    __asm__ __volatile__(
+        "lock; cmpxchg %3,(%1)"
+        : "=a"(result)
+        : "r"(loc), "a"(cmpValue), "r"(newValue)
+        : "memory");
+    return result;
+#else
+#ifdef INTEL64
+  uint64 value;
+  __asm {
+    mov rcx, loc;
+    mov rdx, newValue;
+    mov rax, cmpValue;
+    lock cmpxchg qword ptr[rcx], rdx;
+    mov value, rax;
+  }
+  return value;
+#else // !INTEL64
+  uint32 hiCmp = (uint32) (UINT64_C(0xffffffff) & (cmpValue >> 32));
+  uint32 lowCmp = (uint32) (UINT64_C(0xffffffff) & cmpValue);
+
+  uint32 hiNew = (uint32) (UINT64_C(0xffffffff) & (newValue >> 32));
+  uint32 lowNew = (uint32) (UINT64_C(0xffffffff) & newValue);
+
+  uint32 lowRes;
+  uint32 hiRes;
+
+  __asm {
+      mov edx, hiCmp
+      mov eax, lowCmp
+      mov ecx, hiNew
+      mov ebx, lowNew
+      mov esi, loc
+      lock cmpxchg8b qword ptr[esi]
+      mov lowRes, eax
+      mov hiRes, edx
+  }
+  {
+      uint64 res = (((uint64)hiRes)<<32)|((uint64)lowRes);
+      return res;
+  }
+#endif // INTEL64
+#endif
+}
+
+#if (P_WORD_SIZE==4)
+#define pLsrSynchLockedCmpxchgUIntp pLsrSynchLockedCmpxchgUInt32
+#else
+#if (P_WORD_SIZE==8)
+#define pLsrSynchLockedCmpxchgUIntp pLsrSynchLockedCmpxchgUInt64
+#else
+#error "Unsupported word size"
+#endif /* P_WORD_SIZE=64*/
+#endif /* P_WORD_SIZE=32*/
+
+static inline int SYNCHCC pLsrSynchCmpAndSetUInt32(volatile uint32 * loc, uint32 cmpValue, uint32 newValue)
+{
+    return (pLsrSynchLockedCmpxchgUInt32(loc, cmpValue, newValue) == cmpValue);
+}
+
+static inline int SYNCHCC pLsrSynchCmpAndSetUInt64(volatile uint64 * loc, uint64 cmpValue, uint64 newValue)
+{
+    return (pLsrSynchLockedCmpxchgUInt64(loc, cmpValue, newValue) == cmpValue);
+}
+
+static inline int SYNCHCC pLsrSynchCmpAndSetUIntp(volatile uintp * loc, uintp cmpValue, uintp newValue)
+{
+    return (pLsrSynchLockedCmpxchgUIntp(loc, cmpValue, newValue) == cmpValue);
+}
+
+/* N.B.  Timeouts are in absolute cycle time, not deltas from now.
+ * XXX Change this to PrtInfiniteTimeoutCycles64 when it becomes available
+ */
+#define pLsrSynchInfiniteTimeoutCycles64 UINT64_C(0xFFFFFFFFFFFFFFFF)
+
+PrtBool SYNCHCC pLsrSynchYieldPredicateNotEqualUInt32(volatile void *location, void *data) {
+    return ((*(volatile uint32*)location) != (uint32) data) ? PrtTrue : PrtFalse;
+}
+
+PrtBool SYNCHCC pLsrSynchYieldPredicateNotEqualUIntp(volatile void *location, void *data) {
+    return ((*(volatile uintp*)location) != (uintp) data) ? PrtTrue : PrtFalse;
+}
+
+PrtBool SYNCHCC pLsrSynchYieldPredicateEqualUInt32(volatile void *location, void *data) {
+    return ((*(volatile uint32*)location) == (uint32) data) ? PrtTrue : PrtFalse;
+}
+
+PrtBool SYNCHCC pLsrSynchYieldPredicateEqualUIntp(volatile void *location, void *data) {
+    return ((*(volatile uintp*)location) == (uintp) data) ? PrtTrue : PrtFalse;
+}
+
+void pLsrSynchYieldUntilNotEqualUInt32(volatile uint32*loc, uint32 value)
+{
+    prtYieldUntil(pLsrSynchYieldPredicateNotEqualUInt32, (void*) loc, (void*) value, pLsrSynchInfiniteTimeoutCycles64);
+}
+
+void pLsrSynchYieldUntilNotEqualUIntp(volatile uintp*loc, uintp value)
+{
+    prtYieldUntil(pLsrSynchYieldPredicateNotEqualUIntp, (void*) loc, (void*) value, pLsrSynchInfiniteTimeoutCycles64);
+}
+
+void pLsrSynchYieldUntilEqualUInt32(volatile uint32*loc, uint32 value)
+{
+    prtYieldUntil(pLsrSynchYieldPredicateEqualUInt32, (void*) loc, (void*) value, pLsrSynchInfiniteTimeoutCycles64);
+}
+
+void pLsrSynchYieldUntilEqualUIntp(volatile uintp*loc, uintp value)
+{
+    prtYieldUntil(pLsrSynchYieldPredicateEqualUIntp, (void*) loc, (void*) value, pLsrSynchInfiniteTimeoutCycles64);
+}
+
+static inline uintp pLsrSynchAtomicTakeUIntp(volatile uintp*loc, uintp emptyValue) {
+    while (1) {
+        uintp contents =*loc;
+        if (contents != emptyValue) {
+            if (pLsrSynchCmpAndSetUIntp(loc, contents, emptyValue)) {
+                return contents;
+            }
+        } else {
+            pLsrSynchYieldUntilNotEqualUIntp(loc, emptyValue);
+        }
+    }
+}
+
+static inline void SYNCHCC pLsrSynchAtomicPutUIntp(volatile uintp*loc, uintp value){
+    *loc = value;
+}
+
+/* This should probably have some spinning and exponential backoff */
+static inline uint32 pLsrSynchAtomicTakeUInt32(volatile uint32*loc, uint32 emptyValue) {
+    while (1) {
+        uint32 contents =*loc;
+        if (contents != emptyValue) {
+            if (pLsrSynchCmpAndSetUInt32(loc, contents, emptyValue)) {
+                return contents;
+            }
+        } else {
+            pLsrSynchYieldUntilNotEqualUInt32(loc, emptyValue);
+        }
+    }
+}
+
+static inline void SYNCHCC pLsrSynchAtomicPutUInt32(volatile uint32*loc, uint32 value){
+    *loc = value;
+}
+
+#endif
+
+#endif /* !_PLSR_SYNCHRONIZATION_H_ */
diff --git a/runtime/include/hrc/plsr-tagged-int32.h b/runtime/include/hrc/plsr-tagged-int32.h
new file mode 100755
index 0000000..c8eeb74
--- /dev/null
+++ b/runtime/include/hrc/plsr-tagged-int32.h
@@ -0,0 +1,1209 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+#ifndef _PLSR_TAGGED_INT32_
+#define _PLSR_TAGGED_INT32_
+
+/* Defining PLSR_TAGGED_INT32_TEST will cause several non-static
+ * test functions to be defined which are convenient for inspecting
+ * the code generated by different compilers.
+ */
+
+/* The package operations are defined in terms of three types.
+ *  TaggedInt: the user defined tagged ptr/int type
+ *  APType: the user defined ap (ptr) type
+ *  TaggedInt32: the 32 bit tagged integer type.
+ *  required:  sizeof(TaggedInt) == sizeof(APType) >= sizeof(int32)
+ *
+ *  Implementation specific assumption 1: we assume that for values within the range
+ *  of TaggedInt32, casting to and from TTaggedInt preserves value.  This is guaranteed if 
+ *  TaggedInt is a signed integer type with at least as much precision as TaggedInt32, but 
+ *  implementation defined if TaggedInt is a ptr type.
+ *
+ *  Implementation specific assumption 2: we assume that casting between
+ *  TaggedInt and APType is value preserving.  C99 says that pointer
+ *  casts are implementation defined.
+ *
+ *  Implementation specific assumption 3: we assume (and check) that UINT32_MAX = 2^32-1.
+ *  It's possible that this can be relaxed somewhat, but the proofs would need to be 
+ *  checked.
+ * 
+ *  Implementation specific assumption 4: the non-portable version assumes some 
+ *  implementation defined behavior, most notably two's complement representation, 
+ *  and that >> on signed integers is implemented as arithmetic right shift.
+ */
+
+/* Commentary on integer representations and operations.
+ * 
+ * Notation.  
+ * 
+ * Implementation defined behavior means that a given implementation 
+ * is free to choose what to do, but must do something predictable and documented.
+ *
+ * Undefined behavior means that the meaning of the program is completely unspecified.
+ * The compiler is free to reject such a program, or to optimize as if the behavior
+ * cannot happen, or to cause a runtime error.  It is *not* required to give a 
+ * predictable/repeatable semantics to the behavior.
+ *
+ * Relying on implementation defined behavior could cause portability problems between
+ * compilers/platforms, but should give predictable results on a given compiler/platform.
+ *
+ * Relying on undefined behavior is very dangerous, since a given compiler/platform
+ * may intentionally or accidentally exploit this in such a way as to make the 
+ * semantics completely unpredictable.  For example, signed integer overflow is undefined.
+ * This implies (for example) that the test (x + 1) < x can be treated as false for
+ * signed positive integer x, since either x+1 does not overflow (in which case the 
+ * result is false) or else x+1 does overflow (in which case the result is undefined 
+ * and the program can behave arbitrarily). There are anecdotal reports of some 
+ * compilers exploiting properties such as this.
+ * 
+ * Representation.
+ * 
+ * The c99 spec leaves signed and unsigned integer representations fairly
+ * implementation defined.  Most notably, it allows for arbitrary padding bits,
+ * with essentially arbitrary values.  For a given precision, it imposes
+ * minimum ranges of representable numbers, but not maximum ranges.  It allows
+ * for several different encodings of signed integers.  In  practice, all of 
+ * the platforms we care about will use two's complement arithmetic, with 
+ * (semantically) no padding.  So while it might be desirable in general to
+ * avoid relying on properties of the encoding, the result in most cases
+ * is well-defined for a given compiler/platform.  The header <stdint.h>
+ * defines precise two's complement integer types with no padding which
+ * would be well-suited to this use.  However, this is not available in the
+ * VC99 libraries which we are using.
+ *
+ * Note that the unsigned char type representation is completely well-defined.
+ *
+ * Operations.
+ *
+ * Generally speaking, most operations on unsigned integers are fairly
+ * completely defined.  Operations on signed integers however are 
+ * frequently implementation defined or undefined in particular cases.
+ *
+ * Some specific relevant cases.
+ *
+ * Conversion from signed to unsigned is well-defined.
+ * Conversion from unsigned to signed is implementation defined when
+ *  the value is not representable in the new type.
+ * Left shift is well-defined on unsigned.
+ * Left shift is well-defined on positive signed for which no overflow
+ * occurs.  Otherwise it is UNDEFINED.
+ * Right shift is well-defined on unsigned.
+ * Right shift is well-defined on positive signed, and implementation
+ *  defined on negative signed.
+ * For all arithmetic operations, overflow/underflow on unsigned is 
+ *  well-defined.
+ * For all arithmetic operations, overflow/underflow on signed is
+ *  UNDEFINED.
+ * Logical (and/or/xor) operations are well-defined on both signed
+ *  and unsigned integers.  This is less useful than it seems however,
+ *  since the representations are implementation defined.
+ *
+ * Key points:
+ * UNDEFINED: left shift on signed integers
+ * UNDEFINED: overflow/underflow on signed integers
+ */
+
+/* There are two implementations provided here, chosen between by the 
+ * P_TAGGED_INT32_PORTABLE flag.  All of the differences between the
+ * implementations are restricted to the pLsrTaggedInt32InternalXXX
+ * operations.  Both implementations avoid relying on undefined 
+ * properties.  The portable version also avoids implementation
+ * defined properties, at some expense in the quality of the
+ * generated code.
+ *
+ * The generated code can be inspected by defining PLSR_TAGGED_INT32_TEST
+ * and inspecting the assembly code for the test functions defined at the 
+ * bottom.
+ * 
+ * Implementation 1 (portable).  
+ * 
+ * We represent signed integers x in the range -2^30 <= x < 2^30 as
+ * 2*x + 1.  This allows them to be distinguished from pointers under
+ * the implementation specific assumption that pointers are always even.
+ * Tagging and untagging are done via well-defined arithmetic operations.
+ *
+ * Implementation 2 (non-portable).
+ *
+ * We represent signed integers x in the range -2^30 <= x < 2^30 as
+ * 2*x & 0x1.  This allows them to be distinguished from pointers
+ * under various implementation specific assumptions about the 
+ * representation of integers (all of which should be true on all 
+ * platforms we care about).
+ * N.B.  Even under implementation specific assumptions, x >> 1 != x/2.
+ */
+
+
+/************************************************************************/
+/********** This section defines the core internal operations ***********/
+/************************************************************************/
+
+/* We begin by defining the basic internal operations on tagged 32 bit integers. */
+typedef sint32 PlsrTaggedInt32;
+
+/* Some constants */
+#define pLsrTaggedInt32InternalMax  (SINT32_MAX / 2)
+#define pLsrTaggedInt32InternalMin  (SINT32_MIN / 2)
+#define pLsrTaggedInt32InternalTwoToThe31 0x80000000
+#define pLsrTaggedInt32InternalTwoToThe30 0x40000000
+
+
+#if (defined(P_TAGGED_INT32_ASSUME_SMALL) || defined(P_TAGGED_INT32_ASSERT_SMALL))
+
+/************************ Tagging **********************************/
+#define pLsrTaggedInt32InternalTag(a) ((PlsrTaggedInt32) (a))
+
+/************************ UnTagging ********************************/
+#define pLsrTaggedInt32InternalUnTag(a) ((PlsrTaggedInt32) a)
+
+/********************** Tag checking *******************************/
+#define pLsrTaggedInt32InternalIsTagged(a) 1
+
+/***************** Two operand tag checking ************************/
+#define pLsrTaggedInt32InternalAreTagged(a, b) 1
+
+/********************* Range checking *****************************/
+#ifdef P_TAGGED_INT32_ASSERT_SMALL
+#define pLsrTaggedInt32InternalSIntInRange(a)                           \
+    ((((a) <= pLsrTaggedInt32InternalMax) &&                            \
+      ((a) >= pLsrTaggedInt32InternalMin)) ||                           \
+     (pLsrRuntimeError_("SInt out of range"), 0))                        
+#define pLsrTaggedInt32InternalUIntInRange(a)                   \
+    (((a) <= pLsrTaggedInt32InternalMax) ||                     \
+     (pLsrRuntimeError_("UInt out of range"), 0))                 
+#else
+#define pLsrTaggedInt32InternalSIntInRange(a) 1
+#define pLsrTaggedInt32InternalUIntInRange(a) 1
+#endif /* P_TAGGED_INT32_ASSERT_SMALL */
+
+#define pLsrTaggedInt32InternalSInt32InRange pLsrTaggedInt32InternalSIntInRange
+#define pLsrTaggedInt32InternalUInt32InRange pLsrTaggedInt32InternalUIntInRange
+#define pLsrTaggedInt32InternalSInt64InRange pLsrTaggedInt32InternalSIntInRange
+#define pLsrTaggedInt32InternalUInt64InRange pLsrTaggedInt32InternalUIntInRange
+
+/******************** Check assumptions **************************/
+#define pLsrTaggedInt32InternalCheck(APType, TaggedInt)                 \
+    do {                                                                \
+        assert(sizeof(APType) >= sizeof(PlsrTaggedInt32));              \
+        assert(sizeof(TaggedInt) >= (sizeof(APType)));                  \
+    } while (0)                                                         
+
+#else /* !P_TAGGED_INT32_ASSERT_SMALL && !P_TAGGED_INT32_ASSUME_SMALL*/
+#ifdef P_TAGGED_INT32_PORTABLE
+
+
+/************************ Tagging **********************************/
+/* The tagged version of an integer x in the appropriate range is 2*x + 1.
+ * Note that this is guaranteed not to overflow for 32 bit integers according to the 
+ * c99 spec.
+ */
+#define pLsrTaggedInt32InternalTag(a) ((((PlsrTaggedInt32) (a)) * 2) + 1)
+
+
+/************************ UnTagging ********************************/
+/* The untagged version of a tagged integer x is (x-1)/2. 
+ */
+#define pLsrTaggedInt32InternalUnTag(a) ((((PlsrTaggedInt32) (a)) -1 )/2)
+
+
+/********************** Tag checking *******************************/
+/* We test if x is tagged by checking (((uint32) x) mod 2): that is, 
+ * x is tagged iff it is odd.  Note that conversion to uint32 preserves parity
+ * so long as UINT32_MAX + 1 is even.  The Intel C compiler generates good 
+ * code for this in some contexts but not others.  
+ */
+#define pLsrTaggedInt32InternalIsTagged(a) (((uint32) ((PlsrTaggedInt32) (a))) % 2)
+
+
+/***************** Two operand tag checking ************************/
+/* We test if x and y are both tagged by computing
+ * (((unsigned char) (((uint32) x) mod 2)) & ((unsigned char) (((uint32) y) mod 2)))
+ * Using bitwise and avoids a double jump and produces better code.  The conversion
+ * to unsigned char eliminates any padding bits.
+ */
+#define pLsrTaggedInt32InternalAreTagged(a, b)                          \
+    (pLsrTaggedInt32InternalIsTagged(a)) &&                             \
+    (pLsrTaggedInt32InternalIsTagged(b))
+
+
+/********************* Range checking *****************************/
+
+/* We can test any integer for being in range by comparing to the appropriate
+ * max and min values.  However, this does not generate very good code.  This is 
+ * fully portable.
+ */
+#define pLsrTaggedInt32InternalSIntInRange(a)                           \
+    (((a) <= pLsrTaggedInt32InternalMax) && ((a) >= pLsrTaggedInt32InternalMin))
+#define pLsrTaggedInt32InternalUIntInRange(a) ((a) <= pLsrTaggedInt32InternalMax)
+#define pLsrTaggedInt32InternalUInt32InRange pLsrTaggedInt32InternalUIntInRange
+#define pLsrTaggedInt32InternalUInt64InRange pLsrTaggedInt32InternalUIntInRange
+#define pLsrTaggedInt32InternalSInt32InRange pLsrTaggedInt32InternalSIntInRange
+#define pLsrTaggedInt32InternalSInt64InRange pLsrTaggedInt32InternalSIntInRange
+
+/******************** Check assumptions **************************/
+#define pLsrTaggedInt32InternalCheck(APType, TaggedInt)                 \
+    do {                                                                \
+        assert(sizeof(APType) >= sizeof(PlsrTaggedInt32));              \
+        assert(sizeof(TaggedInt) >= (sizeof(APType)));                  \
+    } while (0)                                                         
+
+
+#else /* !P_TAGGED_INT32_PORTABLE && !P_TAGGED_INT32_ASSUME_SMALL && !P_TAGGED_INT32_ASSERT_SMALL */
+
+
+#define pLsrTaggedInt32InternalSetLowBit(a) ((a) | 0x1)
+#define pLsrTaggedInt32InternalGetLowBit(a) ((a) & 0x1)
+#define pLsrTaggedInt32InternalASR(a) ((a) >> 1)
+
+
+/************************ Tagging **********************************/
+/* The tagged version of an integer x in the appropriate range is 2*x | 0x1.
+ * Under reasonable assumptions about representation, this sets the low bit 
+ * only.
+ */
+#define pLsrTaggedInt32InternalTag(a) \
+    (pLsrTaggedInt32InternalSetLowBit(((PlsrTaggedInt32) (a)) * 2))
+
+
+/************************ UnTagging ********************************/
+/* The untagged version of a tagged integer x is x >> 1.  Under reasonable
+ * assumptions about representation and the implementation of >>, this
+ * does an arithmetic shift right.
+ */
+#define pLsrTaggedInt32InternalUnTag(a) \
+    (pLsrTaggedInt32InternalASR((PlsrTaggedInt32) (a)))
+
+
+/********************** Tag checking *******************************/
+/* An integer is tagged if the low bit is set. */
+#define pLsrTaggedInt32InternalIsTagged(a) \
+    (pLsrTaggedInt32InternalGetLowBit((PlsrTaggedInt32) (a)))
+
+
+/***************** Two operand tag checking ************************/
+/* We test if x and y are both tagged by checking that both low bits are set */
+#define pLsrTaggedInt32InternalAreTagged(a, b)                          \
+    (pLsrTaggedInt32InternalGetLowBit((((uint32) ((PlsrTaggedInt32) (a))) & \
+                                       ((uint32) ((PlsrTaggedInt32) (b))))))
+
+
+/********************* Range checking *****************************/
+
+/* We can test any integer for being in range by comparing to the appropriate
+ * max and min values.  However, this does not generate very good code.  This is 
+ * fully portable.
+ */
+#define pLsrTaggedInt32InternalSIntInRange(a)                           \
+    (((a) <= pLsrTaggedInt32InternalMax) && ((a) >= pLsrTaggedInt32InternalMin))
+#define pLsrTaggedInt32InternalUIntInRange(a) ((a) <= pLsrTaggedInt32InternalMax)
+
+/* Hard to do better than this for uints */
+#define pLsrTaggedInt32InternalUInt32InRange pLsrTaggedInt32InternalUIntInRange
+#define pLsrTaggedInt32InternalUInt64InRange pLsrTaggedInt32InternalUIntInRange
+
+/* For signed integer types with known precision, we can define
+ * a test which generates better code as follows, under the implementation 
+ * specific assumptions that 1) UINT32_MAX = 2^32-1, and 
+ * 2) SINT32_MAX = 2^32-1 and 3) SINT32_MAX = -2^32 (actually, any power of
+ * two).
+ *
+ * We test if x is in the representable range (that is -2^30 <= x < 2^30) 
+ * by casting it to uint32, adding 2^30 and testing against 2^31.  See the 
+ * proof below for a formal justification of this in terms of the c99 spec, 
+ * but informally what this is doing is as follows:
+ * The conversion to an unsigned int preserves the bit layout in two's 
+ * complement.  For negative numbers, x is representable if the leading
+ * two bits are 11, and for positive numbers, x is representable if
+ * the leading two bits are 00.  Otherwise, the number is not representable.
+ * So we wish to tell whether or not the leading bits differ.  We do this
+ * by adding 2^30 to the number.  If the leading bits are 11, it overflows
+ * and the leading bits are 00.  If the leading bits are 00, the leading
+ * bits become 01.  In both cases, the number is now less than 2^31.  If
+ * the bits differ, there is no overflow, and the leading bit will be 1.
+ * Therefore the number will be greater than or equal to 2^31.
+ *
+ * This explanation assumes two's complement representation with a particular
+ * bit order: however the proof below shows that this method is in
+ * fact fully general under very limited assumptions about the range of the 
+ * respective types.  These assumptions could be eliminated if we had 
+ * access to the stdint.h types.
+ */
+#define pLsrTaggedInt32InternalSInt32InRange(a)                         \
+    ((((uint32) (a)) + pLsrTaggedInt32InternalTwoToThe30) < pLsrTaggedInt32InternalTwoToThe31)
+#define pLsrTaggedInt32InternalSInt64InRange(a)                         \
+    ((((uint64) (a)) + pLsrTaggedInt32InternalTwoToThe30) < pLsrTaggedInt32InternalTwoToThe31)
+
+/* Proof of correctness for range checking.
+ * 
+ * According the c99 spec, converting from a signed to an unsigned type of the 
+ * same size:
+ *  1) preserves the value if the value is representable (positive)
+ *  2) is the result of repeatedly adding one more than the maximum value of
+ *     the new type to the old value
+ * So, assuming a is a negative sint32, then
+ *    ((uint32) a) = (UINT32_MAX + 1) + a
+ * For this proof, I assume that (UINT32_MAX + 1) = 2^32.  Strictly speaking,
+ * I'm not sure this is guaranteed - I believe the spec permits additional 
+ * range.  In practice, this isn't a concern.  I believe this proof could be 
+ * generalized to relax this assumption somewhat.  The proof goes through
+ * fine with any power of two for UINT32_MAX.
+ *
+ * For a 32 bit signed int k, -2^31 <= k < 2^31
+ * Let S' = ((uint32) k) + 2^30
+ * and let S = S' mod 2^32
+ * Case 1: k is positive.  Then:
+ *   - ((uint32) k) = k, so
+ *   - S = (k + 2^30) mod 2^32
+ *       = k + 2^30   (since k + 2^30 < 2^32)
+ *    
+ *   Suppose S < 2^31.  Then 
+ *     S = k + 2^30 < 2^31 
+ *         k + 2^30 < 2^30 + 2^30
+ *         k        < 2^30
+ *   Suppose S >= 2^31.  Then
+ *     S = k + 2^30 >= 2^31 
+ *         k + 2^30 >= 2^30 + 2^30
+ *         k        >= 2^30
+ *    
+ * Case 2: k is negative (-1 >= k >= -2^31).  Then:
+ *   - By the c99 spec (assuming UINT32_MAX = 2^32-1)
+ *     ((uint32) k) = 2^32 + k 
+ *   - So, S' = 2^32 + k + 2^30
+ *
+ *    Suppose k >= -2^30.  Note that
+ *         S = S' mod 2^32
+ *           = (2^32 + k + 2^30) mod 2^32
+ *           = (k + 2^30) mod 2^32
+ *    But note that 0 <= k + 2^30 < 2^30, so
+ *           = (k + 2^30) 
+ *    And so 
+ *         S < 2^30
+ *    Finally, by contrapositive:
+ *         S >= 2^30 => k < -2^30
+ *    So clearly
+ *         S >= 2^31 => k < -2^30
+ *
+ *    Suppose k < -2^30.  Note that
+ *         S = S' mod 2^32
+ *           = (2^32 + k + 2^30) mod 2^32
+ *    But note that 
+ *        -2^31 + 2^30 <= k + 2^30 < -2^30 + 2^30, i.e.
+ *              - 2^30 <= k + 2^30 < 0
+ *    So 
+ *         2^32 - 2^30 <= S'       < 2^32 + 0
+ *         2^31 + 2^30 <= S'       < 2^32
+ *    So 
+ *         S = S' mod 2^32 = S' >= 2^31 + 2^30
+ *    Finally, by contrapositive, 
+ *         S < 2^31 + 2^30 => k >= -2^30
+ *    So clearly
+ *         S < 2^31 => k >= -2^30
+ *
+ * So we have:
+ *   S < 2^31 => k <   2^30  (k positive)
+ *   S < 2^31 => k >= -2^30  (k negative)
+ * And
+ *   S >= 2^31 => k >= 2^30  (k positive)
+ *   S >= 2^31 => k < -2^30  (k negative)
+ * 
+ * So S < 2^31  => -2^30 <= k < 2^30
+ *
+ */
+
+/******************** Check assumptions **************************/
+#define pLsrTaggedInt32InternalCheck(APType, TaggedInt)                 \
+    do {                                                                \
+        assert(sizeof(APType) >= sizeof(PlsrTaggedInt32));              \
+        assert(sizeof(TaggedInt) >= (sizeof(APType)));                  \
+        assert(UINT32_MAX == 0xFFFFFFFF);                               \
+        assert(SINT32_MIN == 0x80000000);                               \
+        assert(SINT32_MAX == 0x7FFFFFFF);                               \
+    } while (0)                                                         
+
+#endif /* P_TAGGED_INT32_PORTABLE */
+#endif /* P_TAGGED_INT32_ASSUME_SMALL || P_TAGGED_INT32_ASSERT_SMALL */
+
+
+/************************************************************************/
+/******************* This is the main implementation ********************/
+/************************************************************************/
+
+#define pLsrTaggedInt32Check pLsrTaggedInt32InternalCheck
+
+/*********************** Conversions ************************************/
+
+#define pLsrTaggedInt32Max (pLsrTaggedInt32InternalMax)
+#define pLsrTaggedInt32Min (pLsrTaggedInt32InternalMin)
+#define pLsrTaggedInt32Zero (pLsrTaggedInt32InternalTag(0))
+#define pLsrTaggedInt32One (pLsrTaggedInt32InternalTag(1))
+#define pLsrTaggedInt32MinusOne (pLsrTaggedInt32InternalTag(-1))
+
+
+#define pLsrTaggedInt32TaggedIntIsTagged(a) (pLsrTaggedInt32InternalIsTagged(a))
+#define pLsrTaggedInt32TaggedIntUnTag(a)                                \
+    (assert(pLsrTaggedInt32InternalIsTagged(a)), (pLsrTaggedInt32InternalUnTag(a)))
+#define pLsrTaggedInt32TaggedIntFromSmallInt32(TaggedInt, a)    \
+    ((TaggedInt) (pLsrTaggedInt32InternalTag(a)))
+
+/* The NumConv macros are statement macros. */
+
+/* Convert a value of type sintx to the tagged int type */
+#define pLsrTaggedInt32NumConvFastTaggedIntFromSIntX(TaggedInt, SIntX, apFromSIntX, dest, a) \
+    do {                                                                \
+        if (sizeof(SIntX) <= sizeof(sint32)) {                          \
+            if (pLsrTaggedInt32InternalSInt32InRange(a))                \
+                {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \
+            else                                                        \
+                {(dest) = ((TaggedInt) (apFromSIntX(a)));}              \
+        } else if (sizeof(SIntX) == sizeof(sint64)) {                   \
+            if (pLsrTaggedInt32InternalSInt64InRange(a))                \
+                {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \
+            else                                                        \
+                {(dest) = ((TaggedInt) (apFromSIntX(a)));}              \
+        } else {                                                        \
+            if (pLsrTaggedInt32InternalSIntInRange(a))                  \
+                {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \
+            else                                                        \
+                {(dest) = ((TaggedInt) (apFromSIntX(a)));}              \
+        }                                                               \
+    } while (0)
+
+/* Convert a value of the tagged int type to type sintx */
+#define pLsrTaggedInt32NumConvFastSIntXFromTaggedInt(SIntX, TaggedInt, APType, sIntXFromAP, dest, a) \
+    do {                                                                \
+        if (pLsrTaggedInt32InternalIsTagged(a))                         \
+            {(dest) = ((SIntX) (pLsrTaggedInt32InternalUnTag(a)));}     \
+        else                                                            \
+            {(dest) = ((sIntXFromAP((APType) (a))));}                   \
+    } while (0)
+
+/* Convert a value of type uintx to the tagged int type */
+#define pLsrTaggedInt32NumConvFastTaggedIntFromUIntX(TaggedInt, UIntX, apFromUIntX, dest, a) \
+    do {                                                                \
+        if (sizeof(UIntX) <= sizeof(uint32)) {                          \
+            if (pLsrTaggedInt32InternalUInt32InRange(a))                \
+                {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \
+            else                                                        \
+                {(dest) = ((TaggedInt) (apFromUIntX(a)));}              \
+        } else if (sizeof(UIntX) == sizeof(uint64)) {                   \
+            if (pLsrTaggedInt32InternalUInt64InRange(a))                \
+                {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \
+            else                                                        \
+                {(dest) = ((TaggedInt) (apFromUIntX(a)));}              \
+        } else {                                                        \
+            if (pLsrTaggedInt32InternalUIntInRange(a))                  \
+                {(dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag(a)));} \
+            else                                                        \
+                {(dest) = ((TaggedInt) (apFromUIntX(a)));}              \
+        }                                                               \
+    } while (0)
+
+/* Convert a value of the tagged int type to type uintx */
+#define pLsrTaggedInt32NumConvFastUIntXFromTaggedInt(UIntX, TaggedInt, APType, uIntXFromAP, dest, a) \
+    do {                                                                \
+        if (pLsrTaggedInt32InternalIsTagged(a))                         \
+            {(dest) = ((UIntX) (pLsrTaggedInt32InternalUnTag(a)));}     \
+        else                                                            \
+            {(dest) = ((uIntXFromAP((APType) (a))));}                   \
+    } while (0)
+
+/* Convert a value of type floatx to the tagged int type */
+/* This might be a bit of a hack.  Find a better way to do this?  -leaf */
+#define pLsrTaggedInt32NumConvFastTaggedIntFromFloat32(TaggedInt, apFromFloat32, dest, a) \
+    do {                                                                \
+        if ((((float32)(sint32)a) == ((float32) a)) &&                  \
+            (pLsrTaggedInt32InternalSInt32InRange((sint32)a))) {        \
+            (dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag((sint32)a))); \
+        } else {                                                        \
+            (dest) = ((TaggedInt) (apFromFloat32(a)));                  \
+        }                                                               \
+    } while (0)
+
+#define pLsrTaggedInt32NumConvFastTaggedIntFromFloat64(TaggedInt, apFromFloat64, dest, a) \
+    do {                                                                \
+        if ((((float64)(sint32)a) == ((float64) a)) &&                  \
+            (pLsrTaggedInt32InternalSInt32InRange((sint32)a))) {        \
+            (dest) = ((TaggedInt) (pLsrTaggedInt32InternalTag((sint32)a))); \
+        } else {                                                        \
+            (dest) = ((TaggedInt) (apFromFloat64(a)));                   \
+        }                                                               \
+    } while (0)
+
+/* Convert a value of the tagged int type to type floatX */
+#define pLsrTaggedInt32NumConvFastFloatXFromTaggedInt(FloatX, TaggedInt, APType, floatXFromAP, dest, a) \
+    do {                                                                \
+        if (pLsrTaggedInt32InternalIsTagged(a))                         \
+            {(dest) = ((FloatX) (pLsrTaggedInt32InternalUnTag(a)));}    \
+        else                                                            \
+            {(dest) = ((floatXFromAP((APType) (a))));}                  \
+    } while (0)
+
+#define pLsrTaggedInt32MkCStringFrom(TName, TaggedInt, APType, apFromSInt32, cStringFromAP) \
+    static char* pLsrCStringFrom##TName (TaggedInt a) {                 \
+        {                                                               \
+            if(pLsrTaggedInt32InternalIsTagged(a)) {                    \
+                return cStringFromAP(apFromSInt32(pLsrTaggedInt32InternalUnTag(a))); \
+            } else {                                                    \
+                return cStringFromAP((APType) a);                       \
+            }                                                           \
+        }                                                               \
+    }                                                                   \
+
+#define pLsrTaggedInt32MkFromCString(TName, TaggedInt, APType, apFromSInt32, sint32FromAP, apLessOrEqual, apFromCString) \
+    static TaggedInt pLsr##TName##FromCString (char* s) {               \
+        {                                                               \
+            APType ap = apFromCString(s);                               \
+            APType max = apFromSInt32(pLsrTaggedInt32InternalMax);      \
+            APType min = apFromSInt32(pLsrTaggedInt32InternalMin);      \
+            if (apLessOrEqual (min, ap) && apLessOrEqual(ap, max)) {    \
+                return (TaggedInt) pLsrTaggedInt32InternalTag(sint32FromAP(ap)); \
+            } else {                                                    \
+                return (TaggedInt) ap;                                  \
+            }                                                           \
+        }                                                               \
+    }                                                                   \
+
+/**************************** Bitwise *********************************/
+
+/* Can't make a tagged value non-taggable */
+#define pLsrTaggedInt32TaggedIntBNot(TaggedInt, APType, apBNot, apFromSInt32, dest, a) \
+    do {                                                                \
+        if (pLsrTaggedInt32InternalIsTagged((PlsrTaggedInt32) (a))) {   \
+            sint32 pLsrTaggedInt32TaggedIntBNot_tmp =                    \
+                ~((sint32) (pLsrTaggedInt32InternalUnTag(a)));          \
+            (dest) = ((TaggedInt) pLsrTaggedInt32InternalTag(pLsrTaggedInt32TaggedIntBNot_tmp)); \
+        } else {                                                        \
+            (dest) = ((TaggedInt) (apBNot((APType) a)));                \
+        }                                                               \
+    } while(0)
+
+/* Most of the binary bitwise operations can't overflow into the high bits
+ * and hence can use these versions. */
+#define pLsrTaggedInt32InternalBitwiseSlowPathReturnNO(TaggedInt, APType, apFromSInt32, apBin, a, b) \
+    do {                                                                \
+        APType pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA;         \
+        APType pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB;         \
+        if (pLsrTaggedInt32InternalIsTagged(a)) {                       \
+            pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA =           \
+                apFromSInt32(pLsrTaggedInt32InternalUnTag(a));          \
+        } else {                                                        \
+            pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA = (APType)a; \
+        }                                                               \
+        if (pLsrTaggedInt32InternalIsTagged(b)) {                       \
+            pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB =           \
+                apFromSInt32(pLsrTaggedInt32InternalUnTag(b));          \
+        } else {                                                        \
+            pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB = (APType)b; \
+        }                                                               \
+        return (TaggedInt) apBin(pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA, \
+                                 pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB); \
+    } while(0)
+
+#define pLsrTaggedInt32MkTaggedIntBitwiseBinSlowNO(TName, OpName, TaggedInt, APType, apFromSInt32, apBin, binOp) \
+    static TaggedInt pLsrTaggedInt32##TName##OpName##Slow (TaggedInt a, TaggedInt b) { \
+        if(pLsrTaggedInt32InternalAreTagged(a, b)) {                    \
+            sint32 tmp =                                                \
+                binOp(((sint32) (pLsrTaggedInt32InternalUnTag(a))),     \
+                      ((sint32) (pLsrTaggedInt32InternalUnTag(b))));    \
+            return (TaggedInt) pLsrTaggedInt32InternalTag(tmp);         \
+        } else {                                                        \
+            pLsrTaggedInt32InternalBitwiseSlowPathReturnNO(TaggedInt, APType, apFromSInt32, apBin, a, b); \
+        }                                                               \
+    } 
+
+
+#define pLsrTaggedInt32MkTaggedIntBitwiseBinNO(TaggedInt, APType, binOp, binOpSlow, dest, a, b) \
+    do {                                                                \
+        if(pLsrTaggedInt32InternalAreTagged(a, b)) {                    \
+            sint32 pLsrTaggedInt32MkTaggedIntBitwiseBinNO_tmp =         \
+                binOp(((sint32) (pLsrTaggedInt32InternalUnTag(a))),     \
+                      ((sint32) (pLsrTaggedInt32InternalUnTag(b))));    \
+            (dest) = (TaggedInt) (pLsrTaggedInt32InternalTag(pLsrTaggedInt32MkTaggedIntBitwiseBinNO_tmp)); \
+        } else {                                                        \
+            (dest) = binOpSlow(a, b);                                   \
+        }                                                               \
+    } while(0)
+
+/* Currently just use the slow version for things which might overflow into
+ * the high bits.  
+ */
+
+#define pLsrTaggedInt32InternalBitwiseSlowPathReturn(TaggedInt, APType, apFromSInt32, apBin, a, b) \
+    do {                                                                \
+        APType pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA;         \
+        APType pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB;         \
+        if (pLsrTaggedInt32InternalIsTagged(a)) {                       \
+            pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA =           \
+                apFromSInt32(pLsrTaggedInt32InternalUnTag(a));          \
+        } else {                                                        \
+            pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA = (APType)a; \
+        }                                                               \
+        if (pLsrTaggedInt32InternalIsTagged(b)) {                       \
+            pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB =           \
+                apFromSInt32(pLsrTaggedInt32InternalUnTag(b));          \
+        } else {                                                        \
+            pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB = (APType)b; \
+        }                                                               \
+        return (TaggedInt) apBin(pLsrTaggedInt32InternalBitwiseSlowPathReturn_aA, \
+                                 pLsrTaggedInt32InternalBitwiseSlowPathReturn_bB); \
+    } while(0)
+
+#define pLsrTaggedInt32MkTaggedIntBitwiseBinSlow(TName, OpName, TaggedInt, APType, apFromSInt32, apBin, binOp) \
+    static TaggedInt pLsrTaggedInt32##TName##OpName##Slow (TaggedInt a, TaggedInt b) { \
+        pLsrTaggedInt32InternalBitwiseSlowPathReturn(TaggedInt, APType, apFromSInt32, apBin, a, b); \
+    }
+
+#define pLsrTaggedInt32MkTaggedIntBitwiseBin(TaggedInt, APType, binOp, binOpSlow, dest, a, b) \
+    do {                                                                \
+        (dest) = binOpSlow(a, b);                                       \
+    } while(0)
+
+
+#define pLsrTaggedInt32TaggedIntBAndOp(a, b) (a & b)
+#define pLsrTaggedInt32MkTaggedIntBAndSlow(TName, TaggedInt, APType, apFromSInt32, apBAnd) \
+    pLsrTaggedInt32MkTaggedIntBitwiseBinSlowNO(TName, BAnd, TaggedInt, APType, apFromSInt32, \
+                                               apBAnd, pLsrTaggedInt32TaggedIntBAndOp)
+#define pLsrTaggedInt32TaggedIntBAnd(TaggedInt, APType, bAndSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntBitwiseBinNO(TaggedInt, APType, pLsrTaggedInt32TaggedIntBAndOp, \
+                                           bAndSlow, dest, aa, bb) 
+
+#define pLsrTaggedInt32TaggedIntBOrOp(a, b) (a | b)
+#define pLsrTaggedInt32MkTaggedIntBOrSlow(TName, TaggedInt, APType, apFromSInt32, apBOr) \
+    pLsrTaggedInt32MkTaggedIntBitwiseBinSlowNO(TName, BOr, TaggedInt, APType, apFromSInt32, apBOr, \
+                                               pLsrTaggedInt32TaggedIntBOrOp)
+#define pLsrTaggedInt32TaggedIntBOr(TaggedInt, APType, bOrSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntBitwiseBinNO(TaggedInt, APType, pLsrTaggedInt32TaggedIntBOrOp, bOrSlow, \
+                                           dest, aa, bb) 
+
+/* Can overflow */
+#define pLsrTaggedInt32TaggedIntBShiftLOp(a, b) (assert(0)))
+#define pLsrTaggedInt32MkTaggedIntBShiftLSlow(TName, TaggedInt, APType, apFromSInt32, apBShiftL) \
+    pLsrTaggedInt32MkTaggedIntBitwiseBinSlow(TName, BShiftL, TaggedInt, APType, apFromSInt32, apBShiftL, \
+                                             pLsrTaggedInt32TaggedIntBShiftLOp)
+#define pLsrTaggedInt32TaggedIntBShiftL(TaggedInt, APType, bShiftLSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntBitwiseBin(TaggedInt, APType, \
+                                         pLsrTaggedInt32TaggedIntBShiftLOp, bShiftLSlow, dest, aa, bb) 
+
+/* a >> b is only defined for 0 <= b <= 31 */
+#define pLsrTaggedInt32TaggedIntBShiftROp(a, b) ((b >= 0 && b < 32) ? (a >> b) : \
+                                                 ((b < 0 && b > -32) ? (a << -b) : \
+                                                  ((a < 0) ? -1 : 0))) 
+#define pLsrTaggedInt32MkTaggedIntBShiftRSlow(TName, TaggedInt, APType, apFromSInt32, apBShiftR) \
+    pLsrTaggedInt32MkTaggedIntBitwiseBinSlow(TName, BShiftR, TaggedInt, APType, apFromSInt32, apBShiftR, \
+                                             pLsrTaggedInt32TaggedIntBShiftROp)
+#define pLsrTaggedInt32TaggedIntBShiftR(TaggedInt, APType, bShiftRSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntBitwiseBin(TaggedInt, APType, \
+                                         pLsrTaggedInt32TaggedIntBShiftROp, bShiftRSlow, dest, aa, bb) 
+
+#define pLsrTaggedInt32TaggedIntBXorOp(a, b) (a ^ b)
+#define pLsrTaggedInt32MkTaggedIntBXorSlow(TName, TaggedInt, APType, apFromSInt32, apBXor) \
+    pLsrTaggedInt32MkTaggedIntBitwiseBinSlowNO(TName, BXor, TaggedInt, APType, apFromSInt32, apBXor, \
+                                               pLsrTaggedInt32TaggedIntBXorOp)
+#define pLsrTaggedInt32TaggedIntBXor(TaggedInt, APType, bXorSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntBitwiseBinNO(TaggedInt, APType, \
+                                           pLsrTaggedInt32TaggedIntBXorOp, bXorSlow, dest, aa, bb) 
+
+/**************************** Arithmetic *********************************/
+
+#define pLsrTaggedInt32TaggedIntNeg(TaggedInt, APType, apNeg, apFromSInt32, dest, a) \
+    do {                                                                \
+        if (pLsrTaggedInt32InternalIsTagged((PlsrTaggedInt32) (a))) {   \
+            sint32 pLsrTaggedInt32TaggedIntNeg_tmp =                    \
+                -((sint32) (pLsrTaggedInt32InternalUnTag(a)));          \
+            if ((pLsrTaggedInt32InternalSInt32InRange(pLsrTaggedInt32TaggedIntNeg_tmp))) \
+                {(dest) = ((TaggedInt) pLsrTaggedInt32InternalTag(pLsrTaggedInt32TaggedIntNeg_tmp));} \
+            else                                                        \
+                {(dest) = ((TaggedInt) apFromSInt32(pLsrTaggedInt32TaggedIntNeg_tmp));} \
+        } else {                                                        \
+            (dest) = ((TaggedInt) (apNeg((APType) a)));                 \
+        }                                                               \
+    } while(0)
+
+#define pLsrTaggedInt32InternalArithSlowPathReturn(TaggedInt, APType, apFromSInt32, apBin, a, b) \
+    do {                                                                \
+        APType pLsrTaggedInt32InternalArithSlowPathReturn_aA;           \
+        APType pLsrTaggedInt32InternalArithSlowPathReturn_bB;           \
+        if (pLsrTaggedInt32InternalIsTagged(a)) {                       \
+            pLsrTaggedInt32InternalArithSlowPathReturn_aA =             \
+                apFromSInt32(pLsrTaggedInt32InternalUnTag(a));          \
+        } else {                                                        \
+            pLsrTaggedInt32InternalArithSlowPathReturn_aA = (APType)a;  \
+        }                                                               \
+        if (pLsrTaggedInt32InternalIsTagged(b)) {                       \
+            pLsrTaggedInt32InternalArithSlowPathReturn_bB =             \
+                apFromSInt32(pLsrTaggedInt32InternalUnTag(b));          \
+        } else {                                                        \
+            pLsrTaggedInt32InternalArithSlowPathReturn_bB = (APType)b;  \
+        }                                                               \
+        return (TaggedInt) apBin(pLsrTaggedInt32InternalArithSlowPathReturn_aA, \
+                                 pLsrTaggedInt32InternalArithSlowPathReturn_bB); \
+    } while(0)
+
+/* Addition and subtraction */
+#define pLsrTaggedInt32MkTaggedIntAddSubSlow(TName, OpName, TaggedInt, APType, apFromSInt32, apBin, binOp) \
+    static TaggedInt pLsrTaggedInt32##TName##OpName##Slow (TaggedInt a, TaggedInt b) { \
+        if(pLsrTaggedInt32InternalAreTagged(a, b)) {                    \
+            sint32 tmp =                                                \
+                (((sint32) (pLsrTaggedInt32InternalUnTag(a))) binOp     \
+                 ((sint32) (pLsrTaggedInt32InternalUnTag(b))));         \
+            if (pLsrTaggedInt32InternalSInt32InRange(tmp)) {            \
+                return (TaggedInt) pLsrTaggedInt32InternalTag(tmp);     \
+            }                                                           \
+            else {                                                      \
+                return (TaggedInt) apFromSInt32(tmp);                   \
+            }                                                           \
+        } else {                                                        \
+            pLsrTaggedInt32InternalArithSlowPathReturn(TaggedInt, APType, apFromSInt32, apBin, a, b); \
+        }                                                               \
+    } 
+
+#define pLsrTaggedInt32MkTaggedIntAddSub(TaggedInt, APType, binOp, binOpSlow, dest, a, b) \
+    do {                                                                \
+        if(pLsrTaggedInt32InternalAreTagged(a, b)) {                    \
+            sint32 pLsrTaggedInt32MkTaggedIntAddSub_tmp =               \
+                (((sint32) (pLsrTaggedInt32InternalUnTag(a))) binOp     \
+                 ((sint32) (pLsrTaggedInt32InternalUnTag(b))));         \
+            if (pLsrTaggedInt32InternalSInt32InRange(pLsrTaggedInt32MkTaggedIntAddSub_tmp)) { \
+                (dest) = (TaggedInt) (pLsrTaggedInt32InternalTag(pLsrTaggedInt32MkTaggedIntAddSub_tmp)); \
+                continue;                                               \
+            }                                                           \
+        }                                                               \
+        (dest) = binOpSlow(a, b);                                       \
+    } while(0)
+
+#define pLsrTaggedInt32MkTaggedIntAddSlow(TName, TaggedInt, APType, apFromSInt32, apAdd) \
+    pLsrTaggedInt32MkTaggedIntAddSubSlow(TName, Add, TaggedInt, APType, apFromSInt32, apAdd, +)
+#define pLsrTaggedInt32TaggedIntAdd(TaggedInt, APType, addSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntAddSub(TaggedInt, APType, +, addSlow, dest, aa, bb) 
+
+#define pLsrTaggedInt32MkTaggedIntSubSlow(TName, TaggedInt, APType, apFromSInt32, apSub) \
+    pLsrTaggedInt32MkTaggedIntAddSubSlow(TName, Sub, TaggedInt, APType, apFromSInt32, apSub, -)
+#define pLsrTaggedInt32TaggedIntSub(TaggedInt, APType, subSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntAddSub(TaggedInt, APType, -, subSlow, dest, aa, bb) 
+
+
+/* Multiplication */
+#define pLsrTaggedInt32MkTaggedIntMulSlow(TName, TaggedInt, APType, apFromSInt32, apFromSInt64, apMul) \
+    static TaggedInt pLsrTaggedInt32##TName##MulSlow (TaggedInt a, TaggedInt b) { \
+        if(pLsrTaggedInt32InternalAreTagged(a, b)) {                    \
+            sint64 tmp =                                                \
+                (((sint64)(sint32) (pLsrTaggedInt32InternalUnTag(a))) * \
+                 ((sint64)(sint32) (pLsrTaggedInt32InternalUnTag(b)))); \
+            if (pLsrTaggedInt32InternalSInt64InRange(tmp)) {            \
+                return (TaggedInt) pLsrTaggedInt32InternalTag(tmp);     \
+            } else {                                                    \
+                return (TaggedInt) apFromSInt64(tmp);                   \
+            }                                                           \
+        } else {                                                        \
+            pLsrTaggedInt32InternalArithSlowPathReturn(TaggedInt, APType, apFromSInt32, apMul, a, b); \
+        }                                                               \
+    }                                                                   
+
+/* We use inline assembly here to work around an icl limitation.  The compiler 
+ * fails to turn  add64((int64) a, b) into imul (a, b) when b is a small 
+ * constant.  Using assembly here probably eliminates some optimization
+ * opportunities.  
+ * We could use a more customized asm sequence, but defining this via
+ * a saturating mutiply has the advantage of playing relatively nicely with
+ * the various versions of tagging/untagging/range checking.  Note that this
+ * is not a "proper" saturating multiply, since a negative result saturates
+ * ot SINT32_MAX.  We're really just using SINT32_MAX as an error result here.
+ */
+#ifdef PLSR_GNU_ASM 
+#ifdef __KNC__
+/* avoid cmovo for KNC */
+#define pLsrTaggedInt32InternalSInt32MulSaturating(D, A, B) 	  \
+    __asm__("imul %1, %0;jno 1f;mov %2, %0;1:"                    \
+            : "=r"(D)                                             \
+            : "rm"(B), "rm"(SINT32_MAX), "0"(A)); 
+#else
+/* D, A, and B should be variables here */
+/* This produces significantly better code */
+#define pLsrTaggedInt32InternalSInt32MulSaturating(D, A, B)       \
+    __asm__("imul %1, %0; cmovo %2, %0"                           \
+            : "=r"(D)                                             \
+            : "rm"(B), "rm"(SINT32_MAX), "0"(A)); 
+#endif /* __KNC__ */
+#else /* PLSR_GNU_ASM */
+#ifdef PLSR_MS_ASM
+#define pLsrTaggedInt32InternalSInt32MulSaturating(D, A, B)           \
+    __asm { __asm mov eax, A                                          \
+            __asm mov ecx, SINT32_MAX                                 \
+            __asm imul eax, B                                         \
+            __asm cmovo eax, ecx                                      \
+            __asm mov D, eax                                          \
+            };
+#else /* PLSR_MS_ASM */
+#error "Inline assembly unsupported"
+#endif /* PLSR_MS_ASM */
+#endif /* PLSR_GNU_ASM */
+
+#define pLsrTaggedInt32TaggedIntMul(TaggedInt, APType, mulSlow, dest, a, b) \
+    do {                                                                \
+        if(pLsrTaggedInt32InternalAreTagged(a, b)) {                    \
+            sint32 pLsrTaggedInt32TaggedIntMul_a = pLsrTaggedInt32InternalUnTag(a); \
+            sint32 pLsrTaggedInt32TaggedIntMul_b = pLsrTaggedInt32InternalUnTag(b); \
+            sint32 pLsrTaggedInt32TaggedIntMul_tmp;                     \
+            pLsrTaggedInt32InternalSInt32MulSaturating(pLsrTaggedInt32TaggedIntMul_tmp, \
+                                                       pLsrTaggedInt32TaggedIntMul_a, \
+                                                       pLsrTaggedInt32TaggedIntMul_b);\
+            /*            pLsrTaggedInt32TaggedIntMul_tmp = (((sint64) pLsrTaggedInt32TaggedIntMul_a) * */ \
+            /*                                               ((sint64) pLsrTaggedInt32TaggedIntMul_b));*/ \
+            if (pLsrTaggedInt32InternalSInt32InRange(pLsrTaggedInt32TaggedIntMul_tmp)) { \
+                (dest) = (TaggedInt) (pLsrTaggedInt32InternalTag(pLsrTaggedInt32TaggedIntMul_tmp)); \
+                continue;                                               \
+            }                                                           \
+        }                                                               \
+        (dest) = mulSlow(a, b);                                         \
+    } while(0)
+
+/* Division */
+
+typedef enum {
+    PlsrTaggedInt32InternalDivKindT,
+    PlsrTaggedInt32InternalDivKindE,
+    PlsrTaggedInt32InternalDivKindF
+} PlsrTaggedInt32InternalDivKind;
+
+#define pLsrTaggedInt32InternalMkTaggedIntDivModSlow(Kind, kind, TName, TaggedInt, APType, apFromSInt32, apDivMod) \
+    static void pLsrTaggedInt32##TName##DivMod##Kind##Slow (TaggedInt* q, TaggedInt* r, TaggedInt a, TaggedInt b) \
+    {                                                                   \
+        if(pLsrTaggedInt32InternalAreTagged(a, b)) {                    \
+            sint32 ai = pLsrTaggedInt32InternalUnTag(a);                \
+            sint32 bi = pLsrTaggedInt32InternalUnTag(b);                \
+            sint32 qi = ai/bi;                                          \
+            sint32 ri = ai%bi;                                          \
+            if (kind == PlsrTaggedInt32InternalDivKindF) {              \
+                if (((ri > 0) - (ri < 0)) == -((bi > 0) - (bi < 0))) {  \
+                    qi--;                                               \
+                    ri += bi;                                           \
+                }                                                       \
+            } else if (kind == PlsrTaggedInt32InternalDivKindE) {       \
+                if (ri < 0) {                                           \
+                    if (bi > 0) {                                       \
+                        qi--;                                           \
+                        ri += bi;                                       \
+                    } else {                                            \
+                        qi++;                                           \
+                        ri -= bi;                                       \
+                    }                                                   \
+                }                                                       \
+            }                                                           \
+            /* |ai % bi| < |bi| */                                      \
+            *r = (TaggedInt) (pLsrTaggedInt32InternalTag(ri));          \
+            /* plsrTaggedInt32InternalMin/-1 > plsrTaggedInt32InternalMax */ \
+            if (pLsrTaggedInt32InternalSInt32InRange(qi)) {             \
+                *q = (TaggedInt) (pLsrTaggedInt32InternalTag(qi));      \
+            } else {                                                    \
+                *q = (TaggedInt) (apFromSInt32(qi));                    \
+            }                                                           \
+        } else {                                                        \
+            APType aA;                                                  \
+            APType bB;                                                  \
+            if (pLsrTaggedInt32InternalIsTagged(a)) {                   \
+                aA =                                                    \
+                    apFromSInt32(pLsrTaggedInt32InternalUnTag(a));      \
+            } else {                                                    \
+                aA = (APType)a;                                         \
+            }                                                           \
+            if (pLsrTaggedInt32InternalIsTagged(b)) {                   \
+                bB =                                                    \
+                    apFromSInt32(pLsrTaggedInt32InternalUnTag(b));      \
+            } else {                                                    \
+                bB = (APType)b;                                         \
+            }                                                           \
+            apDivMod((APType*) q, (APType*) r, aA, bB);                 \
+        }                                                               \
+    }
+
+
+#define pLsrTaggedInt32InternalTaggedIntDivMod(kind, TaggedInt, APType, apFromSInt32, divModSlow, q, r, a, b) \
+    do {                                                                \
+        if(pLsrTaggedInt32InternalAreTagged(a, b)) {                    \
+            sint32 pLsrTaggedInt32InternalTaggedIntDivMod_ai = pLsrTaggedInt32InternalUnTag(a); \
+            sint32 pLsrTaggedInt32InternalTaggedIntDivMod_bi = pLsrTaggedInt32InternalUnTag(b); \
+            sint32 pLsrTaggedInt32InternalTaggedIntDivMod_qi =          \
+                pLsrTaggedInt32InternalTaggedIntDivMod_ai / pLsrTaggedInt32InternalTaggedIntDivMod_bi; \
+            sint32 pLsrTaggedInt32InternalTaggedIntDivMod_ri =          \
+                pLsrTaggedInt32InternalTaggedIntDivMod_ai % pLsrTaggedInt32InternalTaggedIntDivMod_bi; \
+                                                                        \
+            if (kind == PlsrTaggedInt32InternalDivKindF) {              \
+                                                                        \
+                if (                                                    \
+                    ((pLsrTaggedInt32InternalTaggedIntDivMod_ri > 0) -  \
+                     (pLsrTaggedInt32InternalTaggedIntDivMod_ri < 0))   \
+                    ==                                                  \
+                    -((pLsrTaggedInt32InternalTaggedIntDivMod_bi > 0) - \
+                      (pLsrTaggedInt32InternalTaggedIntDivMod_bi < 0))  \
+                    )                                                   \
+                    {                                                   \
+                        pLsrTaggedInt32InternalTaggedIntDivMod_qi--;    \
+                        pLsrTaggedInt32InternalTaggedIntDivMod_ri += pLsrTaggedInt32InternalTaggedIntDivMod_bi; \
+                    }                                                   \
+            } else if (kind == PlsrTaggedInt32InternalDivKindE) {       \
+                if (pLsrTaggedInt32InternalTaggedIntDivMod_ri < 0) {    \
+                    if (pLsrTaggedInt32InternalTaggedIntDivMod_bi > 0)  \
+                        {                                               \
+                            pLsrTaggedInt32InternalTaggedIntDivMod_qi--; \
+                            pLsrTaggedInt32InternalTaggedIntDivMod_ri += pLsrTaggedInt32InternalTaggedIntDivMod_bi; \
+                        }                                               \
+                    else {                                              \
+                        pLsrTaggedInt32InternalTaggedIntDivMod_qi++;    \
+                        pLsrTaggedInt32InternalTaggedIntDivMod_ri -= pLsrTaggedInt32InternalTaggedIntDivMod_bi; \
+                    }                                                   \
+                }                                                       \
+            }                                                           \
+                                                                        \
+            /* |ai % bi| < |bi| */                                      \
+            r = (TaggedInt) (pLsrTaggedInt32InternalTag(pLsrTaggedInt32InternalTaggedIntDivMod_ri)); \
+                                                                        \
+            /* plsrTaggedInt32InternalMin/-1 > plsrTaggedInt32InternalMax */ \
+            if (pLsrTaggedInt32InternalSInt32InRange(pLsrTaggedInt32InternalTaggedIntDivMod_qi)) { \
+                q = (TaggedInt) (pLsrTaggedInt32InternalTag(pLsrTaggedInt32InternalTaggedIntDivMod_qi)); \
+            } else {                                                    \
+                q = (TaggedInt) (apFromSInt32(pLsrTaggedInt32InternalTaggedIntDivMod_qi)); \
+            }                                                           \
+                                                                        \
+        } else {                                                        \
+            TaggedInt PlsrTaggedInt32InternalTaggedIntDivMod_qq = NULL; \
+            TaggedInt PlsrTaggedInt32InternalTaggedIntDivMod_rr = NULL; \
+            divModSlow(&PlsrTaggedInt32InternalTaggedIntDivMod_qq, &PlsrTaggedInt32InternalTaggedIntDivMod_rr, a, b); \
+            q = PlsrTaggedInt32InternalTaggedIntDivMod_qq;              \
+            r = PlsrTaggedInt32InternalTaggedIntDivMod_rr;              \
+        }                                                               \
+    } while(0)
+
+#define pLsrTaggedInt32InternalTaggedIntDiv(kind, TaggedInt, APType, apFromSInt32, divModSlow, q, a, b) \
+    do {                                                                \
+        TaggedInt pLsrTaggedInt32InternalTaggedIntDiv_r = NULL;         \
+        pLsrTaggedInt32InternalTaggedIntDivMod(kind, TaggedInt, APType, apFromSInt32, divModSlow, q, \
+                                               pLsrTaggedInt32InternalTaggedIntDiv_r, a, b); \
+      } while (0)
+
+#define pLsrTaggedInt32InternalTaggedIntMod(kind, TaggedInt, APType, apFromSInt32, divModSlow, r, a, b) \
+    do {                                                                \
+        TaggedInt pLsrTaggedInt32InternalTaggedIntMod_q = NULL;         \
+        pLsrTaggedInt32InternalTaggedIntDivMod(kind, TaggedInt, APType, apFromSInt32, \
+                                               divModSlow, pLsrTaggedInt32InternalTaggedIntMod_q, r, a, b); \
+    } while (0)
+
+#define pLsrTaggedInt32MkTaggedIntDivModTSlow(TName, TaggedInt, APType, apFromSInt32, apDivModT) \
+    pLsrTaggedInt32InternalMkTaggedIntDivModSlow(T, PlsrTaggedInt32InternalDivKindT, \
+                                                 TName, TaggedInt, APType, apFromSInt32, apDivModT) \
+        
+#define pLsrTaggedInt32TaggedIntDivModT(TaggedInt, APType, apFromSInt32, divModTSlow, q, r, a, b) \
+    pLsrTaggedInt32InternalTaggedIntDivMod(PlsrTaggedInt32InternalDivKindT, \
+                                           TaggedInt, APType, apFromSInt32, divModTSlow, q, r, a, b) \
+
+#define pLsrTaggedInt32TaggedIntDivT(TaggedInt, APType, apFromSInt32, divModTSlow, q, a, b) \
+    pLsrTaggedInt32InternalTaggedIntDiv(PlsrTaggedInt32InternalDivKindT, \
+                                        TaggedInt, APType, apFromSInt32, divModTSlow, q, a, b) \
+
+#define pLsrTaggedInt32TaggedIntModT(TaggedInt, APType, apFromSInt32, divModTSlow, r, a, b) \
+    pLsrTaggedInt32InternalTaggedIntMod(PlsrTaggedInt32InternalDivKindT, \
+                                        TaggedInt, APType, apFromSInt32, divModTSlow, r, a, b) \
+
+#define pLsrTaggedInt32MkTaggedIntDivModESlow(TName, TaggedInt, APType, apFromSInt32, apDivModE) \
+    pLsrTaggedInt32InternalMkTaggedIntDivModSlow(E, PlsrTaggedInt32InternalDivKindE, \
+                                                 TName, TaggedInt, APType, apFromSInt32, apDivModE) \
+
+#define pLsrTaggedInt32TaggedIntDivModE(TaggedInt, APType, apFromSInt32, divModESlow, q, r, a, b) \
+    pLsrTaggedInt32InternalTaggedIntDivMod(PlsrTaggedInt32InternalDivKindE, \
+                                           TaggedInt, APType, apFromSInt32, divModESlow, q, r, a, b) \
+
+#define pLsrTaggedInt32TaggedIntDivE(TaggedInt, APType, apFromSInt32, divModESlow, q, a, b) \
+    pLsrTaggedInt32InternalTaggedIntDiv(PlsrTaggedInt32InternalDivKindE, \
+                                        TaggedInt, APType, apFromSInt32, divModESlow, q, a, b) \
+
+#define pLsrTaggedInt32TaggedIntModE(TaggedInt, APType, apFromSInt32, divModESlow, r, a, b) \
+    pLsrTaggedInt32InternalTaggedIntMod(PlsrTaggedInt32InternalDivKindE, \
+                                        TaggedInt, APType, apFromSInt32, divModESlow, r, a, b) \
+
+#define pLsrTaggedInt32MkTaggedIntDivModFSlow(TName, TaggedInt, APType, apFromSInt32, apDivModF) \
+    pLsrTaggedInt32InternalMkTaggedIntDivModSlow(F, PlsrTaggedInt32InternalDivKindF, \
+                                                 TName, TaggedInt, APType, apFromSInt32, apDivModF) \
+
+#define pLsrTaggedInt32TaggedIntDivModF(TaggedInt, APType, apFromSInt32, divModFSlow, q, r, a, b) \
+    pLsrTaggedInt32InternalTaggedIntDivMod(PlsrTaggedInt32InternalDivKindF, \
+                                           TaggedInt, APType, apFromSInt32, divModFSlow, q, r, a, b) \
+
+#define pLsrTaggedInt32TaggedIntDivF(TaggedInt, APType, apFromSInt32, divModFSlow, q, a, b) \
+    pLsrTaggedInt32InternalTaggedIntDiv(PlsrTaggedInt32InternalDivKindF, \
+                                        TaggedInt, APType, apFromSInt32, divModFSlow, q, a, b) \
+
+#define pLsrTaggedInt32TaggedIntModF(TaggedInt, APType, apFromSInt32, divModFSlow, r, a, b) \
+    pLsrTaggedInt32InternalTaggedIntMod(PlsrTaggedInt32InternalDivKindF, \
+                                        TaggedInt, APType, apFromSInt32, divModFSlow, r, a, b) 
+
+#define pLsrTaggedInt32MkTaggedIntGcd(TName, TaggedInt, APType, apFromSInt32, apLessOrEqual, sint32FromAP, apGcd) \
+    static TaggedInt pLsrTaggedInt32##TName##Gcd(TaggedInt a, TaggedInt b) \
+    {                                                                   \
+        if(pLsrTaggedInt32InternalAreTagged(a, b)) {                    \
+            sint32 ai = pLsrTaggedInt32InternalUnTag(a);                \
+            sint32 bi = pLsrTaggedInt32InternalUnTag(b);                \
+            if (ai<0) ai = -ai;                                         \
+            if (bi<0) bi = -bi;                                         \
+            while (bi != 0) {                                           \
+                sint32 r = ai%bi;                                       \
+                ai = bi;                                                \
+                bi = r;                                                 \
+            }                                                           \
+            return (TaggedInt) (pLsrTaggedInt32InternalTag(ai));        \
+        } else {                                                        \
+            APType aA, bB;                                              \
+            if (pLsrTaggedInt32InternalIsTagged(a)) {                   \
+                aA = apFromSInt32(pLsrTaggedInt32InternalUnTag(a));     \
+            } else {                                                    \
+                aA = (APType)a;                                         \
+            }                                                           \
+            if (pLsrTaggedInt32InternalIsTagged(b)) {                   \
+                bB = apFromSInt32(pLsrTaggedInt32InternalUnTag(b));     \
+            } else {                                                    \
+                bB = (APType)b;                                         \
+            }                                                           \
+            APType r = apGcd(aA, bB);                                   \
+            APType min = apFromSInt32(pLsrTaggedInt32InternalMin);      \
+            APType max = apFromSInt32(pLsrTaggedInt32InternalMax);      \
+            if (apLessOrEqual (min, r) && apLessOrEqual(r, max)) {      \
+                return (TaggedInt) pLsrTaggedInt32InternalTag(sint32FromAP(r)); \
+            } else {                                                    \
+                return (TaggedInt) r;                                   \
+            }                                                           \
+        }                                                               \
+    }
+
+/* Comparisons */
+
+#define pLsrTaggedInt32MkTaggedIntCmpSlow(TName, OpName, TaggedInt, APType, apFromSInt32, apCmp, cmpOp) \
+    static PlsrBoolean pLsrTaggedInt32##TName##OpName##Slow (TaggedInt a, TaggedInt b) { \
+        if(pLsrTaggedInt32InternalAreTagged(a, b)) {                    \
+            return (PlsrBoolean) (((PlsrTaggedInt32) a) cmpOp             \
+                                 ((PlsrTaggedInt32) b));                 \
+        } else {                                                        \
+            APType aA, bB;                                              \
+            if (pLsrTaggedInt32InternalIsTagged(a)) {                   \
+                aA = apFromSInt32(pLsrTaggedInt32InternalUnTag(a));     \
+            } else {                                                    \
+                aA = (APType)a;                                         \
+            }                                                           \
+            if (pLsrTaggedInt32InternalIsTagged(b)) {                   \
+                bB = apFromSInt32(pLsrTaggedInt32InternalUnTag(b));     \
+            } else {                                                    \
+                bB = (APType)b;                                         \
+            }                                                           \
+            return apCmp(aA, bB);                                       \
+        }                                                               \
+    }
+
+#define pLsrTaggedInt32MkTaggedIntCmp(cmpSlow, cmpOp, dest, a, b)       \
+    do {                                                                \
+        if(pLsrTaggedInt32InternalAreTagged((a), (b))) {                \
+            (dest) = ((PlsrBoolean) (((PlsrTaggedInt32) (a)) cmpOp        \
+                                    ((PlsrTaggedInt32) (b))));           \
+        } else {                                                        \
+            (dest) = (cmpSlow((a), (b)));                               \
+        }                                                               \
+    } while(0)
+
+#define pLsrTaggedInt32MkTaggedIntEqSlow(TName, TaggedInt, APType, apFromSInt32, apEq) \
+    pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Eq, TaggedInt, APType, apFromSInt32, apEq, ==) 
+#define pLsrTaggedInt32TaggedIntEq(eqSlow, dest, aa, bb)        \
+    pLsrTaggedInt32MkTaggedIntCmp(eqSlow, ==, dest, aa, bb)
+
+
+#define pLsrTaggedInt32MkTaggedIntNeSlow(TName, TaggedInt, APType, apFromSInt32, apNe) \
+    pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Ne, TaggedInt, APType, apFromSInt32, apNe, !=) 
+#define pLsrTaggedInt32TaggedIntNe(neSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntCmp(neSlow, !=, dest, aa, bb)
+
+#define pLsrTaggedInt32MkTaggedIntLtSlow(TName, TaggedInt, APType, apFromSInt32, apLt) \
+    pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Lt, TaggedInt, APType, apFromSInt32, apLt, <) 
+#define pLsrTaggedInt32TaggedIntLt(ltSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntCmp(ltSlow, <, dest, aa, bb)
+
+#define pLsrTaggedInt32MkTaggedIntGtSlow(TName, TaggedInt, APType, apFromSInt32, apGt) \
+    pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Gt, TaggedInt, APType, apFromSInt32, apGt, >) 
+#define pLsrTaggedInt32TaggedIntGt(gtSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntCmp(gtSlow, >, dest, aa, bb)
+
+#define pLsrTaggedInt32MkTaggedIntLeSlow(TName, TaggedInt, APType, apFromSInt32, apLe) \
+    pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Le, TaggedInt, APType, apFromSInt32, apLe, <=) 
+#define pLsrTaggedInt32TaggedIntLe(leSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntCmp(leSlow, <=, dest, aa, bb)
+
+#define pLsrTaggedInt32MkTaggedIntGeSlow(TName, TaggedInt, APType, apFromSInt32, apGe) \
+    pLsrTaggedInt32MkTaggedIntCmpSlow(TName, Ge, TaggedInt, APType, apFromSInt32, apGe, >=) 
+#define pLsrTaggedInt32TaggedIntGe(geSlow, dest, aa, bb) \
+    pLsrTaggedInt32MkTaggedIntCmp(geSlow, >=, dest, aa, bb)
+
+/************************ Hashing **********************************/
+
+#define pLsrTaggedInt32TaggedIntHash(TaggedInt, APType, apHash, a)      \
+    ((pLsrTaggedInt32InternalIsTagged((PlsrTaggedInt32) (a))) ?         \
+     pLsrSInt32Hash(pLsrTaggedInt32InternalUnTag(a)) :                  \
+     apHash((APType)a))
+
+/************************ Testing **********************************/
+#ifdef PLSR_TAGGED_INT32_TEST
+
+/************************ Tagging **********************************/
+void pLsrTaggedInt32TestTag(sint32 x) {
+    PlsrTaggedInt32 r = pLsrTaggedInt32InternalTag(x);
+    printf("Tagged version of %d is %d\n", x, r);
+}
+
+/************************ UnTagging ********************************/
+void pLsrTaggedInt32TestUnTag(PlsrTaggedInt32 x) {
+    sint32 r = (sint32) pLsrTaggedInt32InternalUnTag(x);
+    printf("UnTagged version of %d is %d\n", x, r);
+}
+
+/********************** Tag checking *******************************/
+void pLsrTaggedInt32TestIsTagged(PlsrTaggedInt32 x) {
+    if (pLsrTaggedInt32InternalIsTagged(x)) {
+        printf("%d is tagged", x);
+    } else {
+        printf("%d is not tagged", x);
+    }
+}
+
+/***************** Two operand tag checking ************************/
+void pLsrTaggedInt32TestAreTagged(PlsrTaggedInt32 x, PlsrTaggedInt32 y) {
+    if (pLsrTaggedInt32InternalAreTagged(x, y)) {
+        printf("%d and %d are tagged", x, y);
+    } else {
+        printf("%d and/or %d is not tagged", x, y);
+    }
+}
+
+
+/********************* Range checking *****************************/
+void pLsrTaggedInt32TestSIntInRange(sint32 x, sint64 y) {
+    if (pLsrTaggedInt32InternalSInt32InRange(x)) {
+        printf("in range");
+    } else {
+        printf("not in range");
+    }
+    if (pLsrTaggedInt32InternalSInt64InRange(y)) {
+        printf("in range");
+    } else {
+        printf("not in range");
+    }
+}
+
+void pLsrTaggedInt32TestUIntInRange(uint32 x, uint64 y) {
+    if (pLsrTaggedInt32InternalUInt32InRange(x)) {
+        printf("in range");
+    } else {
+        printf("not in range");
+    }
+    if (pLsrTaggedInt32InternalUInt64InRange(y)) {
+        printf("in range");
+    } else {
+        printf("not in range");
+    }
+}
+#endif /* PLSR_TAGGED_INT32_TEST */
+
+#endif /* _PLSR_TAGGED_INT32_ */
+
diff --git a/runtime/include/hrc/plsr-thunk.h b/runtime/include/hrc/plsr-thunk.h
new file mode 100644
index 0000000..c1067c7
--- /dev/null
+++ b/runtime/include/hrc/plsr-thunk.h
@@ -0,0 +1,10 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Unified Futures Thunk Implementation */
+
+#ifdef PLSR_LIGHTWEIGHT_THUNKS
+  #include "hrc/plsr-lightweight-thunk.h"
+#else
+  #include "hrc/plsr-ptk-thunk.h"
+#endif
diff --git a/runtime/include/hrc/plsr-util.h b/runtime/include/hrc/plsr-util.h
new file mode 100644
index 0000000..ad10e3e
--- /dev/null
+++ b/runtime/include/hrc/plsr-util.h
@@ -0,0 +1,182 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Utilities */
+
+#ifndef _PLSR_UTIL_H_
+#define _PLSR_UTIL_H_
+
+#include <assert.h>
+#include <stdlib.h>
+#include "hrc/pil.h"
+
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+#endif /* P_USE_PILLAR */
+
+#ifdef PLSR_LINUX
+#include <sys/time.h>
+#endif
+
+#ifdef P_USE_PILLAR
+#  undef to
+#  pragma pillar_managed(on)
+#endif
+
+/* This is the type of Mil booleans.  Mil booleans
+ * are integers in {0, 1}.  Use toPlsrBoolean to
+ * convert C booleans (i.e. zero/non-zero values)
+ * to P booleans.
+ */
+typedef uintp PlsrBoolean;
+#define toPlsrBoolean(e) ((PlsrBoolean)(!!(e)))
+
+/* Absolute value.  According the c99 spec, converting from a signed to an
+ * unsigned type of the same size:
+ * 1) preserves the value if the value is representable (positive)
+ * 2) is the result of repeatedly adding one more than the maximum value of
+ *    the new type to the old value
+ * So, assuming a is a negative sint32, then
+ * ((uint32) a) = (UINT32_MAX + 1) + a (using non-modular arithmetic).  This is
+ * guaranteed to be representable, since |SINT32_MIN| < UINT32_MAX
+ * So using true arithmetic, we have:
+ * UINT32_MAX - ((uint32) a) + 1 = UINT32_MAX - (UINT32_MAX + 1 + a)  + 1
+ *                               = UINT32_MAX - UINT32_MAX -1 -a + 1
+ *                               = -a
+ * But note that since SINT32_MIN <= a < 0,
+ * 0 < ((uint32) a) = (UINT32_MAX + 1) + a <= UINT32_MAX must be true, so
+ * UINT32_MAX - ((uint32) a) cannot underflow, and must be less than
+ * UINT32_MAX (since ((uint32) a) must be positive.
+ * Therefore UINT32_MAX - ((uint32) a) + 1 can be computed precisely in
+ * 32 bit modular arithmetic.
+ */
+
+#define pLsrUInt32FromNegativeSInt32Abs(a) (UINT32_MAX - ((uint32) a) + 1)
+#define pLsrUInt64FromNegativeSInt64Abs(a) (UINT64_MAX - ((uint64) a) + 1)
+#define pLsrUIntpFromNegativeSIntpAbs(a) (UINTP_MAX - ((uintp) a) + 1)
+
+static uint32 pLsrAbs32(sint32 a) {
+    if (a < 0) {
+        return UINT32_MAX - ((uint32) a) + 1;
+    } else {
+        return a;
+    }
+}
+
+static uint64 pLsrAbs64(sint64 a) {
+    if (a < 0) {
+        return UINT64_MAX - ((uint64) a) + 1;
+    } else {
+        return a;
+    }
+}
+
+static uintp pLsrAbsP(sintp a) {
+    if (a < 0) {
+        return UINTP_MAX - ((uintp) a) + 1;
+    } else {
+        return a;
+    }
+}
+
+#ifndef PRT_NORETURN
+#ifdef __GNUC__
+#define PRT_NORETURN __attribute__((noreturn))
+#else
+#define PRT_NORETURN __declspec(noreturn)
+#endif
+#endif
+
+static void pLsrExit(int status)
+{
+#ifdef P_USE_PARALLEL_FUTURES
+    ptkFutureSystemShutdown();
+#endif
+#ifdef P_USE_AGC
+    pgc_kill();
+#endif /* P_USE_AGC */
+#ifdef P_USE_PILLAR
+    prtExit(status);
+#else /* !P_USE_AGC */
+    exit(status);
+#endif /* !P_USE_AGC */
+}
+
+#define pLsrHalt(status)                        \
+    do {                                        \
+        pLsrExit(status);                       \
+        return 0;                               \
+    } while (0)
+
+#define pLsrHaltV(status)                        \
+    do {                                         \
+        pLsrExit(status);                        \
+        return;                                  \
+    } while (0)
+
+static PRT_NORETURN void pLsrHaltNegOne()
+{
+    fflush(stdout);
+    assert(0);
+    pLsrExit(-1);
+}
+
+#define pLsrRuntimeError(msg) \
+    (printf("Runtime error: %s at line %d in file %s\n", msg, __LINE__, __FILE__), \
+     pLsrHaltNegOne(),                                                        \
+     0)
+
+static PRT_NORETURN void pLsrRuntimeError_(char * msg) {
+    pLsrRuntimeError(msg);
+}
+
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+
+static PRT_NORETURN void pLsrRuntimeErrorUnmanaged(char * msg) {
+    printf("Runtime error: %s at line %d in file %s\n", msg, __LINE__, __FILE__), \
+    fflush(stdout);
+    assert(0);
+    exit(-1);
+}
+
+#  undef to
+#pragma pillar_managed(on)
+#endif
+
+void pLsrDisableErrorBox();
+uint64 pLsrEventsTimeStamp();
+
+#if (defined(PLSR_ENABLE_EVENTS)) && (defined(_WIN32))
+
+static uint64 pLsrEventsStartTime;
+static FILE* pLsrEventsLogFile;
+
+void pLsrEventsInit()
+{
+    pLsrEventsStartTime = pLsrEventsTimeStamp();
+    pLsrEventsLogFile = fopen("events.log", "wt");
+}
+
+#define pLsrEventsTransition(action, state)           \
+    fprintf(pLsrEventsLogFile, "%d,%p,%s,%s\n",                         \
+            (uint32) (pLsrEventsTimeStamp() - pLsrEventsStartTime),     \
+            prtGetTaskHandle(),                                         \
+            action, state);                                             \
+
+void pLsrEventsShutdown()
+{
+    fclose(pLsrEventsLogFile);
+}
+
+#else /* PLSR_ENABLE_EVENTS */
+
+#define pLsrEventsInit()
+#define pLsrEventsTransition(action, state)
+#define pLsrEventsShutdown()
+
+#endif /* PLSR_ENABLE_EVENTS */
+
+#endif /* !_PLSR_UTIL_H_ */
diff --git a/runtime/include/hrc/plsr-value.h b/runtime/include/hrc/plsr-value.h
new file mode 100755
index 0000000..f379f4a
--- /dev/null
+++ b/runtime/include/hrc/plsr-value.h
@@ -0,0 +1,941 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Various P Values */
+
+#ifndef _PLSR_VALUE_H_
+#define _PLSR_VALUE_H_
+
+
+/**********************************************************************
+ * Option Sets and Types
+ */
+
+/* Option sets based on this vtable must always be initialised
+ * immediately after allocation without an intervening GC.  If this is
+ * not the case, the mutability property of the vtable must be
+ * changed.
+ */
+pLsrVTableStatic(pLsrPSetVTable_, VSetTag, "*option set*", pLsrPSetPadding);
+#define pLsrPSetVTable (&pLsrPSetVTable_)
+
+/* Generated code defines:
+ *   pLsrPSetSize
+ *   pLsrPSetOffset
+ *   pLsrPTypeSize
+ */
+
+static PlsrPAny pLsrPSetNew(PlsrPAny v)
+{
+    PlsrPAny s;
+    noyield {
+        pLsrAlloc(PlsrPAny, s, pLsrPSetVTable, pLsrPSetSize);
+        pLsrWriteBarrierRefOptBase
+            (s, pLsrObjectField(s, pLsrPSetOffset, PlsrPAny*), v);
+    }
+    return s;
+}
+
+#define pLsrPSetGet(s) (pLsrObjectField(s, pLsrPSetOffset, PlsrPAny*))
+#define pLsrPSetIsEmpty(s) ((PlsrBoolean)(!pLsrPSetGet(s)))
+
+/* Types based on this vtable must always be initialised
+ * immediately after allocation without an intervening GC.  If this is
+ * not the case, the mutability property of the vtable must be
+ * changed.
+ */
+pLsrVTableStatic(pLsrPTypeVTable_, VTypeTag, "*type*", pLsrPTypePadding);
+#define pLsrPTypeVTable (&pLsrPTypeVTable_)
+
+/**********************************************************************
+ * Rationals
+ */
+
+/* Boxed rationals based on this vtable must always be initialised
+ * immediately after allocation without an intervening GC.  If this is
+ * not the case, the mutability property of the vtable must be
+ * changed.
+ */
+pLsrVTableStatic(pLsrPRatVTable_, VRatTag, "*rat*", pLsrPRatPadding);
+#define pLsrPRatVTable (&pLsrPRatVTable_)
+
+/* Generated code defines:
+ *   pLsrPRatSize
+ *   pLsrPRatOffset
+ */
+
+#ifdef P_PRAT_IS_SINTP 
+
+#define pLsrPRatContainsRef 0
+
+#define pLsrPRatUnimp(s, dest)                                          \
+    do {                                                                \
+        pLsrRuntimeError("PRat unimplemented");                         \
+        (dest) = 0;                                                     \
+    } while (0)
+
+#define pLsrSIntpFromPRat(pLsrSIntpFromPRatDest, pLsrSIntpFromPRatArg1) \
+    do {                                                                \
+        (pLsrSIntpFromPRatDest) =                                       \
+            ((sintp)pLsrObjectField(pLsrSIntpFromPRatArg1, pLsrPRatOffset, sintp*)); \
+    } while (0)
+
+#define pLsrPRatFromRational(pLsrPRatFromRationalDest, pLsrPRatFromRationalArg1) \
+    pLsrPRatUnimp("pLsrPRatFromRational", pLsrPRatFromRationalDest)
+
+#define pLsrRationalFromPRat(pLsrRationalFromPRatDest, pLsrRationalFromPRatArg1)\
+    do {                                                                \
+        sintp pLsrRationalFromPRatI;                                    \
+        pLsrSIntpFromPRat(pLsrRationalFromPRatI, pLsrRationalFromPRatArg1); \
+        pLsrRationalFromSIntp(pLsrRationalFromPRatDest, pLsrRationalFromPRatI); \
+    } while (0)
+
+/* Conversions */
+
+#define pLsrPRatFromUInt32(pLsrPRatFromUInt32Dest, pLsrPRatFromUInt32Arg1) \
+    do {                                                                \
+        noyield {                                                       \
+            pLsrAlloc(PlsrPAny, (pLsrPRatFromUInt32Dest), pLsrPRatVTable, pLsrPRatSize); \
+            pLsrObjectField(pLsrPRatFromUInt32Dest, pLsrPRatOffset, uintp*) = ((uintp) pLsrPRatFromUInt32Arg1); \
+        }                                                               \
+    } while (0)
+
+#define pLsrPRatFromInteger(pLsrPRatFromIntegerDest, pLsrPRatFromIntegerArg1) \
+    pLsrPRatUnimp("pLsrPRatFromInteger", pLsrPRatFromIntegerDest)
+
+#define pLsrUInt32FromPRat(pLsrUInt32FromPRatDest, pLsrUInt32FromPRatArg1) \
+    do {                                                                \
+        sintp pLsrUInt32FromPRatI;                                      \
+        pLsrSIntpFromPRat(pLsrUInt32FromPRatI, pLsrUInt32FromPRatArg1); \
+        if (pLsrUInt32FromPRatI < 0 || pLsrUInt32FromPRatI > UINT32_MAX) { \
+            pLsrRuntimeError_("pLsrUInt32FromPRat: not in range");       \
+        }                                                               \
+        pLsrUInt32FromPRatDest = (uint32) pLsrUInt32FromPRatI;          \
+    } while (0)
+
+#define pLsrUIntpFromPRat(pLsrUIntpFromPRatDest, pLsrUIntpFromPRatArg1) \
+    pLsrPRatUnimp("pLsrUIntpFromPRat", pLsrUIntpFromPRatDest)
+
+
+#else /* !P_PRAT_IS_SINTP */
+
+#define pLsrPRatContainsRef 1
+
+#define pLsrPRatFromRational(pLsrPRatFromRationalDest, pLsrPRatFromRationalArg1) \
+    do {                                                                \
+        noyield {                                                       \
+            pLsrAlloc(PlsrPAny, (pLsrPRatFromRationalDest), pLsrPRatVTable, pLsrPRatSize); \
+            pLsrWriteBarrierRefOptBase                                  \
+                (pLsrPRatFromRationalDest,                              \
+                 pLsrObjectField(pLsrPRatFromRationalDest, pLsrPRatOffset, PlsrRational*), \
+                 pLsrPRatFromRationalArg1);                             \
+        }                                                               \
+    } while (0)
+
+#define pLsrRationalFromPRat(pLsrRationalFromPRatDest, pLsrRationalFromPRatArg1)\
+    do {                                                                \
+        (pLsrRationalFromPRatDest) = \
+            ((PlsrRational)pLsrObjectField(pLsrRationalFromPRatArg1, pLsrPRatOffset, PlsrRational*)); \
+    } while (0)
+
+/* Conversions */
+
+#define pLsrPRatFromUInt32(pLsrPRatFromUInt32Dest, pLsrPRatFromUInt32Arg1) \
+    do {                                                                \
+        PlsrRational pLsrPRatFromUInt32R;                               \
+        pLsrRationalFromUInt32(pLsrPRatFromUInt32R, pLsrPRatFromUInt32Arg1); \
+        pLsrPRatFromRational(pLsrPRatFromUInt32Dest, pLsrPRatFromUInt32R); \
+    } while (0)
+
+#define pLsrPRatFromInteger(pLsrPRatFromIntegerDest, pLsrPRatFromIntegerArg1) \
+    do {                                                                \
+        PlsrRational pLsrPRatFromIntegerR;                              \
+        pLsrRationalFromInteger(pLsrPRatFromIntegerR, pLsrPRatFromIntegerArg1); \
+        pLsrPRatFromRational(pLsrPRatFromIntegerDest, pLsrPRatFromIntegerR); \
+    } while (0)
+
+#define pLsrUInt32FromPRat(pLsrUInt32FromPRatDest, pLsrUInt32FromPRatArg1)\
+    do {                                                                \
+        PlsrRational pLsrUInt32FromPRatR;                               \
+        pLsrRationalFromPRat(pLsrUInt32FromPRatR, pLsrUInt32FromPRatArg1); \
+        pLsrUInt32FromRational(pLsrUInt32FromPRatDest, pLsrUInt32FromPRatR); \
+    } while (0)
+
+#define pLsrUIntpFromPRat(pLsrUIntpFromPRatDest, pLsrUIntpFromPRatArg1) \
+    do {                                                                \
+        PlsrRational pLsrUIntpFromPRatR;                                \
+        pLsrRationalFromPRat(pLsrUIntpFromPRatR, pLsrUIntpFromPRatArg1); \
+        pLsrUIntpFromRational(pLsrUIntpFromPRatDest, pLsrUIntpFromPRatR); \
+    } while (0)
+
+#endif /* P_PRAT_IS_SINTP */
+/**********************************************************************
+ * Names
+ */
+
+pLsrVTableStatic(pLsrPNameVTable_, VNameTag, "*name*", 0);
+#define pLsrPNameVTable (&pLsrPNameVTable_)
+
+/* PlsrObjectU must be a prefix of this structure */
+/* This structure must always be initialised immediately after allocation
+ * without an intervening GC.  If this is not the case, the mutability
+ * property of the vtable must be changed.
+ */
+typedef struct {
+    PlsrVTable vtable;
+    uintp tag;
+    uintp hash;
+    uintp strLen;
+    char str[];
+} PlsrPNameU;
+
+#define pLsrPNameStatic(v, n, h, sl, ...)        \
+    static PlsrPNameU v = {                      \
+        .vtable = pLsrPNameVTable,               \
+        .tag = (n),                              \
+        .hash = (h),                             \
+        .strLen = (sl),                          \
+        .str = { __VA_ARGS__ }                   \
+    }
+
+#define pLsrPNameGetTag(n) (((PlsrPNameU*)(n))->tag)
+#define pLsrPNameGetHash(n) (((PlsrPNameU*)(n))->hash)
+#define pLsrPNameGetStringLen(n) (((PlsrPNameU*)(n))->strLen)
+#define pLsrPNameGetString(n) (((PlsrPNameU*)(n))->str)
+
+/**********************************************************************
+ * Floating Point
+ */
+
+/* Generated code defines:
+ *  pLsrPFloatOffset
+ *  pLsrPFloatSize
+ *  pLsrPDoubleOffset
+ *  pLsrPDoubleSize
+ */
+
+/* Boxed floats based on this vtable must always be initialised
+ * immediately after allocation without an intervening GC.  If this is
+ * not the case, the mutability property of the vtable must be
+ * changed.
+ */
+pLsrVTableStatic(pLsrPFloatVTable_, VFloatTag, "*float*", pLsrPFloatPadding);
+#define pLsrPFloatVTable (&pLsrPFloatVTable_)
+
+/* Boxed doubles based on this vtable must always be initialised
+ * immediately after allocation without an intervening GC.  If this is
+ * not the case, the mutability property of the vtable must be
+ * changed.
+ */
+pLsrVTableStatic(pLsrPDoubleVTable_, VDoubleTag, "*double*", pLsrPDoublePadding);
+#define pLsrPDoubleVTable (&pLsrPDoubleVTable_)
+
+#define pLsrPFloatGet(v) (pLsrObjectField((v), pLsrPFloatOffset, float*))
+#define pLsrPDoubleGet(v) (pLsrObjectField((v), pLsrPDoubleOffset, double*))
+
+
+/**********************************************************************
+ * Indexes
+ */
+
+typedef struct {
+    PlsrPAny name;
+    uintp offset;
+} PlsrIdxE;
+
+/* PlsrObjectU must be a prefix of this structure */
+typedef struct {
+    PlsrVTable vtable;
+    uintp len;
+    PlsrIdxE elts[];
+} PlsrIdxU;
+
+#define pLsrIdxPadding (sizeof(PlsrIdxU) - (sizeof(PlsrVTable) + sizeof(uintp)))
+pLsrVTableStatic(pLsrIdxVTable_, VNoneTag, "*index*", pLsrIdxPadding);
+#define pLsrIdxVTable (&pLsrIdxVTable_)
+
+typedef PlsrIdxU* PlsrIdxB;
+
+#define pLsrIdxEltStatic {0, 0}
+
+#define pLsrIdxEmpty(v) \
+    static PlsrIdxU v = \
+        { .vtable = pLsrIdxVTable, .len = 0, .elts = { pLsrIdxEltStatic } }
+
+#define pLsrIdxStatic(v, dlen, ... ) \
+    static PlsrIdxU v = \
+        { .vtable = pLsrIdxVTable, .len = (dlen), .elts = { __VA_ARGS__ } }
+
+static PlsrIdxB pLsrIdxNew(uintp len)
+{
+    uintp dlen = 1;
+    uintp i;
+    PlsrIdxB dct;
+    uintp size;
+
+    /* Choose a power of two at least 1.5 times as large as len*/
+    while(2*dlen < 3*len) dlen *= 2;
+    size = sizeof(PlsrIdxU) + dlen*sizeof(PlsrIdxE);
+    pLsrAlloc(PlsrIdxB, dct, pLsrIdxVTable, size);
+    dct->len = dlen;
+    for(i=0;i<dlen;i++) dct->elts[i].name=0;
+    return dct;
+}
+
+static void pLsrIdxSet(PlsrIdxB idx, PlsrPAny n, uintp e)
+{
+    uintp mask = idx->len-1;
+    PlsrIdxE* elts = idx->elts;
+    uintp j;
+    uintp hash;
+    if (!n) pLsrRuntimeError("Name out of range");
+    hash  = pLsrPNameGetHash(n);
+    for(j = hash & mask; elts[j].name; j = (j+1) & mask);
+    pLsrWriteBarrierRefBase(idx, elts[j].name, n);
+    elts[j].offset = e;
+}
+
+static uintp pLsrIdxGet(PlsrIdxB idx, PlsrPAny n)
+{
+    uintp mask = idx->len-1;
+    PlsrIdxE* elts = idx->elts;
+    uintp hash = pLsrPNameGetHash(n);
+    uintp j;
+    for(j = hash & mask; elts[j].name != n; j = (j+1) & mask);
+    return elts[j].offset;
+}
+
+/**********************************************************************
+ * Tuples
+ */
+
+/* Generated code defines:
+ *   pLsrPArrayOLenOffset
+ *   pLsrPArrayOEltOffset
+ *   pLsrPArrayOBaseSize
+ *   pLsrPArrayILenOffset
+ *   pLsrPArrayIEltOffset
+ *   pLsrPArrayIIdxOffset
+ *   pLsrPArrayIBaseSize
+ */
+
+/* Base tuples */
+
+#define pLsrTupleStatic(v, t, ...) static t v = { __VA_ARGS__ }
+
+#define pLsrTupleStaticV(v, t, vtb, sz, ...)                            \
+    static t v = { .vtable = vtb, .f_0 = sz, .extras = { __VA_ARGS__ } }
+
+#define pLsrTupleNewFixed(dest, vt, sz, algn)                   \
+    pLsrAllocAligned(PlsrObjectB, (dest), (vt), (sz), algn)
+#define pLsrTupleNewVariable(dest, vt, fsz, esz, c, algn)               \
+    pLsrAllocAligned(PlsrObjectB, (dest), (vt), (fsz)+(esz)*(c), algn)
+
+#define pLsrTupleNewPinnedFixed(dest, vt, sz, algn)                     \
+    pLsrAllocAlignedPinned(PlsrObjectB, (dest), (vt), (sz), algn)
+#define pLsrTupleNewPinnedVariable(dest, vt, fsz, esz, c, algn)         \
+    pLsrAllocAlignedPinned(PlsrObjectB, (dest), (vt), (fsz)+(esz)*(c), algn)
+
+/* Ordinal arrays */
+
+/* This vtable should only be used with arrays of refs
+ */
+pLsrVTableStatic(pLsrPArrayOVTable_, VArrayTag, "*ordinal array*", pLsrPArrayOPadding);
+#define pLsrPArrayOVTable (&pLsrPArrayOVTable_)
+
+/* This function should only be used for arrays of references */
+static PlsrPAny pLsrPArrayONew(uintp c)
+{
+    PlsrPAny res;
+    pLsrTupleNewVariable(res, pLsrPArrayOVTable, pLsrPArrayOBaseSize,
+                         sizeof(PlsrObjectB), c, sizeof(PlsrObjectB));
+    pLsrObjectField(res, pLsrPArrayOLenOffset, uintp*) = c;
+    return res;
+}
+
+#define pLsrPArrayOGetLen(arr) \
+    (pLsrObjectField((arr), pLsrPArrayOLenOffset, uintp*))
+
+/* This macro is for arrays of references */
+#define pLsrPArrayOElt(arr, i) \
+    (pLsrObjectField((arr), pLsrPArrayOEltOffset+sizeof(PlsrObjectB)*(i), \
+                     PlsrObjectB*))
+
+static void pLsrPArrayOSet(PlsrPAny arr, PlsrPAny i, PlsrObjectB e)
+{
+    uintp ui;
+    pLsrUIntpFromPRat(ui, i);
+    pLsrWriteBarrierRefBase(arr, pLsrPArrayOElt(arr, ui), e);
+}
+
+static PlsrObjectB pLsrPArrayOGet(PlsrPAny arr, PlsrPAny i)
+{
+    uintp ui;
+    pLsrUIntpFromPRat(ui, i);
+    return pLsrPArrayOElt(arr, ui);
+}
+
+#define pLsrPArrayOEltEval(dest, a, i) pLsrObjectEval((dest), pLsrPArrayOElt((a), (i)))
+
+/* Indexed arrays */
+
+/* This vtable should not be used with accurate GC, as it does not
+ * determine its GC info unambiguously.  However, the runtime uses
+ * it for arrays with all reference elements.
+ */
+pLsrVTableStatic(pLsrPArrayIVTable_, VArrayIdxTag, "*indexed array*", pLsrPArrayIPadding);
+#define pLsrPArrayIVTable (&pLsrPArrayIVTable_)
+
+/* This function should only be used for arrays of references */
+static PlsrPAny pLsrPArrayINew(uintp c, PlsrIdxB idx)
+{
+    PlsrPAny res;
+    noyield {
+        pLsrTupleNewVariable(res, 
+                             pLsrPArrayIVTable,
+                             pLsrPArrayIBaseSize,
+                             sizeof(PlsrObjectB),
+                             c, 
+                             sizeof(PlsrObjectB));
+        pLsrObjectField(res, pLsrPArrayILenOffset, uintp*) = c;
+        pLsrWriteBarrierRefOptBase(res,
+                                   pLsrObjectField(res, pLsrPArrayIIdxOffset,
+                                                   PlsrIdxB*),
+                                   idx);
+    }
+    return res;
+}
+
+/* This macro is for arrays of references */
+#define pLsrPArrayIElt(arr, i)                                          \
+    (pLsrObjectField((arr), pLsrPArrayIEltOffset+sizeof(PlsrObjectB)*(i), \
+                     PlsrObjectB*))
+
+#define pLsrPArrayIGetLen(arr)                                  \
+    (pLsrObjectField((arr), pLsrPArrayILenOffset, uintp*))
+
+#define pLsrPArrayIGetIdx(arr)                                  \
+    (pLsrObjectField((arr), pLsrPArrayIIdxOffset, PlsrIdxB*))
+
+static PlsrObjectB pLsrPArrayIGet(PlsrPAny arr, PlsrPAny n)
+{
+    uintp i = pLsrIdxGet(pLsrPArrayIGetIdx(arr), n);
+    return pLsrPArrayIElt(arr, i);
+}
+
+/* Utilities */
+
+static PlsrBoolean pLsrPArrayHasIdx(PlsrPAny arr)
+{
+    return pLsrVTableGetTag(pLsrObjectGetVTable(arr)) == VArrayIdxTag;
+}
+
+
+/**********************************************************************
+ * P Functions
+ */
+
+/* When using accurate GC, this vtable should be used only with closures
+ * with no free variables.
+ */
+pLsrVTableStatic(pLsrClosureVTable_, VFunctionTag, "*function*", pLsrPFunctionPadding);
+#define pLsrClosureVTable (&pLsrClosureVTable_)
+
+/* Generated constants:
+ *   pLsrPFunctionCodeOffset
+ *   pLsrPFunctionSize  (* for no free variables *)
+ */
+
+typedef PlsrObjectB (*PlsrPFunctionCodeRef)(PlsrObjectB);
+
+#define pLsrPFunctionApplyRef(clos, arg)                                \
+    (pLsrObjectField(clos, pLsrPFunctionCodeOffset, PlsrPFunctionCodeRef*) \
+     (arg))
+
+/**********************************************************************
+ * Sums
+ */
+
+/* All this code is for sums over references only */
+
+/* Objects created from this vtable must always be initialised immediately
+ * after allocation
+ * without an intervening GC.  If this is not the case, the mutability
+ * property of the vtable must be changed.
+ */
+pLsrVTableStatic(pLsrPSumVTable_, VSumTag, "*sum*", pLsrPSumPadding);
+#define pLsrPSumVTable (&pLsrPSumVTable_)
+
+/* Generated constants:
+ *   pLsrPSumTagOffset
+ *   pLsrPSumValOffset
+ *   pLsrPSumSize
+ */
+
+static PlsrPAny pLsrPSumNew(PlsrPAny tag, PlsrObjectB value)
+{
+    PlsrPAny v;
+    noyield {
+        pLsrAlloc(PlsrPAny, v, pLsrPSumVTable, pLsrPSumSize);
+        pLsrWriteBarrierRefOptBase
+            (v, pLsrObjectField(v, pLsrPSumTagOffset, PlsrPAny*), tag);
+        pLsrWriteBarrierRefOptBase
+            (v, pLsrObjectField(v, pLsrPSumValOffset, PlsrObjectB*), value);
+    }
+    return v;
+}
+
+static PlsrPAny pLsrPSumNewT(PlsrPAny tag, PlsrPAny value)
+{
+    PlsrThunkBRef valthnk = pLsrThunkNewValRef((PlsrRef) value);
+    return pLsrPSumNew(tag, (PlsrObjectB)valthnk);
+}
+
+#define pLsrPSumGetTag(s) (pLsrObjectField(s, pLsrPSumTagOffset, PlsrPAny*))
+#define pLsrPSumGetVal(s) (pLsrObjectField(s, pLsrPSumValOffset, PlsrObjectB*))
+#define pLsrPSumGetValEval(dest, s) pLsrObjectEval((dest), pLsrPSumGetVal(s))
+
+
+/**********************************************************************
+ * Strings
+ */
+
+static PlsrPAny pLsrCoreCharOrd = 0;
+
+static void pLsrRegisterCoreCharOrd(PlsrPAny n)
+{
+    assert(!pLsrCoreCharOrd); // Should only be registered once
+    assert(n);
+    pLsrCoreCharOrd = n;
+}
+
+static PlsrBoolean pLsrIsNiceChar(int c)
+{
+    return c>=32 && c<=126;
+}
+
+static PlsrBoolean pLsrValueToChar(char* c, PlsrPAny v)
+{
+    assert(pLsrCoreCharOrd);
+    if (pLsrVTableGetTag(pLsrObjectGetVTable(v)) == VSumTag) {
+        if(pLsrPSumGetTag(v) == pLsrCoreCharOrd) {
+            PlsrPAny cv;
+            pLsrPSumGetValEval(cv, v);
+            if (cv) {
+                char i;
+                pLsrUInt32FromPRat(i, cv);
+                if (pLsrIsNiceChar(i)) {
+                    *c = i;
+                    return 1;
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static char* pLsrValueToCStringMaybe(PlsrPAny v)
+{
+    if (!pLsrCoreCharOrd ||
+        pLsrVTableGetTag(pLsrObjectGetVTable(v)) != VArrayTag ||
+        pLsrPArrayHasIdx(v) ||
+        pLsrPArrayOGetLen(v) == 0)
+        return 0;
+    {
+        uintp c = pLsrPArrayOGetLen(v);
+        char* s = (char*)pLsrAllocC((c+1) * sizeof(char));
+        uintp i;
+        for(i=0; i<c; i++) {
+            PlsrPAny elt;
+            pLsrPArrayOEltEval(elt, v, i);
+            if (!(elt && pLsrValueToChar(&s[i], elt))) {
+                pLsrFreeC(s);
+                return 0;
+            }
+        }
+        s[i]=0;
+        return s;
+    }
+}
+
+static char* pLsrPStringToCString(PlsrPAny s)
+{
+    uintp len = pLsrPArrayOGetLen(s);
+    char* buf = (char*)pLsrAllocC((len+1)*sizeof(char));
+    uintp i;
+    assert(pLsrCoreCharOrd);
+    assert(pLsrVTableGetTag(pLsrObjectGetVTable(s)) == VArrayTag &&
+           !pLsrPArrayHasIdx(s));
+    buf[len] = '\0';
+    for(i=0; i<len; i++) {
+        PlsrPAny c;
+        pLsrPArrayOEltEval(c, s, i);
+        PlsrPAny cord;
+        char cval;
+        assert(pLsrVTableGetTag(pLsrObjectGetVTable(c)) == VSumTag &&
+               pLsrPSumGetTag(c) == pLsrCoreCharOrd);
+        pLsrPSumGetValEval(cord, c);
+        assert(pLsrVTableGetTag(pLsrObjectGetVTable(cord)) == VRatTag);
+        pLsrUInt32FromPRat(cval, cord);
+        buf[i] = cval;
+    }
+    return buf;
+}
+
+static PlsrPAny pLsrCStringToPStringT(const char* str)
+{
+    uintp len = strlen(str);
+    PlsrPAny res = pLsrPArrayONew(len);
+    uintp i;
+    assert(pLsrCoreCharOrd);
+    for(i=0; i<len; i++) {
+        PlsrPAny cval;
+        PlsrPAny csum;
+        pLsrPRatFromUInt32(cval, str[i]);
+        csum = pLsrPSumNewT(pLsrCoreCharOrd, cval);
+        PlsrThunkBRef thnk2 = pLsrThunkNewValRef((PlsrRef)csum);
+        pLsrWriteBarrierRefBase(res, pLsrPArrayOElt(res, i),
+                                (PlsrObjectB)thnk2);
+    }
+    return res;
+}
+
+static PlsrPAny pLsrCStringToPString(const char* str)
+{
+    uintp len = strlen(str);
+    PlsrPAny res = pLsrPArrayONew(len);
+    uintp i;
+    assert(pLsrCoreCharOrd);
+    for(i=0; i<len; i++) {
+        PlsrPAny cval;
+        PlsrPAny csum;
+        pLsrPRatFromUInt32(cval, str[i]);
+        csum = pLsrPSumNew(pLsrCoreCharOrd, cval);
+        pLsrWriteBarrierRefBase(res, pLsrPArrayOElt(res, i), csum);
+    }
+    return res;
+}
+
+/**********************************************************************
+ * Unreachable error values
+ */
+
+#define pLsrErrorVal(t) ((t)0)
+
+/**********************************************************************
+ * Printing
+ */
+
+static void pLsrNamePrint(PlsrPAny n)
+{
+    printf("%s", pLsrPNameGetString(n));
+}
+
+static void pLsrValuePrint(PlsrObjectB v)
+{
+    switch (pLsrVTableGetTag(pLsrObjectGetVTable(v))) {
+    case VRatTag: {
+        PlsrRational r;
+        char *s;
+        pLsrRationalFromPRat(r, v);
+        s = pLsrCStringFromRational(r);
+        printf("%s", s);
+        pLsrFreeC(s);
+        break;
+    }
+    case VNameTag:
+        pLsrNamePrint(v);
+        break;
+    case VFloatTag:{
+        float f = pLsrPFloatGet(v);
+        printf("%f", pLsrPFloatGet(v));
+        break;}
+    case VDoubleTag:
+        printf("%f", pLsrPDoubleGet(v));
+        break;
+    case VArrayTag: {
+        char* s =  pLsrValueToCStringMaybe(v);
+        if (s) {
+             printf("%s", s);
+             pLsrFreeC(s);
+        } else {
+            uintp i;
+            printf("{");
+            for(i=0; i<pLsrPArrayOGetLen(v); i++) {
+                if(i>0)  printf(", ");
+                pLsrValuePrint(pLsrPArrayOElt(v, i));
+            }
+            printf("}");
+        }
+        break;
+    }
+    case VArrayIdxTag: {
+        uintp alen = pLsrPArrayIGetLen(v);
+        /* Don't stack allocate until Pillar compiler alloca bug is fixed. */
+        PlsrPAny *names = (PlsrPAny*) pLsrAllocC(alen*sizeof(PlsrPAny));
+        {
+            uintp i;
+            PlsrIdxB idx = pLsrPArrayIGetIdx(v);
+            uintp dlen = idx->len;
+
+            for(i=0; i < dlen; i++) {
+                if (idx->elts[i].name) {
+                    names[idx->elts[i].offset] = idx->elts[i].name;
+                }
+            }
+        }
+        {
+            uintp i;
+            bool first = 1;
+            
+            printf("{");
+            for(i=0; i < alen; i++) {
+                if (!first) printf(", ");
+                else first=0;
+                pLsrNamePrint(names[i]);
+                printf(" => ");
+                pLsrValuePrint(pLsrPArrayIElt(v, i));
+            }
+        }
+        printf("}");
+        pLsrFreeC(names);
+        break;
+    }
+    case VSetTag:
+        printf("SET{");
+        if (!pLsrPSetIsEmpty(v)) 
+            pLsrValuePrint(pLsrPSetGet(v));
+        printf("}");
+        break;
+    case VTypeTag:
+        printf("TypePH");
+        break;
+    case VSumTag:
+        printf("sum(");
+        pLsrNamePrint(pLsrPSumGetTag(v));
+        printf(") ");
+        pLsrValuePrint(pLsrPSumGetVal(v));
+        break;
+    case VFunctionTag:
+        printf("A Closure");
+        break;
+    case VThunkTag:
+        pLsrThunkPrintRef((PlsrThunkBRef)v);
+        break;
+    default:
+        printf("Something else: possibly missed a case in ValuePrint");
+        break;
+    }
+};
+
+/**********************************************************************
+ * Register Runtime VTables
+ */
+
+#define pLsrPNameSize (sizeof(PlsrPNameU)+1) // plus one for zero byte
+#define pLsrPNameLenOff ((unsigned)(&((PlsrPNameU*)0)->strLen))
+#define pLsrIdxSize (sizeof(PlsrIdxU))
+#define pLsrIdxLenOff ((unsigned)(&((PlsrIdxU*)0)->len))
+#define pLsrIdxEltSize (sizeof(PlsrIdxE))
+#define pLsrThunkCutSize ()
+
+/* Note: for PgcIsRef arrays which are all zeroes, we rely on 
+ * c99 static initialization semantics to initialize the fields to zero.
+ * By convention, we write this as {0, } .
+ */
+static void pLsrValueRegisterVTables()
+{
+    static PgcIsRef pLsrPRatRefs[pLsrPRatSize/P_WORD_SIZE] = { 0, pLsrPRatContainsRef };
+    static PgcIsRef pLsrPNameRefs[pLsrPNameSize/P_WORD_SIZE] = { 0, };
+    static PgcIsRef pLsrPFloatRefs[pLsrPFloatSize/P_WORD_SIZE] = { 0, };
+    static PgcIsRef pLsrPDoubleRefs[pLsrPDoubleSize/P_WORD_SIZE] = { 0, };
+    static PgcIsRef pLsrPSumRefs[] = { 0, 1, 1 };
+    static PgcIsRef pLsrPArrayORefs[pLsrPArrayOBaseSize/P_WORD_SIZE] = { 0, };
+    static PgcIsRef pLsrPArrayIRefs[] = { 0, 0, 1 };
+    static PgcIsRef pLsrPFunctionRefs[pLsrPFunctionSize/P_WORD_SIZE] = { 0, };
+    static PgcIsRef pLsrPSetRefs[] = { 0, 1 };
+    static PgcIsRef pLsrPTypeRefs[pLsrPTypeSize/P_WORD_SIZE] = { 0, };
+    static PgcIsRef pLsrIdxRefs[pLsrIdxSize/P_WORD_SIZE] = { 0, };
+    static PgcIsRef pLsrIdxWRefs[pLsrIdxSize/P_WORD_SIZE] = {0, };
+    static PgcIsRef pLsrIdxEltRefs[] = { 1, 0 };
+#ifdef PLSR_LIGHTWEIGHT_THUNKS
+    assert(sizeof(PlsrThunkURef)/P_WORD_SIZE == 2);
+    static PgcIsRef pLsrThunkValRefsRef[] = {0, 1};
+    static PgcIsRef pLsrThunkCutRefs[sizeof(PlsrThunkURef)/P_WORD_SIZE] = {0, 1};
+#else /* !PLSR_LIGHTWEIGHT_THUNKS */
+#ifdef PLSR_THUNK_SYNCHRONIZE
+#ifdef PLSR_THUNK_INTERCEPT_CUTS
+    assert(sizeof(PlsrThunkURef)/P_WORD_SIZE == 6);
+    static PgcIsRef pLsrThunkValRefsRef[] = {0, 0, 0, 0, 0, 1};
+#else  /*  PLSR_THUNK_INTERCEPT_CUTS*/
+    assert(sizeof(PlsrThunkURef)/P_WORD_SIZE == 5);
+    static PgcIsRef pLsrThunkValRefsRef[] = {0, 0, 0, 0, 1};
+#endif /*  PLSR_THUNK_INTERCEPT_CUTS */
+#else /* !PLSR_THUNK_SYNCHRONIZE */
+#ifdef PLSR_THUNK_INTERCEPT_CUTS
+    assert(sizeof(PlsrThunkURef)/P_WORD_SIZE == 5);
+    static PgcIsRef pLsrThunkValRefsRef[] = {0, 0, 0, 0, 1};
+#else  /*  PLSR_THUNK_INTERCEPT_CUTS*/
+    assert(sizeof(PlsrThunkURef)/P_WORD_SIZE == 4);
+    static PgcIsRef pLsrThunkValRefsRef[] = {0, 0, 0, 1};
+#endif /*  PLSR_THUNK_INTERCEPT_CUTS */
+#endif /* !PLSR_THUNK_SYNCHRONIZE */
+#endif /* !PLSR_LIGHTWEIGHT_THUNKS */
+    static PgcIsRef pLsrThunkValRefs32[sizeof(PlsrThunkU32)/P_WORD_SIZE]         = {0, };
+    static PgcIsRef pLsrThunkValRefs64[sizeof(PlsrThunkU64)/P_WORD_SIZE]         = {0, };
+    static PgcIsRef pLsrThunkValRefsFloat[sizeof(PlsrThunkUFloat)/P_WORD_SIZE]   = {0, };
+    static PgcIsRef pLsrThunkValRefsDouble[sizeof(PlsrThunkUDouble)/P_WORD_SIZE] = {0, };
+
+#ifdef PLSR_LIGHTWEIGHT_THUNKS
+    static PgcIsRef pLsrThunkEvalRefsRef[sizeof(PlsrThunkURef)/P_WORD_SIZE]       = {0, };
+    static PgcIsRef pLsrThunkEvalRefs32[sizeof(PlsrThunkU32)/P_WORD_SIZE]         = {0, };
+    static PgcIsRef pLsrThunkEvalRefs64[sizeof(PlsrThunkU64)/P_WORD_SIZE]         = {0, };
+    static PgcIsRef pLsrThunkEvalRefsFloat[sizeof(PlsrThunkUFloat)/P_WORD_SIZE]   = {0, };
+    static PgcIsRef pLsrThunkEvalRefsDouble[sizeof(PlsrThunkUDouble)/P_WORD_SIZE] = {0, };
+#endif /* PLSR_LIGHTWEIGHT_THUNKS */
+
+    assert(pLsrPRatSize/P_WORD_SIZE == 2);
+    assert(pLsrPSumSize/P_WORD_SIZE == 3);
+    assert(pLsrPArrayIBaseSize/P_WORD_SIZE == 3);
+    assert(pLsrPSetSize/P_WORD_SIZE == 2);
+    assert(pLsrIdxEltSize/P_WORD_SIZE == 2);
+
+    pLsrVTableRegister(pLsrPRatVTable, pLsrDefaultAlignment, pLsrPRatSize, pLsrPRatRefs, 0, 0, 0,
+                       PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegister(pLsrPNameVTable, pLsrDefaultAlignment, pLsrPNameSize, pLsrPNameRefs, 1,
+                       pLsrPNameLenOff, 0, PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegister(pLsrPFloatVTable, pLsrDefaultAlignment, pLsrPFloatSize, pLsrPFloatRefs,
+                       0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegister(pLsrPDoubleVTable, pLsrDefaultAlignment, pLsrPDoubleSize, pLsrPDoubleRefs,
+                       0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegister(pLsrPSumVTable, pLsrDefaultAlignment, pLsrPSumSize, pLsrPSumRefs, 0, 0, 0,
+                       PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegister(pLsrPArrayOVTable, pLsrDefaultAlignment, pLsrPArrayOBaseSize, pLsrPArrayORefs,
+                       P_WORD_SIZE, pLsrPArrayOLenOffset, 1,
+                       PGC_CREATED_MUTABLE, 0);
+    pLsrVTableRegister(pLsrPArrayIVTable, pLsrDefaultAlignment, pLsrPArrayIBaseSize, pLsrPArrayIRefs,
+                       P_WORD_SIZE, pLsrPArrayILenOffset, 1,
+                       PGC_CREATED_MUTABLE, 0);
+    pLsrVTableRegister(pLsrClosureVTable, pLsrDefaultAlignment, pLsrPFunctionSize, pLsrPFunctionRefs,
+                       0, 0, 0, PGC_CREATED_MUTABLE, 0);
+    pLsrVTableRegister(pLsrPSetVTable, pLsrDefaultAlignment, pLsrPSetSize, pLsrPSetRefs, 0, 0, 0,
+                       PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegister(pLsrPTypeVTable, pLsrDefaultAlignment, pLsrPTypeSize, pLsrPTypeRefs, 0, 0,
+                       0, PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegisterV(pLsrIdxVTable, pLsrDefaultAlignment, pLsrIdxSize, pLsrIdxRefs,
+                        pLsrIdxEltSize, pLsrIdxLenOff, pLsrIdxEltRefs,
+                        PGC_CREATED_MUTABLE, 0, pLsrIdxWRefs, NULL);
+#ifdef PLSR_GC_INDIRECTIONS
+    pLsrIndirectionVTableRegister(pLsrThunkValVTableRef, sizeof(PlsrThunkURef), pLsrThunkResultOffsetRef);
+#else
+    pLsrVTableRegister(pLsrThunkValVTableRef, pLsrDefaultAlignment, sizeof(PlsrThunkURef),
+                       pLsrThunkValRefsRef, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0);
+#endif
+    pLsrVTableRegister(pLsrThunkValVTable32, pLsrDefaultAlignment, sizeof(PlsrThunkU32),
+                       pLsrThunkValRefs32, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegister(pLsrThunkValVTable64, pLsrDefaultAlignment, sizeof(PlsrThunkU64),
+                       pLsrThunkValRefs64, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegister(pLsrThunkValVTableFloat, pLsrDefaultAlignment, sizeof(PlsrThunkUFloat),
+                       pLsrThunkValRefsFloat, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0);
+    pLsrVTableRegister(pLsrThunkValVTableDouble, pLsrDefaultAlignment, sizeof(PlsrThunkUDouble),
+                       pLsrThunkValRefsDouble, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0);
+
+#ifdef PLSR_LIGHTWEIGHT_THUNKS
+    pLsrVTableRegister(pLsrThunkCutVTable, pLsrDefaultAlignment, sizeof(PlsrThunkURef),
+                       pLsrThunkCutRefs, 0, 0, 0, PGC_ALWAYS_IMMUTABLE, 0);
+
+    pLsrVTableRegister(pLsrThunkEvalVTableRef, pLsrDefaultAlignment, sizeof(PlsrThunkURef),
+                       pLsrThunkEvalRefsRef, 0, 0, 0, PGC_CREATED_MUTABLE, 0);
+    pLsrVTableRegister(pLsrThunkEvalVTable32, pLsrDefaultAlignment, sizeof(PlsrThunkU32),
+                       pLsrThunkEvalRefs32, 0, 0, 0, PGC_CREATED_MUTABLE, 0);
+    pLsrVTableRegister(pLsrThunkEvalVTable64, pLsrDefaultAlignment, sizeof(PlsrThunkU64),
+                       pLsrThunkEvalRefs64, 0, 0, 0, PGC_CREATED_MUTABLE, 0);
+    pLsrVTableRegister(pLsrThunkEvalVTableFloat, pLsrDefaultAlignment, sizeof(PlsrThunkUFloat),
+                       pLsrThunkEvalRefsFloat, 0, 0, 0, PGC_CREATED_MUTABLE, 0);
+    pLsrVTableRegister(pLsrThunkEvalVTableDouble, pLsrDefaultAlignment, sizeof(PlsrThunkUDouble),
+                       pLsrThunkEvalRefsDouble, 0, 0, 0, PGC_CREATED_MUTABLE, 0);
+#endif
+}
+
+/**********************************************************************
+ * Register Runtime Globals
+ */
+
+#define pLsrValueGlobalRefsCount 1
+
+static PlsrRef* pLsrValueGlobalRefs[] = 
+    {
+        (PlsrRef *) &pLsrCoreCharOrd,
+        (PlsrRef *) NULL /* This must be last */
+    };
+
+static void pLsrValueRegisterGlobalRefs() {
+    assert(pLsrValueGlobalRefs[pLsrValueGlobalRefsCount] == NULL);
+    pLsrGcRegisterGlobalRefs ((void **)pLsrValueGlobalRefs, pLsrValueGlobalRefsCount);
+};
+
+static void pLsrValueRegisterGlobals() {
+    pLsrValueRegisterGlobalRefs();
+};
+
+/**********************************************************************
+ * Runtime Assertions
+ */
+
+static void pLsrValueCheck()
+{
+    if (pLsrObjectFieldsBase > pLsrPSetOffset)
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPSetSize < pLsrPSetOffset+sizeof(PlsrPAny))
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrObjectFieldsBase > pLsrPTypeSize)
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrObjectFieldsBase > pLsrPRatOffset)
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPRatSize < pLsrPRatOffset+sizeof(PlsrRational))
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrObjectFieldsBase > pLsrPFloatOffset)
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPFloatSize < pLsrPFloatOffset+sizeof(float))
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrObjectFieldsBase > pLsrPDoubleOffset)
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPDoubleSize < pLsrPDoubleOffset+sizeof(double))
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrObjectFieldsBase > pLsrPArrayOLenOffset)
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPArrayOEltOffset < pLsrPArrayOLenOffset+sizeof(uintp))
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPArrayOBaseSize < pLsrPArrayOEltOffset)
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrObjectFieldsBase > pLsrPArrayILenOffset)
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPArrayIIdxOffset < pLsrPArrayILenOffset+sizeof(uintp))
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPArrayIEltOffset < pLsrPArrayIIdxOffset+sizeof(void*))
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPArrayIBaseSize < pLsrPArrayIEltOffset)
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrObjectFieldsBase > pLsrPFunctionCodeOffset)
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrObjectFieldsBase > pLsrPSumTagOffset)
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPSumSize < pLsrPSumTagOffset+sizeof(PlsrPAny))
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPSumSize < pLsrPSumValOffset+sizeof(PlsrObjectB))
+        pLsrRuntimeError("Bad object model!\n");
+#ifdef P_USE_TAGGED_RATIONALS
+    if (pLsrPSmallRationalMax != pLsrSmallRationalMax) 
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPSmallRationalMin != pLsrSmallRationalMin)
+        pLsrRuntimeError("Bad object model!\n");
+#endif
+#ifdef P_USE_TAGGED_INTEGERS
+    if (pLsrPSmallIntegerMax != pLsrSmallIntegerMax) 
+        pLsrRuntimeError("Bad object model!\n");
+    if (pLsrPSmallIntegerMin != pLsrSmallIntegerMin)
+        pLsrRuntimeError("Bad object model!\n");
+#endif
+}
+
+#endif /* !_PLSR_VALUE_H_ */
diff --git a/runtime/include/hrc/plsr-wpo.h b/runtime/include/hrc/plsr-wpo.h
new file mode 100755
index 0000000..31df034
--- /dev/null
+++ b/runtime/include/hrc/plsr-wpo.h
@@ -0,0 +1,168 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* Weak Pointer Objects */
+
+#ifndef _PLSR_WPO_H_
+#define _PLSR_WPO_H_
+
+/***********************************************************************************
+ ************* Weak Pointer Object Interface ***************************************
+ **********************************************************************************/
+
+typedef PlsrRef PlsrWpo;
+
+/* Create a new weak pointer object from key, value and optional finalizer
+*  finalizer may be NULL.  Finalizer is assumed to be a PlsrThunkBRef
+*/
+static PlsrWpo pLsrWpoNew(PlsrRef key, PlsrRef value, PlsrRef finalizer);
+
+/* Create a new weak pointer object from key, value and optional finalizer
+*  finalizer may be NULL.  Finalizer is run using the provided code pointer.
+*/
+static PlsrWpo pLsrWpoNewWithRun(PlsrRef key, PlsrRef value, PlsrRef finalizer, PlsrFinalizerCode run);
+
+/* Read the value (if present) of the weak pointer object.  Returns NULL
+*  if not present.
+*/
+static PlsrRef pLsrWpoRead(PlsrWpo wpo);
+
+/* Tombstone the weak pointer object and immediately run its finalizer.
+*  Returns when the finalizer has completed.
+*/
+static void pLsrWpoFinalize(PlsrWpo wpo);
+
+/* Registration code */
+static void pLsrWpoRegisterVTables();
+static void pLsrWpoRegisterGlobals();
+static void pLsrWpoCheckAssertions();
+
+/***********************************************************************************
+ ************* Weak Pointer Object Implementation **********************************
+ **********************************************************************************/
+
+
+typedef struct PlsrWpoS {
+    PlsrVTable vtable;
+    PlsrRef key;
+    volatile PlsrRef value;
+    PlsrRef finalizer;
+    PlsrFinalizerCode run;
+} PlsrWpoU;
+
+#define pLsrWpoPadding \
+    (sizeof(PlsrWpoU) - sizeof(PlsrVTable) - 3*sizeof(PlsrRef) - sizeof(PlsrFinalizerCode))
+pLsrVTableStatic(pLsrWpoVTable_, VNoneTag, "*weak pointer object*", pLsrWpoPadding);
+#define pLsrWpoVTable (&pLsrWpoVTable_)
+
+
+
+#ifdef P_USE_PILLAR
+
+
+static PlsrWpo pLsrWpoNewWithRun(PlsrRef key, PlsrRef value, PlsrRef finalizer, PlsrFinalizerCode run)
+{
+    PlsrRef res;
+    pLsrAlloc_(PlsrRef, res, pLsrWpoVTable, sizeof(PlsrWpoU));
+    ((PlsrWpoU*)res)->key=key;
+    ((PlsrWpoU*)res)->value=value;
+    ((PlsrWpoU*)res)->finalizer=finalizer;
+    ((PlsrWpoU*)res)->run=run;
+#ifdef PLSR_WPO_TRACE
+    printf("Creating WPO object %p with finalizer object %p and value %p\n", res, finalizer, value);
+    fflush(stdout);
+#endif
+    return res;
+}
+
+void pLsrWpoRunFinalizer(PlsrRef finalizer) 
+{
+    pLsrThunkEvalRef(finalizer);
+}
+
+static PlsrWpo pLsrWpoNew(PlsrRef key, PlsrRef value, PlsrRef finalizer)
+{
+    return pLsrWpoNewWithRun(key, value, finalizer, pLsrWpoRunFinalizer);
+}
+
+static PlsrRef pLsrWpoRead(PlsrWpo wpo)
+{
+#ifdef PLSR_WPO_TRACE
+    printf("Reading WPO object %p with value %p\n", wpo, ((PlsrWpoU*)wpo)->value);
+    fflush(stdout);
+#endif
+
+    return ((PlsrWpoU*)wpo)->value;
+}
+
+static void pLsrWpoFinalize(PlsrWpo wpo) 
+{
+    ((PlsrWpoU*)wpo)->value = NULL;
+    if (((PlsrWpoU*)wpo)->finalizer) {
+        ((PlsrWpoU*)wpo)->run(((PlsrWpoU*)wpo)->finalizer);
+    }
+}
+
+
+#pragma pillar_managed(off)
+
+static void pLsrWpoTombstone(void * wpo) 
+{
+#ifdef PLSR_WPO_TRACE
+    printf("Tombstoning WPO object %p with value %p\n", wpo, ((PlsrWpoU*)wpo)->value);
+    fflush(stdout);
+#endif
+    assert(((PlsrWpoU*)wpo)->value);
+    ((PlsrWpoU*)wpo)->value = NULL;
+    if (((PlsrWpoU*)wpo)->finalizer) {
+        pLsrFinalizerAdd(((PlsrWpoU*)wpo)->run, ((PlsrWpoU*)wpo)->finalizer);
+    }
+}
+
+#pragma pillar_managed(on)
+
+#define pLsrWpoSize (sizeof(PlsrWpoU))
+
+static void pLsrWpoRegisterVTables()
+{
+    /* XXX Temporarily disable weak pointer semantics pending GC fixes. -leaf */
+    static PgcIsRef pLsrWpoRefs[] = { 0, 1, 1, 1, 0 };
+    static PgcIsRef pLsrWpoWRefs[] = { 0, 0, 0, 0, 0 };
+    pLsrVTableRegisterV(pLsrWpoVTable, pLsrDefaultAlignment, pLsrWpoSize, pLsrWpoRefs,
+                        0, 0, 0,
+                        PGC_ALWAYS_MUTABLE, 0, pLsrWpoWRefs, NULL);
+    pgc_set_wpo_vtable((struct VTable *) pLsrWpoVTable);
+    pgc_set_wpo_finalizer(pLsrWpoTombstone);
+}
+
+static void pLsrWpoRegisterGlobals()
+{
+}
+
+static void pLsrWpoCheckAssertions()
+{
+}
+
+#else /* ! P_USE_PILLAR */
+
+static PlsrWpo pLsrWpoNew(PlsrRef key, PlsrRef value, PlsrRef finalizer) 
+{
+    pLsrRuntimeError("Weak pointer objects not supported");
+    return 0;
+}
+
+static void pLsrWpoRegisterVTables()
+{
+}
+
+static void pLsrWpoRegisterGlobals()
+{
+}
+
+static void pLsrWpoCheckAssertions()
+{
+}
+
+#endif /* P_USE_PILLAR */
+
+#endif /* !_PLSR_WPO_H_ */
diff --git a/runtime/include/hrc/plsr.h b/runtime/include/hrc/plsr.h
new file mode 100644
index 0000000..edbcc69
--- /dev/null
+++ b/runtime/include/hrc/plsr.h
@@ -0,0 +1,183 @@
+/* The Haskell Research Compiler */
+/* COPYRIGHT_NOTICE_1 */
+
+/* The P language-specific runtime */
+
+#ifndef _PLSR_H_
+#define _PLSR_H_
+
+/**********************************************************************
+ * Control
+ */
+
+/* User Control macros:
+ *   P_WORD_SIZE               - number of bytes in a pointer
+ *   P_USE_PILLAR              - compiled for pillar
+ *   P_USE_CGC                 - use BDW conservative garbage collector
+ *   P_USE_AGC                 - use a PSL accurate garabage collector
+ *                             - value specifies kind (see PlsrAgcKind )
+ *   P_AGC_LOCK_PARAM          - the lock parameter to pass to pgc_init
+ *   PLSR_ENABLE_EVENTS        - Turn on the event logging code.
+ *   P_USE_FAST_ALLOC          - use an inlined fast path for allocation
+ *   PLSR_INSTRUMENT_ALLOCATION - collect and report allocation stats
+ *   PLSR_INSTRUMENT_GCS       - collect and report per gc allocation stats
+ *   PLSR_INSTRUMENT_VTB_ALC   - collect and report per vtable allocation stats
+ *   P_USE_GC_WRITE_BARRIERS   - use write barriers for the GC
+ *   P_ALL_BARRIERS            - do all barriers for barrier validation
+ *   P_USE_PARALLEL_FUTURES    - use the parallel futures system
+ *   P_DO_VTABLE_CHANGE        - do vtable changing for immutability etc.
+ *   P_PRAT_USE_SINTP          - use sintp to represent P Rationals (unsafe)
+ *   P_USE_TAGGED_INTEGERS     - use low bit tagging on small ap integers
+ *   P_USE_TAGGED_RATIONALS    - use low bit tagging on small ap rationals
+ *   P_TAGGED_INT32_PORTABLE   - see commentary in plsr-tagged-int32.h
+ *   P_TAGGED_INT32_ASSERT_SMALL - use 32 bit integers with runtime checks
+ *   P_TAGGED_INT32_ASSUME_SMALL - use 32 bit untegers with no checks
+ *   PLSR_DISABLE_TAILCALL     - disable tailcalls
+ *   PLSR_GMP_USE_GCMALLOC     - use gc malloc for gmp wrappers
+ *   PLSR_GMP_USE_PCDECL       - use pcdecl to avoid pinning gmp wrappers
+ *   PLSR_GMP_USE_MALLOC       - use malloc to avoid pinning gmp wrappers
+ *   PLSR_GMP_FORCE_GC         - force malloc heap limits for gmp wrappers
+ *   PLSR_GMP_USE_PINNING      - use pinning in gmp wrappers
+ *   PLSR_GMP_USE_GALLOCATE    - use guaranteed allocation in gmp wrappers
+ *   PLSR_GMP_USE_DEFAULT      - use a backend default gmp implementation
+ *   PLSR_TAGGED_INT32_TEST    - generate test functions
+ *   PLSR_TAGGED_INTEGER_RECOVER - Check AP int output for taggable integers
+ *   PLSR_THUNK_INTERCEPT_CUTS - Intercept cuts out of thunks
+ *   PLSR_THUNK_SYNCHRONIZE    - use synchronized thunks
+ *   PLSR_NO_GMP_INTEGERS      - don't use the GMP
+ *   PLSR_STACK_SIZE_WORKER    - default stack size (mb) for ipc worker threads
+ *   PLSR_STACK_SIZE_MAIN      - default stack size (mb) for ipc main threads
+ *   PLSR_SINGLE_THREADED      - no runtime concurrency allowed
+ * Derived Control macros:
+ *   P_USE_MCRT                - compiled for C and use mcrt
+ *   P_USE_PTHREADS            - use pthreads for threading
+ *   PLSR_GNU_ASM              - use gnu style asm
+ *   PLSR_MS_ASM               - use ms style asm
+ */
+
+/*#define PLSR_ENABLE_EVENTS*/
+
+/* Derive control macros */
+
+#if defined(__GNUC__) || __INTEL_COMPILER >= 1000
+  #define PLSR_GNU_ASM
+#else
+  #define PLSR_MS_ASM
+#endif
+
+#if defined(PLSR_SINGLE_THREADED) && defined(P_USE_PARALLEL_FUTURES)
+#error "Can't use parallel futures with the single threaded runtime"
+#endif
+
+#if defined(P_USE_PARALLEL_FUTURES) && !defined(PLSR_THUNK_SYNCHRONIZE)
+#error "Must use synchronized thunks with parallel futures"
+#endif
+
+#ifdef USE_PTHREADS
+#define P_USE_PTHREADS
+#else
+#if !defined(P_USE_PILLAR) && defined(P_USE_PARALLEL_FUTURES)
+#define P_USE_MCRT
+#endif /* !P_USE_PILLAR && P_USE_PARALLEL_FUTURES */
+#endif /* USE_PTHREADS */
+
+/* Memory allocation */
+#ifdef P_USE_CGC
+#    define GC_NOT_DLL
+#  ifdef DEBUG
+#    define GC_DEBUG
+#  endif /* DEBUG */
+#endif /* P_USE_CGC */
+
+/**********************************************************************
+ * C header files
+ */
+
+#ifdef P_USE_PILLAR
+#  pragma pillar_managed(off)
+#  define to __to__
+#endif /* P_USE_PILLAR */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include <sys/timeb.h>
+#ifndef PLSR_LINUX
+#include <windows.h>
+#endif
+#include <limits.h>
+
+/* Memory allocation and collection utilities */
+
+#define PgcIsRef bool
+
+enum PlsrAgcKind {
+    PlsrAKMf  = 1,
+    PlsrAKCgc = 2,
+    PlsrAKTgc = 3
+};
+
+#ifdef P_USE_CGC
+#  ifdef P_USE_PILLAR
+#     error "CGC and Pillar combination not supported"
+#  endif /* P_USE_PILLAR */
+#  include "toolkit/gc.h"
+#  define P_VTABLE_RESERVE P_WORD_SIZE
+#else /* !P_USE_CGC */
+#ifdef P_USE_AGC
+#  ifdef P_USE_PILLAR
+#     undef PgcIsRef
+#     include "pgc/pgc.h"
+#     define P_VTABLE_RESERVE PGC_VTABLE_RESERVE
+#  else /* !P_USR_PILLAR */
+#     error "AGC and non-Pillar combination not support"
+#  endif /* !P_USE_PILLAR */
+#else /* !P_USE_AGC */
+#  include <malloc.h>
+#  define P_VTABLE_RESERVE P_WORD_SIZE
+#endif /* !P_USE_AGC */
+#endif /* !P_USE_CGC */
+
+#if !defined(P_USE_PTHREADS) && (defined(P_USE_MCRT) || defined(P_USE_PARALLEL_FUTURES))
+#include <mcrt.h>
+#endif /* defined(P_USE_MCRT) || defined(P_USE_PARALLEL_FUTURES) */
+
+#ifdef P_USE_PILLAR
+#  undef to
+#  pragma pillar_managed(on)
+#  include "prt/prt.h"
+#  include "prt/prtcodegenerator.h"
+#endif /* P_USE_PILLAR */
+
+/* Toolkit headers - these work in both C and Pillar */
+
+#if defined(P_USE_PARALLEL_FUTURES) || defined(PLSR_THUNK_SYNCHRONIZE)
+#define CONCURRENCY
+#else /* !P_USE_PARALLEL_FUTURES */
+#undef CONCURRENCY
+#endif /* !P_USE_PARALLEL_FUTURES */
+#include "toolkit/ptkfuture.h"
+typedef ref PtkRef;
+
+/**********************************************************************
+ * Runtime proper
+ */
+
+#include "hrc/plsr-util.h"
+#include "hrc/plsr-objects.h"
+#include "hrc/plsr-synchronization.h"
+#include "hrc/plsr-gc.h"
+#include "hrc/plsr-finalizer.h"
+#include "hrc/plsr-numeric.h"
+#include "hrc/plsr-thunk.h"
+#include "hrc/plsr-value.h"
+#include "hrc/plsr-wpo.h"
+#include "hrc/plsr-prims.h"
+#include "hrc/plsr-params.h"
+#include "hrc/plsr-main.h"
+
+#endif /* !_PLSR_H_ */