diff --git a/backend.native/cli.bc/src/org/jetbrains/kotlin/cli/bc/K2Native.kt b/backend.native/cli.bc/src/org/jetbrains/kotlin/cli/bc/K2Native.kt
index 3163a532f4f..81a4e22c729 100644
--- a/backend.native/cli.bc/src/org/jetbrains/kotlin/cli/bc/K2Native.kt
+++ b/backend.native/cli.bc/src/org/jetbrains/kotlin/cli/bc/K2Native.kt
@@ -153,6 +153,7 @@ class K2Native : CLICompiler<K2NativeCompilerArguments>() {
                 put(LIGHT_DEBUG, arguments.lightDebug)
                 put(STATIC_FRAMEWORK, selectFrameworkType(configuration, arguments, outputKind))
                 put(OVERRIDE_CLANG_OPTIONS, arguments.clangOptions.toNonNullList())
+                put(ALLOCATION_MODE, arguments.allocator)
 
                 put(PRINT_IR, arguments.printIr)
                 put(PRINT_IR_WITH_DESCRIPTORS, arguments.printIrWithDescriptors)
diff --git a/backend.native/cli.bc/src/org/jetbrains/kotlin/cli/bc/K2NativeCompilerArguments.kt b/backend.native/cli.bc/src/org/jetbrains/kotlin/cli/bc/K2NativeCompilerArguments.kt
index 173bc1d478d..ebcc4cd4595 100644
--- a/backend.native/cli.bc/src/org/jetbrains/kotlin/cli/bc/K2NativeCompilerArguments.kt
+++ b/backend.native/cli.bc/src/org/jetbrains/kotlin/cli/bc/K2NativeCompilerArguments.kt
@@ -242,6 +242,9 @@ class K2NativeCompilerArguments : CommonCompilerArguments() {
     @Argument(value="-Xoverride-clang-options", valueDescription = "<arg1,arg2,...>", description = "Explicit list of Clang options")
     var clangOptions: Array<String>? = null
 
+    @Argument(value="-Xallocator", valueDescription = "std | mimalloc", description = "Allocator used in runtime")
+    var allocator: String = "std"
+
     override fun configureAnalysisFlags(collector: MessageCollector): MutableMap<AnalysisFlag<*>, Any> =
             super.configureAnalysisFlags(collector).also {
                 val useExperimental = it[AnalysisFlags.useExperimental] as List<*>
diff --git a/backend.native/compiler/ir/backend.native/src/org/jetbrains/kotlin/backend/konan/KonanConfig.kt b/backend.native/compiler/ir/backend.native/src/org/jetbrains/kotlin/backend/konan/KonanConfig.kt
index ba9fd28ac12..6fb3a1de34d 100644
--- a/backend.native/compiler/ir/backend.native/src/org/jetbrains/kotlin/backend/konan/KonanConfig.kt
+++ b/backend.native/compiler/ir/backend.native/src/org/jetbrains/kotlin/backend/konan/KonanConfig.kt
@@ -13,20 +13,13 @@ import org.jetbrains.kotlin.config.CommonConfigurationKeys
 import org.jetbrains.kotlin.config.CompilerConfiguration
 import org.jetbrains.kotlin.descriptors.ModuleDescriptor
 import org.jetbrains.kotlin.konan.CURRENT
+import org.jetbrains.kotlin.konan.CompilerVersion
 import org.jetbrains.kotlin.konan.MetaVersion
 import org.jetbrains.kotlin.konan.TempFiles
 import org.jetbrains.kotlin.konan.file.File
 import org.jetbrains.kotlin.konan.library.KonanLibrary
 import org.jetbrains.kotlin.konan.properties.loadProperties
-import org.jetbrains.kotlin.konan.target.Distribution
-import org.jetbrains.kotlin.konan.target.HostManager
-import org.jetbrains.kotlin.konan.target.KonanTarget
-import org.jetbrains.kotlin.konan.target.PlatformManager
 import org.jetbrains.kotlin.konan.target.*
-import org.jetbrains.kotlin.util.Logger
-import kotlin.system.exitProcess
-import org.jetbrains.kotlin.library.toUnresolvedLibraries
-import org.jetbrains.kotlin.konan.CompilerVersion
 import org.jetbrains.kotlin.library.KotlinLibrary
 import org.jetbrains.kotlin.library.resolver.TopologicalLibraryOrder
 
@@ -116,6 +109,18 @@ class KonanConfig(val project: Project, val configuration: CompilerConfiguration
         add(if (debug) "debug.bc" else "release.bc")
         add(if (memoryModel == MemoryModel.STRICT) "strict.bc" else "relaxed.bc")
         if (shouldCoverLibraries || shouldCoverSources) add("profileRuntime.bc")
+        if (configuration.get(KonanConfigKeys.ALLOCATION_MODE) == "mimalloc") {
+            if (!target.supportsMimallocAllocator()) {
+                configuration.report(CompilerMessageSeverity.STRONG_WARNING,
+                        "Mimalloc allocator isn't supported on target ${target.name}. Used standard mode.")
+                add("std_alloc.bc")
+            } else {
+                add("opt_alloc.bc")
+                add("mimalloc.bc")
+            }
+        } else {
+            add("std_alloc.bc")
+        }
     }.map {
         File(distribution.defaultNatives(target)).child(it).absolutePath
     }
diff --git a/backend.native/compiler/ir/backend.native/src/org/jetbrains/kotlin/backend/konan/KonanConfigurationKeys.kt b/backend.native/compiler/ir/backend.native/src/org/jetbrains/kotlin/backend/konan/KonanConfigurationKeys.kt
index b26c5af4d00..5e07cbecb82 100644
--- a/backend.native/compiler/ir/backend.native/src/org/jetbrains/kotlin/backend/konan/KonanConfigurationKeys.kt
+++ b/backend.native/compiler/ir/backend.native/src/org/jetbrains/kotlin/backend/konan/KonanConfigurationKeys.kt
@@ -88,6 +88,8 @@ class KonanConfigKeys {
                 = CompilerConfigurationKey.create("program or library name")
         val OVERRIDE_CLANG_OPTIONS: CompilerConfigurationKey<List<String>>
                 = CompilerConfigurationKey.create("arguments for clang")
+        val ALLOCATION_MODE: CompilerConfigurationKey<String>
+                = CompilerConfigurationKey.create("allocation mode")
         val PRINT_BITCODE: CompilerConfigurationKey<Boolean> 
                 = CompilerConfigurationKey.create("print bitcode")
         val PRINT_DESCRIPTORS: CompilerConfigurationKey<Boolean>
diff --git a/backend.native/tests/build.gradle b/backend.native/tests/build.gradle
index 3c2ce4608b7..83c3569a812 100644
--- a/backend.native/tests/build.gradle
+++ b/backend.native/tests/build.gradle
@@ -317,8 +317,8 @@ def createTestTasks(File testRoot, Class<Task> taskType, Closure taskConfigurati
 
 void dependsOnPlatformLibs(Task t) {
     if (!useCustomDist) {
-        def testTarget = project.testTarget
-        if (testTarget != null && testTarget != project.hostName) {
+        def testTarget = project.target
+        if (testTarget != project.platformManager.Companion.host) {
             t.dependsOn(":${testTarget}PlatformLibs")
         } else {
             t.dependsOn(':distPlatformLibs')
diff --git a/build-tools/src/main/groovy/org/jetbrains/kotlin/CompileToBitcode.groovy b/build-tools/src/main/groovy/org/jetbrains/kotlin/CompileToBitcode.groovy
deleted file mode 100644
index 8191f27d4c0..00000000000
--- a/build-tools/src/main/groovy/org/jetbrains/kotlin/CompileToBitcode.groovy
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright 2010-2017 JetBrains s.r.o.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.jetbrains.kotlin
-
-import org.gradle.api.DefaultTask
-import org.gradle.api.tasks.*
-import org.gradle.api.tasks.InputDirectory
-import org.gradle.api.tasks.OutputFile
-import org.gradle.api.tasks.TaskAction
-import org.jetbrains.kotlin.konan.target.Family
-import org.jetbrains.kotlin.konan.target.HostManager
-import org.jetbrains.kotlin.konan.target.KonanTarget
-
-class CompileCppToBitcode extends DefaultTask {
-    private String name = "main"
-    private String target = "host"
-    private File srcRoot;
-
-    protected List<String> compilerArgs = []
-    protected List<String> linkerArgs = []
-
-    @InputDirectory
-    File getSrcRoot() {
-        return srcRoot ?: project.file("src/$name")
-    }
-
-    @OutputFile
-    File getOutFile() {
-        return new File(getTargetDir(), "${name}.bc")
-    }
-
-    private File getSrcDir() {
-        return new File(this.getSrcRoot(), "cpp")
-    }
-
-    private File getHeadersDir() {
-        return new File(this.getSrcRoot(), "headers")
-    }
-
-    private File getTargetDir() {
-        return new File(project.buildDir, target)
-    }
-
-    private File getObjDir() {
-        return new File(getTargetDir(), name)
-    }
-
-    void name(String value) {
-        name = value
-    }
-
-    void target(String value) {
-        target = value
-    }
-
-    void srcRoot(File value) {
-        srcRoot = value
-    }
-
-    protected List<String> getCompilerArgs() {
-        return compilerArgs
-    }
-
-    protected List<String> getLinkerArgs() {
-        return linkerArgs
-    }
-
-    protected  String getTarget() {
-        return target
-    }
-
-    void compilerArgs(String... args) {
-        compilerArgs.addAll(args)
-    }
-
-    void compilerArgs(List<String> args) {
-        compilerArgs.addAll(args)
-    }
-
-    void linkerArgs(String... args) {
-        linkerArgs.addAll(args)
-    }
-
-    void linkerArgs(List<String> args) {
-        linkerArgs.addAll(args)
-    }
-
-    private Boolean targetingMinGW() {
-        def hostManager = new HostManager()
-        return hostManager.targetByName(this.target).family == Family.MINGW
-    }
-
-    @TaskAction
-    void compile() {
-        // the strange code below seems to be required due to some Gradle (Groovy?) behaviour
-        File headersDir = this.getHeadersDir()
-        File srcDir = this.getSrcDir()
-        List<String> compilerArgs = this.getCompilerArgs()
-        List<String> linkerArgs = this.getLinkerArgs()
-        File objDir = this.getObjDir()
-        objDir.mkdirs()
-        Boolean targetingMinGW = this.targetingMinGW()
-
-        project.execKonanClang(this.target) {
-            workingDir objDir
-            executable "clang++"
-            args '-std=c++14'
-            args '-Werror'
-            args '-O2'
-            if (!targetingMinGW) {
-                args '-fPIC'
-            }
-            args compilerArgs
-
-            args "-I$headersDir"
-            args '-c', '-emit-llvm'
-            args project.fileTree(srcDir) {
-                include('**/*.cpp')
-                include('**/*.mm') // Objective-C++
-            }
-        }
-
-        project.exec {
-            executable "$project.llvmDir/bin/llvm-link"
-            args project.fileTree(objDir).include('**/*.bc').sort { a, b -> (a.name <=> b.name) }
-
-            args linkerArgs
-
-            args '-o', outFile
-        }
-    }
-}
diff --git a/build-tools/src/main/kotlin/org/jetbrains/kotlin/CompileToBitcode.kt b/build-tools/src/main/kotlin/org/jetbrains/kotlin/CompileToBitcode.kt
new file mode 100644
index 00000000000..18e8c23da3a
--- /dev/null
+++ b/build-tools/src/main/kotlin/org/jetbrains/kotlin/CompileToBitcode.kt
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2010-2017 JetBrains s.r.o.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.jetbrains.kotlin
+
+import org.gradle.api.Action
+import org.gradle.api.DefaultTask
+import org.gradle.api.tasks.InputDirectory
+import org.gradle.api.tasks.OutputFile
+import org.gradle.api.tasks.TaskAction
+import org.jetbrains.kotlin.konan.target.Family
+import org.jetbrains.kotlin.konan.target.HostManager
+import org.jetbrains.kotlin.konan.target.KonanTarget
+
+import java.io.File
+import javax.inject.Inject
+
+open class CompileToBitcode @Inject constructor(@InputDirectory val srcRoot: File,
+                                                val folderName: String,
+                                                val target: String) : DefaultTask() {
+    enum class Language {
+        C, CPP
+    }
+
+    val compilerArgs = mutableListOf<String>()
+    val linkerArgs = mutableListOf<String>()
+    val excludeFiles = mutableListOf<String>()
+    var srcDir = File(srcRoot, "cpp")
+    var headersDir = File(srcRoot, "headers")
+    var skipLinkagePhase = false
+    var excludedTargets = mutableListOf<String>()
+    var language = Language.CPP
+
+    private val targetDir by lazy { File(project.buildDir, target) }
+
+    private val objDir by lazy { File(targetDir, folderName) }
+
+    private val KonanTarget.isMINGW
+        get() = this.family == Family.MINGW
+
+    @OutputFile
+    val outFile = File(targetDir, "${folderName}.bc")
+
+    @TaskAction
+    fun compile() {
+        if (target in excludedTargets) return
+        objDir.mkdirs()
+        val plugin = project.convention.getPlugin(ExecClang::class.java)
+        val commonFlags = listOf("-c", "-emit-llvm", "-I$headersDir")
+        val (executable, defaultFlags, srcFilesPatterns) =
+                when (language) {
+                    Language.C -> Triple("clang",
+                            // Used flags provided by original build of allocator C code.
+                            commonFlags + listOf("-std=gnu11", "-O3", "-Wall", "-Wextra", "-Wno-unknown-pragmas",
+                                    "-ftls-model=initial-exec"),
+                            listOf("**/*.c"))
+                    Language.CPP -> Triple("clang++",
+                            commonFlags + listOfNotNull("-std=c++14", "-Werror", "-O2",
+                                    "-fPIC".takeIf { !HostManager().targetByName(target).isMINGW }),
+                            listOf("**/*.cpp", "**/*.mm"))
+                }
+
+        plugin.execKonanClang(target, Action {
+            it.workingDir = objDir
+            it.executable = executable
+            it.args = defaultFlags + compilerArgs +
+                    project.fileTree(srcDir) {
+                        it.include(srcFilesPatterns)
+                        it.exclude(excludeFiles)
+                    }.files.map { it.absolutePath }
+        })
+
+        if (!skipLinkagePhase) {
+            project.exec {
+                val llvmDir = project.findProperty("llvmDir")
+                it.executable = "$llvmDir/bin/llvm-link"
+                it.args = listOf("-o", outFile.absolutePath) + linkerArgs +
+                        project.fileTree(objDir) {
+                            it.include("**/*.bc")
+                        }.files.map { it.absolutePath }
+            }
+        }
+    }
+}
diff --git a/build-tools/src/main/kotlin/org/jetbrains/kotlin/Utils.kt b/build-tools/src/main/kotlin/org/jetbrains/kotlin/Utils.kt
index 45507d13597..86afb4915c5 100644
--- a/build-tools/src/main/kotlin/org/jetbrains/kotlin/Utils.kt
+++ b/build-tools/src/main/kotlin/org/jetbrains/kotlin/Utils.kt
@@ -244,4 +244,7 @@ fun compileSwift(project: Project, target: KonanTarget, sources: List<String>, o
         """.trimMargin())
     check(exitCode == 0, { "Compilation failed" })
     check(output.toFile().exists(), { "Compiler swiftc hasn't produced an output file: $output" })
-}
\ No newline at end of file
+}
+
+fun targetSupportsMimallocAllocator(targetName: String) =
+        HostManager().targetByName(targetName).supportsMimallocAllocator()
\ No newline at end of file
diff --git a/common/build.gradle b/common/build.gradle
index c20d29ee972..14307301bd8 100644
--- a/common/build.gradle
+++ b/common/build.gradle
@@ -2,22 +2,16 @@
  * Copyright 2010-2018 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
  * that can be found in the LICENSE file.
  */
-import org.jetbrains.kotlin.CompileCppToBitcode
+import org.jetbrains.kotlin.CompileToBitcode
 
 // TODO: consider using some Gradle plugins to build and test
 
 targetList.each { targetName ->
-    task ("${targetName}Hash", type: CompileCppToBitcode) {
-        name 'hash'
-        target targetName
-    }
+    tasks.create("${targetName}Hash", CompileToBitcode, file("src/hash"), 'hash', targetName)
 }
 
 targetList.each { targetName ->
-    task ("${targetName}Files", type: CompileCppToBitcode) {
-        name 'files'
-        target targetName
-    }
+    tasks.create("${targetName}Files", CompileToBitcode, file("src/files"), 'files', targetName)
 }
 
 task build {
diff --git a/runtime/build.gradle b/runtime/build.gradle
index 9e2bc0d5694..2275ee5ec1c 100644
--- a/runtime/build.gradle
+++ b/runtime/build.gradle
@@ -1,21 +1,23 @@
 /*
- * Copyright 2010-2018 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * Copyright 2010-2019 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
  * that can be found in the LICENSE file.
  */
-import org.jetbrains.kotlin.CompileCppToBitcode
+import org.jetbrains.kotlin.CompileToBitcode
+import org.jetbrains.kotlin.UtilsKt
 
 // TODO: consider using some Gradle plugins to build and test
 
-void includeRuntime(CompileCppToBitcode task) {
-    task.compilerArgs '-I' + project.file('../common/src/hash/headers')
-    task.compilerArgs '-I' + project.file('src/main/cpp')
+void includeRuntime(CompileToBitcode task) {
+    task.compilerArgs.add('-I' + project.file('../common/src/hash/headers'))
+    task.compilerArgs.add('-I' + project.file('src/main/cpp'))
 }
 
 targetList.each { targetName ->
-    task("${targetName}Runtime", type: CompileCppToBitcode) {
-        name "runtime"
-        srcRoot file('src/main')
+    tasks.create("${targetName}Runtime", CompileToBitcode, file('src/main'), "runtime", targetName).configure {
         dependsOn ":common:${targetName}Hash"
+        dependsOn "${targetName}StdAlloc"
+        dependsOn "${targetName}OptAlloc"
+        dependsOn "${targetName}Mimalloc"
         dependsOn "${targetName}Launcher"
         dependsOn "${targetName}Debug"
         dependsOn "${targetName}Release"
@@ -24,63 +26,53 @@ targetList.each { targetName ->
         dependsOn "${targetName}ProfileRuntime"
         dependsOn "${targetName}ObjC"
         dependsOn "${targetName}ExceptionsSupport"
-        target targetName
         includeRuntime(delegate)
-        linkerArgs project.file("../common/build/$targetName/hash.bc").path
+        linkerArgs.add(project.file("../common/build/$targetName/hash.bc").path)
     }
 
-    task("${targetName}Launcher", type: CompileCppToBitcode) {
-        name "launcher"
-        srcRoot file('src/launcher')
-        target targetName
-        includeRuntime(delegate)
+    tasks.create("${targetName}Mimalloc", CompileToBitcode, file('src/mimalloc'), "mimalloc", targetName).configure {
+        language = CompileToBitcode.Language.C
+        excludeFiles.addAll(["**/alloc-override*.c", "**/page-queue.c", "**/static.c"])
+        if (!UtilsKt.targetSupportsMimallocAllocator(targetName))
+            excludedTargets.add(targetName)
+        srcDir = new File(srcRoot, "c")
+        compilerArgs.add("-DKONAN_MI_MALLOC=1")
+        headersDir = new File(srcDir, "include")
     }
 
-    task ("${targetName}Debug", type: CompileCppToBitcode) {
-        name "debug"
-        srcRoot file('src/debug')
-        target targetName
+    tasks.create("${targetName}Launcher", CompileToBitcode, file('src/launcher'), "launcher", targetName).configure {
         includeRuntime(delegate)
     }
 
-    task ("${targetName}ExceptionsSupport", type: CompileCppToBitcode) {
-        name "exceptionsSupport"
-        srcRoot file('src/exceptions_support')
-        target targetName
+    tasks.create("${targetName}Debug", CompileToBitcode, file('src/debug'), "debug", targetName).configure {
         includeRuntime(delegate)
     }
 
-    task ("${targetName}Release", type: CompileCppToBitcode) {
-        name "release"
-        srcRoot file('src/release')
-        target targetName
+    tasks.create("${targetName}StdAlloc", CompileToBitcode, file('src/std_alloc'), "std_alloc", targetName)
+
+    tasks.create("${targetName}OptAlloc", CompileToBitcode, file('src/opt_alloc'), "opt_alloc", targetName)
+
+    tasks.create("${targetName}ExceptionsSupport", CompileToBitcode, file('src/exceptions_support'),
+            "exceptionsSupport", targetName).configure {
         includeRuntime(delegate)
     }
 
-    task ("${targetName}Strict", type: CompileCppToBitcode) {
-        name "strict"
-        srcRoot file('src/strict')
-        target targetName
+    tasks.create("${targetName}Release", CompileToBitcode, file('src/release'), "release", targetName).configure {
         includeRuntime(delegate)
     }
 
-    task ("${targetName}Relaxed", type: CompileCppToBitcode) {
-        name "relaxed"
-        srcRoot file('src/relaxed')
-        target targetName
+    tasks.create("${targetName}Strict", CompileToBitcode, file('src/strict'), "strict", targetName).configure {
         includeRuntime(delegate)
     }
 
-    task ("${targetName}ProfileRuntime", type: CompileCppToBitcode) {
-        name "profileRuntime"
-        srcRoot file('src/profile_runtime')
-        target targetName
+    tasks.create("${targetName}Relaxed", CompileToBitcode, file('src/relaxed'), "relaxed", targetName).configure {
+        includeRuntime(delegate)
     }
 
-    task ("${targetName}ObjC", type: CompileCppToBitcode) {
-        name "objc"
-        srcRoot file('src/objc')
-        target targetName
+    tasks.create("${targetName}ProfileRuntime", CompileToBitcode, file('src/profile_runtime'),
+            "profileRuntime", targetName)
+
+    tasks.create("${targetName}ObjC", CompileToBitcode, file('src/objc'), "objc", targetName).configure {
         includeRuntime(delegate)
     }
 }
diff --git a/runtime/src/main/cpp/Porting.cpp b/runtime/src/main/cpp/Porting.cpp
index 21428161691..315700c8925 100644
--- a/runtime/src/main/cpp/Porting.cpp
+++ b/runtime/src/main/cpp/Porting.cpp
@@ -216,8 +216,10 @@ extern "C" void dlfree(void*);
 #define calloc_impl dlcalloc
 #define free_impl dlfree
 #else
-#define calloc_impl ::calloc
-#define free_impl ::free
+extern "C" void* konan_calloc_impl(size_t, size_t);
+extern "C" void konan_free_impl(void*);
+#define calloc_impl konan_calloc_impl
+#define free_impl konan_free_impl
 #endif
 
 void* calloc(size_t count, size_t size) {
diff --git a/runtime/src/mimalloc/c/LICENSE b/runtime/src/mimalloc/c/LICENSE
new file mode 100644
index 00000000000..4151dbe4ab1
--- /dev/null
+++ b/runtime/src/mimalloc/c/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Microsoft Corporation, Daan Leijen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/runtime/src/mimalloc/c/alloc-aligned.c b/runtime/src/mimalloc/c/alloc-aligned.c
new file mode 100644
index 00000000000..5a59a63ab26
--- /dev/null
+++ b/runtime/src/mimalloc/c/alloc-aligned.c
@@ -0,0 +1,204 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h>  // memset, memcpy
+
+// ------------------------------------------------------
+// Aligned Allocation
+// ------------------------------------------------------
+
+static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept {
+  // note: we don't require `size > offset`, we just guarantee that
+  // the address at offset is aligned regardless of the allocated size.
+  mi_assert(alignment > 0 && alignment % sizeof(void*) == 0);
+  if (mi_unlikely(size > PTRDIFF_MAX)) return NULL;   // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) return NULL; // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
+  const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
+  
+  // try if there is a small block available with just the right alignment
+  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
+    mi_page_t* page = _mi_heap_get_free_small_page(heap,size);
+    const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
+    if (mi_likely(page->free != NULL && is_aligned))
+    {
+      #if MI_STAT>1
+      mi_heap_stat_increase( heap, malloc, size);
+      #endif
+      void* p = _mi_page_malloc(heap,page,size); // TODO: inline _mi_page_malloc
+      mi_assert_internal(p != NULL);
+      mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
+      if (zero) _mi_block_zero_init(page,p,size);
+      return p;
+    }
+  }
+
+  // use regular allocation if it is guaranteed to fit the alignment constraints
+  if (offset==0 && alignment<=size && size<=MI_MEDIUM_OBJ_SIZE_MAX && (size&align_mask)==0) {
+    void* p = _mi_heap_malloc_zero(heap, size, zero);
+    mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
+    return p;
+  }
+  
+  // otherwise over-allocate
+  void* p = _mi_heap_malloc_zero(heap, size + alignment - 1, zero);
+  if (p == NULL) return NULL;
+
+  // .. and align within the allocation
+  uintptr_t adjust = alignment - (((uintptr_t)p + offset) & align_mask);
+  mi_assert_internal(adjust % sizeof(uintptr_t) == 0);
+  void* aligned_p = (adjust == alignment ? p : (void*)((uintptr_t)p + adjust));
+  if (aligned_p != p) mi_page_set_has_aligned(_mi_ptr_page(p), true); 
+  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
+  mi_assert_internal( p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p),_mi_ptr_page(aligned_p),aligned_p) );
+  return aligned_p;
+}
+
+
+mi_decl_allocator void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false);
+}
+
+mi_decl_allocator void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
+}
+
+mi_decl_allocator void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true);
+}
+
+mi_decl_allocator void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned_at(heap, size, alignment, 0);
+}
+
+mi_decl_allocator void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count, size, &total)) return NULL;
+  return mi_heap_zalloc_aligned_at(heap, total, alignment, offset);
+}
+
+mi_decl_allocator void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_calloc_aligned_at(heap,count,size,alignment,0);
+}
+
+mi_decl_allocator void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+}
+
+mi_decl_allocator void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_malloc_aligned(mi_get_default_heap(), size, alignment);
+}
+
+mi_decl_allocator void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+}
+
+mi_decl_allocator void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned(mi_get_default_heap(), size, alignment);
+}
+
+mi_decl_allocator void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_calloc_aligned_at(mi_get_default_heap(), count, size, alignment, offset);
+}
+
+mi_decl_allocator void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_calloc_aligned(mi_get_default_heap(), count, size, alignment);
+}
+
+
+static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
+  mi_assert(alignment > 0);
+  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
+  if (p == NULL) return mi_heap_malloc_zero_aligned_at(heap,newsize,alignment,offset,zero);
+  size_t size = mi_usable_size(p);
+  if (newsize <= size && newsize >= (size - (size / 2))
+      && (((uintptr_t)p + offset) % alignment) == 0) {
+    return p;  // reallocation still fits, is aligned and not more than 50% waste
+  }
+  else {
+    void* newp = mi_heap_malloc_aligned_at(heap,newsize,alignment,offset);
+    if (newp != NULL) {
+      if (zero && newsize > size) {
+        const mi_page_t* page = _mi_ptr_page(newp);
+        if (page->is_zero) {
+          // already zero initialized
+          mi_assert_expensive(mi_mem_is_zero(newp,newsize));
+        }
+        else {
+          // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+          size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+          memset((uint8_t*)newp + start, 0, newsize - start);
+        }
+      }
+      memcpy(newp, p, (newsize > size ? size : newsize));
+      mi_free(p); // only free if successful
+    }
+    return newp;
+  }
+}
+
+static void* mi_heap_realloc_zero_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, bool zero) mi_attr_noexcept {
+  mi_assert(alignment > 0);
+  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
+  size_t offset = ((uintptr_t)p % alignment); // use offset of previous allocation (p can be NULL)
+  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,zero);
+}
+
+mi_decl_allocator void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,false);
+}
+
+mi_decl_allocator void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned(heap,p,newsize,alignment,false);
+}
+
+mi_decl_allocator void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned_at(heap, p, newsize, alignment, offset, true);
+}
+
+mi_decl_allocator void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned(heap, p, newsize, alignment, true);
+}
+
+mi_decl_allocator void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(newcount, size, &total)) return NULL;
+  return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset);
+}
+
+mi_decl_allocator void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(newcount, size, &total)) return NULL;
+  return mi_heap_rezalloc_aligned(heap, p, total, alignment);
+}
+
+mi_decl_allocator void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+}
+
+mi_decl_allocator void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+}
+
+mi_decl_allocator void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_rezalloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+}
+
+mi_decl_allocator void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_rezalloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+}
+
+mi_decl_allocator void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_recalloc_aligned_at(mi_get_default_heap(), p, newcount, size, alignment, offset);
+}
+
+mi_decl_allocator void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment);
+}
+
diff --git a/runtime/src/mimalloc/c/alloc-override-osx.c b/runtime/src/mimalloc/c/alloc-override-osx.c
new file mode 100644
index 00000000000..fef4b929f4e
--- /dev/null
+++ b/runtime/src/mimalloc/c/alloc-override-osx.c
@@ -0,0 +1,232 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#if !KONAN_MI_MALLOC
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#if defined(MI_MALLOC_OVERRIDE)
+
+#if !defined(__APPLE__)
+#error "this file should only be included on macOS"
+#endif
+
+/* ------------------------------------------------------
+   Override system malloc on macOS
+   This is done through the malloc zone interface.
+------------------------------------------------------ */
+
+#include <AvailabilityMacros.h>
+#include <malloc/malloc.h>
+#include <string.h>  // memset
+
+#if defined(MAC_OS_X_VERSION_10_6) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+// only available from OSX 10.6
+extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_import));
+#endif
+
+
+/* ------------------------------------------------------
+   malloc zone members
+------------------------------------------------------ */
+
+static size_t zone_size(malloc_zone_t* zone, const void* p) {
+  return 0; // as we cannot guarantee that `p` comes from us, just return 0
+}
+
+static void* zone_malloc(malloc_zone_t* zone, size_t size) {
+  return mi_malloc(size);
+}
+
+static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) {
+  return mi_calloc(count, size);
+}
+
+static void* zone_valloc(malloc_zone_t* zone, size_t size) {
+  return mi_malloc_aligned(size, _mi_os_page_size());
+}
+
+static void zone_free(malloc_zone_t* zone, void* p) {
+  return mi_free(p);
+}
+
+static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) {
+  return mi_realloc(p, newsize);
+}
+
+static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) {
+  return mi_malloc_aligned(size,alignment);
+}
+
+static void zone_destroy(malloc_zone_t* zone) {
+  // todo: ignore for now?
+}
+
+static unsigned zone_batch_malloc(malloc_zone_t* zone, size_t size, void** ps, unsigned count) {
+  size_t i;
+  for (i = 0; i < count; i++) {
+    ps[i] = zone_malloc(zone, size);
+    if (ps[i] == NULL) break;
+  }
+  return i;
+}
+
+static void zone_batch_free(malloc_zone_t* zone, void** ps, unsigned count) {
+  for(size_t i = 0; i < count; i++) {
+    zone_free(zone, ps[i]);
+    ps[i] = NULL;
+  }
+}
+
+static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) {
+  mi_collect(false);
+  return 0;
+}
+
+static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) {
+  zone_free(zone,p);
+}
+
+
+/* ------------------------------------------------------
+   Introspection members
+------------------------------------------------------ */
+
+static kern_return_t intro_enumerator(task_t task, void* p,
+                            unsigned type_mask, vm_address_t zone_address,
+                            memory_reader_t reader,
+                            vm_range_recorder_t recorder)
+{
+  // todo: enumerate all memory
+  return KERN_SUCCESS;
+}
+
+static size_t intro_good_size(malloc_zone_t* zone, size_t size) {
+  return mi_good_size(size);
+}
+
+static boolean_t intro_check(malloc_zone_t* zone) {
+  return true;
+}
+
+static void intro_print(malloc_zone_t* zone, boolean_t verbose) {
+  mi_stats_print(NULL);
+}
+
+static void intro_log(malloc_zone_t* zone, void* p) {
+  // todo?
+}
+
+static void intro_force_lock(malloc_zone_t* zone) {
+  // todo?
+}
+
+static void intro_force_unlock(malloc_zone_t* zone) {
+  // todo?
+}
+
+static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) {
+  // todo...
+  stats->blocks_in_use = 0;
+  stats->size_in_use = 0;
+  stats->max_size_in_use = 0;
+  stats->size_allocated = 0;
+}
+
+static boolean_t intro_zone_locked(malloc_zone_t* zone) {
+  return false;
+}
+
+
+/* ------------------------------------------------------
+  At process start, override the default allocator
+------------------------------------------------------ */
+
+static malloc_zone_t* mi_get_default_zone()
+{
+  // The first returned zone is the real default
+  malloc_zone_t** zones = NULL;
+  unsigned count = 0;
+  kern_return_t ret = malloc_get_all_zones(0, NULL, (vm_address_t**)&zones, &count);
+  if (ret == KERN_SUCCESS && count > 0) {
+    return zones[0];
+  }
+  else {
+    // fallback
+    return malloc_default_zone();
+  }
+}
+
+
+static void __attribute__((constructor)) _mi_macos_override_malloc()
+{
+  static malloc_introspection_t intro;
+  memset(&intro, 0, sizeof(intro));
+
+  intro.enumerator = &intro_enumerator;
+  intro.good_size = &intro_good_size;
+  intro.check = &intro_check;
+  intro.print = &intro_print;
+  intro.log = &intro_log;
+  intro.force_lock = &intro_force_lock;
+  intro.force_unlock = &intro_force_unlock;
+
+  static malloc_zone_t zone;
+  memset(&zone, 0, sizeof(zone));
+
+  zone.version = 4;
+  zone.zone_name = "mimalloc";
+  zone.size = &zone_size;
+  zone.introspect = &intro;
+  zone.malloc = &zone_malloc;
+  zone.calloc = &zone_calloc;
+  zone.valloc = &zone_valloc;
+  zone.free = &zone_free;
+  zone.realloc = &zone_realloc;
+  zone.destroy = &zone_destroy;
+  zone.batch_malloc = &zone_batch_malloc;
+  zone.batch_free = &zone_batch_free;
+
+  malloc_zone_t* purgeable_zone = NULL;
+
+#if defined(MAC_OS_X_VERSION_10_6) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+  // switch to version 9 on OSX 10.6 to support memalign.
+  zone.version = 9;
+  zone.memalign = &zone_memalign;
+  zone.free_definite_size = &zone_free_definite_size;
+  zone.pressure_relief = &zone_pressure_relief;
+  intro.zone_locked = &intro_zone_locked;
+
+  // force the purgeable zone to exist to avoid strange bugs
+  if (malloc_default_purgeable_zone) {
+    purgeable_zone = malloc_default_purgeable_zone();
+  }
+#endif
+
+  // Register our zone
+  malloc_zone_register(&zone);
+
+  // Unregister the default zone, this makes our zone the new default
+  // as that was the last registered.
+  malloc_zone_t *default_zone = mi_get_default_zone();
+  malloc_zone_unregister(default_zone);
+
+  // Reregister the default zone so free and realloc in that zone keep working.
+  malloc_zone_register(default_zone);
+
+  // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
+  // earlier than the default zone.
+  if (purgeable_zone != NULL) {
+    malloc_zone_unregister(purgeable_zone);
+    malloc_zone_register(purgeable_zone);
+  }
+}
+
+#endif // MI_MALLOC_OVERRIDE
+#endif
\ No newline at end of file
diff --git a/runtime/src/mimalloc/c/alloc-override.c b/runtime/src/mimalloc/c/alloc-override.c
new file mode 100644
index 00000000000..3bde565ab35
--- /dev/null
+++ b/runtime/src/mimalloc/c/alloc-override.c
@@ -0,0 +1,197 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#if !KONAN_MI_MALLOC
+#if !defined(MI_IN_ALLOC_C)
+#error "this file should be included from 'alloc.c' (so aliases can work)"
+#endif
+
+#if defined(MI_MALLOC_OVERRIDE) && defined(_WIN32) && !(defined(MI_SHARED_LIB) && defined(_DLL))
+#error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)"
+#endif
+
+#if defined(MI_MALLOC_OVERRIDE) && !defined(_WIN32)
+
+// ------------------------------------------------------
+// Override system malloc
+// ------------------------------------------------------
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__MACH__)
+  // use aliasing to alias the exported function to one of our `mi_` functions
+  #if (defined(__GNUC__) && __GNUC__ >= 9)
+    #define MI_FORWARD(fun)      __attribute__((alias(#fun), used, visibility("default"), copy(fun)))
+  #else
+    #define MI_FORWARD(fun)      __attribute__((alias(#fun), used, visibility("default")))
+  #endif
+  #define MI_FORWARD1(fun,x)      MI_FORWARD(fun)
+  #define MI_FORWARD2(fun,x,y)    MI_FORWARD(fun)
+  #define MI_FORWARD3(fun,x,y,z)  MI_FORWARD(fun)
+  #define MI_FORWARD0(fun,x)      MI_FORWARD(fun)
+  #define MI_FORWARD02(fun,x,y)   MI_FORWARD(fun)
+#else
+  // use forwarding by calling our `mi_` function
+  #define MI_FORWARD1(fun,x)      { return fun(x); }
+  #define MI_FORWARD2(fun,x,y)    { return fun(x,y); }
+  #define MI_FORWARD3(fun,x,y,z)  { return fun(x,y,z); }
+  #define MI_FORWARD0(fun,x)      { fun(x); }
+  #define MI_FORWARD02(fun,x,y)   { fun(x,y); }
+#endif
+
+#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)
+  // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
+  // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
+  struct mi_interpose_s {
+    const void* replacement;
+    const void* target;
+  };
+  #define MI_INTERPOSEX(oldfun,newfun)  { (const void*)&newfun, (const void*)&oldfun }
+  #define MI_INTERPOSE_MI(fun)         MI_INTERPOSEX(fun,mi_##fun)
+  __attribute__((used)) static struct mi_interpose_s _mi_interposes[]  __attribute__((section("__DATA, __interpose"))) =
+  {
+    MI_INTERPOSE_MI(malloc),
+    MI_INTERPOSE_MI(calloc),
+    MI_INTERPOSE_MI(realloc),
+    MI_INTERPOSE_MI(free),
+    MI_INTERPOSE_MI(strdup),
+    MI_INTERPOSE_MI(strndup)
+  };
+#elif defined(_MSC_VER)
+  // cannot override malloc unless using a dll.
+  // we just override new/delete which does work in a static library.
+#else
+  // On all other systems forward to our API
+  void* malloc(size_t size)              mi_attr_noexcept  MI_FORWARD1(mi_malloc, size);
+  void* calloc(size_t size, size_t n)    mi_attr_noexcept  MI_FORWARD2(mi_calloc, size, n);
+  void* realloc(void* p, size_t newsize) mi_attr_noexcept  MI_FORWARD2(mi_realloc, p, newsize);
+  void  free(void* p)                    mi_attr_noexcept  MI_FORWARD0(mi_free, p);
+#endif
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__MACH__)
+#pragma GCC visibility push(default)
+#endif
+
+// ------------------------------------------------------
+// Override new/delete
+// This is not really necessary as they usually call
+// malloc/free anyway, but it improves performance.
+// ------------------------------------------------------
+#ifdef __cplusplus
+  // ------------------------------------------------------
+  // With a C++ compiler we override the new/delete operators.
+  // see <https://en.cppreference.com/w/cpp/memory/new/operator_new>
+  // ------------------------------------------------------
+  #include <new>
+  void operator delete(void* p) noexcept              MI_FORWARD0(mi_free,p);
+  void operator delete[](void* p) noexcept            MI_FORWARD0(mi_free,p);
+
+  void* operator new(std::size_t n) noexcept(false)   MI_FORWARD1(mi_new,n);
+  void* operator new[](std::size_t n) noexcept(false) MI_FORWARD1(mi_new,n);
+
+  void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { UNUSED(tag); return mi_new_nothrow(n); }
+  void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { UNUSED(tag); return mi_new_nothrow(n); }
+
+  #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+  void operator delete  (void* p, std::size_t n) MI_FORWARD02(mi_free_size,p,n);
+  void operator delete[](void* p, std::size_t n) MI_FORWARD02(mi_free_size,p,n);
+  #endif
+
+  #if (__cplusplus > 201402L || defined(__cpp_aligned_new))
+  void operator delete  (void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete  (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+  void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+
+  void* operator new( std::size_t n, std::align_val_t al)   noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new[]( std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new  (std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
+  void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
+  #endif
+
+#elif (defined(__GNUC__) || defined(__clang__))
+  // ------------------------------------------------------
+  // Override by defining the mangled C++ names of the operators (as
+  // used by GCC and CLang).
+  // See <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling>
+  // ------------------------------------------------------
+  void _ZdlPv(void* p)            MI_FORWARD0(mi_free,p); // delete
+  void _ZdaPv(void* p)            MI_FORWARD0(mi_free,p); // delete[]
+  void _ZdlPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n);
+  void _ZdaPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n);
+  void _ZdlPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
+  void _ZdaPvSt11align_val_t(void* p, size_t al)            { mi_free_aligned(p,al); }
+  void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
+  void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); }
+
+  typedef struct mi_nothrow_s {  } mi_nothrow_t;
+  #if (MI_INTPTR_SIZE==8)
+    void* _Znwm(size_t n)                             MI_FORWARD1(mi_new,n);  // new 64-bit
+    void* _Znam(size_t n)                             MI_FORWARD1(mi_new,n);  // new[] 64-bit
+    void* _ZnwmSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al);
+    void* _ZnamSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al);
+    void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+  #elif (MI_INTPTR_SIZE==4)
+    void* _Znwj(size_t n)                             MI_FORWARD1(mi_new,n);  // new 64-bit
+    void* _Znaj(size_t n)                             MI_FORWARD1(mi_new,n);  // new[] 64-bit
+    void* _ZnwjSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al);
+    void* _ZnajSt11align_val_t(size_t n, size_t al)   MI_FORWARD2(mi_new_aligned, n, al);
+    void* _ZnwjRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { UNUSED(tag); return mi_new_nothrow(n); }
+    void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+    void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { UNUSED(tag); return mi_new_aligned_nothrow(n,al); }
+  #else
+  #error "define overloads for new/delete for this platform (just for performance, can be skipped)"
+  #endif
+#endif // __cplusplus
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------------------------------------------
+// Posix & Unix functions definitions
+// ------------------------------------------------------
+
+void*  reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize);
+size_t malloc_size(void* p)              MI_FORWARD1(mi_usable_size,p);
+size_t malloc_usable_size(void *p)       MI_FORWARD1(mi_usable_size,p);
+void   cfree(void* p)                    MI_FORWARD0(mi_free, p);
+
+// no forwarding here due to aliasing/name mangling issues
+void* valloc(size_t size)                                   { return mi_valloc(size); }
+void* pvalloc(size_t size)                                  { return mi_pvalloc(size); }
+void* reallocarray(void* p, size_t count, size_t size)      { return mi_reallocarray(p, count, size); }
+void* memalign(size_t alignment, size_t size)               { return mi_memalign(alignment, size); }
+void* aligned_alloc(size_t alignment, size_t size)          { return mi_aligned_alloc(alignment, size); }
+int posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); }
+
+#if defined(__GLIBC__) && defined(__linux__)
+  // forward __libc interface (needed for glibc-based Linux distributions)
+  void* __libc_malloc(size_t size)                  MI_FORWARD1(mi_malloc,size);
+  void* __libc_calloc(size_t count, size_t size)    MI_FORWARD2(mi_calloc,count,size);
+  void* __libc_realloc(void* p, size_t size)        MI_FORWARD2(mi_realloc,p,size);
+  void  __libc_free(void* p)                        MI_FORWARD0(mi_free,p);
+  void  __libc_cfree(void* p)                       MI_FORWARD0(mi_free,p);
+
+  void* __libc_valloc(size_t size) { return mi_valloc(size); }
+  void* __libc_pvalloc(size_t size) { return mi_pvalloc(size); }
+  void* __libc_memalign(size_t alignment, size_t size)          { return mi_memalign(alignment,size); }
+  int __posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p,alignment,size); }
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__MACH__)
+#pragma GCC visibility pop
+#endif
+
+#endif // MI_MALLOC_OVERRIDE && !_WIN32
+#endif
diff --git a/runtime/src/mimalloc/c/alloc-posix.c b/runtime/src/mimalloc/c/alloc-posix.c
new file mode 100644
index 00000000000..505e42e4489
--- /dev/null
+++ b/runtime/src/mimalloc/c/alloc-posix.c
@@ -0,0 +1,151 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018,2019, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+// ------------------------------------------------------------------------
+// mi prefixed publi definitions of various Posix, Unix, and C++ functions
+// for convenience and used when overriding these functions.
+// ------------------------------------------------------------------------
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+// ------------------------------------------------------
+// Posix & Unix functions definitions
+// ------------------------------------------------------
+
+#include <errno.h>
+#include <string.h>  // memcpy
+#include <stdlib.h>  // getenv
+
+#ifndef EINVAL
+#define EINVAL 22
+#endif
+#ifndef ENOMEM
+#define ENOMEM 12
+#endif
+
+
+size_t mi_malloc_size(const void* p) mi_attr_noexcept {
+  return mi_usable_size(p);
+}
+
+size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept {
+  return mi_usable_size(p);
+}
+
+void mi_cfree(void* p) mi_attr_noexcept {
+  if (mi_is_in_heap_region(p)) {
+    mi_free(p);
+  }
+}
+
+int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept {
+  // Note: The spec dictates we should not modify `*p` on an error. (issue#27)
+  // <http://man7.org/linux/man-pages/man3/posix_memalign.3.html>
+  if (p == NULL) return EINVAL;
+  if (alignment % sizeof(void*) != 0) return EINVAL;      // natural alignment
+  if (!_mi_is_power_of_two(alignment)) return EINVAL;  // not a power of 2
+  void* q = mi_malloc_aligned(size, alignment);
+  if (q==NULL && size != 0) return ENOMEM;
+  *p = q;
+  return 0;
+}
+
+void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept {
+  return mi_malloc_aligned(size, alignment);
+}
+
+void* mi_valloc(size_t size) mi_attr_noexcept {
+  return mi_malloc_aligned(size, _mi_os_page_size());
+}
+
+void* mi_pvalloc(size_t size) mi_attr_noexcept {
+  size_t psize = _mi_os_page_size();
+  if (size >= SIZE_MAX - psize) return NULL; // overflow
+  size_t asize = ((size + psize - 1) / psize) * psize;
+  return mi_malloc_aligned(asize, psize);
+}
+
+void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept {
+  if (alignment==0 || !_mi_is_power_of_two(alignment)) return NULL; 
+  if ((size&(alignment-1)) != 0) return NULL; // C11 requires integral multiple, see <https://en.cppreference.com/w/c/memory/aligned_alloc>
+  return mi_malloc_aligned(size, alignment);
+}
+
+void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept {  // BSD
+  void* newp = mi_reallocn(p,count,size);
+  if (newp==NULL) errno = ENOMEM;
+  return newp;
+}
+
+void* mi__expand(void* p, size_t newsize) mi_attr_noexcept {  // Microsoft
+  void* res = mi_expand(p, newsize);
+  if (res == NULL) errno = ENOMEM;
+  return res;
+}
+
+unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept {
+  if (s==NULL) return NULL;
+  size_t len;
+  for(len = 0; s[len] != 0; len++) { }
+  size_t size = (len+1)*sizeof(unsigned short);
+  unsigned short* p = (unsigned short*)mi_malloc(size);
+  if (p != NULL) {
+    memcpy(p,s,size);
+  }
+  return p;
+}
+
+unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexcept {
+  return (unsigned char*)mi_strdup((const char*)s);
+}
+
+int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept {
+  if (buf==NULL || name==NULL) return EINVAL;
+  if (size != NULL) *size = 0;
+  #pragma warning(suppress:4996)
+  char* p = getenv(name);
+  if (p==NULL) {
+    *buf = NULL;
+  }
+  else {
+    *buf = mi_strdup(p);
+    if (*buf==NULL) return ENOMEM;
+    if (size != NULL) *size = strlen(p);
+  }
+  return 0;
+}
+
+int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept {
+  if (buf==NULL || name==NULL) return EINVAL;
+  if (size != NULL) *size = 0;
+#if !defined(_WIN32) || (defined(WINAPI_FAMILY) && (WINAPI_FAMILY != WINAPI_FAMILY_DESKTOP_APP))
+  // not supported
+  *buf = NULL;
+  return EINVAL;
+#else
+  #pragma warning(suppress:4996)
+  unsigned short* p = (unsigned short*)_wgetenv((const wchar_t*)name);
+  if (p==NULL) {
+    *buf = NULL;
+  }
+  else {
+    *buf = mi_wcsdup(p);
+    if (*buf==NULL) return ENOMEM;
+    if (size != NULL) *size = wcslen((const wchar_t*)p);
+  }
+  return 0;
+#endif
+}
+
+void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { // Microsoft
+  return mi_recalloc_aligned_at(p, newcount, size, alignment, offset);
+}
+
+void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept { // Microsoft
+  return mi_recalloc_aligned(p, newcount, size, alignment);
+}
diff --git a/runtime/src/mimalloc/c/alloc.c b/runtime/src/mimalloc/c/alloc.c
new file mode 100644
index 00000000000..e68b48d2025
--- /dev/null
+++ b/runtime/src/mimalloc/c/alloc.c
@@ -0,0 +1,707 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset, memcpy, strlen
+#include <stdlib.h>  // malloc, exit
+
+#define MI_IN_ALLOC_C
+#include "alloc-override.c"
+#undef MI_IN_ALLOC_C
+
+// ------------------------------------------------------
+// Allocation
+// ------------------------------------------------------
+
+// Fast allocation in a page: just pop from the free list.
+// Fall back to generic allocation only if the list is empty.
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
+  mi_assert_internal(page->block_size==0||page->block_size >= size);
+  mi_block_t* block = page->free;
+  if (mi_unlikely(block == NULL)) {
+    return _mi_malloc_generic(heap, size); // slow path
+  }
+  mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
+  // pop from the free list
+  page->free = mi_block_next(page,block);
+  page->used++;
+  mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
+#if (MI_DEBUG!=0)
+  if (!page->is_zero) { memset(block, MI_DEBUG_UNINIT, size); }
+#elif (MI_SECURE!=0)
+  block->next = 0;  // don't leak internal data
+#endif
+#if (MI_STAT>1)
+  if(size <= MI_LARGE_OBJ_SIZE_MAX) {
+    size_t bin = _mi_bin(size);
+    mi_heap_stat_increase(heap,normal[bin], 1);
+  }
+#endif
+  return block;
+}
+
+// allocate a small block
+extern inline mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  mi_assert(size <= MI_SMALL_SIZE_MAX);
+  mi_page_t* page = _mi_heap_get_free_small_page(heap,size);
+  return _mi_page_malloc(heap, page, size);
+}
+
+extern inline mi_decl_allocator void* mi_malloc_small(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small(mi_get_default_heap(), size);
+}
+
+
+// zero initialized small block
+mi_decl_allocator void* mi_zalloc_small(size_t size) mi_attr_noexcept {
+  void* p = mi_malloc_small(size);
+  if (p != NULL) { memset(p, 0, size); }
+  return p;
+}
+
+// The main allocation function
+extern inline mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  mi_assert(heap!=NULL);
+  mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
+  void* p;
+  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
+    p = mi_heap_malloc_small(heap, size);
+  }
+  else {
+    p = _mi_malloc_generic(heap, size);
+  }
+  #if MI_STAT>1
+  if (p != NULL) {
+    if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+    mi_heap_stat_increase( heap, malloc, mi_good_size(size) );  // overestimate for aligned sizes
+  }
+  #endif
+  return p;
+}
+
+extern inline mi_decl_allocator void* mi_malloc(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc(mi_get_default_heap(), size);
+}
+
+void _mi_block_zero_init(const mi_page_t* page, void* p, size_t size) {
+  // note: we need to initialize the whole block to zero, not just size
+  // or the recalloc/rezalloc functions cannot safely expand in place (see issue #63)
+  UNUSED(size);
+  mi_assert_internal(p != NULL);
+  mi_assert_internal(size > 0 && page->block_size >= size);
+  mi_assert_internal(_mi_ptr_page(p)==page);
+  if (page->is_zero) {
+    // already zero initialized memory?
+    ((mi_block_t*)p)->next = 0;  // clear the free list pointer
+    mi_assert_expensive(mi_mem_is_zero(p,page->block_size));
+  }
+  else {
+    // otherwise memset
+    memset(p, 0, page->block_size);
+  }
+}
+
+void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) {
+  void* p = mi_heap_malloc(heap,size);
+  if (zero && p != NULL) {
+    _mi_block_zero_init(_mi_ptr_page(p),p,size);  // todo: can we avoid getting the page again?
+  }
+  return p;
+}
+
+extern inline mi_decl_allocator void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return _mi_heap_malloc_zero(heap, size, true);
+}
+
+mi_decl_allocator void* mi_zalloc(size_t size) mi_attr_noexcept {
+  return mi_heap_zalloc(mi_get_default_heap(),size);
+}
+
+
+// ------------------------------------------------------
+// Check for double free in secure and debug mode 
+// This is somewhat expensive so only enabled for secure mode 4
+// ------------------------------------------------------
+
+#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
+// linear check if the free list contains a specific element
+static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
+  while (list != NULL) {
+    if (elem==list) return true;
+    list = mi_block_next(page, list);
+  }
+  return false;
+}
+
+static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block, const mi_block_t* n) {
+  size_t psize;
+  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
+  if (n == NULL || ((uint8_t*)n >= pstart && (uint8_t*)n < (pstart + psize))) {
+    // Suspicious: the decoded value is in the same page (or NULL).
+    // Walk the free lists to verify positively if it is already freed
+    if (mi_list_contains(page, page->free, block) ||
+        mi_list_contains(page, page->local_free, block) ||
+        mi_list_contains(page, (const mi_block_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*,&page->thread_free)), block)) 
+    {
+      _mi_fatal_error("double free detected of block %p with size %zu\n", block, page->block_size);
+      return true;
+    }
+  }
+  return false;
+}
+
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  mi_block_t* n = mi_block_nextx(page, block, page->cookie); // pretend it is freed, and get the decoded first field
+  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&        // quick check: aligned pointer?
+      (n==NULL || mi_is_in_same_segment(block, n)))    // quick check: in same segment or NULL?
+  { 
+    // Suspicous: decoded value in block is in the same segment (or NULL) -- maybe a double free?
+    // (continue in separate function to improve code generation)
+    return mi_check_is_double_freex(page, block, n);
+  }  
+  return false;
+}
+#else
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  UNUSED(page);
+  UNUSED(block);
+  return false;
+}
+#endif
+
+
+// ------------------------------------------------------
+// Free
+// ------------------------------------------------------
+
+// multi-threaded free
+static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
+{
+  mi_thread_free_t tfree;
+  mi_thread_free_t tfreex;
+  bool use_delayed;
+
+  mi_segment_t* segment = _mi_page_segment(page);
+  if (segment->page_kind==MI_PAGE_HUGE) {
+    // huge page segments are always abandoned and can be freed immediately
+    mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0);
+    mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*,&segment->abandoned_next))==NULL);
+    // claim it and free
+    mi_heap_t* heap = mi_get_default_heap();
+    // paranoia: if this it the last reference, the cas should always succeed
+    if (mi_atomic_cas_strong(&segment->thread_id,heap->thread_id,0)) {
+      mi_block_set_next(page, block, page->free);
+      page->free = block;
+      page->used--;
+      page->is_zero = false;
+      mi_assert(page->used == 0);
+      mi_tld_t* tld = heap->tld;
+      if (page->block_size > MI_HUGE_OBJ_SIZE_MAX) {
+        _mi_stat_decrease(&tld->stats.giant, page->block_size);
+      }
+      else {
+        _mi_stat_decrease(&tld->stats.huge, page->block_size);
+      }
+      _mi_segment_page_free(page,true,&tld->segments);
+    }
+    return;
+  }
+
+  do {
+    tfree = page->thread_free;
+    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE ||
+                   (mi_tf_delayed(tfree) == MI_NO_DELAYED_FREE && page->used == mi_atomic_read_relaxed(&page->thread_freed)+1)  // data-race but ok, just optimizes early release of the page
+                  );
+    if (mi_unlikely(use_delayed)) {
+      // unlikely: this only happens on the first concurrent free in a page that is in the full list
+      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
+    }
+    else {
+      // usual: directly add to page thread_free list
+      mi_block_set_next(page, block, mi_tf_block(tfree));
+      tfreex = mi_tf_set_block(tfree,block);
+    }
+  } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree));
+
+  if (mi_likely(!use_delayed)) {
+    // increment the thread free count and return
+    mi_atomic_increment(&page->thread_freed);
+  }
+  else {
+    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
+    mi_heap_t* heap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
+    mi_assert_internal(heap != NULL);
+    if (heap != NULL) {
+      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
+      mi_block_t* dfree;
+      do {
+        dfree = (mi_block_t*)heap->thread_delayed_free;
+        mi_block_set_nextx(heap,block,dfree, heap->cookie);
+      } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));
+    }
+
+    // and reset the MI_DELAYED_FREEING flag
+    do {
+      tfreex = tfree = page->thread_free;
+      mi_assert_internal(mi_tf_delayed(tfree) == MI_NEVER_DELAYED_FREE || mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
+      if (mi_tf_delayed(tfree) != MI_NEVER_DELAYED_FREE) tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
+    } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree));
+  }
+}
+
+
+// regular free
+static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
+{
+  #if (MI_DEBUG)
+  memset(block, MI_DEBUG_FREED, page->block_size);
+  #endif
+
+  // and push it on the free list
+  if (mi_likely(local)) {
+    // owning thread can free a block directly
+    if (mi_check_is_double_free(page, block)) return;
+    mi_block_set_next(page, block, page->local_free);
+    page->local_free = block;
+    page->used--;
+    if (mi_unlikely(mi_page_all_free(page))) {
+      _mi_page_retire(page);
+    }
+    else if (mi_unlikely(mi_page_is_in_full(page))) {
+      _mi_page_unfull(page);
+    }
+  }
+  else {
+    _mi_free_block_mt(page,block);
+  }
+}
+
+
+// Adjust a block that was allocated aligned, to the actual start of the block in the page.
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
+  mi_assert_internal(page!=NULL && p!=NULL);
+  size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
+  size_t adjust = (diff % page->block_size);
+  return (mi_block_t*)((uintptr_t)p - adjust);
+}
+
+
+static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool local, void* p) {
+  mi_block_t* block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
+  _mi_free_block(page, local, block);
+}
+
+// Free a block
+void mi_free(void* p) mi_attr_noexcept
+{
+#if (MI_DEBUG>0)
+  if (mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0)) {
+    _mi_error_message("trying to free an invalid (unaligned) pointer: %p\n", p);
+    return;
+  }
+#endif
+
+  const mi_segment_t* const segment = _mi_ptr_segment(p);
+  if (mi_unlikely(segment == NULL)) return;  // checks for (p==NULL)
+
+#if (MI_DEBUG!=0)
+  if (mi_unlikely(!mi_is_in_heap_region(p))) {
+    _mi_warning_message("possibly trying to free a pointer that does not point to a valid heap region: 0x%p\n"
+      "(this may still be a valid very large allocation (over 64MiB))\n", p);
+    if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
+      _mi_warning_message("(yes, the previous pointer 0x%p was valid after all)\n", p);
+    }
+  }
+#endif
+#if (MI_DEBUG!=0 || MI_SECURE>=4)
+  if (mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie)) {
+    _mi_error_message("trying to free a pointer that does not point to a valid heap space: %p\n", p);
+    return;
+  }
+#endif
+
+  const uintptr_t tid = _mi_thread_id();
+  mi_page_t* const page = _mi_segment_page_of(segment, p);
+
+#if (MI_STAT>1)
+  mi_heap_t* heap = mi_heap_get_default();
+  mi_heap_stat_decrease(heap, malloc, mi_usable_size(p));
+  if (page->block_size <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal[_mi_bin(page->block_size)], 1);
+  }
+  // huge page stat is accounted for in `_mi_page_retire`
+#endif
+
+  if (mi_likely(tid == segment->thread_id && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
+    // local, and not full or aligned
+    mi_block_t* block = (mi_block_t*)p;
+    if (mi_check_is_double_free(page,block)) return;    
+    mi_block_set_next(page, block, page->local_free);
+    page->local_free = block;
+    page->used--;
+    if (mi_unlikely(mi_page_all_free(page))) { _mi_page_retire(page); }
+  }
+  else {
+    // non-local, aligned blocks, or a full page; use the more generic path
+    mi_free_generic(segment, page, tid == segment->thread_id, p);
+  }
+}
+
+bool _mi_free_delayed_block(mi_block_t* block) {
+  // get segment and page
+  const mi_segment_t* segment = _mi_ptr_segment(block);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(_mi_thread_id() == segment->thread_id);
+  mi_page_t* page = _mi_segment_page_of(segment, block);
+  if (mi_tf_delayed(page->thread_free) == MI_DELAYED_FREEING) {
+    // we might already start delayed freeing while another thread has not yet
+    // reset the delayed_freeing flag; in that case don't free it quite yet if
+    // this is the last block remaining.
+    if (page->used - page->thread_freed == 1) return false;
+  }
+  _mi_free_block(page,true,block);
+  return true;
+}
+
+// Bytes available in a block
+size_t mi_usable_size(const void* p) mi_attr_noexcept {
+  if (p==NULL) return 0;
+  const mi_segment_t* segment = _mi_ptr_segment(p);
+  const mi_page_t* page = _mi_segment_page_of(segment,p);
+  size_t size = page->block_size;
+  if (mi_unlikely(mi_page_has_aligned(page))) {
+    ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)_mi_page_ptr_unalign(segment,page,p);
+    mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
+    return (size - adjust);
+  }
+  else {
+    return size;
+  }
+}
+
+
+// ------------------------------------------------------
+// ensure explicit external inline definitions are emitted!
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+void* _mi_externs[] = {
+  (void*)&_mi_page_malloc,
+  (void*)&mi_malloc,
+  (void*)&mi_malloc_small,
+  (void*)&mi_heap_malloc,
+  (void*)&mi_heap_zalloc,
+  (void*)&mi_heap_malloc_small
+};
+#endif
+
+
+// ------------------------------------------------------
+// Allocation extensions
+// ------------------------------------------------------
+
+void mi_free_size(void* p, size_t size) mi_attr_noexcept {
+  UNUSED_RELEASE(size);
+  mi_assert(p == NULL || size <= mi_usable_size(p));
+  mi_free(p);
+}
+
+void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
+  UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free_size(p,size);
+}
+
+void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
+  UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free(p);
+}
+
+extern inline mi_decl_allocator void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count,size,&total)) return NULL;
+  return mi_heap_zalloc(heap,total);
+}
+
+mi_decl_allocator void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_calloc(mi_get_default_heap(),count,size);
+}
+
+// Uninitialized `calloc`
+extern mi_decl_allocator void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count,size,&total)) return NULL;
+  return mi_heap_malloc(heap, total);
+}
+
+mi_decl_allocator void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_mallocn(mi_get_default_heap(),count,size);
+}
+
+// Expand in place or fail
+mi_decl_allocator void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
+  if (p == NULL) return NULL;
+  size_t size = mi_usable_size(p);
+  if (newsize > size) return NULL;
+  return p; // it fits
+}
+
+void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) {
+  if (p == NULL) return _mi_heap_malloc_zero(heap,newsize,zero);
+  size_t size = mi_usable_size(p);
+  if (newsize <= size && newsize >= (size / 2)) {
+    return p;  // reallocation still fits and not more than 50% waste
+  }
+  void* newp = mi_heap_malloc(heap,newsize);
+  if (mi_likely(newp != NULL)) {
+    if (zero && newsize > size) {
+      // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+      size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+      memset((uint8_t*)newp + start, 0, newsize - start);
+    }
+    memcpy(newp, p, (newsize > size ? size : newsize));
+    mi_free(p); // only free if successful
+  }
+  return newp;
+}
+
+mi_decl_allocator void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_heap_realloc_zero(heap, p, newsize, false);
+}
+
+mi_decl_allocator void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count, size, &total)) return NULL;
+  return mi_heap_realloc(heap, p, total);
+}
+
+
+// Reallocate but free `p` on errors
+mi_decl_allocator void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  void* newp = mi_heap_realloc(heap, p, newsize);
+  if (newp==NULL && p!=NULL) mi_free(p);
+  return newp;
+}
+
+mi_decl_allocator void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_heap_realloc_zero(heap, p, newsize, true);
+}
+
+mi_decl_allocator void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_mul_overflow(count, size, &total)) return NULL;
+  return mi_heap_rezalloc(heap, p, total);
+}
+
+
+mi_decl_allocator void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_realloc(mi_get_default_heap(),p,newsize);
+}
+
+mi_decl_allocator void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_reallocn(mi_get_default_heap(),p,count,size);
+}
+
+// Reallocate but free `p` on errors
+mi_decl_allocator void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_reallocf(mi_get_default_heap(),p,newsize);
+}
+
+mi_decl_allocator void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_rezalloc(mi_get_default_heap(), p, newsize);
+}
+
+mi_decl_allocator void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_recalloc(mi_get_default_heap(), p, count, size);
+}
+
+
+
+// ------------------------------------------------------
+// strdup, strndup, and realpath
+// ------------------------------------------------------
+
+// `strdup` using mi_malloc
+char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+  if (s == NULL) return NULL;
+  size_t n = strlen(s);
+  char* t = (char*)mi_heap_malloc(heap,n+1);
+  if (t != NULL) memcpy(t, s, n + 1);
+  return t;
+}
+
+char* mi_strdup(const char* s) mi_attr_noexcept {
+  return mi_heap_strdup(mi_get_default_heap(), s);
+}
+
+// `strndup` using mi_malloc
+char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+  if (s == NULL) return NULL;
+  size_t m = strlen(s);
+  if (n > m) n = m;
+  char* t = (char*)mi_heap_malloc(heap, n+1);
+  if (t == NULL) return NULL;
+  memcpy(t, s, n);
+  t[n] = 0;
+  return t;
+}
+
+char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
+  return mi_heap_strndup(mi_get_default_heap(),s,n);
+}
+
+#ifndef __wasi__
+// `realpath` using mi_malloc
+#ifdef _WIN32
+#ifndef PATH_MAX
+#define PATH_MAX MAX_PATH
+#endif
+#include <windows.h>
+#include <errno.h>
+char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  // todo: use GetFullPathNameW to allow longer file names
+  char buf[PATH_MAX];
+  DWORD res = GetFullPathNameA(fname, PATH_MAX, (resolved_name == NULL ? buf : resolved_name), NULL);
+  if (res == 0) {
+    errno = GetLastError(); return NULL;
+  }
+  else if (res > PATH_MAX) {
+    errno = EINVAL; return NULL;
+  }
+  else if (resolved_name != NULL) {
+    return resolved_name;
+  }
+  else {
+    return mi_heap_strndup(heap, buf, PATH_MAX);
+  }
+}
+#else
+#include <unistd.h>  // pathconf
+static size_t mi_path_max() {
+  static size_t path_max = 0;
+  if (path_max <= 0) {
+    long m = pathconf("/",_PC_PATH_MAX);
+    if (m <= 0) path_max = 4096;      // guess
+    else if (m < 256) path_max = 256; // at least 256
+    else path_max = m;
+  }
+  return path_max;
+}
+
+char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  if (resolved_name != NULL) {
+    return realpath(fname,resolved_name);
+  }
+  else {
+    size_t n  = mi_path_max();
+    char* buf = (char*)mi_malloc(n+1);
+    if (buf==NULL) return NULL;
+    char* rname  = realpath(fname,buf);
+    char* result = mi_heap_strndup(heap,rname,n); // ok if `rname==NULL`
+    mi_free(buf);
+    return result;
+  }
+}
+#endif
+
+char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
+  return mi_heap_realpath(mi_get_default_heap(),fname,resolved_name);
+}
+#endif
+
+/*-------------------------------------------------------
+C++ new and new_aligned
+The standard requires calling into `get_new_handler` and
+throwing the bad_alloc exception on failure. If we compile
+with a C++ compiler we can implement this precisely. If we
+use a C compiler we cannot throw a `bad_alloc` exception
+but we call `exit` instead (i.e. not returning).
+-------------------------------------------------------*/
+
+#ifdef __cplusplus
+#include <new>
+static bool mi_try_new_handler(bool nothrow) {
+  std::new_handler h = std::get_new_handler();
+  if (h==NULL) {
+    if (!nothrow) throw std::bad_alloc();
+    return false;
+  }
+  else {
+    h();
+    return true;
+  }
+}
+#else
+#include <errno.h>
+#ifndef ENOMEM
+#define ENOMEM 12
+#endif
+typedef void (*std_new_handler_t)();
+
+#if (defined(__GNUC__) || defined(__clang__))
+std_new_handler_t __attribute((weak)) _ZSt15get_new_handlerv() {
+  return NULL;
+}
+std_new_handler_t mi_get_new_handler() {
+  return _ZSt15get_new_handlerv();
+}
+#else
+// note: on windows we could dynamically link to `?get_new_handler@std@@YAP6AXXZXZ`.
+std_new_handler_t mi_get_new_handler() {
+  return NULL;
+}
+#endif
+
+static bool mi_try_new_handler(bool nothrow) {
+  std_new_handler_t h = mi_get_new_handler();
+  if (h==NULL) {
+    if (!nothrow) exit(ENOMEM);
+    return false;
+  }
+  else {
+    h();
+    return true;
+  }
+}
+#endif
+
+static mi_decl_noinline void* mi_try_new(size_t n, bool nothrow ) {
+  void* p = NULL;
+  while(p == NULL && mi_try_new_handler(nothrow)) {
+    p = mi_malloc(n);
+  }
+  return p;
+}
+
+void* mi_new(size_t n) {
+  void* p = mi_malloc(n);
+  if (mi_unlikely(p == NULL)) return mi_try_new(n,false);
+  return p;
+}
+
+void* mi_new_aligned(size_t n, size_t alignment) {
+  void* p;
+  do { p = mi_malloc_aligned(n, alignment); }
+  while(p == NULL && mi_try_new_handler(false));
+  return p;
+}
+
+void* mi_new_nothrow(size_t n) {
+  void* p = mi_malloc(n);
+  if (mi_unlikely(p == NULL)) return mi_try_new(n,true);
+  return p;
+}
+
+void* mi_new_aligned_nothrow(size_t n, size_t alignment) {
+  void* p;
+  do { p = mi_malloc_aligned(n, alignment); }
+  while (p == NULL && mi_try_new_handler(true));
+  return p;
+}
diff --git a/runtime/src/mimalloc/c/heap.c b/runtime/src/mimalloc/c/heap.c
new file mode 100644
index 00000000000..daa9b241c34
--- /dev/null
+++ b/runtime/src/mimalloc/c/heap.c
@@ -0,0 +1,527 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset, memcpy
+
+
+/* -----------------------------------------------------------
+  Helpers
+----------------------------------------------------------- */
+
+// return `true` if ok, `false` to break
+typedef bool (heap_page_visitor_fun)(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2);
+
+// Visit all pages in a heap; returns `false` if break was called.
+static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void* arg1, void* arg2)
+{
+  if (heap==NULL || heap->page_count==0) return 0;
+
+  // visit all pages
+  #if MI_DEBUG>1
+  size_t total = heap->page_count;
+  #endif
+  size_t count = 0;
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+    mi_page_queue_t* pq = &heap->pages[i];
+    mi_page_t* page = pq->first;
+    while(page != NULL) {
+      mi_page_t* next = page->next; // save next in case the page gets removed from the queue
+      mi_assert_internal(page->heap == heap);
+      count++;
+      if (!fn(heap, pq, page, arg1, arg2)) return false;
+      page = next; // and continue
+    }
+  }
+  mi_assert_internal(count == total);
+  return true;
+}
+
+
+#if MI_DEBUG>1
+static bool _mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  UNUSED(arg1);
+  UNUSED(arg2);
+  UNUSED(pq);
+  mi_assert_internal(page->heap == heap);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_internal(segment->thread_id == heap->thread_id);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return true;
+}
+
+static bool mi_heap_is_valid(mi_heap_t* heap) {
+  mi_assert_internal(heap!=NULL);
+  mi_heap_visit_pages(heap, &_mi_heap_page_is_valid, NULL, NULL);
+  return true;
+}
+#endif
+
+
+
+
+/* -----------------------------------------------------------
+  "Collect" pages by migrating `local_free` and `thread_free`
+  lists and freeing empty pages. This is done when a thread
+  stops (and in that case abandons pages if there are still
+  blocks alive)
+----------------------------------------------------------- */
+
+typedef enum mi_collect_e {
+  NORMAL,
+  FORCE,
+  ABANDON
+} mi_collect_t;
+
+
+static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) {
+  UNUSED(arg2);
+  UNUSED(heap);
+  mi_collect_t collect = *((mi_collect_t*)arg_collect);
+  _mi_page_free_collect(page, collect >= ABANDON);
+  if (mi_page_all_free(page)) {
+    // no more used blocks, free the page. TODO: should we retire here and be less aggressive?
+    _mi_page_free(page, pq, collect != NORMAL);
+  }
+  else if (collect == ABANDON) {
+    // still used blocks but the thread is done; abandon the page
+    _mi_page_abandon(page, pq);
+  }
+  return true; // don't break
+}
+
+static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  UNUSED(arg1);
+  UNUSED(arg2);
+  UNUSED(heap);
+  UNUSED(pq);
+  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE);
+  return true; // don't break
+}
+
+static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
+{
+  if (!mi_heap_is_initialized(heap)) return;
+  _mi_deferred_free(heap, collect > NORMAL);
+  
+  // collect (some) abandoned pages
+  if (collect >= NORMAL && !heap->no_reclaim) {
+    if (collect == NORMAL) {
+      // this may free some segments (but also take ownership of abandoned pages)
+      _mi_segment_try_reclaim_abandoned(heap, false, &heap->tld->segments);
+    }
+    #if MI_DEBUG
+    else if (collect == ABANDON && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
+      // the main thread is abandoned, try to free all abandoned segments.
+      // if all memory is freed by now, all segments should be freed.
+      _mi_segment_try_reclaim_abandoned(heap, true, &heap->tld->segments);
+    }
+    #endif
+  }
+
+  // if abandoning, mark all pages to no longer add to delayed_free
+  if (collect == ABANDON) {
+    //for (mi_page_t* page = heap->pages[MI_BIN_FULL].first; page != NULL; page = page->next) {
+    //  _mi_page_use_delayed_free(page, false);  // set thread_free.delayed to MI_NO_DELAYED_FREE      
+    //}    
+    mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
+  }
+
+  // free thread delayed blocks. 
+  // (if abandoning, after this there are no more local references into the pages.)
+  _mi_heap_delayed_free(heap);
+
+  // collect all pages owned by this thread
+  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
+  mi_assert_internal( collect != ABANDON || heap->thread_delayed_free == NULL );
+  
+  // collect segment caches
+  if (collect >= FORCE) {
+    _mi_segment_thread_collect(&heap->tld->segments);
+  }
+
+  // collect regions
+  if (collect >= FORCE && _mi_is_main_thread()) {
+    _mi_mem_collect(&heap->tld->stats);
+  }
+}
+
+void _mi_heap_collect_abandon(mi_heap_t* heap) {
+  mi_heap_collect_ex(heap, ABANDON);
+}
+
+void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept {
+  mi_heap_collect_ex(heap, (force ? FORCE : NORMAL));
+}
+
+void mi_collect(bool force) mi_attr_noexcept {
+  mi_heap_collect(mi_get_default_heap(), force);
+}
+
+
+/* -----------------------------------------------------------
+  Heap new
+----------------------------------------------------------- */
+
+mi_heap_t* mi_heap_get_default(void) {
+  mi_thread_init(); 
+  return mi_get_default_heap();
+}
+
+mi_heap_t* mi_heap_get_backing(void) {
+  mi_heap_t* heap = mi_heap_get_default();
+  mi_assert_internal(heap!=NULL);
+  mi_heap_t* bheap = heap->tld->heap_backing;
+  mi_assert_internal(bheap!=NULL);
+  mi_assert_internal(bheap->thread_id == _mi_thread_id());
+  return bheap;
+}
+
+uintptr_t _mi_heap_random(mi_heap_t* heap) {
+  uintptr_t r = heap->random;
+  heap->random = _mi_random_shuffle(r);
+  return r;
+}
+
+mi_heap_t* mi_heap_new(void) {
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);
+  if (heap==NULL) return NULL;
+  memcpy(heap, &_mi_heap_empty, sizeof(mi_heap_t));
+  heap->tld = bheap->tld;
+  heap->thread_id = _mi_thread_id();
+  heap->cookie = ((uintptr_t)heap ^ _mi_heap_random(bheap)) | 1;
+  heap->random = _mi_heap_random(bheap);
+  heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
+  return heap;
+}
+
+// zero out the page queues
+static void mi_heap_reset_pages(mi_heap_t* heap) {
+  mi_assert_internal(mi_heap_is_initialized(heap));
+  // TODO: copy full empty heap instead?
+  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
+#ifdef MI_MEDIUM_DIRECT
+  memset(&heap->pages_free_medium, 0, sizeof(heap->pages_free_medium));
+#endif
+  memcpy(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
+  heap->thread_delayed_free = NULL;
+  heap->page_count = 0;
+}
+
+// called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
+static void mi_heap_free(mi_heap_t* heap) {
+  mi_assert_internal(mi_heap_is_initialized(heap));
+  if (mi_heap_is_backing(heap)) return; // dont free the backing heap
+  
+  // reset default
+  if (mi_heap_is_default(heap)) {
+    _mi_heap_set_default_direct(heap->tld->heap_backing);
+  }
+  // and free the used memory
+  mi_free(heap);
+}
+
+
+/* -----------------------------------------------------------
+  Heap destroy
+----------------------------------------------------------- */
+
+static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  UNUSED(arg1);
+  UNUSED(arg2);
+  UNUSED(heap);
+  UNUSED(pq);
+
+  // ensure no more thread_delayed_free will be added
+  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE);  
+
+  // stats
+  if (page->block_size > MI_LARGE_OBJ_SIZE_MAX) {
+    if (page->block_size > MI_HUGE_OBJ_SIZE_MAX) {
+      _mi_stat_decrease(&heap->tld->stats.giant,page->block_size);
+    }
+    else {
+      _mi_stat_decrease(&heap->tld->stats.huge, page->block_size);
+    }
+  }
+  #if (MI_STAT>1)
+  size_t inuse = page->used - page->thread_freed;
+  if (page->block_size <= MI_LARGE_OBJ_SIZE_MAX)  {
+    mi_heap_stat_decrease(heap,normal[_mi_bin(page->block_size)], inuse);
+  }
+  mi_heap_stat_decrease(heap,malloc, page->block_size * inuse);  // todo: off for aligned blocks...
+  #endif
+
+  // pretend it is all free now
+  mi_assert_internal(page->thread_freed<=0xFFFF);
+  page->used = (uint16_t)page->thread_freed;
+
+  // and free the page
+  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
+
+  return true; // keep going
+}
+
+void _mi_heap_destroy_pages(mi_heap_t* heap) {
+  mi_heap_visit_pages(heap, &_mi_heap_page_destroy, NULL, NULL);
+  mi_heap_reset_pages(heap);
+}
+
+void mi_heap_destroy(mi_heap_t* heap) {
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert(heap->no_reclaim);
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (!mi_heap_is_initialized(heap)) return;
+  if (!heap->no_reclaim) {
+    // don't free in case it may contain reclaimed pages
+    mi_heap_delete(heap);
+  }
+  else {
+    // free all pages
+    _mi_heap_destroy_pages(heap);
+    mi_heap_free(heap);
+  }
+}
+
+
+
+/* -----------------------------------------------------------
+  Safe Heap delete
+----------------------------------------------------------- */
+
+// Tranfer the pages from one heap to the other
+static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
+  mi_assert_internal(heap!=NULL);
+  if (from==NULL || from->page_count == 0) return;
+
+  // unfull all full pages in the `from` heap
+  mi_page_t* page = from->pages[MI_BIN_FULL].first; 
+  while (page != NULL) {
+    mi_page_t* next = page->next;
+    _mi_page_unfull(page);
+    page = next;
+  }
+  mi_assert_internal(from->pages[MI_BIN_FULL].first == NULL);
+
+  // free outstanding thread delayed free blocks
+  _mi_heap_delayed_free(from);
+
+  // transfer all pages by appending the queues; this will set
+  // a new heap field which is ok as all pages are unfull'd and thus 
+  // other threads won't access this field anymore (see `mi_free_block_mt`)
+  for (size_t i = 0; i < MI_BIN_FULL; i++) {
+    mi_page_queue_t* pq = &heap->pages[i];
+    mi_page_queue_t* append = &from->pages[i];
+    size_t pcount = _mi_page_queue_append(heap, pq, append);
+    heap->page_count += pcount;
+    from->page_count -= pcount;
+  }
+  mi_assert_internal(from->thread_delayed_free == NULL);
+  mi_assert_internal(from->page_count == 0);
+  
+  // and reset the `from` heap
+  mi_heap_reset_pages(from);
+}
+
+// Safe delete a heap without freeing any still allocated blocks in that heap.
+void mi_heap_delete(mi_heap_t* heap)
+{
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (!mi_heap_is_initialized(heap)) return;
+
+  if (!mi_heap_is_backing(heap)) {
+    // tranfer still used pages to the backing heap
+    mi_heap_absorb(heap->tld->heap_backing, heap);
+  }
+  else {
+    // the backing heap abandons its pages
+    _mi_heap_collect_abandon(heap);
+  }
+  mi_assert_internal(heap->page_count==0);
+  mi_heap_free(heap);
+}
+
+mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
+  mi_assert(mi_heap_is_initialized(heap));
+  if (!mi_heap_is_initialized(heap)) return NULL;
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  mi_heap_t* old = mi_get_default_heap(); 
+  _mi_heap_set_default_direct(heap);
+  return old;
+}
+
+
+
+
+/* -----------------------------------------------------------
+  Analysis
+----------------------------------------------------------- */
+
+// static since it is not thread safe to access heaps from other threads.
+static mi_heap_t* mi_heap_of_block(const void* p) {
+  if (p == NULL) return NULL;
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(valid);
+  if (mi_unlikely(!valid)) return NULL;
+  return _mi_segment_page_of(segment,p)->heap;
+}
+
+bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
+  mi_assert(heap != NULL);
+  if (!mi_heap_is_initialized(heap)) return false;
+  return (heap == mi_heap_of_block(p));
+}
+
+
+static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* p, void* vfound) {
+  UNUSED(heap);
+  UNUSED(pq);
+  bool* found = (bool*)vfound;
+  mi_segment_t* segment = _mi_page_segment(page);
+  void* start = _mi_page_start(segment, page, NULL);
+  void* end   = (uint8_t*)start + (page->capacity * page->block_size);
+  *found = (p >= start && p < end);
+  return (!*found); // continue if not found
+}
+
+bool mi_heap_check_owned(mi_heap_t* heap, const void* p) {
+  mi_assert(heap != NULL);
+  if (!mi_heap_is_initialized(heap)) return false;
+  if (((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) return false;  // only aligned pointers
+  bool found = false;
+  mi_heap_visit_pages(heap, &mi_heap_page_check_owned, (void*)p, &found);
+  return found;
+}
+
+bool mi_check_owned(const void* p) {
+  return mi_heap_check_owned(mi_get_default_heap(), p);
+}
+
+/* -----------------------------------------------------------
+  Visit all heap blocks and areas
+  Todo: enable visiting abandoned pages, and
+        enable visiting all blocks of all heaps across threads
+----------------------------------------------------------- */
+
+// Separate struct to keep `mi_page_t` out of the public interface
+typedef struct mi_heap_area_ex_s {
+  mi_heap_area_t area;
+  mi_page_t*     page;
+} mi_heap_area_ex_t;
+
+static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_visit_fun* visitor, void* arg) {
+  mi_assert(xarea != NULL);
+  if (xarea==NULL) return true;
+  const mi_heap_area_t* area = &xarea->area;
+  mi_page_t* page = xarea->page;
+  mi_assert(page != NULL);
+  if (page == NULL) return true;
+
+  _mi_page_free_collect(page,true);
+  mi_assert_internal(page->local_free == NULL);
+  if (page->used == 0) return true;
+
+  size_t   psize;
+  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
+
+  if (page->capacity == 1) {
+    // optimize page with one block
+    mi_assert_internal(page->used == 1 && page->free == NULL);
+    return visitor(page->heap, area, pstart, page->block_size, arg);
+  }
+
+  // create a bitmap of free blocks.
+  #define MI_MAX_BLOCKS   (MI_SMALL_PAGE_SIZE / sizeof(void*))
+  uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
+  memset(free_map, 0, sizeof(free_map));
+
+  size_t free_count = 0;
+  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+    free_count++;
+    mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
+    size_t offset = (uint8_t*)block - pstart;
+    mi_assert_internal(offset % page->block_size == 0);
+    size_t blockidx = offset / page->block_size;  // Todo: avoid division?
+    mi_assert_internal( blockidx < MI_MAX_BLOCKS);
+    size_t bitidx = (blockidx / sizeof(uintptr_t));
+    size_t bit = blockidx - (bitidx * sizeof(uintptr_t));
+    free_map[bitidx] |= ((uintptr_t)1 << bit);
+  }
+  mi_assert_internal(page->capacity == (free_count + page->used));
+
+  // walk through all blocks skipping the free ones
+  size_t used_count = 0;
+  for (size_t i = 0; i < page->capacity; i++) {
+    size_t bitidx = (i / sizeof(uintptr_t));
+    size_t bit = i - (bitidx * sizeof(uintptr_t));
+    uintptr_t m = free_map[bitidx];
+    if (bit == 0 && m == UINTPTR_MAX) {
+      i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
+    }
+    else if ((m & ((uintptr_t)1 << bit)) == 0) {
+      used_count++;
+      uint8_t* block = pstart + (i * page->block_size);
+      if (!visitor(page->heap, area, block, page->block_size, arg)) return false;
+    }
+  }
+  mi_assert_internal(page->used == used_count);
+  return true;
+}
+
+typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg);
+
+
+static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
+  UNUSED(heap);
+  UNUSED(pq);
+  mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
+  mi_heap_area_ex_t xarea;
+  xarea.page = page;
+  xarea.area.reserved = page->reserved * page->block_size;
+  xarea.area.committed = page->capacity * page->block_size;
+  xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
+  xarea.area.used  = page->used - page->thread_freed; // race is ok
+  xarea.area.block_size = page->block_size;
+  return fun(heap, &xarea, arg);
+}
+
+// Visit all heap pages as areas
+static bool mi_heap_visit_areas(const mi_heap_t* heap, mi_heap_area_visit_fun* visitor, void* arg) {
+  if (visitor == NULL) return false;
+  return mi_heap_visit_pages((mi_heap_t*)heap, &mi_heap_visit_areas_page, (void*)(visitor), arg); // note: function pointer to void* :-{
+}
+
+// Just to pass arguments
+typedef struct mi_visit_blocks_args_s {
+  bool  visit_blocks;
+  mi_block_visit_fun* visitor;
+  void* arg;
+} mi_visit_blocks_args_t;
+
+static bool mi_heap_area_visitor(const mi_heap_t* heap, const mi_heap_area_ex_t* xarea, void* arg) {
+  mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg;
+  if (!args->visitor(heap, &xarea->area, NULL, xarea->area.block_size, args->arg)) return false;
+  if (args->visit_blocks) {
+    return mi_heap_area_visit_blocks(xarea, args->visitor, args->arg);
+  }
+  else {
+    return true;
+  }
+}
+
+// Visit all blocks in a heap
+bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_visit_blocks_args_t args = { visit_blocks, visitor, arg };
+  return mi_heap_visit_areas(heap, &mi_heap_area_visitor, &args);
+}
+
diff --git a/runtime/src/mimalloc/c/include/mimalloc-atomic.h b/runtime/src/mimalloc/c/include/mimalloc-atomic.h
new file mode 100644
index 00000000000..56c1320170b
--- /dev/null
+++ b/runtime/src/mimalloc/c/include/mimalloc-atomic.h
@@ -0,0 +1,248 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_ATOMIC_H
+#define MIMALLOC_ATOMIC_H
+
+// ------------------------------------------------------
+// Atomics 
+// We need to be portable between C, C++, and MSVC.
+// ------------------------------------------------------
+
+#if defined(_MSC_VER)
+#define _Atomic(tp)         tp
+#define ATOMIC_VAR_INIT(x)  x
+#elif defined(__cplusplus)
+#include <atomic>
+#define  _Atomic(tp)        std::atomic<tp>
+#else
+#include <stdatomic.h>
+#endif
+
+#define mi_atomic_cast(tp,x)  (volatile _Atomic(tp)*)(x)
+
+// ------------------------------------------------------
+// Atomic operations specialized for mimalloc
+// ------------------------------------------------------
+
+// Atomically add a 64-bit value; returns the previous value. 
+// Note: not using _Atomic(int64_t) as it is only used for statistics.
+static inline void mi_atomic_add64(volatile int64_t* p, int64_t add);
+
+// Atomically add a value; returns the previous value. Memory ordering is relaxed.
+static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add);
+
+// Atomically compare and exchange a value; returns `true` if successful. 
+// May fail spuriously. Memory ordering as release on success, and relaxed on failure.
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
+static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);
+
+// Atomically compare and exchange a value; returns `true` if successful.
+// Memory ordering is acquire-release
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
+static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);
+
+// Atomically exchange a value. Memory ordering is acquire-release.
+static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange);
+
+// Atomically read a value. Memory ordering is relaxed.
+static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p);
+
+// Atomically read a value. Memory ordering is acquire.
+static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p);
+
+// Atomically write a value. Memory ordering is release.
+static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+
+// Yield
+static inline void mi_atomic_yield(void);
+
+
+
+// Atomically add a value; returns the previous value.
+static inline uintptr_t mi_atomic_addu(volatile _Atomic(uintptr_t)* p, uintptr_t add) {
+  return (uintptr_t)mi_atomic_add((volatile _Atomic(intptr_t)*)p, (intptr_t)add);
+}
+// Atomically subtract a value; returns the previous value.
+static inline uintptr_t mi_atomic_subu(volatile _Atomic(uintptr_t)* p, uintptr_t sub) {
+  return (uintptr_t)mi_atomic_add((volatile _Atomic(intptr_t)*)p, -((intptr_t)sub));
+}
+
+// Atomically increment a value; returns the incremented result.
+static inline uintptr_t mi_atomic_increment(volatile _Atomic(uintptr_t)* p) {
+  return mi_atomic_addu(p, 1);
+}
+
+// Atomically decrement a value; returns the decremented result.
+static inline uintptr_t mi_atomic_decrement(volatile _Atomic(uintptr_t)* p) {
+  return mi_atomic_subu(p, 1);
+}
+
+// Atomically read a pointer; Memory order is relaxed.
+static inline void* mi_atomic_read_ptr_relaxed(volatile _Atomic(void*) const * p) {
+  return (void*)mi_atomic_read_relaxed((const volatile _Atomic(uintptr_t)*)p);
+}
+
+// Atomically read a pointer; Memory order is acquire.
+static inline void* mi_atomic_read_ptr(volatile _Atomic(void*) const * p) {
+  return (void*)mi_atomic_read((const volatile _Atomic(uintptr_t)*)p);
+}
+
+// Atomically write a pointer
+static inline void mi_atomic_write_ptr(volatile _Atomic(void*)* p, void* x) {
+  mi_atomic_write((volatile _Atomic(uintptr_t)*)p, (uintptr_t)x );
+}
+
+// Atomically compare and exchange a pointer; returns `true` if successful. May fail spuriously.
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
+static inline bool mi_atomic_cas_ptr_weak(volatile _Atomic(void*)* p, void* desired, void* expected) {
+  return mi_atomic_cas_weak((volatile _Atomic(uintptr_t)*)p, (uintptr_t)desired, (uintptr_t)expected);
+}
+
+// Atomically compare and exchange a pointer; returns `true` if successful.
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
+static inline bool mi_atomic_cas_ptr_strong(volatile _Atomic(void*)* p, void* desired, void* expected) {
+  return mi_atomic_cas_strong((volatile _Atomic(uintptr_t)*)p, (uintptr_t)desired, (uintptr_t)expected);
+}
+
+// Atomically exchange a pointer value.
+static inline void* mi_atomic_exchange_ptr(volatile _Atomic(void*)* p, void* exchange) {
+  return (void*)mi_atomic_exchange((volatile _Atomic(uintptr_t)*)p, (uintptr_t)exchange);
+}
+
+
+#ifdef _MSC_VER
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <intrin.h>
+#ifdef _WIN64
+typedef LONG64   msc_intptr_t;
+#define RC64(f)  f##64
+#else
+typedef LONG     msc_intptr_t;
+#define RC64(f)  f
+#endif
+static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) {
+  return (intptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+}
+static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
+  return (expected == (uintptr_t)RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
+}
+static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
+  return mi_atomic_cas_strong(p,desired,expected);
+}
+static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
+  return (uintptr_t)RC64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
+}
+static inline uintptr_t mi_atomic_read(volatile _Atomic(uintptr_t) const* p) {
+  return *p;
+}
+static inline uintptr_t mi_atomic_read_relaxed(volatile _Atomic(uintptr_t) const* p) {
+  return mi_atomic_read(p);
+}
+static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  mi_atomic_exchange(p,x);
+}
+static inline void mi_atomic_yield(void) {
+  YieldProcessor();
+}
+static inline void mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add) {
+  #ifdef _WIN64
+  mi_atomic_add(p,add);
+  #else
+  int64_t current;
+  int64_t sum;
+  do {
+    current = *p;
+    sum = current + add;
+  } while (_InterlockedCompareExchange64(p, sum, current) != current);
+  #endif
+}
+
+#else
+#ifdef __cplusplus
+#define  MI_USING_STD   using namespace std;
+#else
+#define  MI_USING_STD
+#endif
+static inline void mi_atomic_add64(volatile int64_t* p, int64_t add) {
+  MI_USING_STD
+  atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed);
+}
+static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) {
+  MI_USING_STD
+  return atomic_fetch_add_explicit(p, add, memory_order_relaxed);
+}
+static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
+  MI_USING_STD
+  return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_release, memory_order_relaxed);
+}
+static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
+  MI_USING_STD
+  return atomic_compare_exchange_strong_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_relaxed);
+}
+static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
+  MI_USING_STD
+  return atomic_exchange_explicit(p, exchange, memory_order_acq_rel);
+}
+static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p) {
+  MI_USING_STD
+  return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_relaxed);
+}
+static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p) {
+  MI_USING_STD
+  return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_acquire);
+}
+static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  MI_USING_STD
+  return atomic_store_explicit(p, x, memory_order_release);
+}
+
+#if defined(__cplusplus)
+  #include <thread>
+  static inline void mi_atomic_yield(void) {
+    std::this_thread::yield();
+  }
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))
+#if defined(__x86_64__) || defined(__i386__)
+  static inline void mi_atomic_yield(void) {
+    asm volatile ("pause" ::: "memory");
+  }
+#elif defined(__arm__) || defined(__aarch64__)
+  #if KONAN_MI_MALLOC
+    #if defined(__arm__)
+      #include <sched.h>
+      static inline void mi_atomic_yield(void) {
+        sched_yield();
+      }
+    #else
+      static inline void mi_atomic_yield(void) {
+        asm volatile("yield");
+      }
+    #endif
+  #else
+  static inline void mi_atomic_yield(void) {
+    asm volatile("yield");
+  }
+  #endif
+#endif
+#elif defined(__wasi__)
+  #include <sched.h>
+  static inline void mi_atomic_yield(void) {
+    sched_yield();
+  }
+#else
+  #include <unistd.h>
+  static inline void mi_atomic_yield(void) {
+    sleep(0);
+  }
+#endif
+
+#endif
+
+#endif // __MIMALLOC_ATOMIC_H
diff --git a/runtime/src/mimalloc/c/include/mimalloc-internal.h b/runtime/src/mimalloc/c/include/mimalloc-internal.h
new file mode 100644
index 00000000000..b0997d1e3bd
--- /dev/null
+++ b/runtime/src/mimalloc/c/include/mimalloc-internal.h
@@ -0,0 +1,498 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_INTERNAL_H
+#define MIMALLOC_INTERNAL_H
+
+#include "mimalloc-types.h"
+
+#if defined(MI_MALLOC_OVERRIDE) && (defined(__APPLE__) || defined(__OpenBSD__))
+#define MI_TLS_RECURSE_GUARD
+#endif
+
+#if (MI_DEBUG>0)
+#define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
+#else
+#define mi_trace_message(...)  
+#endif
+
+#if defined(_MSC_VER)
+#define mi_decl_noinline   __declspec(noinline)
+#define mi_attr_noreturn 
+#elif defined(__GNUC__) || defined(__clang__)
+#define mi_decl_noinline   __attribute__((noinline))
+#define mi_attr_noreturn   __attribute__((noreturn))
+#else
+#define mi_decl_noinline
+#define mi_attr_noreturn   
+#endif
+
+
+// "options.c"
+void       _mi_fputs(mi_output_fun* out, const char* prefix, const char* message);
+void       _mi_fprintf(mi_output_fun* out, const char* fmt, ...);
+void       _mi_error_message(const char* fmt, ...);
+void       _mi_warning_message(const char* fmt, ...);
+void       _mi_verbose_message(const char* fmt, ...);
+void       _mi_trace_message(const char* fmt, ...);
+void       _mi_options_init(void);
+void       _mi_fatal_error(const char* fmt, ...) mi_attr_noreturn;
+
+// "init.c"
+extern mi_stats_t       _mi_stats_main;
+extern const mi_page_t  _mi_page_empty;
+bool       _mi_is_main_thread(void);
+uintptr_t  _mi_random_shuffle(uintptr_t x);
+uintptr_t  _mi_random_init(uintptr_t seed /* can be zero */);
+bool       _mi_preloading();  // true while the C runtime is not ready
+
+// os.c
+size_t     _mi_os_page_size(void);
+void       _mi_os_init(void);                                      // called from process init
+void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
+void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
+size_t     _mi_os_good_alloc_size(size_t size);
+
+// memory.c
+void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld);
+void       _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats);
+
+bool       _mi_mem_reset(void* p, size_t size, mi_stats_t* stats);
+bool       _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool       _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool       _mi_mem_protect(void* addr, size_t size);
+bool       _mi_mem_unprotect(void* addr, size_t size);
+
+void        _mi_mem_collect(mi_stats_t* stats);
+
+// "segment.c"
+mi_page_t* _mi_segment_page_alloc(size_t block_wsize, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
+void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
+void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
+bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
+void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size); // page start for any page
+
+// "page.c"
+void*      _mi_malloc_generic(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc;
+
+void       _mi_page_retire(mi_page_t* page);                                  // free the page if there are no other pages with many free blocks
+void       _mi_page_unfull(mi_page_t* page);
+void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
+void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
+void       _mi_heap_delayed_free(mi_heap_t* heap);
+
+void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay);
+size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
+void       _mi_deferred_free(mi_heap_t* heap, bool force);
+
+void       _mi_page_free_collect(mi_page_t* page,bool force);
+void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
+
+size_t     _mi_bin_size(uint8_t bin);           // for stats
+uint8_t    _mi_bin(size_t size);                // for stats
+uint8_t    _mi_bsr(uintptr_t x);                // bit-scan-right, used on BSD in "os.c"
+
+// "heap.c"
+void       _mi_heap_destroy_pages(mi_heap_t* heap);
+void       _mi_heap_collect_abandon(mi_heap_t* heap);
+uintptr_t  _mi_heap_random(mi_heap_t* heap);
+void       _mi_heap_set_default_direct(mi_heap_t* heap);
+
+// "stats.c"
+void       _mi_stats_done(mi_stats_t* stats);
+double     _mi_clock_end(double start);
+double     _mi_clock_start(void);
+
+// "alloc.c"
+void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;  // called from `_mi_malloc_generic`
+void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero);
+void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero);
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
+bool        _mi_free_delayed_block(mi_block_t* block);
+void        _mi_block_zero_init(const mi_page_t* page, void* p, size_t size);
+
+#if MI_DEBUG>1
+bool        _mi_page_is_valid(mi_page_t* page);
+#endif
+
+
+// ------------------------------------------------------
+// Branches
+// ------------------------------------------------------
+
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x)     __builtin_expect((x),0)
+#define mi_likely(x)       __builtin_expect((x),1)
+#else
+#define mi_unlikely(x)     (x)
+#define mi_likely(x)       (x)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+
+
+/* -----------------------------------------------------------
+  Inlined definitions
+----------------------------------------------------------- */
+#define UNUSED(x)     (void)(x)
+#if (MI_DEBUG>0) 
+#define UNUSED_RELEASE(x)  
+#else
+#define UNUSED_RELEASE(x)  UNUSED(x)
+#endif
+
+#define MI_INIT4(x)   x(),x(),x(),x()
+#define MI_INIT8(x)   MI_INIT4(x),MI_INIT4(x)
+#define MI_INIT16(x)  MI_INIT8(x),MI_INIT8(x)
+#define MI_INIT32(x)  MI_INIT16(x),MI_INIT16(x)
+#define MI_INIT64(x)  MI_INIT32(x),MI_INIT32(x)
+#define MI_INIT128(x) MI_INIT64(x),MI_INIT64(x)
+#define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x)
+
+
+// Overflow detecting multiply
+#define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+#if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
+#include <limits.h>   // UINT_MAX, ULONG_MAX
+#if (SIZE_MAX == UINT_MAX)
+  return __builtin_umul_overflow(count, size, total);
+#elif (SIZE_MAX == ULONG_MAX)
+  return __builtin_umull_overflow(count, size, total);
+#else
+  return __builtin_umulll_overflow(count, size, total);
+#endif
+#else /* __builtin_umul_overflow is unavailable */
+  *total = count * size;
+  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
+          && size > 0 && (SIZE_MAX / size) < count);
+#endif
+}
+
+// Is `x` a power of two? (0 is considered a power of two)
+static inline bool _mi_is_power_of_two(uintptr_t x) {
+  return ((x & (x - 1)) == 0);
+}
+
+// Align upwards
+static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) {  // power of two?
+    return ((sz + mask) & ~mask);
+  }
+  else {
+    return (((sz + mask)/alignment)*alignment);
+  }
+}
+
+// Is memory zero initialized?
+static inline bool mi_mem_is_zero(void* p, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    if (((uint8_t*)p)[i] != 0) return false;
+  }
+  return true;
+}
+
+// Align a byte size to a size in _machine words_,
+// i.e. byte size == `wsize*sizeof(void*)`.
+static inline size_t _mi_wsize_from_size(size_t size) {
+  mi_assert_internal(size <= SIZE_MAX - sizeof(uintptr_t));
+  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
+}
+
+
+/* -----------------------------------------------------------
+  The thread local default heap
+----------------------------------------------------------- */
+
+extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
+extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
+extern bool _mi_process_is_initialized;
+
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+
+static inline mi_heap_t* mi_get_default_heap(void) {
+#ifdef MI_TLS_RECURSE_GUARD
+  // on some platforms, like macOS, the dynamic loader calls `malloc`
+  // to initialize thread local data. To avoid recursion, we need to avoid
+  // accessing the thread local `_mi_default_heap` until our module is loaded
+  // and use the statically allocated main heap until that time.
+  // TODO: patch ourselves dynamically to avoid this check every time?
+  if (!_mi_process_is_initialized) return &_mi_heap_main;
+#endif
+  return _mi_heap_default;
+}
+
+static inline bool mi_heap_is_default(const mi_heap_t* heap) {
+  return (heap == mi_get_default_heap());
+}
+
+static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
+  return (heap->tld->heap_backing == heap);
+}
+
+static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
+  return (heap != &_mi_heap_empty);
+}
+
+static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  return ((uintptr_t)p ^ _mi_heap_main.cookie);
+}
+
+/* -----------------------------------------------------------
+  Pages
+----------------------------------------------------------- */
+
+static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
+  mi_assert_internal(size <= MI_SMALL_SIZE_MAX);
+  return heap->pages_free_direct[_mi_wsize_from_size(size)];
+}
+
+// Get the page belonging to a certain size class
+static inline mi_page_t* _mi_get_free_small_page(size_t size) {
+  return _mi_heap_get_free_small_page(mi_get_default_heap(), size);
+}
+
+// Segment that contains the pointer
+static inline mi_segment_t* _mi_ptr_segment(const void* p) {
+  // mi_assert_internal(p != NULL);
+  return (mi_segment_t*)((uintptr_t)p & ~MI_SEGMENT_MASK);
+}
+
+// Segment belonging to a page
+static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
+  mi_segment_t* segment = _mi_ptr_segment(page);
+  mi_assert_internal(segment == NULL || page == &segment->pages[page->segment_idx]);
+  return segment;
+}
+
+// used internally
+static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
+  // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
+  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
+  mi_assert_internal(diff >= 0 && diff < MI_SEGMENT_SIZE);
+  uintptr_t idx = (uintptr_t)diff >> segment->page_shift;
+  mi_assert_internal(idx < segment->capacity);
+  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
+  return idx;
+}
+
+// Get the page containing the pointer
+static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
+  uintptr_t idx = _mi_segment_page_idx_of(segment, p);  
+  return &((mi_segment_t*)segment)->pages[idx];
+}
+
+// Quick page start for initialized pages
+static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
+  return _mi_segment_page_start(segment, page, page->block_size, page_size);
+}
+
+// Get the page containing the pointer
+static inline mi_page_t* _mi_ptr_page(void* p) {
+  return _mi_segment_page_of(_mi_ptr_segment(p), p);
+}
+
+// Thread free access
+static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
+  return (mi_block_t*)(tf & ~0x03);
+}
+static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) {
+  return (mi_delayed_t)(tf & 0x03);
+}
+static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) {
+  return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed);
+}
+static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
+  return mi_tf_make(mi_tf_block(tf),delayed);
+}
+static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
+  return mi_tf_make(block, mi_tf_delayed(tf));
+}
+
+// are all blocks in a page freed?
+static inline bool mi_page_all_free(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->used - page->thread_freed == 0);
+}
+
+// are there immediately available blocks
+static inline bool mi_page_immediate_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->free != NULL);
+}
+// are there free blocks in this page?
+static inline bool mi_page_has_free(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  bool hasfree = (mi_page_immediate_available(page) || page->local_free != NULL || (mi_tf_block(page->thread_free) != NULL));
+  mi_assert_internal(hasfree || page->used - page->thread_freed == page->capacity);
+  return hasfree;
+}
+
+// are all blocks in use?
+static inline bool mi_page_all_used(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return !mi_page_has_free(page);
+}
+
+// is more than 7/8th of a page in use?
+static inline bool mi_page_mostly_used(const mi_page_t* page) {
+  if (page==NULL) return true;
+  uint16_t frac = page->reserved / 8U;
+  return (page->reserved - page->used + page->thread_freed <= frac);
+}
+
+static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
+  return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
+}
+
+
+
+//-----------------------------------------------------------
+// Page flags
+//-----------------------------------------------------------
+static inline bool mi_page_is_in_full(const mi_page_t* page) {
+  return page->flags.x.in_full;
+}
+
+static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
+  page->flags.x.in_full = in_full;
+}
+
+static inline bool mi_page_has_aligned(const mi_page_t* page) {
+  return page->flags.x.has_aligned;
+}
+
+static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
+  page->flags.x.has_aligned = has_aligned;
+}
+
+
+// -------------------------------------------------------------------
+// Encoding/Decoding the free list next pointers
+// Note: we pass a `null` value to be used as the `NULL` value for the 
+// end of a free list. This is to prevent the cookie itself to ever 
+// be present among user blocks (as `cookie^0==cookie`).
+// -------------------------------------------------------------------
+
+static inline bool mi_is_in_same_segment(const void* p, const void* q) {
+  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
+}
+
+static inline bool mi_is_in_same_page(const void* p, const void* q) {
+  mi_segment_t* segmentp = _mi_ptr_segment(p);
+  mi_segment_t* segmentq = _mi_ptr_segment(q);
+  if (segmentp != segmentq) return false;
+  uintptr_t idxp = _mi_segment_page_idx_of(segmentp, p);
+  uintptr_t idxq = _mi_segment_page_idx_of(segmentq, q);
+  return (idxp == idxq);
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, uintptr_t cookie ) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_t* b = (mi_block_t*)(block->next ^ cookie);
+  if (mi_unlikely((void*)b==null)) { b = NULL; }
+  return b;
+  #else
+  UNUSED(cookie); UNUSED(null);
+  return (mi_block_t*)block->next;
+  #endif
+}
+
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, uintptr_t cookie) {
+  #ifdef MI_ENCODE_FREELIST
+  if (mi_unlikely(next==NULL)) { next = (mi_block_t*)null; }
+  block->next = (mi_encoded_t)next ^ cookie;
+  #else
+  UNUSED(cookie); UNUSED(null);
+  block->next = (mi_encoded_t)next;
+  #endif
+}
+
+static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_t* next = mi_block_nextx(page,block,page->cookie);
+  // check for free list corruption: is `next` at least in our segment range?
+  // TODO: check if `next` is `page->block_size` aligned?
+  if (next!=NULL && !mi_is_in_same_page(block, next)) {
+    _mi_fatal_error("corrupted free list entry of size %zub at %p: value 0x%zx\n", page->block_size, block, (uintptr_t)next);
+    next = NULL;
+  }   
+  return next;
+  #else
+  UNUSED(page);
+  return mi_block_nextx(page,block,0);
+  #endif
+}
+
+static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_set_nextx(page,block,next, page->cookie);
+  #else
+  UNUSED(page);
+  mi_block_set_nextx(page,block, next,0);
+  #endif
+}
+
+// -------------------------------------------------------------------
+// Getting the thread id should be performant
+// as it is called in the fast path of `_mi_free`,
+// so we specialize for various platforms.
+// -------------------------------------------------------------------
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))
+#if KONAN_MI_MALLOC
+  #include <pthread.h>
+  pthread_t pthread_self(void);
+#endif
+// TLS register on x86 is in the FS or GS register
+// see: https://akkadia.org/drepper/tls.pdf
+static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
+  uintptr_t tid;
+  #if defined(__i386__)
+  __asm__("movl %%gs:0, %0" : "=r" (tid) : : );  // 32-bit always uses GS
+  #elif defined(__MACH__)
+  #if KONAN_MI_MALLOC
+  #include <TargetConditionals.h>
+    #if TARGET_OS_EMBEDDED // iOS/tvOS/watchOS devices.
+      tid = pthread_self();
+    #else
+       __asm__("movq %%gs:0, %0" : "=r" (tid) : : );  // x86_64 macOS uses GS
+    #endif
+  #else
+    __asm__("movq %%gs:0, %0" : "=r" (tid) : : );  // x86_64 macOS uses GS
+  #endif
+  #elif defined(__x86_64__)
+  __asm__("movq %%fs:0, %0" : "=r" (tid) : : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+  asm volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
+  #elif defined(__aarch64__)
+  asm volatile ("mrs %0, tpidr_el0" : "=r" (tid));
+  #endif
+  return tid;
+}
+#else
+// otherwise use standard C
+static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
+  return (uintptr_t)&_mi_heap_default;
+}
+#endif
+
+
+#endif
diff --git a/runtime/src/mimalloc/c/include/mimalloc-new-delete.h b/runtime/src/mimalloc/c/include/mimalloc-new-delete.h
new file mode 100644
index 00000000000..050f94334eb
--- /dev/null
+++ b/runtime/src/mimalloc/c/include/mimalloc-new-delete.h
@@ -0,0 +1,52 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018,2019 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_NEW_DELETE_H
+#define MIMALLOC_NEW_DELETE_H
+
+// ----------------------------------------------------------------------------
+// This header provides convenient overrides for the new and
+// delete operations in C++.
+//
+// This header should be included in only one source file!
+//
+// On Windows, or when linking dynamically with mimalloc, these
+// can be more performant than the standard new-delete operations.
+// See <https://en.cppreference.com/w/cpp/memory/new/operator_new>
+// ---------------------------------------------------------------------------
+#if defined(__cplusplus)
+  #include <new>
+  #include <mimalloc.h>
+
+  void operator delete(void* p) noexcept              { mi_free(p); };
+  void operator delete[](void* p) noexcept            { mi_free(p); };
+
+  void* operator new(std::size_t n) noexcept(false)   { return mi_new(n); }
+  void* operator new[](std::size_t n) noexcept(false) { return mi_new(n); }
+
+  void* operator new  (std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
+  void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); }
+
+  #if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+  void operator delete  (void* p, std::size_t n) { mi_free_size(p,n); };
+  void operator delete[](void* p, std::size_t n) { mi_free_size(p,n); };
+  #endif
+
+  #if (__cplusplus > 201402L || defined(__cpp_aligned_new))
+  void operator delete  (void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast<size_t>(al)); }
+  void operator delete  (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+  void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast<size_t>(al)); };
+
+  void* operator new( std::size_t n, std::align_val_t al)   noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new[]( std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast<size_t>(al)); }
+  void* operator new  (std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
+  void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast<size_t>(al)); }
+  #endif
+#endif
+
+#endif // MIMALLOC_NEW_DELETE_H
diff --git a/runtime/src/mimalloc/c/include/mimalloc-override.h b/runtime/src/mimalloc/c/include/mimalloc-override.h
new file mode 100644
index 00000000000..201fb8b49ba
--- /dev/null
+++ b/runtime/src/mimalloc/c/include/mimalloc-override.h
@@ -0,0 +1,66 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018,2019 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_OVERRIDE_H
+#define MIMALLOC_OVERRIDE_H
+
+/* ----------------------------------------------------------------------------
+This header can be used to statically redirect malloc/free and new/delete
+to the mimalloc variants. This can be useful if one can include this file on
+each source file in a project (but be careful when using external code to
+not accidentally mix pointers from different allocators).
+-----------------------------------------------------------------------------*/
+
+#include <mimalloc.h>
+
+// Standard C allocation
+#define malloc(n)               mi_malloc(n)
+#define calloc(n,c)             mi_calloc(n,c)
+#define realloc(p,n)            mi_realloc(p,n)
+#define free(p)                 mi_free(p)
+
+#define strdup(s)               mi_strdup(s)
+#define strndup(s)              mi_strndup(s)
+#define realpath(f,n)           mi_realpath(f,n)
+
+// Microsoft extensions
+#define _expand(p,n)            mi_expand(p,n)
+#define _msize(p)               mi_usable_size(p)
+#define _recalloc(p,n,c)        mi_recalloc(p,n,c)
+
+#define _strdup(s)              mi_strdup(s)
+#define _strndup(s)             mi_strndup(s)
+#define _wcsdup(s)              (wchar_t*)mi_wcsdup((const unsigned short*)(s))
+#define _mbsdup(s)              mi_mbsdup(s)
+#define _dupenv_s(b,n,v)        mi_dupenv_s(b,n,v)
+#define _wdupenv_s(b,n,v)       mi_wdupenv_s((unsigned short*)(b),n,(const unsigned short*)(v))
+
+// Various Posix and Unix variants
+#define reallocf(p,n)           mi_reallocf(p,n)
+#define malloc_size(p)          mi_usable_size(p)
+#define malloc_usable_size(p)   mi_usable_size(p)
+#define cfree(p)                mi_free(p)
+
+#define valloc(n)               mi_valloc(n)
+#define pvalloc(n)              mi_pvalloc(n)
+#define reallocarray(p,s,n)     mi_reallocarray(p,s,n)
+#define memalign(a,n)           mi_memalign(a,n)
+#define aligned_alloc(a,n)      mi_aligned_alloc(a,n)
+#define posix_memalign(p,a,n)   mi_posix_memalign(p,a,n)
+#define _posix_memalign(p,a,n)  mi_posix_memalign(p,a,n)
+
+// Microsoft aligned variants
+#define _aligned_malloc(n,a)                  mi_malloc_aligned(n,a)
+#define _aligned_realloc(p,n,a)               mi_realloc_aligned(p,n,a)
+#define _aligned_recalloc(p,s,n,a)            mi_aligned_recalloc(p,s,n,a)
+#define _aligned_msize(p,a,o)                 mi_usable_size(p)
+#define _aligned_free(p)                      mi_free(p)
+#define _aligned_offset_malloc(n,a,o)         mi_malloc_aligned_at(n,a,o)
+#define _aligned_offset_realloc(p,n,a,o)      mi_realloc_aligned_at(p,n,a,o)
+#define _aligned_offset_recalloc(p,s,n,a,o)   mi_recalloc_aligned_at(p,s,n,a,o)
+
+#endif // MIMALLOC_OVERRIDE_H
diff --git a/runtime/src/mimalloc/c/include/mimalloc-types.h b/runtime/src/mimalloc/c/include/mimalloc-types.h
new file mode 100644
index 00000000000..bbb7bfdee18
--- /dev/null
+++ b/runtime/src/mimalloc/c/include/mimalloc-types.h
@@ -0,0 +1,431 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TYPES_H
+#define MIMALLOC_TYPES_H
+
+#include <stddef.h>   // ptrdiff_t
+#include <stdint.h>   // uintptr_t, uint16_t, etc
+#include <mimalloc-atomic.h>  // _Atomic
+
+// ------------------------------------------------------
+// Variants
+// ------------------------------------------------------
+
+// Define NDEBUG in the release version to disable assertions.
+#if !KONAN_MI_MALLOC
+  #define NDEBUG
+#endif
+
+// Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
+// #define MI_STAT 1
+
+// Define MI_SECURE to enable security mitigations
+// #define MI_SECURE 1  // guard page around metadata
+// #define MI_SECURE 2  // guard page around each mimalloc page
+// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+// #define MI_SECURE 4  // checks for double free. (may be more expensive)
+
+#if !defined(MI_SECURE)
+#define MI_SECURE 0
+#endif
+
+// Define MI_DEBUG for debug mode
+// #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
+// #define MI_DEBUG 2  // + internal assertion checks
+// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
+#if !defined(MI_DEBUG)
+#if !defined(NDEBUG) || defined(_DEBUG)
+#define MI_DEBUG 2
+#else
+#define MI_DEBUG 0
+#endif
+#endif
+
+// Encoded free lists allow detection of corrupted free lists
+// and can detect buffer overflows and double `free`s.
+#if (MI_SECURE>=3 || MI_DEBUG>=1) 
+#define MI_ENCODE_FREELIST  1
+#endif
+
+// ------------------------------------------------------
+// Platform specific values
+// ------------------------------------------------------
+
+
+// ------------------------------------------------------
+// Size of a pointer.
+// We assume that `sizeof(void*)==sizeof(intptr_t)`
+// and it holds for all platforms we know of.
+//
+// However, the C standard only requires that:
+//  p == (void*)((intptr_t)p))
+// but we also need:
+//  i == (intptr_t)((void*)i)
+// or otherwise one might define an intptr_t type that is larger than a pointer...
+// ------------------------------------------------------
+
+#if INTPTR_MAX == 9223372036854775807LL
+# define MI_INTPTR_SHIFT (3)
+#elif INTPTR_MAX == 2147483647LL
+# define MI_INTPTR_SHIFT (2)
+#else
+#error platform must be 32 or 64 bits
+#endif
+
+#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+
+#define KiB     ((size_t)1024)
+#define MiB     (KiB*KiB)
+#define GiB     (MiB*KiB)
+
+// ------------------------------------------------------
+// Main internal data-structures
+// ------------------------------------------------------
+
+// Main tuning parameters for segment and page sizes
+// Sizes for 64-bit, divide by two for 32-bit
+#define MI_SMALL_PAGE_SHIFT               (13 + MI_INTPTR_SHIFT)      // 64kb
+#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)  // 512kb
+#define MI_LARGE_PAGE_SHIFT               ( 3 + MI_MEDIUM_PAGE_SHIFT) // 4mb
+#define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4mb
+
+// Derived constants
+#define MI_SEGMENT_SIZE                   (1<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_MASK                   ((uintptr_t)MI_SEGMENT_SIZE - 1)
+
+#define MI_SMALL_PAGE_SIZE                (1<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (1<<MI_MEDIUM_PAGE_SHIFT)
+#define MI_LARGE_PAGE_SIZE                (1<<MI_LARGE_PAGE_SHIFT)
+
+#define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
+#define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
+#define MI_LARGE_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
+
+// The max object size are checked to not waste more than 12.5% internally over the page sizes.
+// (Except for large pages since huge objects are allocated in 4MiB chunks)
+#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 16kb
+#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128kb
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_LARGE_PAGE_SIZE/2)   // 2mb 
+#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)     
+#define MI_HUGE_OBJ_SIZE_MAX              (2*MI_INTPTR_SIZE*MI_SEGMENT_SIZE)        // (must match MI_REGION_MAX_ALLOC_SIZE in memory.c)
+
+// Minimal alignment necessary. On most platforms 16 bytes are needed
+// due to SSE registers for example. This must be at least `MI_INTPTR_SIZE`
+#define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
+
+// Maximum number of size classes. (spaced exponentially in 12.5% increments)
+#define MI_BIN_HUGE  (73U)
+
+#if (MI_LARGE_OBJ_WSIZE_MAX >= 655360)
+#error "define more bins"
+#endif
+
+// The free lists use encoded next fields
+// (Only actually encodes when MI_ENCODED_FREELIST is defined.)
+typedef uintptr_t mi_encoded_t;
+
+// free lists contain blocks
+typedef struct mi_block_s {
+  mi_encoded_t next;
+} mi_block_t;
+
+
+// The delayed flags are used for efficient multi-threaded free-ing
+typedef enum mi_delayed_e {
+  MI_NO_DELAYED_FREE = 0,
+  MI_USE_DELAYED_FREE = 1,
+  MI_DELAYED_FREEING = 2,
+  MI_NEVER_DELAYED_FREE = 3
+} mi_delayed_t;
+
+
+// The `in_full` and `has_aligned` page flags are put in a union to efficiently 
+// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+typedef union mi_page_flags_s {
+  uint8_t full_aligned;
+  struct {
+    uint8_t in_full : 1;
+    uint8_t has_aligned : 1;
+  } x; 
+} mi_page_flags_t;
+
+// Thread free list.
+// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+typedef uintptr_t mi_thread_free_t;
+
+// A page contains blocks of one specific size (`block_size`).
+// Each page has three list of free blocks:
+// `free` for blocks that can be allocated,
+// `local_free` for freed blocks that are not yet available to `mi_malloc`
+// `thread_free` for freed blocks by other threads
+// The `local_free` and `thread_free` lists are migrated to the `free` list
+// when it is exhausted. The separate `local_free` list is necessary to
+// implement a monotonic heartbeat. The `thread_free` list is needed for
+// avoiding atomic operations in the common case.
+//
+// `used - thread_freed` == actual blocks that are in use (alive)
+// `used - thread_freed + |free| + |local_free| == capacity`
+//
+// note: we don't count `freed` (as |free|) instead of `used` to reduce
+//       the number of memory accesses in the `mi_page_all_free` function(s).
+// note: the funny layout here is due to:
+// - access is optimized for `mi_free` and `mi_page_alloc`
+// - using `uint16_t` does not seem to slow things down
+typedef struct mi_page_s {
+  // "owned" by the segment
+  uint8_t               segment_idx;       // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]`
+  uint8_t               segment_in_use:1;  // `true` if the segment allocated this page
+  uint8_t               is_reset:1;        // `true` if the page memory was reset
+  uint8_t               is_committed:1;    // `true` if the page virtual memory is committed
+  uint8_t               is_zero_init:1;    // `true` if the page was zero initialized
+  
+  // layout like this to optimize access in `mi_malloc` and `mi_free`
+  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
+  uint16_t              reserved;          // number of blocks reserved in memory
+  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
+  bool                  is_zero;           // `true` if the blocks in the free list are zero initialized
+
+  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
+  #ifdef MI_ENCODE_FREELIST
+  uintptr_t             cookie;            // random cookie to encode the free lists
+  #endif
+  size_t                used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
+  
+  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  volatile _Atomic(uintptr_t)        thread_freed;  // at least this number of blocks are in `thread_free`
+  volatile _Atomic(mi_thread_free_t) thread_free;   // list of deferred free blocks freed by other threads
+
+  // less accessed info
+  size_t                block_size;        // size available in each block (always `>0`)
+  mi_heap_t*            heap;              // the owning heap
+  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
+  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
+
+  // improve page index calculation
+  // without padding: 10 words on 64-bit, 11 on 32-bit. Secure adds one word
+  #if (MI_INTPTR_SIZE==8 && defined(MI_ENCODE_FREELIST)) || (MI_INTPTR_SIZE==4 && !defined(MI_ENCODE_FREELIST))
+  void*                 padding[1];        // 12 words on 64-bit with cookie, 12 words on 32-bit plain
+  #endif
+} mi_page_t;
+
+
+
+typedef enum mi_page_kind_e {
+  MI_PAGE_SMALL,    // small blocks go into 64kb pages inside a segment
+  MI_PAGE_MEDIUM,   // medium blocks go into 512kb pages inside a segment
+  MI_PAGE_LARGE,    // larger blocks go into a single page spanning a whole segment
+  MI_PAGE_HUGE      // huge blocks (>512kb) are put into a single page in a segment of the exact size (but still 2mb aligned)
+} mi_page_kind_t;
+
+// Segments are large allocated memory blocks (2mb on 64 bit) from
+// the OS. Inside segments we allocated fixed size _pages_ that
+// contain blocks.
+typedef struct mi_segment_s {
+  // memory fields
+  size_t          memid;            // id for the os-level memory manager
+  bool            mem_is_fixed;     // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)    
+  bool            mem_is_committed; // `true` if the whole segment is eagerly committed
+
+  // segment fields
+  struct mi_segment_s* next;   // must be the first segment field -- see `segment.c:segment_alloc`
+  struct mi_segment_s* prev;
+  volatile _Atomic(struct mi_segment_s*) abandoned_next;
+  size_t          abandoned;   // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
+  size_t          used;        // count of pages in use (`used <= capacity`)
+  size_t          capacity;    // count of available pages (`#free + used`)
+  size_t          segment_size;// for huge pages this may be different from `MI_SEGMENT_SIZE`
+  size_t          segment_info_size;  // space we are using from the first page for segment meta-data and possible guard pages.
+  uintptr_t       cookie;      // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
+
+  // layout like this to optimize access in `mi_free`
+  size_t          page_shift;  // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
+  volatile _Atomic(uintptr_t) thread_id;   // unique id of the thread owning this segment
+  mi_page_kind_t  page_kind;   // kind of pages: small, large, or huge
+  mi_page_t       pages[1];    // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
+} mi_segment_t;
+
+
+// ------------------------------------------------------
+// Heaps
+// Provide first-class heaps to allocate from.
+// A heap just owns a set of pages for allocation and
+// can only be allocate/reallocate from the thread that created it.
+// Freeing blocks can be done from any thread though.
+// Per thread, the segments are shared among its heaps.
+// Per thread, there is always a default heap that is
+// used for allocation; it is initialized to statically
+// point to an empty heap to avoid initialization checks
+// in the fast path.
+// ------------------------------------------------------
+
+// Thread local data
+typedef struct mi_tld_s mi_tld_t;
+
+// Pages of a certain block size are held in a queue.
+typedef struct mi_page_queue_s {
+  mi_page_t* first;
+  mi_page_t* last;
+  size_t     block_size;
+} mi_page_queue_t;
+
+#define MI_BIN_FULL  (MI_BIN_HUGE+1)
+
+// A heap owns a set of pages.
+struct mi_heap_s {
+  mi_tld_t*             tld;
+  mi_page_t*            pages_free_direct[MI_SMALL_WSIZE_MAX + 2];   // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];                      // queue of pages for each size class (or "bin")
+  volatile _Atomic(mi_block_t*) thread_delayed_free;
+  uintptr_t             thread_id;                                   // thread this heap belongs too
+  uintptr_t             cookie;
+  uintptr_t             random;                                      // random number used for secure allocation
+  size_t                page_count;                                  // total number of pages in the `pages` queues.
+  bool                  no_reclaim;                                  // `true` if this heap should not reclaim abandoned pages
+};
+
+
+
+// ------------------------------------------------------
+// Debug
+// ------------------------------------------------------
+
+#define MI_DEBUG_UNINIT     (0xD0)
+#define MI_DEBUG_FREED      (0xDF)
+
+
+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
+#else
+#define mi_assert(x)
+#endif
+
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
+#endif
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
+#endif
+
+// ------------------------------------------------------
+// Statistics
+// ------------------------------------------------------
+
+#ifndef MI_STAT
+#if (MI_DEBUG>0)
+#define MI_STAT 2
+#else
+#define MI_STAT 0
+#endif
+#endif
+
+typedef struct mi_stat_count_s {
+  int64_t allocated;
+  int64_t freed;
+  int64_t peak;
+  int64_t current;
+} mi_stat_count_t;
+
+typedef struct mi_stat_counter_s {
+  int64_t total;
+  int64_t count;
+} mi_stat_counter_t;
+
+typedef struct mi_stats_s {
+  mi_stat_count_t segments;
+  mi_stat_count_t pages;
+  mi_stat_count_t reserved;
+  mi_stat_count_t committed;
+  mi_stat_count_t reset;
+  mi_stat_count_t page_committed;
+  mi_stat_count_t segments_abandoned;
+  mi_stat_count_t pages_abandoned;
+  mi_stat_count_t threads;
+  mi_stat_count_t huge;
+  mi_stat_count_t giant;
+  mi_stat_count_t malloc;
+  mi_stat_count_t segments_cache;
+  mi_stat_counter_t pages_extended;
+  mi_stat_counter_t mmap_calls;
+  mi_stat_counter_t commit_calls;
+  mi_stat_counter_t page_no_retire;
+  mi_stat_counter_t searches;
+  mi_stat_counter_t huge_count;
+  mi_stat_counter_t giant_count;
+#if MI_STAT>1
+  mi_stat_count_t normal[MI_BIN_HUGE+1];
+#endif
+} mi_stats_t;
+
+
+void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+
+#if (MI_STAT)
+#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
+#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
+#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
+#else
+#define mi_stat_increase(stat,amount)         (void)0
+#define mi_stat_decrease(stat,amount)         (void)0
+#define mi_stat_counter_increase(stat,amount) (void)0
+#endif
+
+#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+
+
+// ------------------------------------------------------
+// Thread Local data
+// ------------------------------------------------------
+
+// Queue of segments
+typedef struct mi_segment_queue_s {
+  mi_segment_t* first;
+  mi_segment_t* last;
+} mi_segment_queue_t;
+
+
+// Segments thread local data
+typedef struct mi_segments_tld_s {
+  mi_segment_queue_t  small_free;   // queue of segments with free small pages
+  mi_segment_queue_t  medium_free;  // queue of segments with free medium pages
+  size_t              count;        // current number of segments;
+  size_t              peak_count;   // peak number of segments
+  size_t              current_size; // current size of all segments
+  size_t              peak_size;    // peak size of all segments
+  size_t              cache_count;  // number of segments in the cache
+  size_t              cache_size;   // total size of all segments in the cache
+  mi_segment_t*       cache;        // (small) cache of segments
+  mi_stats_t*         stats;        // points to tld stats
+} mi_segments_tld_t;
+
+// OS thread local data
+typedef struct mi_os_tld_s {
+  size_t              region_idx;   // start point for next allocation
+  mi_stats_t*         stats;        // points to tld stats
+} mi_os_tld_t;
+
+// Thread local data
+struct mi_tld_s {
+  unsigned long long  heartbeat;     // monotonic heartbeat count
+  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
+  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
+  mi_segments_tld_t   segments;      // segment tld
+  mi_os_tld_t         os;            // os tld
+  mi_stats_t          stats;         // statistics
+};
+
+#endif
diff --git a/runtime/src/mimalloc/c/include/mimalloc.h b/runtime/src/mimalloc/c/include/mimalloc.h
new file mode 100644
index 00000000000..7f26896c039
--- /dev/null
+++ b/runtime/src/mimalloc/c/include/mimalloc.h
@@ -0,0 +1,330 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_H
+#define MIMALLOC_H
+
+#define MI_MALLOC_VERSION 120   // major + 2 digits minor
+
+// ------------------------------------------------------
+// Compiler specific attributes
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+  #if (__GNUC__ <= 5) || (_MSC_VER <= 1900)
+    #define mi_attr_noexcept   throw()
+  #else
+    #define mi_attr_noexcept   noexcept
+  #endif
+#else
+  #define mi_attr_noexcept
+#endif
+
+#ifdef _MSC_VER
+  #if !defined(MI_SHARED_LIB)
+    #define mi_decl_export
+  #elif defined(MI_SHARED_LIB_EXPORT)
+    #define mi_decl_export            __declspec(dllexport)
+  #else
+    #define mi_decl_export            __declspec(dllimport)
+  #endif
+  #if (_MSC_VER >= 1900) && !defined(__EDG__)
+    #define mi_decl_allocator         __declspec(allocator) __declspec(restrict)
+  #else
+    #define mi_decl_allocator         __declspec(restrict)
+  #endif
+  #define mi_decl_thread              __declspec(thread)
+  #define mi_attr_malloc
+  #define mi_attr_alloc_size(s)
+  #define mi_attr_alloc_size2(s1,s2)
+  #define mi_cdecl                   __cdecl
+#elif defined(__GNUC__) || defined(__clang__)
+  #define mi_decl_thread              __thread
+  #define mi_decl_export              __attribute__((visibility("default")))
+  #define mi_decl_allocator
+  #define mi_attr_malloc              __attribute__((malloc))
+  #if defined(__clang_major__) && (__clang_major__ < 4)
+  #define mi_attr_alloc_size(s)
+  #define mi_attr_alloc_size2(s1,s2)
+  #else
+  #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
+  #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
+  #endif
+  #define mi_cdecl                    // leads to warnings... __attribute__((cdecl))
+#else
+  #define mi_decl_thread              __thread
+  #define mi_decl_export
+  #define mi_decl_allocator
+  #define mi_attr_malloc
+  #define mi_attr_alloc_size(s)
+  #define mi_attr_alloc_size2(s1,s2)
+  #define mi_cdecl
+#endif
+
+// ------------------------------------------------------
+// Includes
+// ------------------------------------------------------
+
+#include <stddef.h>     // size_t
+#include <stdbool.h>    // bool
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------------------------------------------
+// Standard malloc interface
+// ------------------------------------------------------
+
+mi_decl_export mi_decl_allocator void* mi_malloc(size_t size)                mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_export mi_decl_allocator void* mi_calloc(size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_export mi_decl_allocator void* mi_realloc(void* p, size_t newsize)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_expand(void* p, size_t newsize)    mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+
+mi_decl_export void  mi_free(void* p)                     mi_attr_noexcept;
+mi_decl_export char* mi_strdup(const char* s)             mi_attr_noexcept;
+mi_decl_export char* mi_strndup(const char* s, size_t n)  mi_attr_noexcept;
+mi_decl_export char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept;
+
+// ------------------------------------------------------
+// Extended functionality
+// ------------------------------------------------------
+#define MI_SMALL_WSIZE_MAX  (128)
+#define MI_SMALL_SIZE_MAX   (MI_SMALL_WSIZE_MAX*sizeof(void*))
+
+mi_decl_export mi_decl_allocator void* mi_malloc_small(size_t size)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_export mi_decl_allocator void* mi_zalloc_small(size_t size)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_export mi_decl_allocator void* mi_zalloc(size_t size)         mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+
+mi_decl_export mi_decl_allocator void* mi_mallocn(size_t count, size_t size)            mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_export mi_decl_allocator void* mi_reallocn(void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
+mi_decl_export mi_decl_allocator void* mi_reallocf(void* p, size_t newsize)             mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+
+
+mi_decl_export size_t mi_usable_size(const void* p)   mi_attr_noexcept;
+mi_decl_export size_t mi_good_size(size_t size)       mi_attr_noexcept;
+
+typedef void (mi_deferred_free_fun)(bool force, unsigned long long heartbeat);
+mi_decl_export void mi_register_deferred_free(mi_deferred_free_fun* deferred_free) mi_attr_noexcept;
+
+typedef void (mi_output_fun)(const char* msg);
+mi_decl_export void mi_register_output(mi_output_fun* out) mi_attr_noexcept;
+
+mi_decl_export void mi_collect(bool force)    mi_attr_noexcept;
+mi_decl_export int  mi_version(void)          mi_attr_noexcept;
+mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_print(mi_output_fun* out) mi_attr_noexcept;
+
+mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
+mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_stats_print(mi_output_fun* out) mi_attr_noexcept;
+
+
+// -------------------------------------------------------------------------------------
+// Aligned allocation
+// Note that `alignment` always follows `size` for consistency with unaligned
+// allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`.
+// -------------------------------------------------------------------------------------
+
+mi_decl_export mi_decl_allocator void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_export mi_decl_allocator void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_export mi_decl_allocator void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_export mi_decl_allocator void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_export mi_decl_allocator void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_export mi_decl_allocator void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_export mi_decl_allocator void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+
+
+// ------------------------------------------------------
+// Heaps
+// ------------------------------------------------------
+struct mi_heap_s;
+typedef struct mi_heap_s mi_heap_t;
+
+mi_decl_export mi_heap_t* mi_heap_new(void);
+mi_decl_export void       mi_heap_delete(mi_heap_t* heap);
+mi_decl_export void       mi_heap_destroy(mi_heap_t* heap);
+mi_decl_export mi_heap_t* mi_heap_set_default(mi_heap_t* heap);
+mi_decl_export mi_heap_t* mi_heap_get_default(void);
+mi_decl_export mi_heap_t* mi_heap_get_backing(void);
+mi_decl_export void       mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept;
+
+mi_decl_export mi_decl_allocator void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_export mi_decl_allocator void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_export mi_decl_allocator void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+
+mi_decl_export mi_decl_allocator void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)   mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+mi_decl_export mi_decl_allocator void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept;
+mi_decl_export mi_decl_allocator void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+
+mi_decl_export char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept;
+mi_decl_export char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept;
+mi_decl_export char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept;
+
+mi_decl_export mi_decl_allocator void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_export mi_decl_allocator void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_export mi_decl_allocator void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+mi_decl_export mi_decl_allocator void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+
+
+// --------------------------------------------------------------------------------
+// Zero initialized re-allocation.
+// Only valid on memory that was originally allocated with zero initialization too.
+// e.g. `mi_calloc`, `mi_zalloc`, `mi_zalloc_aligned` etc.
+// see <https://github.com/microsoft/mimalloc/issues/63#issuecomment-508272992>
+// --------------------------------------------------------------------------------
+
+mi_decl_export mi_decl_allocator void* mi_rezalloc(void* p, size_t newsize)                mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_recalloc(void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
+
+mi_decl_export mi_decl_allocator void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export mi_decl_allocator void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
+mi_decl_export mi_decl_allocator void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
+
+mi_decl_export mi_decl_allocator void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize)                mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+mi_decl_export mi_decl_allocator void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(3,4);
+
+mi_decl_export mi_decl_allocator void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+mi_decl_export mi_decl_allocator void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(3);
+mi_decl_export mi_decl_allocator void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(3,4);
+mi_decl_export mi_decl_allocator void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(3,4);
+
+
+// ------------------------------------------------------
+// Analysis
+// ------------------------------------------------------
+
+mi_decl_export bool mi_heap_contains_block(mi_heap_t* heap, const void* p);
+
+mi_decl_export bool mi_heap_check_owned(mi_heap_t* heap, const void* p);
+mi_decl_export bool mi_check_owned(const void* p);
+
+// An area of heap space contains blocks of a single size.
+typedef struct mi_heap_area_s {
+  void*  blocks;      // start of the area containing heap blocks
+  size_t reserved;    // bytes reserved for this area (virtual)
+  size_t committed;   // current available bytes for this area
+  size_t used;        // bytes in use by allocated blocks
+  size_t block_size;  // size in bytes of each block
+} mi_heap_area_t;
+
+typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg);
+
+mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg);
+
+// Experimental
+mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
+mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+mi_decl_export bool mi_is_redirected() mi_attr_noexcept;
+
+// ------------------------------------------------------
+// Convenience
+// ------------------------------------------------------
+
+#define mi_malloc_tp(tp)        ((tp*)mi_malloc(sizeof(tp)))
+#define mi_zalloc_tp(tp)        ((tp*)mi_zalloc(sizeof(tp)))
+#define mi_calloc_tp(tp,n)      ((tp*)mi_calloc(n,sizeof(tp)))
+#define mi_mallocn_tp(tp,n)     ((tp*)mi_mallocn(n,sizeof(tp)))
+#define mi_reallocn_tp(p,tp,n)  ((tp*)mi_reallocn(p,n,sizeof(tp)))
+#define mi_recalloc_tp(p,tp,n)  ((tp*)mi_recalloc(p,n,sizeof(tp)))
+
+#define mi_heap_malloc_tp(hp,tp)        ((tp*)mi_heap_malloc(hp,sizeof(tp)))
+#define mi_heap_zalloc_tp(hp,tp)        ((tp*)mi_heap_zalloc(hp,sizeof(tp)))
+#define mi_heap_calloc_tp(hp,tp,n)      ((tp*)mi_heap_calloc(hp,n,sizeof(tp)))
+#define mi_heap_mallocn_tp(hp,tp,n)     ((tp*)mi_heap_mallocn(hp,n,sizeof(tp)))
+#define mi_heap_reallocn_tp(hp,p,tp,n)  ((tp*)mi_heap_reallocn(hp,p,n,sizeof(tp)))
+#define mi_heap_recalloc_tp(hp,p,tp,n)  ((tp*)mi_heap_recalloc(hp,p,n,sizeof(tp)))
+
+
+// ------------------------------------------------------
+// Options, all `false` by default
+// ------------------------------------------------------
+
+typedef enum mi_option_e {
+  // stable options
+  mi_option_show_errors,
+  mi_option_show_stats,
+  mi_option_verbose,
+  // the following options are experimental
+  mi_option_eager_commit,
+  mi_option_eager_region_commit,
+  mi_option_large_os_pages,         // implies eager commit
+  mi_option_reserve_huge_os_pages,
+  mi_option_segment_cache,
+  mi_option_page_reset,
+  mi_option_cache_reset,
+  mi_option_reset_decommits,
+  mi_option_eager_commit_delay,
+  mi_option_segment_reset,
+  mi_option_os_tag,
+  mi_option_max_errors,
+  _mi_option_last
+} mi_option_t;
+
+
+mi_decl_export bool  mi_option_is_enabled(mi_option_t option);
+mi_decl_export void  mi_option_enable(mi_option_t option);
+mi_decl_export void  mi_option_disable(mi_option_t option);
+mi_decl_export void  mi_option_set_enabled(mi_option_t option, bool enable);
+mi_decl_export void  mi_option_set_enabled_default(mi_option_t option, bool enable);
+
+mi_decl_export long  mi_option_get(mi_option_t option);
+mi_decl_export void  mi_option_set(mi_option_t option, long value);
+mi_decl_export void  mi_option_set_default(mi_option_t option, long value);
+
+
+// -------------------------------------------------------------------------------------------------------
+// "mi" prefixed implementations of various posix, Unix, Windows, and C++ allocation functions.
+// (This can be convenient when providing overrides of these functions as done in `mimalloc-override.h`.)
+// -------------------------------------------------------------------------------------------------------
+
+mi_decl_export size_t mi_malloc_size(const void* p) mi_attr_noexcept;
+mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept;
+mi_decl_export void   mi_cfree(void* p) mi_attr_noexcept;
+mi_decl_export void*  mi__expand(void* p, size_t newsize) mi_attr_noexcept;
+
+mi_decl_export int   mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept;
+mi_decl_export void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export void* mi_valloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+
+mi_decl_export void* mi_pvalloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_export void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_export void* mi_reallocarray(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2,3);
+
+mi_decl_export void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept;
+mi_decl_export void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept;
+
+mi_decl_export unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept;
+mi_decl_export unsigned char*  mi_mbsdup(const unsigned char* s)  mi_attr_noexcept;
+mi_decl_export int  mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept;
+mi_decl_export int  mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept;
+
+mi_decl_export void mi_free_size(void* p, size_t size) mi_attr_noexcept;
+mi_decl_export void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept;
+mi_decl_export void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept;
+
+mi_decl_export void* mi_new(size_t n) mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_export void* mi_new_aligned(size_t n, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_export void* mi_new_nothrow(size_t n) mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_export void* mi_new_aligned_nothrow(size_t n, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/runtime/src/mimalloc/c/init.c b/runtime/src/mimalloc/c/init.c
new file mode 100644
index 00000000000..81413aa92ff
--- /dev/null
+++ b/runtime/src/mimalloc/c/init.c
@@ -0,0 +1,549 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h>  // memcpy, memset
+#include <stdlib.h>  // atexit
+
+// Empty page used to initialize the small free pages array
+const mi_page_t _mi_page_empty = {
+  0, false, false, false, false, 0, 0,
+  { 0 }, false,
+  NULL,    // free
+  #if MI_ENCODE_FREELIST
+  0,
+  #endif
+  0,       // used
+  NULL, 
+  ATOMIC_VAR_INIT(0), ATOMIC_VAR_INIT(0),
+  0, NULL, NULL, NULL
+  #if (MI_INTPTR_SIZE==8 && defined(MI_ENCODE_FREELIST)) || (MI_INTPTR_SIZE==4 && !defined(MI_ENCODE_FREELIST))
+  , { NULL } // padding
+  #endif
+};
+
+#define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
+#define MI_SMALL_PAGES_EMPTY  \
+  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+
+
+// Empty page queues for every bin
+#define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
+#define MI_PAGE_QUEUES_EMPTY \
+  { QNULL(1), \
+    QNULL(     1), QNULL(     2), QNULL(     3), QNULL(     4), QNULL(     5), QNULL(     6), QNULL(     7), QNULL(     8), /* 8 */ \
+    QNULL(    10), QNULL(    12), QNULL(    14), QNULL(    16), QNULL(    20), QNULL(    24), QNULL(    28), QNULL(    32), /* 16 */ \
+    QNULL(    40), QNULL(    48), QNULL(    56), QNULL(    64), QNULL(    80), QNULL(    96), QNULL(   112), QNULL(   128), /* 24 */ \
+    QNULL(   160), QNULL(   192), QNULL(   224), QNULL(   256), QNULL(   320), QNULL(   384), QNULL(   448), QNULL(   512), /* 32 */ \
+    QNULL(   640), QNULL(   768), QNULL(   896), QNULL(  1024), QNULL(  1280), QNULL(  1536), QNULL(  1792), QNULL(  2048), /* 40 */ \
+    QNULL(  2560), QNULL(  3072), QNULL(  3584), QNULL(  4096), QNULL(  5120), QNULL(  6144), QNULL(  7168), QNULL(  8192), /* 48 */ \
+    QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
+    QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
+    QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
+    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
+    QNULL(MI_LARGE_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+
+#define MI_STAT_COUNT_NULL()  {0,0,0,0}
+
+// Empty statistics
+#if MI_STAT>1
+#define MI_STAT_COUNT_END_NULL()  , { MI_STAT_COUNT_NULL(), MI_INIT32(MI_STAT_COUNT_NULL) }
+#else
+#define MI_STAT_COUNT_END_NULL()
+#endif
+
+#define MI_STATS_NULL  \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), \
+  { 0, 0 }, { 0, 0 }, { 0, 0 },  \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
+  MI_STAT_COUNT_END_NULL()
+
+// --------------------------------------------------------
+// Statically allocate an empty heap as the initial
+// thread local value for the default heap,
+// and statically allocate the backing heap for the main
+// thread so it can function without doing any allocation
+// itself (as accessing a thread local for the first time
+// may lead to allocation itself on some platforms)
+// --------------------------------------------------------
+
+const mi_heap_t _mi_heap_empty = {
+  NULL,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  ATOMIC_VAR_INIT(NULL),
+  0,
+  0,
+  0,
+  0,
+  false
+};
+
+// the thread-local default heap for allocation
+mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
+
+
+#define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
+
+static mi_tld_t tld_main = {
+  0, false,
+  &_mi_heap_main,
+  { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments
+  { 0, tld_main_stats },       // os
+  { MI_STATS_NULL }            // stats
+};
+
+mi_heap_t _mi_heap_main = {
+  &tld_main,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  NULL,
+  0,      // thread id
+#if MI_INTPTR_SIZE==8   // the cookie of the main heap can be fixed (unlike page cookies that need to be secure!)
+  0xCDCDCDCDCDCDCDCDUL,
+#else
+  0xCDCDCDCDUL,
+#endif
+  0,      // random
+  0,      // page count
+  false   // can reclaim
+};
+
+bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
+
+mi_stats_t _mi_stats_main = { MI_STATS_NULL };
+
+/* -----------------------------------------------------------
+  Initialization of random numbers
+----------------------------------------------------------- */
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+uintptr_t _mi_random_shuffle(uintptr_t x) {
+  #if (MI_INTPTR_SIZE==8)
+    // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+  x ^= x >> 30;
+  x *= 0xbf58476d1ce4e5b9UL;
+  x ^= x >> 27;
+  x *= 0x94d049bb133111ebUL;
+  x ^= x >> 31;
+  #elif (MI_INTPTR_SIZE==4)
+    // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+  x ^= x >> 16;
+  x *= 0x7feb352dUL;
+  x ^= x >> 15;
+  x *= 0x846ca68bUL;
+  x ^= x >> 16;
+  #endif
+  return x;
+}
+
+uintptr_t _mi_random_init(uintptr_t seed /* can be zero */) {
+#ifdef __wasi__ // no ASLR when using WebAssembly, and time granularity may be coarse
+  uintptr_t x;
+  arc4random_buf(&x, sizeof x);
+#else
+   // Hopefully, ASLR makes our function address random
+  uintptr_t x = (uintptr_t)((void*)&_mi_random_init);
+  x ^= seed;
+  // xor with high res time
+#if defined(_WIN32)
+  LARGE_INTEGER pcount;
+  QueryPerformanceCounter(&pcount);
+  x ^= (uintptr_t)(pcount.QuadPart);
+#elif defined(__APPLE__)
+  x ^= (uintptr_t)mach_absolute_time();
+#else
+  struct timespec time;
+  clock_gettime(CLOCK_MONOTONIC, &time);
+  x ^= (uintptr_t)time.tv_sec;
+  x ^= (uintptr_t)time.tv_nsec;
+#endif
+  // and do a few randomization steps
+  uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
+  for (uintptr_t i = 0; i < max; i++) {
+    x = _mi_random_shuffle(x);
+  }
+#endif
+  return x;
+}
+
+/* -----------------------------------------------------------
+  Initialization and freeing of the thread local heaps
+----------------------------------------------------------- */
+
+typedef struct mi_thread_data_s {
+  mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
+  mi_tld_t   tld;
+} mi_thread_data_t;
+
+// Initialize the thread local default heap, called from `mi_thread_init`
+static bool _mi_heap_init(void) {
+  if (mi_heap_is_initialized(_mi_heap_default)) return true;
+  if (_mi_is_main_thread()) {
+    // the main heap is statically allocated
+    _mi_heap_set_default_direct(&_mi_heap_main);
+    mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+  }
+  else {
+    // use `_mi_os_alloc` to allocate directly from the OS
+    mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t),&_mi_stats_main); // Todo: more efficient allocation?
+    if (td == NULL) {
+      _mi_error_message("failed to allocate thread local heap memory\n");
+      return false;
+    }
+    mi_tld_t*  tld = &td->tld;
+    mi_heap_t* heap = &td->heap;
+    memcpy(heap, &_mi_heap_empty, sizeof(*heap));
+    heap->thread_id = _mi_thread_id();
+    heap->random = _mi_random_init(heap->thread_id);
+    heap->cookie = ((uintptr_t)heap ^ _mi_heap_random(heap)) | 1;
+    heap->tld = tld;
+    memset(tld, 0, sizeof(*tld));
+    tld->heap_backing = heap;
+    tld->segments.stats = &tld->stats;
+    tld->os.stats = &tld->stats;
+    _mi_heap_set_default_direct(heap);
+  }
+  return false;
+}
+
+// Free the thread local default heap (called from `mi_thread_done`)
+static bool _mi_heap_done(mi_heap_t* heap) {
+  if (!mi_heap_is_initialized(heap)) return true;
+
+  // reset default heap
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+
+  // todo: delete all non-backing heaps?
+
+  // switch to backing heap and free it
+  heap = heap->tld->heap_backing;
+  if (!mi_heap_is_initialized(heap)) return false;
+  
+  // collect if not the main thread
+  if (heap != &_mi_heap_main) {
+    _mi_heap_collect_abandon(heap);
+  }
+
+  // merge stats
+  _mi_stats_done(&heap->tld->stats);
+
+  // free if not the main thread
+  if (heap != &_mi_heap_main) {
+    _mi_os_free(heap, sizeof(mi_thread_data_t), &_mi_stats_main);
+  }
+#if (MI_DEBUG > 0)
+  else {
+    _mi_heap_destroy_pages(heap);
+    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+  }
+#endif
+  return false;
+}
+
+
+
+// --------------------------------------------------------
+// Try to run `mi_thread_done()` automatically so any memory
+// owned by the thread but not yet released can be abandoned
+// and re-owned by another thread.
+//
+// 1. windows dynamic library:
+//     call from DllMain on DLL_THREAD_DETACH
+// 2. windows static library:
+//     use `FlsAlloc` to call a destructor when the thread is done
+// 3. unix, pthreads:
+//     use a pthread key to call a destructor when a pthread is done
+//
+// In the last two cases we also need to call `mi_process_init`
+// to set up the thread local keys.
+// --------------------------------------------------------
+
+static void _mi_thread_done(mi_heap_t* default_heap);
+
+#ifdef __wasi__
+// no pthreads in the WebAssembly Standard Interface
+#elif !defined(_WIN32)
+#define MI_USE_PTHREADS
+#endif
+
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
+  // nothing to do as it is done in DllMain
+#elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+  // use thread local storage keys to detect thread ending
+  #include <windows.h>
+  #include <fibersapi.h>
+  static DWORD mi_fls_key;
+  static void NTAPI mi_fls_done(PVOID value) {
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
+  }
+#elif defined(MI_USE_PTHREADS)
+  // use pthread locol storage keys to detect thread ending
+  #include <pthread.h>
+  static pthread_key_t mi_pthread_key;
+  static void mi_pthread_done(void* value) {
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
+  }
+#elif defined(__wasi__)
+// no pthreads in the WebAssembly Standard Interface
+#else
+  #pragma message("define a way to call mi_thread_done when a thread is done")
+#endif
+
+// Set up handlers so `mi_thread_done` is called automatically
+static void mi_process_setup_auto_thread_done(void) {
+  static bool tls_initialized = false; // fine if it races
+  if (tls_initialized) return;
+  tls_initialized = true;
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    mi_fls_key = FlsAlloc(&mi_fls_done);
+  #elif defined(MI_USE_PTHREADS)
+    pthread_key_create(&mi_pthread_key, &mi_pthread_done);
+  #endif
+}
+
+
+bool _mi_is_main_thread(void) {
+  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+}
+
+// This is called from the `mi_malloc_generic`
+void mi_thread_init(void) mi_attr_noexcept
+{
+  // ensure our process has started already
+  mi_process_init();
+
+  // initialize the thread local default heap
+  // (this will call `_mi_heap_set_default_direct` and thus set the 
+  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
+  if (_mi_heap_init()) return;  // returns true if already initialized
+
+  // don't further initialize for the main thread
+  if (_mi_is_main_thread()) return;
+
+  _mi_stat_increase(&mi_get_default_heap()->tld->stats.threads, 1);
+
+  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
+}
+
+void mi_thread_done(void) mi_attr_noexcept {
+  _mi_thread_done(mi_get_default_heap());
+}
+
+static void _mi_thread_done(mi_heap_t* heap) {
+  // stats
+  if (!_mi_is_main_thread() && mi_heap_is_initialized(heap))  {
+    _mi_stat_decrease(&heap->tld->stats.threads, 1);
+  }
+  // abandon the thread local heap
+  if (_mi_heap_done(heap)) return; // returns true if already ran
+}
+
+void _mi_heap_set_default_direct(mi_heap_t* heap)  {
+  mi_assert_internal(heap != NULL);
+  _mi_heap_default = heap;
+
+  // ensure the default heap is passed to `_mi_thread_done`
+  // setting to a non-NULL value also ensures `mi_thread_done` is called.
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    FlsSetValue(mi_fls_key, heap); 
+  #elif defined(MI_USE_PTHREADS)
+    pthread_setspecific(mi_pthread_key, heap); 
+  #endif
+}
+
+
+
+// --------------------------------------------------------
+// Run functions on process init/done, and thread init/done
+// --------------------------------------------------------
+static void mi_process_done(void);
+
+static bool os_preloading = true;    // true until this module is initialized
+static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+
+// Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
+bool _mi_preloading() {
+  return os_preloading;
+}
+
+bool mi_is_redirected() mi_attr_noexcept {
+  return mi_redirected;
+}
+
+// Communicate with the redirection module on Windows
+#if defined(_WIN32) && defined(MI_SHARED_LIB) 
+#ifdef __cplusplus
+extern "C" {
+#endif
+mi_decl_export void _mi_redirect_entry(DWORD reason) {
+  // called on redirection; careful as this may be called before DllMain
+  if (reason == DLL_PROCESS_ATTACH) {
+    mi_redirected = true;
+  }
+  else if (reason == DLL_PROCESS_DETACH) {
+    mi_redirected = false;
+  }
+  else if (reason == DLL_THREAD_DETACH) {
+    mi_thread_done();
+  }
+}
+__declspec(dllimport) bool mi_allocator_init(const char** message);
+__declspec(dllimport) void mi_allocator_done();
+#ifdef __cplusplus
+}
+#endif
+#else
+static bool mi_allocator_init(const char** message) {
+  if (message != NULL) *message = NULL;
+  return true;
+}
+static void mi_allocator_done() {
+  // nothing to do
+}
+#endif
+
+// Called once by the process loader
+static void mi_process_load(void) {
+  os_preloading = false;
+  atexit(&mi_process_done);
+  _mi_options_init();
+  mi_process_init();
+  //mi_stats_reset();
+  if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
+
+  // show message from the redirector (if present)
+  const char* msg = NULL;
+  mi_allocator_init(&msg);
+  if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
+    _mi_fputs(NULL,NULL,msg);
+  }
+
+  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    size_t pages     = mi_option_get(mi_option_reserve_huge_os_pages);
+    double max_secs = (double)pages / 2.0; // 0.5s per page (1GiB)
+    mi_reserve_huge_os_pages(pages, max_secs, NULL);
+  }
+}
+
+// Initialize the process; called by thread_init or the process loader
+void mi_process_init(void) mi_attr_noexcept {
+  // ensure we are called once
+  if (_mi_process_is_initialized) return;
+  // access _mi_heap_default before setting _mi_process_is_initialized to ensure
+  // that the TLS slot is allocated without getting into recursion on macOS
+  // when using dynamic linking with interpose.
+  mi_heap_t* h = mi_get_default_heap();
+  _mi_process_is_initialized = true;
+
+  _mi_heap_main.thread_id = _mi_thread_id();
+  _mi_verbose_message("process init: 0x%zx\n", _mi_heap_main.thread_id);
+  uintptr_t random = _mi_random_init(_mi_heap_main.thread_id)  ^ (uintptr_t)h;
+  #ifndef __APPLE__
+  _mi_heap_main.cookie = (uintptr_t)&_mi_heap_main ^ random;
+  #endif
+  _mi_heap_main.random = _mi_random_shuffle(random);
+  mi_process_setup_auto_thread_done();
+  _mi_os_init();
+  #if (MI_DEBUG)
+  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
+  #endif
+  _mi_verbose_message("secure level: %d\n", MI_SECURE);
+  mi_thread_init();
+  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+}
+
+// Called when the process is done (through `at_exit`)
+static void mi_process_done(void) {
+  // only shutdown if we were initialized
+  if (!_mi_process_is_initialized) return;
+  // ensure we are called once
+  static bool process_done = false;
+  if (process_done) return;
+  process_done = true;
+
+  #ifndef NDEBUG
+  mi_collect(true);
+  #endif
+  if (mi_option_is_enabled(mi_option_show_stats) ||
+      mi_option_is_enabled(mi_option_verbose)) {
+    mi_stats_print(NULL);
+  }
+  mi_allocator_done();
+  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
+  os_preloading = true; // don't call the C runtime anymore
+}
+
+
+
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
+  // Windows DLL: easy to hook into process_init and thread_done  
+  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
+    UNUSED(reserved);
+    UNUSED(inst);
+    if (reason==DLL_PROCESS_ATTACH) {
+      mi_process_load();
+    }
+    else if (reason==DLL_THREAD_DETACH) {
+      if (!mi_is_redirected()) mi_thread_done();
+    }
+    return TRUE;
+  }
+
+#elif defined(__cplusplus)
+  // C++: use static initialization to detect process start
+  static bool _mi_process_init(void) {
+    mi_process_load();
+    return (_mi_heap_main.thread_id != 0);
+  }
+  static bool mi_initialized = _mi_process_init();
+
+#elif defined(__GNUC__) || defined(__clang__)
+  // GCC,Clang: use the constructor attribute
+  static void __attribute__((constructor)) _mi_process_init(void) {
+    mi_process_load();
+  }
+
+#elif defined(_MSC_VER)
+  // MSVC: use data section magic for static libraries
+  // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
+  static int _mi_process_init(void) {
+    mi_process_load();
+    return 0;
+  }
+  typedef int(*_crt_cb)(void);
+  #ifdef _M_X64
+    __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
+    #pragma section(".CRT$XIU", long, read)
+  #else
+    __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
+  #endif
+  #pragma data_seg(".CRT$XIU")
+  _crt_cb _mi_msvc_initu[] = { &_mi_process_init };
+  #pragma data_seg()
+
+#else
+#pragma message("define a way to call mi_process_load on your platform")
+#endif
diff --git a/runtime/src/mimalloc/c/memory.c b/runtime/src/mimalloc/c/memory.c
new file mode 100644
index 00000000000..dd03cf9565f
--- /dev/null
+++ b/runtime/src/mimalloc/c/memory.c
@@ -0,0 +1,546 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..)
+and the segment and huge object allocation by mimalloc. There may be multiple
+implementations of this (one could be the identity going directly to the OS,
+another could be a simple cache etc), but the current one uses large "regions".
+In contrast to the rest of mimalloc, the "regions" are shared between threads and
+need to be accessed using atomic operations.
+We need this memory layer between the raw OS calls because of:
+1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
+   to reuse memory effectively.
+2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
+   an OS allocation/free is still (much) too expensive relative to the accesses in that
+   object :-( (`malloc-large` tests this). This means we need a cheaper way to
+   reuse memory.
+3. This layer can help with a NUMA aware allocation in the future.
+
+Possible issues:
+- (2) can potentially be addressed too with a small cache per thread which is much
+  simpler. Generally though that requires shrinking of huge pages, and may overuse
+  memory per thread. (and is not compatible with `sbrk`).
+- Since the current regions are per-process, we need atomic operations to
+  claim blocks which may be contended
+- In the worst case, we need to search the whole region map (16KiB for 256GiB)
+  linearly. At what point will direct OS calls be faster? Is there a way to
+  do this better without adding too much complexity?
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+
+// Internal raw OS interface
+size_t  _mi_os_large_page_size();
+bool    _mi_os_protect(void* addr, size_t size);
+bool    _mi_os_unprotect(void* addr, size_t size);
+bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
+bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
+bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+void*   _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
+void    _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
+void*   _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment);
+bool    _mi_os_is_huge_reserved(void* p);
+
+// Constants
+#if (MI_INTPTR_SIZE==8)
+#define MI_HEAP_REGION_MAX_SIZE    (256 * (1ULL << 30))  // 256GiB => 16KiB for the region map
+#elif (MI_INTPTR_SIZE==4)
+#define MI_HEAP_REGION_MAX_SIZE    (3 * (1UL << 30))    // 3GiB => 196 bytes for the region map
+#else
+#error "define the maximum heap space allowed for regions on this platform"
+#endif
+
+#define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE
+
+#define MI_REGION_MAP_BITS        (MI_INTPTR_SIZE * 8)
+#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_REGION_MAP_BITS)
+#define MI_REGION_MAX_ALLOC_SIZE  ((MI_REGION_MAP_BITS/4)*MI_SEGMENT_SIZE)  // 64MiB
+#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)
+#define MI_REGION_MAP_FULL        UINTPTR_MAX
+
+
+typedef uintptr_t mi_region_info_t;
+
+static inline mi_region_info_t mi_region_info_create(void* start, bool is_large, bool is_committed) {
+  return ((uintptr_t)start | ((uintptr_t)(is_large?1:0) << 1) | (is_committed?1:0));
+}
+
+static inline void* mi_region_info_read(mi_region_info_t info, bool* is_large, bool* is_committed) {
+  if (is_large) *is_large = ((info&0x02) != 0);
+  if (is_committed) *is_committed = ((info&0x01) != 0);
+  return (void*)(info & ~0x03);
+}
+
+
+// A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
+// a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
+typedef struct mem_region_s {
+  volatile _Atomic(uintptr_t)        map;   // in-use bit per MI_SEGMENT_SIZE block
+  volatile _Atomic(mi_region_info_t) info;  // start of virtual memory area, and flags
+  volatile _Atomic(uintptr_t)        dirty_mask; // bit per block if the contents are not zero'd
+} mem_region_t;
+
+
+// The region map; 16KiB for a 256GiB HEAP_REGION_MAX
+// TODO: in the future, maintain a map per NUMA node for numa aware allocation
+static mem_region_t regions[MI_REGION_MAX];
+
+static volatile _Atomic(uintptr_t) regions_count; // = 0;        // allocated regions
+
+
+/* ----------------------------------------------------------------------------
+Utility functions
+-----------------------------------------------------------------------------*/
+
+// Blocks (of 4MiB) needed for the given size.
+static size_t mi_region_block_count(size_t size) {
+  mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE);
+  return (size + MI_SEGMENT_SIZE - 1) / MI_SEGMENT_SIZE;
+}
+
+// The bit mask for a given number of blocks at a specified bit index.
+static uintptr_t mi_region_block_mask(size_t blocks, size_t bitidx) {
+  mi_assert_internal(blocks + bitidx <= MI_REGION_MAP_BITS);
+  return ((((uintptr_t)1 << blocks) - 1) << bitidx);
+}
+
+// Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
+static size_t mi_good_commit_size(size_t size) {
+  if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
+  return _mi_align_up(size, _mi_os_large_page_size());
+}
+
+// Return if a pointer points into a region reserved by us.
+bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  if (p==NULL) return false;
+  size_t count = mi_atomic_read_relaxed(&regions_count);
+  for (size_t i = 0; i < count; i++) {
+    uint8_t* start = (uint8_t*)mi_region_info_read( mi_atomic_read_relaxed(&regions[i].info), NULL, NULL);
+    if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
+  }
+  return false;
+}
+
+
+/* ----------------------------------------------------------------------------
+Commit from a region
+-----------------------------------------------------------------------------*/
+
+// Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`.
+// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
+// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
+// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
+static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, 
+                                    size_t size, bool* commit, bool* allow_large, bool* is_zero, void** p, size_t* id, mi_os_tld_t* tld)
+{
+  size_t mask = mi_region_block_mask(blocks,bitidx);
+  mi_assert_internal(mask != 0);
+  mi_assert_internal((mask & mi_atomic_read_relaxed(&region->map)) == mask);
+  mi_assert_internal(&regions[idx] == region);
+
+  // ensure the region is reserved
+  mi_region_info_t info = mi_atomic_read(&region->info);
+  if (info == 0) 
+  {
+    bool region_commit = mi_option_is_enabled(mi_option_eager_region_commit);
+    bool region_large  = *allow_large;
+    void* start = NULL;
+    if (region_large) {
+      start = _mi_os_try_alloc_from_huge_reserved(MI_REGION_SIZE, MI_SEGMENT_ALIGN);
+      if (start != NULL) { region_commit = true; }
+    }
+    if (start == NULL) {
+      start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, region_commit, &region_large, tld);
+    }
+    mi_assert_internal(!(region_large && !*allow_large));
+
+    if (start == NULL) {
+      // failure to allocate from the OS! unclaim the blocks and fail
+      size_t map;
+      do {
+        map = mi_atomic_read_relaxed(&region->map);
+      } while (!mi_atomic_cas_weak(&region->map, map & ~mask, map));
+      return false;
+    }
+
+    // set the newly allocated region
+    info = mi_region_info_create(start,region_large,region_commit);
+    if (mi_atomic_cas_strong(&region->info, info, 0)) {
+      // update the region count
+      mi_atomic_increment(&regions_count);
+    }
+    else {
+      // failed, another thread allocated just before us!
+      // we assign it to a later slot instead (up to 4 tries).
+      for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
+        if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
+          mi_atomic_increment(&regions_count);
+          start = NULL;
+          break;
+        }
+      }
+      if (start != NULL) {
+        // free it if we didn't succeed to save it to some other region
+        _mi_os_free_ex(start, MI_REGION_SIZE, region_commit, tld->stats);
+      }
+      // and continue with the memory at our index
+      info = mi_atomic_read(&region->info);
+    }
+  }
+  mi_assert_internal(info == mi_atomic_read(&region->info));
+  mi_assert_internal(info != 0);
+
+  // Commit the blocks to memory
+  bool region_is_committed = false;
+  bool region_is_large = false;
+  void* start = mi_region_info_read(info,&region_is_large,&region_is_committed);  
+  mi_assert_internal(!(region_is_large && !*allow_large));
+  mi_assert_internal(start!=NULL);
+
+  // set dirty bits
+  uintptr_t m;
+  do {
+    m = mi_atomic_read(&region->dirty_mask);
+  } while (!mi_atomic_cas_weak(&region->dirty_mask, m | mask, m));
+  *is_zero = ((m & mask) == 0); // no dirty bit set in our claimed range?
+
+  void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
+  if (*commit && !region_is_committed) {
+    // ensure commit 
+    bool commit_zero = false;
+    _mi_os_commit(blocks_start, mi_good_commit_size(size), &commit_zero, tld->stats);  // only commit needed size (unless using large OS pages)
+    if (commit_zero) *is_zero = true;
+  }
+  else if (!*commit && region_is_committed) {
+    // but even when no commit is requested, we might have committed anyway (in a huge OS page for example)
+    *commit = true;
+  }
+
+  // and return the allocation  
+  mi_assert_internal(blocks_start != NULL);
+  *allow_large = region_is_large;
+  *p  = blocks_start;
+  *id = (idx*MI_REGION_MAP_BITS) + bitidx;
+  return true;
+}
+
+// Use bit scan forward to quickly find the first zero bit if it is available
+#if defined(_MSC_VER)
+#define MI_HAVE_BITSCAN
+#include <intrin.h>
+static inline size_t mi_bsf(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  #if (MI_INTPTR_SIZE==8)
+  _BitScanForward64(&idx, x);
+  #else
+  _BitScanForward(&idx, x);
+  #endif
+  return idx;
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  #if (MI_INTPTR_SIZE==8)
+  _BitScanReverse64(&idx, x);
+  #else
+  _BitScanReverse(&idx, x);
+  #endif
+  return idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+#define MI_HAVE_BITSCAN
+static inline size_t mi_bsf(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : __builtin_ctzl(x));
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - __builtin_clzl(x));
+}
+#endif
+
+// Allocate `blocks` in a `region` at `idx` of a given `size`.
+// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
+// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
+// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
+static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, 
+                                   bool* commit, bool* allow_large, bool* is_zero, void** p, size_t* id, mi_os_tld_t* tld)
+{
+  mi_assert_internal(p != NULL && id != NULL);
+  mi_assert_internal(blocks < MI_REGION_MAP_BITS);
+
+  const uintptr_t mask = mi_region_block_mask(blocks, 0);
+  const size_t bitidx_max = MI_REGION_MAP_BITS - blocks;
+  uintptr_t map = mi_atomic_read(&region->map);
+  if (map==MI_REGION_MAP_FULL) return true;
+
+  #ifdef MI_HAVE_BITSCAN
+  size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
+  #else
+  size_t bitidx = 0;               // otherwise start at 0
+  #endif
+  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+
+  // scan linearly for a free range of zero bits
+  while(bitidx <= bitidx_max) {
+    if ((map & m) == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?
+      uintptr_t newmap = map | m;
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_cas_weak(&region->map, newmap, map)) {  // TODO: use strong cas here?
+        // no success, another thread claimed concurrently.. keep going
+        map = mi_atomic_read(&region->map);
+        continue;
+      }
+      else {
+        // success, we claimed the bits
+        // now commit the block memory -- this can still fail
+        return mi_region_commit_blocks(region, idx, bitidx, blocks, 
+                                       size, commit, allow_large, is_zero, p, id, tld);
+      }
+    }
+    else {
+      // on to the next bit range
+      #ifdef MI_HAVE_BITSCAN
+      size_t shift = (blocks == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
+      mi_assert_internal(shift > 0 && shift <= blocks);
+      #else
+      size_t shift = 1;
+      #endif
+      bitidx += shift;
+      m <<= shift;
+    }
+  }
+  // no error, but also no bits found
+  return true;
+}
+
+// Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
+// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
+// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
+// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
+static bool mi_region_try_alloc_blocks(size_t idx, size_t blocks, size_t size, 
+                                       bool* commit, bool* allow_large, bool* is_zero, 
+                                       void** p, size_t* id, mi_os_tld_t* tld)
+{
+  // check if there are available blocks in the region..
+  mi_assert_internal(idx < MI_REGION_MAX);
+  mem_region_t* region = &regions[idx];
+  uintptr_t m = mi_atomic_read_relaxed(&region->map);
+  if (m != MI_REGION_MAP_FULL) {  // some bits are zero    
+    bool ok = (*commit || *allow_large); // committing or allow-large is always ok
+    if (!ok) {
+      // otherwise skip incompatible regions if possible. 
+      // this is not guaranteed due to multiple threads allocating at the same time but
+      // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
+      // otherwise we might just not be able to reset/decommit individual pages sometimes.
+      mi_region_info_t info = mi_atomic_read_relaxed(&region->info);
+      bool is_large;
+      bool is_committed;
+      void* start = mi_region_info_read(info,&is_large,&is_committed);
+      ok = (start == NULL || (*commit || !is_committed) || (*allow_large || !is_large)); // Todo: test with one bitmap operation?
+    }
+    if (ok) {
+      return mi_region_alloc_blocks(region, idx, blocks, size, commit, allow_large, is_zero, p, id, tld);
+    }
+  }
+  return true;  // no error, but no success either
+}
+
+/* ----------------------------------------------------------------------------
+ Allocation
+-----------------------------------------------------------------------------*/
+
+// Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
+// (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
+void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, 
+                            size_t* id, mi_os_tld_t* tld)
+{
+  mi_assert_internal(id != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  *id = SIZE_MAX;
+  *is_zero = false;
+  bool default_large = false;
+  if (large==NULL) large = &default_large;  // ensure `large != NULL`  
+
+  // use direct OS allocation for huge blocks or alignment (with `id = SIZE_MAX`)
+  if (size > MI_REGION_MAX_ALLOC_SIZE || alignment > MI_SEGMENT_ALIGN) {
+    *is_zero = true;
+    return _mi_os_alloc_aligned(mi_good_commit_size(size), alignment, *commit, large, tld);  // round up size
+  }
+
+  // always round size to OS page size multiple (so commit/decommit go over the entire range)
+  // TODO: use large OS page size here?
+  size = _mi_align_up(size, _mi_os_page_size());
+
+  // calculate the number of needed blocks
+  size_t blocks = mi_region_block_count(size);
+  mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE);
+
+  // find a range of free blocks
+  void* p = NULL;
+  size_t count = mi_atomic_read(&regions_count);
+  size_t idx = tld->region_idx; // start at 0 to reuse low addresses? Or, use tld->region_idx to reduce contention?
+  for (size_t visited = 0; visited < count; visited++, idx++) {
+    if (idx >= count) idx = 0;  // wrap around
+    if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
+    if (p != NULL) break;
+  }
+
+  if (p == NULL) {
+    // no free range in existing regions -- try to extend beyond the count.. but at most 8 regions
+    for (idx = count; idx < mi_atomic_read_relaxed(&regions_count) + 8 && idx < MI_REGION_MAX; idx++) {
+      if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
+      if (p != NULL) break;
+    }
+  }
+
+  if (p == NULL) {
+    // we could not find a place to allocate, fall back to the os directly
+    _mi_warning_message("unable to allocate from region: size %zu\n", size);
+    *is_zero = true;
+    p = _mi_os_alloc_aligned(size, alignment, commit, large, tld);
+  }
+  else {
+    tld->region_idx = idx;  // next start of search? currently not used as we use first-fit
+  }
+
+  mi_assert_internal( p == NULL || (uintptr_t)p % alignment == 0);
+  return p;
+}
+
+
+
+/* ----------------------------------------------------------------------------
+Free
+-----------------------------------------------------------------------------*/
+
+// Free previously allocated memory with a given id.
+void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && stats != NULL);
+  if (p==NULL) return;
+  if (size==0) return;
+  if (id == SIZE_MAX) {
+   // was a direct OS allocation, pass through
+    _mi_os_free(p, size, stats);
+  }
+  else {
+    // allocated in a region
+    mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE); if (size > MI_REGION_MAX_ALLOC_SIZE) return;
+    // we can align the size up to page size (as we allocate that way too)
+    // this ensures we fully commit/decommit/reset
+    size = _mi_align_up(size, _mi_os_page_size());
+    size_t idx = (id / MI_REGION_MAP_BITS);
+    size_t bitidx = (id % MI_REGION_MAP_BITS);
+    size_t blocks = mi_region_block_count(size);
+    size_t mask = mi_region_block_mask(blocks, bitidx);
+    mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`?
+    mem_region_t* region = &regions[idx];
+    mi_assert_internal((mi_atomic_read_relaxed(&region->map) & mask) == mask ); // claimed?
+    mi_region_info_t info = mi_atomic_read(&region->info);
+    bool is_large;
+    bool is_eager_committed;
+    void* start = mi_region_info_read(info,&is_large,&is_eager_committed);
+    mi_assert_internal(start != NULL);
+    void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
+    mi_assert_internal(blocks_start == p); // not a pointer in our area?
+    mi_assert_internal(bitidx + blocks <= MI_REGION_MAP_BITS);
+    if (blocks_start != p || bitidx + blocks > MI_REGION_MAP_BITS) return; // or `abort`?
+
+    // decommit (or reset) the blocks to reduce the working set.
+    // TODO: implement delayed decommit/reset as these calls are too expensive
+    // if the memory is reused soon.
+    // reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large
+    if (!is_large) {
+      if (mi_option_is_enabled(mi_option_segment_reset)) {
+        if (!is_eager_committed &&  // cannot reset large pages
+          (mi_option_is_enabled(mi_option_eager_commit) ||  // cannot reset halfway committed segments, use `option_page_reset` instead
+            mi_option_is_enabled(mi_option_reset_decommits))) // but we can decommit halfway committed segments
+        {
+          _mi_os_reset(p, size, stats);
+          //_mi_os_decommit(p, size, stats);  // todo: and clear dirty bits?
+        }
+      }
+    }    
+    if (!is_eager_committed) {
+      // adjust commit statistics as we commit again when re-using the same slot
+      _mi_stat_decrease(&stats->committed, mi_good_commit_size(size));
+    }
+
+    // TODO: should we free empty regions? currently only done _mi_mem_collect.
+    // this frees up virtual address space which might be useful on 32-bit systems?
+
+    // and unclaim
+    uintptr_t map;
+    uintptr_t newmap;
+    do {
+      map = mi_atomic_read_relaxed(&region->map);
+      newmap = map & ~mask;
+    } while (!mi_atomic_cas_weak(&region->map, newmap, map));
+  }
+}
+
+
+/* ----------------------------------------------------------------------------
+  collection
+-----------------------------------------------------------------------------*/
+void _mi_mem_collect(mi_stats_t* stats) {
+  // free every region that has no segments in use.
+  for (size_t i = 0; i < regions_count; i++) {
+    mem_region_t* region = &regions[i];
+    if (mi_atomic_read_relaxed(&region->map) == 0) {
+      // if no segments used, try to claim the whole region
+      uintptr_t m;
+      do {
+        m = mi_atomic_read_relaxed(&region->map);
+      } while(m == 0 && !mi_atomic_cas_weak(&region->map, ~((uintptr_t)0), 0 ));
+      if (m == 0) {
+        // on success, free the whole region (unless it was huge reserved)
+        bool is_eager_committed;
+        void* start = mi_region_info_read(mi_atomic_read(&region->info), NULL, &is_eager_committed);
+        if (start != NULL && !_mi_os_is_huge_reserved(start)) {
+          _mi_os_free_ex(start, MI_REGION_SIZE, is_eager_committed, stats);
+        }
+        // and release
+        mi_atomic_write(&region->info,0);
+        mi_atomic_write(&region->map,0);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------
+  Other
+-----------------------------------------------------------------------------*/
+
+bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats) {
+  return _mi_os_commit(p, size, is_zero, stats);
+}
+
+bool _mi_mem_decommit(void* p, size_t size, mi_stats_t* stats) {
+  return _mi_os_decommit(p, size, stats);
+}
+
+bool _mi_mem_reset(void* p, size_t size, mi_stats_t* stats) {
+  return _mi_os_reset(p, size, stats);
+}
+
+bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats) {
+  return _mi_os_unreset(p, size, is_zero, stats);
+}
+
+bool _mi_mem_protect(void* p, size_t size) {
+  return _mi_os_protect(p, size);
+}
+
+bool _mi_mem_unprotect(void* p, size_t size) {
+  return _mi_os_unprotect(p, size);
+}
diff --git a/runtime/src/mimalloc/c/options.c b/runtime/src/mimalloc/c/options.c
new file mode 100644
index 00000000000..d6b0558b0f6
--- /dev/null
+++ b/runtime/src/mimalloc/c/options.c
@@ -0,0 +1,410 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <stdio.h>
+#include <stdlib.h> // strtol
+#include <string.h> // strncpy, strncat, strlen, strstr
+#include <ctype.h>  // toupper
+#include <stdarg.h>
+
+static uintptr_t mi_max_error_count = 16;  // stop outputting errors after this
+
+static void mi_add_stderr_output();
+
+int mi_version(void) mi_attr_noexcept {
+  return MI_MALLOC_VERSION;
+}
+
+#ifdef _WIN32
+#include <conio.h>
+#endif
+
+// --------------------------------------------------------
+// Options
+// These can be accessed by multiple threads and may be
+// concurrently initialized, but an initializing data race
+// is ok since they resolve to the same value.
+// --------------------------------------------------------
+typedef enum mi_init_e {
+  UNINIT,       // not yet initialized
+  DEFAULTED,    // not found in the environment, use default value
+  INITIALIZED   // found in environment or set explicitly
+} mi_init_t;
+
+typedef struct mi_option_desc_s {
+  long        value;  // the value
+  mi_init_t   init;   // is it initialized yet? (from the environment)
+  mi_option_t option; // for debugging: the option index should match the option
+  const char* name;   // option name without `mimalloc_` prefix
+} mi_option_desc_t;
+
+#define MI_OPTION(opt)        mi_option_##opt, #opt
+#define MI_OPTION_DESC(opt)   {0, UNINIT, MI_OPTION(opt) }
+
+static mi_option_desc_t options[_mi_option_last] =
+{
+  // stable options
+  { MI_DEBUG, UNINIT, MI_OPTION(show_errors) },
+  { 0, UNINIT, MI_OPTION(show_stats) },
+  { 0, UNINIT, MI_OPTION(verbose) },
+
+  // the following options are experimental and not all combinations make sense.
+  { 1, UNINIT, MI_OPTION(eager_commit) },        // note: needs to be on when eager_region_commit is enabled
+  #ifdef _WIN32   // and BSD?
+  { 0, UNINIT, MI_OPTION(eager_region_commit) }, // don't commit too eagerly on windows (just for looks...)
+  #else
+  { 1, UNINIT, MI_OPTION(eager_region_commit) },
+  #endif
+  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
+  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
+  { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
+  { 0, UNINIT, MI_OPTION(page_reset) },
+  { 0, UNINIT, MI_OPTION(cache_reset) },
+  { 0, UNINIT, MI_OPTION(reset_decommits) },     // note: cannot enable this if secure is on
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+  { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+  { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
+  { 16, UNINIT, MI_OPTION(max_errors) }          // maximum errors that are output
+};
+
+static void mi_option_init(mi_option_desc_t* desc);
+
+void _mi_options_init(void) {
+  // called on process load; should not be called before the CRT is initialized!
+  // (e.g. do not call this from process_init as that may run before CRT initialization)
+  mi_add_stderr_output(); // now it safe to use stderr for output
+  for(int i = 0; i < _mi_option_last; i++ ) {
+    mi_option_t option = (mi_option_t)i;
+    mi_option_get(option); // initialize
+    if (option != mi_option_verbose) {
+      mi_option_desc_t* desc = &options[option];
+      _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
+    }
+  }
+  mi_max_error_count = mi_option_get(mi_option_max_errors);
+}
+
+long mi_option_get(mi_option_t option) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  mi_option_desc_t* desc = &options[option];
+  mi_assert(desc->option == option);  // index should match the option
+  if (mi_unlikely(desc->init == UNINIT)) {
+    mi_option_init(desc);
+  }
+  return desc->value;
+}
+
+void mi_option_set(mi_option_t option, long value) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  mi_option_desc_t* desc = &options[option];
+  mi_assert(desc->option == option);  // index should match the option
+  desc->value = value;
+  desc->init = INITIALIZED;
+}
+
+void mi_option_set_default(mi_option_t option, long value) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  mi_option_desc_t* desc = &options[option];
+  if (desc->init != INITIALIZED) {
+    desc->value = value;
+  }
+}
+
+bool mi_option_is_enabled(mi_option_t option) {
+  return (mi_option_get(option) != 0);
+}
+
+void mi_option_set_enabled(mi_option_t option, bool enable) {
+  mi_option_set(option, (enable ? 1 : 0));
+}
+
+void mi_option_set_enabled_default(mi_option_t option, bool enable) {
+  mi_option_set_default(option, (enable ? 1 : 0));
+}
+
+void mi_option_enable(mi_option_t option) {
+  mi_option_set_enabled(option,true);
+}
+
+void mi_option_disable(mi_option_t option) {
+  mi_option_set_enabled(option,false);
+}
+
+
+static void mi_out_stderr(const char* msg) {
+  #ifdef _WIN32
+  // on windows with redirection, the C runtime cannot handle locale dependent output
+  // after the main thread closes so we use direct console output.
+  if (!_mi_preloading()) { _cputs(msg); }
+  #else
+  fputs(msg, stderr);
+  #endif
+}
+
+// Since an output function can be registered earliest in the `main`
+// function we also buffer output that happens earlier. When
+// an output function is registered it is called immediately with
+// the output up to that point.
+#ifndef MI_MAX_DELAY_OUTPUT
+#define MI_MAX_DELAY_OUTPUT (32*1024)
+#endif
+static char out_buf[MI_MAX_DELAY_OUTPUT+1];
+static _Atomic(uintptr_t) out_len;
+
+static void mi_out_buf(const char* msg) {
+  if (msg==NULL) return;
+  if (mi_atomic_read_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
+  size_t n = strlen(msg);
+  if (n==0) return;
+  // claim space
+  uintptr_t start = mi_atomic_addu(&out_len, n);
+  if (start >= MI_MAX_DELAY_OUTPUT) return;
+  // check bound
+  if (start+n >= MI_MAX_DELAY_OUTPUT) {
+    n = MI_MAX_DELAY_OUTPUT-start-1;
+  }
+  memcpy(&out_buf[start], msg, n);
+}
+
+static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf) {
+  if (out==NULL) return;
+  // claim (if `no_more_buf == true`, no more output will be added after this point)
+  size_t count = mi_atomic_addu(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
+  // and output the current contents
+  if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
+  out_buf[count] = 0;
+  out(out_buf);
+  if (!no_more_buf) {
+    out_buf[count] = '\n'; // if continue with the buffer, insert a newline
+  }
+}
+
+
+// Once this module is loaded, switch to this routine
+// which outputs to stderr and the delayed output buffer.
+static void mi_out_buf_stderr(const char* msg) {
+  mi_out_stderr(msg);
+  mi_out_buf(msg);
+}
+
+
+
+// --------------------------------------------------------
+// Default output handler
+// --------------------------------------------------------
+
+// Should be atomic but gives errors on many platforms as generally we cannot cast a function pointer to a uintptr_t.
+// For now, don't register output from multiple threads.
+#pragma warning(suppress:4180)
+static mi_output_fun* volatile mi_out_default; // = NULL
+
+static mi_output_fun* mi_out_get_default(void) {
+  mi_output_fun* out = mi_out_default;
+  return (out == NULL ? &mi_out_buf : out);
+}
+
+void mi_register_output(mi_output_fun* out) mi_attr_noexcept {
+  mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
+  if (out!=NULL) mi_out_buf_flush(out,true);             // output all the delayed output now
+}
+
+// add stderr to the delayed output after the module is loaded
+static void mi_add_stderr_output() {
+  mi_out_buf_flush(&mi_out_stderr, false); // flush current contents to stderr
+  mi_out_default = &mi_out_buf_stderr;     // and add stderr to the delayed output
+}
+
+// --------------------------------------------------------
+// Messages, all end up calling `_mi_fputs`.
+// --------------------------------------------------------
+#define MAX_ERROR_COUNT (10)
+static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT stop emitting errors and warnings
+
+// When overriding malloc, we may recurse into mi_vfprintf if an allocation
+// inside the C runtime causes another message.
+static mi_decl_thread bool recurse = false;
+
+void _mi_fputs(mi_output_fun* out, const char* prefix, const char* message) {
+  if (recurse) return;
+  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) out = mi_out_get_default();
+  recurse = true;
+  if (prefix != NULL) out(prefix);
+  out(message);
+  recurse = false;
+  return;
+}
+
+// Define our own limited `fprintf` that avoids memory allocation.
+// We do this using `snprintf` with a limited buffer.
+static void mi_vfprintf( mi_output_fun* out, const char* prefix, const char* fmt, va_list args ) {
+  char buf[512];
+  if (fmt==NULL) return;
+  if (recurse) return;
+  recurse = true;
+  vsnprintf(buf,sizeof(buf)-1,fmt,args);
+  recurse = false;
+  _mi_fputs(out,prefix,buf);
+}
+
+
+void _mi_fprintf( mi_output_fun* out, const char* fmt, ... ) {
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(out,NULL,fmt,args);
+  va_end(args);
+}
+
+void _mi_trace_message(const char* fmt, ...) {
+  if (mi_option_get(mi_option_verbose) <= 1) return;  // only with verbose level 2 or higher
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf(NULL, "mimalloc: ", fmt, args);
+  va_end(args);
+}
+
+void _mi_verbose_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_verbose)) return;
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(NULL, "mimalloc: ", fmt, args);
+  va_end(args);
+}
+
+void _mi_error_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
+  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(NULL, "mimalloc: error: ", fmt, args);
+  va_end(args);
+  mi_assert(false);
+}
+
+void _mi_warning_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
+  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(NULL, "mimalloc: warning: ", fmt, args);
+  va_end(args);
+}
+
+
+#if MI_DEBUG
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) {
+  _mi_fprintf(NULL,"mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
+  abort();
+}
+#endif
+
+mi_attr_noreturn void _mi_fatal_error(const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf(NULL, "mimalloc: fatal: ", fmt, args);
+  va_end(args);
+  #if (MI_SECURE>=0)
+  abort();
+  #endif
+}
+
+// --------------------------------------------------------
+// Initialize options by checking the environment
+// --------------------------------------------------------
+
+static void mi_strlcpy(char* dest, const char* src, size_t dest_size) {
+  dest[0] = 0;
+  #pragma warning(suppress:4996)
+  strncpy(dest, src, dest_size - 1);
+  dest[dest_size - 1] = 0;
+}
+
+static void mi_strlcat(char* dest, const char* src, size_t dest_size) {
+  #pragma warning(suppress:4996)
+  strncat(dest, src, dest_size - 1);
+  dest[dest_size - 1] = 0;
+}
+
+#if defined _WIN32
+// On Windows use GetEnvironmentVariable instead of getenv to work
+// reliably even when this is invoked before the C runtime is initialized.
+// i.e. when `_mi_preloading() == true`.
+// Note: on windows, environment names are not case sensitive.
+#include <windows.h>
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  result[0] = 0;
+  size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
+  return (len > 0 && len < result_size);
+}
+#else
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  const char* s = getenv(name);
+  if (s == NULL) {
+    // in unix environments we check the upper case name too.
+    char buf[64+1];
+    size_t len = strlen(name);
+    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = toupper(name[i]);
+    }
+    buf[len] = 0;
+    s = getenv(buf);
+  }
+  if (s != NULL && strlen(s) < result_size) {
+    mi_strlcpy(result, s, result_size);
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+#endif
+static void mi_option_init(mi_option_desc_t* desc) {
+  #ifndef _WIN32
+  // cannot call getenv() when still initializing the C runtime.
+  if (_mi_preloading()) return;
+  #endif
+  // Read option value from the environment
+  char buf[64+1];
+  mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+  mi_strlcat(buf, desc->name, sizeof(buf));
+  char s[64+1];
+  if (mi_getenv(buf, s, sizeof(s))) {
+    size_t len = strlen(s);
+    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = (char)toupper(s[i]);
+    }
+    buf[len] = 0;
+    if (buf[0]==0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
+      desc->value = 1;
+      desc->init = INITIALIZED;
+    }
+    else if (strstr("0;FALSE;NO;OFF", buf) != NULL) {
+      desc->value = 0;
+      desc->init = INITIALIZED;
+    }
+    else {
+      char* end = buf;
+      long value = strtol(buf, &end, 10);
+      if (*end == 0) {
+        desc->value = value;
+        desc->init = INITIALIZED;
+      }
+      else {
+        _mi_warning_message("environment option mimalloc_%s has an invalid value: %s\n", desc->name, buf);
+        desc->init = DEFAULTED;
+      }
+    }
+  }
+  else {
+    desc->init = DEFAULTED;
+  }
+  mi_assert_internal(desc->init != UNINIT);
+}
diff --git a/runtime/src/mimalloc/c/os.c b/runtime/src/mimalloc/c/os.c
new file mode 100644
index 00000000000..8f5afc5b469
--- /dev/null
+++ b/runtime/src/mimalloc/c/os.c
@@ -0,0 +1,950 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // ensure mmap flags are defined
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // strerror
+#include <errno.h>
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__wasi__)
+// stdlib.h is all we need, and has already been included in mimalloc.h
+#else
+#include <sys/mman.h>  // mmap
+#include <unistd.h>    // sysconf
+#if defined(__linux__)
+#include <linux/mman.h> // linux mmap flags
+#endif
+#if defined(__APPLE__)
+#include <mach/vm_statistics.h>
+#endif
+#endif
+
+/* -----------------------------------------------------------
+  Initialization.
+  On windows initializes support for aligned allocation and
+  large OS pages (if MIMALLOC_LARGE_OS_PAGES is true).
+----------------------------------------------------------- */
+bool    _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+bool    _mi_os_is_huge_reserved(void* p);
+void*   _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment);
+
+static void* mi_align_up_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_up((uintptr_t)p, alignment);
+}
+
+static uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
+  return (sz / alignment) * alignment;
+}
+
+static void* mi_align_down_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_down((uintptr_t)p, alignment);
+}
+
+// page size (initialized properly in `os_init`)
+static size_t os_page_size = 4096;
+
+// minimal allocation granularity
+static size_t os_alloc_granularity = 4096;
+
+// if non-zero, use large page allocation
+static size_t large_os_page_size = 0;
+
+// OS (small) page size
+size_t _mi_os_page_size() {
+  return os_page_size;
+}
+
+// if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB)
+size_t _mi_os_large_page_size() {
+  return (large_os_page_size != 0 ? large_os_page_size : _mi_os_page_size());
+}
+
+static bool use_large_os_page(size_t size, size_t alignment) {
+  // if we have access, check the size and alignment requirements
+  if (large_os_page_size == 0 || !mi_option_is_enabled(mi_option_large_os_pages)) return false;
+  return ((size % large_os_page_size) == 0 && (alignment % large_os_page_size) == 0);
+}
+
+// round to a good OS allocation size (bounded by max 12.5% waste)
+size_t _mi_os_good_alloc_size(size_t size) {
+  size_t align_size;
+  if (size < 512*KiB) align_size = _mi_os_page_size();
+  else if (size < 2*MiB) align_size = 64*KiB;
+  else if (size < 8*MiB) align_size = 256*KiB;
+  else if (size < 32*MiB) align_size = 1*MiB;
+  else align_size = 4*MiB;
+  if (size >= (SIZE_MAX - align_size)) return size; // possible overflow?
+  return _mi_align_up(size, align_size);
+}
+
+#if defined(_WIN32)
+// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
+// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
+// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
+// We hide MEM_EXTENDED_PARAMETER to compile with older SDK's.
+#include <winternl.h>
+typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ void*, ULONG);
+typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ PVOID, ULONG);
+static PVirtualAlloc2 pVirtualAlloc2 = NULL;
+static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
+
+static bool mi_win_enable_large_os_pages() 
+{
+  if (large_os_page_size > 0) return true;
+
+  // Try to see if large OS pages are supported
+  // To use large pages on Windows, we first need access permission
+  // Set "Lock pages in memory" permission in the group policy editor
+  // <https://devblogs.microsoft.com/oldnewthing/20110128-00/?p=11643>
+  unsigned long err = 0;
+  HANDLE token = NULL;
+  BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+  if (ok) {
+    TOKEN_PRIVILEGES tp;
+    ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid);
+    if (ok) {
+      tp.PrivilegeCount = 1;
+      tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+      ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
+      if (ok) {
+        err = GetLastError();
+        ok = (err == ERROR_SUCCESS);
+        if (ok) {
+          large_os_page_size = GetLargePageMinimum();
+        }
+      }
+    }
+    CloseHandle(token);
+  }
+  if (!ok) {
+    if (err == 0) err = GetLastError();
+    _mi_warning_message("cannot enable large OS page support, error %lu\n", err);
+  }
+  return (ok!=0);
+}
+
+void _mi_os_init(void) {
+  // get the page size
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  if (si.dwPageSize > 0) os_page_size = si.dwPageSize;
+  if (si.dwAllocationGranularity > 0) os_alloc_granularity = si.dwAllocationGranularity;
+  // get the VirtualAlloc2 function
+  HINSTANCE  hDll;
+  hDll = LoadLibrary(TEXT("kernelbase.dll"));
+  if (hDll != NULL) {
+    // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
+    pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
+    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
+    FreeLibrary(hDll);
+  }
+  hDll = LoadLibrary(TEXT("ntdll.dll"));
+  if (hDll != NULL) {    
+    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
+    FreeLibrary(hDll);
+  }  
+  if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    mi_win_enable_large_os_pages();
+  }
+}
+#elif defined(__wasi__)
+void _mi_os_init() {
+  os_page_size = 0x10000; // WebAssembly has a fixed page size: 64KB
+  os_alloc_granularity = 16;
+}
+#else
+void _mi_os_init() {
+  // get the page size
+  long result = sysconf(_SC_PAGESIZE);
+  if (result > 0) {
+    os_page_size = (size_t)result;
+    os_alloc_granularity = os_page_size;
+  }
+  if (mi_option_is_enabled(mi_option_large_os_pages)) {
+    large_os_page_size = (1UL << 21); // 2MiB
+  }
+}
+#endif
+
+
+/* -----------------------------------------------------------
+  Raw allocation on Windows (VirtualAlloc) and Unix's (mmap).
+----------------------------------------------------------- */
+
+static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
+{
+  if (addr == NULL || size == 0 || _mi_os_is_huge_reserved(addr)) return true;
+  bool err = false;
+#if defined(_WIN32)
+  err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
+#elif defined(__wasi__)
+  err = 0; // WebAssembly's heap cannot be shrunk
+#else
+  err = (munmap(addr, size) == -1);
+#endif
+  if (was_committed) _mi_stat_decrease(&stats->committed, size); 
+  _mi_stat_decrease(&stats->reserved, size);
+  if (err) {
+#pragma warning(suppress:4996)
+    _mi_warning_message("munmap failed: %s, addr 0x%8li, size %lu\n", strerror(errno), (size_t)addr, size);
+    return false;
+  }
+  else {
+    return true;
+  }
+}
+
+static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size);
+
+#ifdef _WIN32
+static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
+#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
+  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
+  if ((size % ((uintptr_t)1 << 30)) == 0 /* 1GiB multiple */
+    && (flags & MEM_LARGE_PAGES) != 0 && (flags & MEM_COMMIT) != 0 && (flags & MEM_RESERVE) != 0
+    && (addr != NULL || try_alignment == 0 || try_alignment % _mi_os_page_size() == 0)
+    && pNtAllocateVirtualMemoryEx != NULL)
+  {
+    #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
+    #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
+    #endif
+    MEM_EXTENDED_PARAMETER param = { 0, 0 };
+    param.Type = 5; // == MemExtendedParameterAttributeFlags;
+    param.ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
+    SIZE_T psize = size;
+    void*  base  = addr;
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, &param, 1);
+    if (err == 0) {
+      return base;
+    }
+    else {
+      // else fall back to regular large OS pages
+      _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error 0x%lx)\n", err);
+    }
+  }
+#endif
+#if (MI_INTPTR_SIZE >= 8) 
+  // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
+  void* hint;
+  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment,size)) != NULL) {
+    return VirtualAlloc(hint, size, flags, PAGE_READWRITE);
+  }
+#endif
+#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)  
+  // on modern Windows try use VirtualAlloc2 for aligned allocation
+  if (try_alignment > 0 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+    MEM_ADDRESS_REQUIREMENTS reqs = { 0 };
+    reqs.Alignment = try_alignment;
+    MEM_EXTENDED_PARAMETER param = { 0 };
+    param.Type = MemExtendedParameterAddressRequirements;
+    param.Pointer = &reqs;
+    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
+  }
+#endif
+  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+}
+
+static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
+  mi_assert_internal(!(large_only && !allow_large));
+  static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
+  void* p = NULL;
+  if ((large_only || use_large_os_page(size, try_alignment)) 
+      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
+    uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
+      // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
+      mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok);
+    }
+    else {
+      // large OS pages must always reserve and commit.
+      *is_large = true;
+      p = mi_win_virtual_allocx(addr, size, try_alignment, flags | MEM_LARGE_PAGES);
+      if (large_only) return p;
+      // fall back to non-large page allocation on error (`p == NULL`).
+      if (p == NULL) {
+        mi_atomic_write(&large_page_try_ok,10);  // on error, don't try again for the next N allocations
+      }
+    }
+  }
+  if (p == NULL) {
+    *is_large = ((flags&MEM_LARGE_PAGES) != 0);
+    p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate memory: error code: %i, addr: %p, size: 0x%x, large only: %d, allow_large: %d\n", GetLastError(), addr, size, large_only, allow_large);
+  }
+  return p;
+}
+
+#elif defined(__wasi__)
+static void* mi_wasm_heap_grow(size_t size, size_t try_alignment) {
+  uintptr_t base = __builtin_wasm_memory_size(0) * _mi_os_page_size();
+  uintptr_t aligned_base = _mi_align_up(base, (uintptr_t) try_alignment);
+  size_t alloc_size = _mi_align_up( aligned_base - base + size, _mi_os_page_size());
+  mi_assert(alloc_size >= size && (alloc_size % _mi_os_page_size()) == 0);
+  if (alloc_size < size) return NULL;
+  if (__builtin_wasm_memory_grow(0, alloc_size / _mi_os_page_size()) == SIZE_MAX) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  return (void*)aligned_base;
+}
+#else
+#define MI_OS_USE_MMAP
+static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
+  void* p = NULL;
+  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
+  // on 64-bit systems, use the virtual address area after 4TiB for 4MiB aligned allocations
+  void* hint;
+  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment, size)) != NULL) {
+    p = mmap(hint,size,protect_flags,flags,fd,0);
+    if (p==MAP_FAILED) p = NULL; // fall back to regular mmap
+  }
+  #else
+  UNUSED(try_alignment);
+  #endif
+  if (p==NULL) {
+    p = mmap(addr,size,protect_flags,flags,fd,0);
+    if (p==MAP_FAILED) p = NULL;
+  }
+  return p;
+}
+
+static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
+  void* p = NULL;
+  #if !defined(MAP_ANONYMOUS)
+  #define MAP_ANONYMOUS  MAP_ANON
+  #endif
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  int fd = -1;
+  #if defined(MAP_ALIGNED)  // BSD
+  if (try_alignment > 0) {
+    size_t n = _mi_bsr(try_alignment);
+    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
+      flags |= MAP_ALIGNED(n);
+    }
+  }
+  #endif
+  #if defined(PROT_MAX)
+  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
+  #endif
+  #if defined(VM_MAKE_TAG)
+  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
+  int os_tag = (int)mi_option_get(mi_option_os_tag);
+  if (os_tag < 100 || os_tag > 255) os_tag = 100;
+  fd = VM_MAKE_TAG(os_tag);
+  #endif
+  if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
+    static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
+    uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // If the OS is not configured for large OS pages, or the user does not have
+      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
+      // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
+      // to avoid too many failing calls to mmap.
+      mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok);
+    }
+    else {
+      int lflags = flags;
+      int lfd = fd;
+      #ifdef MAP_ALIGNED_SUPER
+      lflags |= MAP_ALIGNED_SUPER;
+      #endif
+      #ifdef MAP_HUGETLB
+      lflags |= MAP_HUGETLB;
+      #endif
+      #ifdef MAP_HUGE_1GB
+      if ((size % ((uintptr_t)1 << 30)) == 0) {
+        lflags |= MAP_HUGE_1GB;
+      }
+      else
+      #endif
+      {
+        #ifdef MAP_HUGE_2MB
+        lflags |= MAP_HUGE_2MB;
+        #endif
+      }
+      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
+      lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+      #endif
+      if (large_only || lflags != flags) {
+        // try large OS page allocation
+        *is_large = true;
+        p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
+        #ifdef MAP_HUGE_1GB
+        if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
+          _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
+          lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
+          p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
+        }
+        #endif
+        if (large_only) return p;
+        if (p == NULL) {
+          mi_atomic_write(&large_page_try_ok, 10);  // on error, don't try again for the next N allocations
+        }
+      }
+    }
+  }
+  if (p == NULL) {
+    *is_large = false;
+    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);    
+    #if defined(MADV_HUGEPAGE)
+    // Many Linux systems don't allow MAP_HUGETLB but they support instead
+    // transparent huge pages (TPH). It is not required to call `madvise` with MADV_HUGE
+    // though since properly aligned allocations will already use large pages if available
+    // in that case -- in particular for our large regions (in `memory.c`).
+    // However, some systems only allow TPH if called with explicit `madvise`, so
+    // when large OS pages are enabled for mimalloc, we call `madvice` anyways.
+    if (allow_large && use_large_os_page(size, try_alignment)) {
+      if (madvise(p, size, MADV_HUGEPAGE) == 0) {
+        *is_large = true; // possibly
+      };
+    }
+    #endif
+  }
+  return p;
+}
+#endif
+
+// On 64-bit systems, we can do efficient aligned allocation by using 
+// the 4TiB to 30TiB area to allocate them.
+#if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || (defined(MI_OS_USE_MMAP) && !defined(MAP_ALIGNED)))
+static volatile _Atomic(intptr_t) aligned_base;
+
+// Return a 4MiB aligned address that is probably available
+static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
+  if (try_alignment == 0 || try_alignment > MI_SEGMENT_SIZE) return NULL;
+  if ((size%MI_SEGMENT_SIZE) != 0) return NULL;
+  intptr_t hint = mi_atomic_add(&aligned_base, size);
+  if (hint == 0 || hint > ((intptr_t)30<<40)) { // try to wrap around after 30TiB (area after 32TiB is used for huge OS pages)
+    intptr_t init = ((intptr_t)4 << 40); // start at 4TiB area
+    #if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of aligned allocations unless in debug mode
+    uintptr_t r = _mi_random_init((uintptr_t)&mi_os_get_aligned_hint ^ hint);
+    init = init + (MI_SEGMENT_SIZE * ((r>>17) & 0xFFFF));  // (randomly 0-64k)*4MiB == 0 to 256GiB
+    #endif
+    mi_atomic_cas_strong(mi_atomic_cast(uintptr_t, &aligned_base), init, hint + size);
+    hint = mi_atomic_add(&aligned_base, size); // this may still give 0 or > 30TiB but that is ok, it is a hint after all
+  }
+  if (hint%try_alignment != 0) return NULL;
+  return (void*)hint;
+}
+#else
+static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
+  UNUSED(try_alignment); UNUSED(size);
+  return NULL;
+}
+#endif
+
+
+// Primitive allocation from the OS.
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  if (size == 0) return NULL;
+  if (!commit) allow_large = false;
+
+  void* p = NULL;
+  /*
+  if (commit && allow_large) {
+    p = _mi_os_try_alloc_from_huge_reserved(size, try_alignment);
+    if (p != NULL) {
+      *is_large = true;
+      return p;
+    }
+  }
+  */
+
+  #if defined(_WIN32)
+    int flags = MEM_RESERVE;
+    if (commit) flags |= MEM_COMMIT;
+    p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
+  #elif defined(__wasi__)
+    *is_large = false;
+    p = mi_wasm_heap_grow(size, try_alignment);
+  #else
+    int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
+    p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
+  #endif
+  mi_stat_counter_increase(stats->mmap_calls, 1);
+  if (p != NULL) {
+    _mi_stat_increase(&stats->reserved, size);
+    if (commit) { _mi_stat_increase(&stats->committed, size); }
+  }
+  return p;
+}
+
+
+// Primitive aligned allocation from the OS.
+// This function guarantees the allocated memory is aligned.
+static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+  mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0));
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  if (!commit) allow_large = false;
+  if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
+  size = _mi_align_up(size, _mi_os_page_size());
+
+  // try first with a hint (this will be aligned directly on Win 10+ or BSD)
+  void* p = mi_os_mem_alloc(size, alignment, commit, allow_large, is_large, stats);
+  if (p == NULL) return NULL;
+
+  // if not aligned, free it, overallocate, and unmap around it
+  if (((uintptr_t)p % alignment != 0)) {
+    mi_os_mem_free(p, size, commit, stats);
+    if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
+    size_t over_size = size + alignment;
+
+#if _WIN32
+    // over-allocate and than re-allocate exactly at an aligned address in there.
+    // this may fail due to threads allocating at the same time so we
+    // retry this at most 3 times before giving up.
+    // (we can not decommit around the overallocation on Windows, because we can only
+    //  free the original pointer, not one pointing inside the area)
+    int flags = MEM_RESERVE;
+    if (commit) flags |= MEM_COMMIT;
+    for (int tries = 0; tries < 3; tries++) {
+      // over-allocate to determine a virtual memory range
+      p = mi_os_mem_alloc(over_size, alignment, commit, false, is_large, stats);
+      if (p == NULL) return NULL; // error
+      if (((uintptr_t)p % alignment) == 0) {
+        // if p happens to be aligned, just decommit the left-over area
+        _mi_os_decommit((uint8_t*)p + size, over_size - size, stats);
+        break;
+      }
+      else {
+        // otherwise free and allocate at an aligned address in there
+        mi_os_mem_free(p, over_size, commit, stats);
+        void* aligned_p = mi_align_up_ptr(p, alignment);
+        p = mi_win_virtual_alloc(aligned_p, size, alignment, flags, false, allow_large, is_large);
+        if (p == aligned_p) break; // success!
+        if (p != NULL) { // should not happen?
+          mi_os_mem_free(p, size, commit, stats);
+          p = NULL;
+        }
+      }
+    }
+#else
+    // overallocate...
+    p = mi_os_mem_alloc(over_size, alignment, commit, false, is_large, stats);
+    if (p == NULL) return NULL;
+    // and selectively unmap parts around the over-allocated area.
+    void* aligned_p = mi_align_up_ptr(p, alignment);
+    size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
+    size_t mid_size = _mi_align_up(size, _mi_os_page_size());
+    size_t post_size = over_size - pre_size - mid_size;
+    mi_assert_internal(pre_size < over_size && post_size < over_size && mid_size >= size);
+    if (pre_size > 0)  mi_os_mem_free(p, pre_size, commit, stats);
+    if (post_size > 0) mi_os_mem_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats);
+    // we can return the aligned pointer on `mmap` systems
+    p = aligned_p;
+#endif
+  }
+
+  mi_assert_internal(p == NULL || (p != NULL && ((uintptr_t)p % alignment) == 0));
+  return p;
+}
+
+/* -----------------------------------------------------------
+  OS API: alloc, free, alloc_aligned
+----------------------------------------------------------- */
+
+void* _mi_os_alloc(size_t size, mi_stats_t* stats) {
+  if (size == 0) return NULL;
+  size = _mi_os_good_alloc_size(size);
+  bool is_large = false;
+  return mi_os_mem_alloc(size, 0, true, false, &is_large, stats);
+}
+
+void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats) {
+  if (size == 0 || p == NULL) return;
+  size = _mi_os_good_alloc_size(size);
+  mi_os_mem_free(p, size, was_committed, stats);
+}
+
+void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
+  _mi_os_free_ex(p, size, true, stats);
+}
+
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld)
+{
+  if (size == 0) return NULL;
+  size = _mi_os_good_alloc_size(size);
+  alignment = _mi_align_up(alignment, _mi_os_page_size());
+  bool allow_large = false;
+  if (large != NULL) {
+    allow_large = *large;
+    *large = false;
+  }
+  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), tld->stats);
+}
+
+
+
+/* -----------------------------------------------------------
+  OS memory API: reset, commit, decommit, protect, unprotect.
+----------------------------------------------------------- */
+
+
+// OS page align within a given area, either conservative (pages inside the area only),
+// or not (straddling pages outside the area is possible)
+static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, size_t* newsize) {
+  mi_assert(addr != NULL && size > 0);
+  if (newsize != NULL) *newsize = 0;
+  if (size == 0 || addr == NULL) return NULL;
+
+  // page align conservatively within the range
+  void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size())
+    : mi_align_down_ptr(addr, _mi_os_page_size()));
+  void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
+    : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
+  ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start;
+  if (diff <= 0) return NULL;
+
+  mi_assert_internal((conservative && (size_t)diff <= size) || (!conservative && (size_t)diff >= size));
+  if (newsize != NULL) *newsize = (size_t)diff;
+  return start;
+}
+
+static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* newsize) {
+  return mi_os_page_align_areax(true, addr, size, newsize);
+}
+
+// Commit/Decommit memory.
+// Usuelly commit is aligned liberal, while decommit is aligned conservative.
+// (but not for the reset version where we want commit to be conservative as well)
+static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservative, bool* is_zero, mi_stats_t* stats) {
+  // page align in the range, commit liberally, decommit conservative
+  *is_zero = false;
+  size_t csize;
+  void* start = mi_os_page_align_areax(conservative, addr, size, &csize);
+  if (csize == 0 || _mi_os_is_huge_reserved(addr)) return true;
+  int err = 0;
+  if (commit) {
+    _mi_stat_increase(&stats->committed, csize);
+    _mi_stat_counter_increase(&stats->commit_calls, 1);
+  }
+  else {
+    _mi_stat_decrease(&stats->committed, csize);
+  }
+
+  #if defined(_WIN32)
+  if (commit) {
+    // if the memory was already committed, the call succeeds but it is not zero'd
+    // *is_zero = true;
+    void* p = VirtualAlloc(start, csize, MEM_COMMIT, PAGE_READWRITE);
+    err = (p == start ? 0 : GetLastError());
+  }
+  else {
+    BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT);
+    err = (ok ? 0 : GetLastError());
+  }
+  #elif defined(__wasi__)
+  // WebAssembly guests can't control memory protection
+  #else
+  err = mprotect(start, csize, (commit ? (PROT_READ | PROT_WRITE) : PROT_NONE));
+  if (err != 0) { err = errno; }
+  #endif
+  if (err != 0) {
+    _mi_warning_message("commit/decommit error: start: 0x%p, csize: 0x%x, err: %i\n", start, csize, err);
+  }
+  mi_assert_internal(err == 0);
+  return (err == 0);
+}
+
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
+  return mi_os_commitx(addr, size, true, false /* conservative? */, is_zero, stats);
+}
+
+bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats) {
+  bool is_zero;
+  return mi_os_commitx(addr, size, false, true /* conservative? */, &is_zero, stats);
+}
+
+bool _mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
+  return mi_os_commitx(addr, size, true, true /* conservative? */, is_zero, stats);
+}
+
+
+// Signal to the OS that the address range is no longer in use
+// but may be used later again. This will release physical memory
+// pages and reduce swapping while keeping the memory committed.
+// We page align to a conservative area inside the range to reset.
+static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats) {
+  // page align conservatively within the range
+  size_t csize;
+  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0 || _mi_os_is_huge_reserved(addr)) return true;
+  if (reset) _mi_stat_increase(&stats->reset, csize);
+        else _mi_stat_decrease(&stats->reset, csize);
+  if (!reset) return true; // nothing to do on unreset!
+
+  #if (MI_DEBUG>1)
+  if (MI_SECURE==0) {
+    memset(start, 0, csize); // pretend it is eagerly reset
+  }
+  #endif
+
+#if defined(_WIN32)
+  // Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory
+  void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
+  mi_assert_internal(p == start);
+  #if 1
+  if (p == start && start != NULL) {
+    VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
+  }
+  #endif
+  if (p != start) return false;
+#else
+#if defined(MADV_FREE)
+  static int advice = MADV_FREE;
+  int err = madvise(start, csize, advice);
+  if (err != 0 && errno == EINVAL && advice == MADV_FREE) {
+    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
+    advice = MADV_DONTNEED;
+    err = madvise(start, csize, advice);
+  }
+#elif defined(__wasi__)
+  int err = 0;
+#else
+  int err = madvise(start, csize, MADV_DONTNEED);
+#endif
+  if (err != 0) {
+    _mi_warning_message("madvise reset error: start: 0x%p, csize: 0x%x, errno: %i\n", start, csize, errno);
+  }
+  //mi_assert(err == 0);
+  if (err != 0) return false;
+#endif
+  return true;
+}
+
+// Signal to the OS that the address range is no longer in use
+// but may be used later again. This will release physical memory
+// pages and reduce swapping while keeping the memory committed.
+// We page align to a conservative area inside the range to reset.
+bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
+  if (mi_option_is_enabled(mi_option_reset_decommits)) {
+    return _mi_os_decommit(addr,size,stats);
+  }
+  else {
+    return mi_os_resetx(addr, size, true, stats);
+  }
+}
+
+bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
+  if (mi_option_is_enabled(mi_option_reset_decommits)) {
+    return _mi_os_commit_unreset(addr, size, is_zero, stats);  // re-commit it (conservatively!)
+  }
+  else {
+    *is_zero = false;
+    return mi_os_resetx(addr, size, false, stats);
+  }
+}
+
+
+// Protect a region in memory to be not accessible.
+static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
+  // page align conservatively within the range
+  size_t csize = 0;
+  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return false;
+  if (_mi_os_is_huge_reserved(addr)) {
+	  _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
+  }
+  int err = 0;
+#ifdef _WIN32
+  DWORD oldprotect = 0;
+  BOOL ok = VirtualProtect(start, csize, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
+  err = (ok ? 0 : GetLastError());
+#elif defined(__wasi__)
+  err = 0;
+#else
+  err = mprotect(start, csize, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
+  if (err != 0) { err = errno; }
+#endif
+  if (err != 0) {
+    _mi_warning_message("mprotect error: start: 0x%p, csize: 0x%x, err: %i\n", start, csize, err);
+  }
+  return (err == 0);
+}
+
+bool _mi_os_protect(void* addr, size_t size) {
+  return mi_os_protectx(addr, size, true);
+}
+
+bool _mi_os_unprotect(void* addr, size_t size) {
+  return mi_os_protectx(addr, size, false);
+}
+
+
+
+bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
+  // page align conservatively within the range
+  mi_assert_internal(oldsize > newsize && p != NULL);
+  if (oldsize < newsize || p == NULL) return false;
+  if (oldsize == newsize) return true;
+
+  // oldsize and newsize should be page aligned or we cannot shrink precisely
+  void* addr = (uint8_t*)p + newsize;
+  size_t size = 0;
+  void* start = mi_os_page_align_area_conservative(addr, oldsize - newsize, &size);
+  if (size == 0 || start != addr) return false;
+
+#ifdef _WIN32
+  // we cannot shrink on windows, but we can decommit
+  return _mi_os_decommit(start, size, stats);
+#else
+  return mi_os_mem_free(start, size, true, stats);
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------
+Support for huge OS pages (1Gib) that are reserved up-front and never
+released. Only regions are allocated in here (see `memory.c`) so the memory
+will be reused.
+-----------------------------------------------------------------------------*/
+#define MI_HUGE_OS_PAGE_SIZE ((size_t)1 << 30)  // 1GiB
+
+typedef struct mi_huge_info_s {
+  volatile _Atomic(void*)  start;     // start of huge page area (32TiB)
+  volatile _Atomic(size_t) reserved;  // total reserved size
+  volatile _Atomic(size_t) used;      // currently allocated
+} mi_huge_info_t;
+
+static mi_huge_info_t os_huge_reserved = { NULL, 0, ATOMIC_VAR_INIT(0) };
+
+bool _mi_os_is_huge_reserved(void* p) {
+  return (mi_atomic_read_ptr(&os_huge_reserved.start) != NULL && 
+          p >= mi_atomic_read_ptr(&os_huge_reserved.start) &&
+          (uint8_t*)p < (uint8_t*)mi_atomic_read_ptr(&os_huge_reserved.start) + mi_atomic_read(&os_huge_reserved.reserved));
+}
+
+void* _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment)
+{
+  // only allow large aligned allocations (e.g. regions)
+  if (size < MI_SEGMENT_SIZE || (size % MI_SEGMENT_SIZE) != 0) return NULL;
+  if (try_alignment > MI_SEGMENT_SIZE) return NULL;  
+  if (mi_atomic_read_ptr(&os_huge_reserved.start)==NULL) return NULL;
+  if (mi_atomic_read(&os_huge_reserved.used) >= mi_atomic_read(&os_huge_reserved.reserved)) return NULL; // already full
+
+  // always aligned
+  mi_assert_internal(mi_atomic_read(&os_huge_reserved.used) % MI_SEGMENT_SIZE == 0 );
+  mi_assert_internal( (uintptr_t)mi_atomic_read_ptr(&os_huge_reserved.start) % MI_SEGMENT_SIZE == 0 );
+  
+  // try to reserve space
+  size_t base = mi_atomic_addu( &os_huge_reserved.used, size );
+  if ((base + size) > os_huge_reserved.reserved) {
+    // "free" our over-allocation
+    mi_atomic_subu( &os_huge_reserved.used, size);
+    return NULL;
+  }
+
+  // success!
+  uint8_t* p = (uint8_t*)mi_atomic_read_ptr(&os_huge_reserved.start) + base;
+  mi_assert_internal( (uintptr_t)p % MI_SEGMENT_SIZE == 0 );
+  return p;
+}
+
+/*
+static void mi_os_free_huge_reserved() {
+  uint8_t* addr = os_huge_reserved.start;
+  size_t total  = os_huge_reserved.reserved;
+  os_huge_reserved.reserved = 0;
+  os_huge_reserved.start = NULL;
+  for( size_t current = 0; current < total; current += MI_HUGE_OS_PAGE_SIZE) {
+    _mi_os_free(addr + current, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
+  }
+}
+*/
+
+#if !(MI_INTPTR_SIZE >= 8 && (defined(_WIN32) || defined(MI_OS_USE_MMAP)))
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  UNUSED(pages); UNUSED(max_secs);
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  return ENOMEM; 
+}
+#else
+int mi_reserve_huge_os_pages( size_t pages, double max_secs, size_t* pages_reserved ) mi_attr_noexcept
+{
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  if (max_secs==0) return ETIMEDOUT; // timeout 
+  if (pages==0) return 0;            // ok
+  if (!mi_atomic_cas_ptr_strong(&os_huge_reserved.start,(void*)1,NULL)) return ETIMEDOUT; // already reserved
+
+  // Set the start address after the 32TiB area
+  uint8_t* start = (uint8_t*)((uintptr_t)32 << 40); // 32TiB virtual start address
+  #if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of huge pages unless in debug mode
+  uintptr_t r = _mi_random_init((uintptr_t)&mi_reserve_huge_os_pages);
+  start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
+  #endif
+
+  // Allocate one page at the time but try to place them contiguously
+  // We allocate one page at the time to be able to abort if it takes too long
+  double start_t = _mi_clock_start();
+  uint8_t* addr = start;  // current top of the allocations
+  for (size_t page = 0; page < pages; page++, addr += MI_HUGE_OS_PAGE_SIZE ) {
+    // allocate a page
+    void* p = NULL; 
+    bool is_large = true;
+    #ifdef _WIN32
+    if (page==0) { mi_win_enable_large_os_pages(); }
+    p = mi_win_virtual_alloc(addr, MI_HUGE_OS_PAGE_SIZE, 0, MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE, true, true, &is_large);
+    #elif defined(MI_OS_USE_MMAP)
+    p = mi_unix_mmap(addr, MI_HUGE_OS_PAGE_SIZE, 0, PROT_READ | PROT_WRITE, true, true, &is_large);
+    #else 
+    // always fail
+    #endif  
+    
+    // Did we succeed at a contiguous address?
+    if (p != addr) {
+      // no success, issue a warning and return with an error 
+      if (p != NULL) {
+        _mi_warning_message("could not allocate contiguous huge page %zu at 0x%p\n", page, addr); 
+        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main );
+      }
+      else {
+        #ifdef _WIN32
+        int err = GetLastError();
+        #else
+        int err = errno;
+        #endif
+        _mi_warning_message("could not allocate huge page %zu at 0x%p, error: %i\n", page, addr, err);
+      }
+      return ENOMEM;  
+    }
+    // success, record it
+    if (page==0) {
+      mi_atomic_write_ptr(&os_huge_reserved.start, addr);  // don't switch the order of these writes
+      mi_atomic_write(&os_huge_reserved.reserved, MI_HUGE_OS_PAGE_SIZE);
+    }
+    else {
+      mi_atomic_addu(&os_huge_reserved.reserved,MI_HUGE_OS_PAGE_SIZE);
+    }
+    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE); 
+    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
+    if (pages_reserved != NULL) { *pages_reserved = page + 1; }
+
+    // check for timeout
+    double elapsed = _mi_clock_end(start_t);
+    if (elapsed > max_secs) return ETIMEDOUT; 
+    if (page >= 1) {
+      double estimate = ((elapsed / (double)(page+1)) * (double)pages);
+      if (estimate > 1.5*max_secs) return ETIMEDOUT; // seems like we are going to timeout
+    }
+  }  
+  _mi_verbose_message("reserved %zu huge pages\n", pages);
+  return 0;
+}
+#endif
+
diff --git a/runtime/src/mimalloc/c/page-queue.c b/runtime/src/mimalloc/c/page-queue.c
new file mode 100644
index 00000000000..95443a69bb5
--- /dev/null
+++ b/runtime/src/mimalloc/c/page-queue.c
@@ -0,0 +1,361 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  Definition of page queues for each block size
+----------------------------------------------------------- */
+
+#ifndef MI_IN_PAGE_C
+#error "this file should be included from 'page.c'"
+#endif
+
+/* -----------------------------------------------------------
+  Minimal alignment in machine words (i.e. `sizeof(void*)`)
+----------------------------------------------------------- */
+
+#if (MI_MAX_ALIGN_SIZE > 4*MI_INTPTR_SIZE)
+  #error "define alignment for more than 4x word size for this platform"
+#elif (MI_MAX_ALIGN_SIZE > 2*MI_INTPTR_SIZE)
+  #define MI_ALIGN4W   // 4 machine words minimal alignment
+#elif (MI_MAX_ALIGN_SIZE > MI_INTPTR_SIZE)
+  #define MI_ALIGN2W   // 2 machine words minimal alignment
+#else
+  // ok, default alignment is 1 word
+#endif
+
+
+/* -----------------------------------------------------------
+  Queue query
+----------------------------------------------------------- */
+
+
+static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
+  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+}
+
+static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
+  return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+}
+
+static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
+  return (pq->block_size > MI_LARGE_OBJ_SIZE_MAX);
+}
+
+/* -----------------------------------------------------------
+  Bins
+----------------------------------------------------------- */
+
+// Bit scan reverse: return the index of the highest bit.
+static inline uint8_t mi_bsr32(uint32_t x);
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+static inline uint8_t mi_bsr32(uint32_t x) {
+  uint32_t idx;
+  _BitScanReverse((DWORD*)&idx, x);
+  return (uint8_t)idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+static inline uint8_t mi_bsr32(uint32_t x) {
+  return (31 - __builtin_clz(x));
+}
+#else
+static inline uint8_t mi_bsr32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const uint8_t debruijn[32] = {
+     31,  0, 22,  1, 28, 23, 18,  2, 29, 26, 24, 10, 19,  7,  3, 12,
+     30, 21, 27, 17, 25,  9,  6, 11, 20, 16,  8,  5, 15,  4, 14, 13,
+  };
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  x++;
+  return debruijn[(x*0x076be629) >> 27];
+}
+#endif
+
+// Bit scan reverse: return the index of the highest bit.
+uint8_t _mi_bsr(uintptr_t x) {
+  if (x == 0) return 0;
+#if MI_INTPTR_SIZE==8
+  uint32_t hi = (x >> 32);
+  return (hi == 0 ? mi_bsr32((uint32_t)x) : 32 + mi_bsr32(hi));
+#elif MI_INTPTR_SIZE==4
+  return mi_bsr32(x);
+#else
+# error "define bsr for non-32 or 64-bit platforms"
+#endif
+}
+
+// Return the bin for a given field size.
+// Returns MI_BIN_HUGE if the size is too large.
+// We use `wsize` for the size in "machine word sizes",
+// i.e. byte size == `wsize*sizeof(void*)`.
+extern inline uint8_t _mi_bin(size_t size) {
+  size_t wsize = _mi_wsize_from_size(size);
+  uint8_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+  #if defined(MI_ALIGN4W)
+  else if (wsize <= 4) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+  #elif defined(MI_ALIGN2W)
+  else if (wsize <= 8) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+  #else
+  else if (wsize <= 8) {
+    bin = (uint8_t)wsize;
+  }
+  #endif
+  else if (wsize > MI_LARGE_OBJ_WSIZE_MAX) {
+    bin = MI_BIN_HUGE;
+  }
+  else {
+    #if defined(MI_ALIGN4W) 
+    if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
+    #endif
+    wsize--;
+    // find the highest bit
+    uint8_t b = mi_bsr32((uint32_t)wsize);
+    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
+    // - adjust with 3 because we use do not round the first 8 sizes
+    //   which each get an exact bin
+    bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
+    mi_assert_internal(bin < MI_BIN_HUGE);
+  }
+  mi_assert_internal(bin > 0 && bin <= MI_BIN_HUGE);
+  return bin;
+}
+
+
+
+/* -----------------------------------------------------------
+  Queue of pages with free blocks
+----------------------------------------------------------- */
+
+size_t _mi_bin_size(uint8_t bin) {
+  return _mi_heap_empty.pages[bin].block_size;
+}
+
+// Good size for allocation
+size_t mi_good_size(size_t size) mi_attr_noexcept {
+  if (size <= MI_LARGE_OBJ_SIZE_MAX) {
+    return _mi_bin_size(_mi_bin(size));
+  }
+  else {
+    return _mi_align_up(size,_mi_os_page_size());
+  }
+}
+
+#if (MI_DEBUG>1)
+static bool mi_page_queue_contains(mi_page_queue_t* queue, const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_page_t* list = queue->first;
+  while (list != NULL) {
+    mi_assert_internal(list->next == NULL || list->next->prev == list);
+    mi_assert_internal(list->prev == NULL || list->prev->next == list);
+    if (list == page) break;
+    list = list->next;
+  }
+  return (list == page);
+}
+
+#endif
+
+#if (MI_DEBUG>1)
+static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t* pq) {
+  return (pq >= &heap->pages[0] && pq <= &heap->pages[MI_BIN_FULL]);
+}
+#endif
+
+static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->block_size));
+  mi_heap_t* heap = page->heap;
+  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
+  mi_page_queue_t* pq = &heap->pages[bin];
+  mi_assert_internal(bin >= MI_BIN_HUGE || page->block_size == pq->block_size);
+  mi_assert_expensive(mi_page_queue_contains(pq, page));
+  return pq;
+}
+
+static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : _mi_bin(page->block_size));
+  mi_assert_internal(bin <= MI_BIN_FULL);
+  mi_page_queue_t* pq = &heap->pages[bin];
+  mi_assert_internal(mi_page_is_in_full(page) || page->block_size == pq->block_size);
+  return pq;
+}
+
+// The current small page array is for efficiency and for each
+// small size (up to 256) it points directly to the page for that
+// size without having to compute the bin. This means when the
+// current free page queue is updated for a small bin, we need to update a
+// range of entries in `_mi_page_small_free`.
+static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_queue_t* pq) {
+  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+  size_t size = pq->block_size;
+  if (size > MI_SMALL_SIZE_MAX) return;
+
+  mi_page_t* page = pq->first;
+  if (pq->first == NULL) page = (mi_page_t*)&_mi_page_empty;
+
+  // find index in the right direct page array
+  size_t start;
+  size_t idx = _mi_wsize_from_size(size);
+  mi_page_t** pages_free = heap->pages_free_direct;
+
+  if (pages_free[idx] == page) return;  // already set
+
+  // find start slot
+  if (idx<=1) {
+    start = 0;
+  }
+  else {
+    // find previous size; due to minimal alignment upto 3 previous bins may need to be skipped
+    uint8_t bin = _mi_bin(size);
+    const mi_page_queue_t* prev = pq - 1;
+    while( bin == _mi_bin(prev->block_size) && prev > &heap->pages[0]) {
+      prev--;
+    }
+    start = 1 + _mi_wsize_from_size(prev->block_size);
+    if (start > idx) start = idx;
+  }
+
+  // set size range to the right page
+  mi_assert(start <= idx);
+  for (size_t sz = start; sz <= idx; sz++) {
+    pages_free[sz] = page;
+  }
+}
+
+/*
+static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
+  return (queue->first == NULL);
+}
+*/
+
+static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(mi_page_queue_contains(queue, page));
+  mi_assert_internal(page->block_size == queue->block_size || (page->block_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  if (page->prev != NULL) page->prev->next = page->next;
+  if (page->next != NULL) page->next->prev = page->prev;
+  if (page == queue->last)  queue->last = page->prev;
+  if (page == queue->first) {
+    queue->first = page->next;
+    // update first
+    mi_heap_t* heap = page->heap;
+    mi_assert_internal(mi_heap_contains_queue(heap, queue));
+    mi_heap_queue_first_update(heap,queue);
+  }
+  page->heap->page_count--;
+  page->next = NULL;
+  page->prev = NULL;
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
+  mi_page_set_in_full(page,false);
+}
+
+
+static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(page->heap == NULL);
+  mi_assert_internal(!mi_page_queue_contains(queue, page));
+  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
+  mi_assert_internal(page->block_size == queue->block_size ||
+                      (page->block_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue)) ||
+                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
+  page->next = queue->first;
+  page->prev = NULL;
+  if (queue->first != NULL) {
+    mi_assert_internal(queue->first->prev == NULL);
+    queue->first->prev = page;
+    queue->first = page;
+  }
+  else {
+    queue->first = queue->last = page;
+  }
+
+  // update direct
+  mi_heap_queue_first_update(heap, queue);
+  heap->page_count++;
+}
+
+
+static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(mi_page_queue_contains(from, page));
+  mi_assert_expensive(!mi_page_queue_contains(to, page));
+  mi_assert_internal((page->block_size == to->block_size && page->block_size == from->block_size) ||
+                     (page->block_size == to->block_size && mi_page_queue_is_full(from)) ||
+                     (page->block_size == from->block_size && mi_page_queue_is_full(to)) ||
+                     (page->block_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
+                     (page->block_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
+
+  if (page->prev != NULL) page->prev->next = page->next;
+  if (page->next != NULL) page->next->prev = page->prev;
+  if (page == from->last)  from->last = page->prev;
+  if (page == from->first) {
+    from->first = page->next;
+    // update first
+    mi_heap_t* heap = page->heap;
+    mi_assert_internal(mi_heap_contains_queue(heap, from));
+    mi_heap_queue_first_update(heap, from);
+  }
+
+  page->prev = to->last;
+  page->next = NULL;
+  if (to->last != NULL) {
+    mi_assert_internal(page->heap == to->last->heap);
+    to->last->next = page;
+    to->last = page;
+  }
+  else {
+    to->first = page;
+    to->last = page;
+    mi_heap_queue_first_update(page->heap, to);
+  }
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(to));
+}
+
+size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
+  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+  mi_assert_internal(pq->block_size == append->block_size);
+
+  if (append->first==NULL) return 0;
+
+  // set append pages to new heap and count
+  size_t count = 0;
+  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
+    mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
+    count++;
+  }
+
+  if (pq->last==NULL) {
+    // take over afresh
+    mi_assert_internal(pq->first==NULL);
+    pq->first = append->first;
+    pq->last = append->last;
+    mi_heap_queue_first_update(heap, pq);
+  }
+  else {
+    // append to end
+    mi_assert_internal(pq->last!=NULL);
+    mi_assert_internal(append->first!=NULL);
+    pq->last->next = append->first;
+    append->first->prev = pq->last;
+    pq->last = append->last;
+  }
+  return count;
+}
diff --git a/runtime/src/mimalloc/c/page.c b/runtime/src/mimalloc/c/page.c
new file mode 100644
index 00000000000..437cd0a57a1
--- /dev/null
+++ b/runtime/src/mimalloc/c/page.c
@@ -0,0 +1,818 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  The core of the allocator. Every segment contains
+  pages of a certain block size. The main function
+  exported is `mi_malloc_generic`.
+----------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+/* -----------------------------------------------------------
+  Definition of page queues for each block size
+----------------------------------------------------------- */
+
+#define MI_IN_PAGE_C
+#include "page-queue.c"
+#undef MI_IN_PAGE_C
+
+
+/* -----------------------------------------------------------
+  Page helpers
+----------------------------------------------------------- */
+
+// Index a block in a page
+static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_start, size_t i) {
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(i <= page->reserved);
+  return (mi_block_t*)((uint8_t*)page_start + (i * page->block_size));
+}
+
+static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_stats_t* stats);
+
+
+#if (MI_DEBUG>1)
+static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
+  size_t count = 0;
+  while (head != NULL) {
+    mi_assert_internal(page == _mi_ptr_page(head));
+    count++;
+    head = mi_block_next(page, head);
+  }
+  return count;
+}
+
+/*
+// Start of the page available memory
+static inline uint8_t* mi_page_area(const mi_page_t* page) {
+  return _mi_page_start(_mi_page_segment(page), page, NULL);
+}
+*/
+
+static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
+  size_t psize;
+  uint8_t* page_area = _mi_page_start(_mi_page_segment(page), page, &psize);
+  mi_block_t* start = (mi_block_t*)page_area;
+  mi_block_t* end   = (mi_block_t*)(page_area + psize);
+  while(p != NULL) {
+    if (p < start || p >= end) return false;
+    p = mi_block_next(page, p);
+  }
+  return true;
+}
+
+static bool mi_page_is_valid_init(mi_page_t* page) {
+  mi_assert_internal(page->block_size > 0);
+  mi_assert_internal(page->used <= page->capacity);
+  mi_assert_internal(page->capacity <= page->reserved);
+
+  mi_segment_t* segment = _mi_page_segment(page);
+  uint8_t* start = _mi_page_start(segment,page,NULL);
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,page->block_size,NULL));
+  //mi_assert_internal(start + page->capacity*page->block_size == page->top);
+
+  mi_assert_internal(mi_page_list_is_valid(page,page->free));
+  mi_assert_internal(mi_page_list_is_valid(page,page->local_free));
+
+  #if MI_DEBUG>3 // generally too expensive to check this
+  if (page->flags.is_zero) {
+    for(mi_block_t* block = page->free; block != NULL; mi_block_next(page,block)) {
+      mi_assert_expensive(mi_mem_is_zero(block + 1, page->block_size - sizeof(mi_block_t)));
+    }
+  }
+  #endif
+
+  mi_block_t* tfree = mi_tf_block(page->thread_free);
+  mi_assert_internal(mi_page_list_is_valid(page, tfree));
+  size_t tfree_count = mi_page_list_count(page, tfree);
+  mi_assert_internal(tfree_count <= page->thread_freed + 1);
+
+  size_t free_count = mi_page_list_count(page, page->free) + mi_page_list_count(page, page->local_free);
+  mi_assert_internal(page->used + free_count == page->capacity);
+
+  return true;
+}
+
+bool _mi_page_is_valid(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_valid_init(page));
+  #if MI_SECURE
+  mi_assert_internal(page->cookie != 0);
+  #endif
+  if (page->heap!=NULL) {
+    mi_segment_t* segment = _mi_page_segment(page);
+    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == page->heap->thread_id || segment->thread_id==0);
+    if (segment->page_kind != MI_PAGE_HUGE) {
+      mi_page_queue_t* pq = mi_page_queue_of(page);
+      mi_assert_internal(mi_page_queue_contains(pq, page));
+      mi_assert_internal(pq->block_size==page->block_size || page->block_size > MI_LARGE_OBJ_SIZE_MAX || mi_page_is_in_full(page));
+      mi_assert_internal(mi_heap_contains_queue(page->heap,pq));
+    }
+  }
+  return true;
+}
+#endif
+
+
+void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay  ) {
+  mi_thread_free_t tfree;
+  mi_thread_free_t tfreex;
+
+  do {
+    tfreex = tfree = page->thread_free;
+    if (mi_unlikely(mi_tf_delayed(tfree) < MI_DELAYED_FREEING)) {
+      tfreex = mi_tf_set_delayed(tfree,delay);
+    }
+    else if (mi_unlikely(mi_tf_delayed(tfree) == MI_DELAYED_FREEING)) {
+      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
+      continue;          // and try again
+    }
+  }
+  while((mi_tf_delayed(tfreex) !=  mi_tf_delayed(tfree)) && // avoid atomic operation if already equal
+        !mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree));
+}
+
+
+/* -----------------------------------------------------------
+  Page collect the `local_free` and `thread_free` lists
+----------------------------------------------------------- */
+
+// Collect the local `thread_free` list using an atomic exchange.
+// Note: The exchange must be done atomically as this is used right after
+// moving to the full list in `mi_page_collect_ex` and we need to
+// ensure that there was no race where the page became unfull just before the move.
+static void _mi_page_thread_free_collect(mi_page_t* page)
+{
+  mi_block_t* head;
+  mi_thread_free_t tfree;
+  mi_thread_free_t tfreex;
+  do {
+    tfree = page->thread_free;
+    head = mi_tf_block(tfree);
+    tfreex = mi_tf_set_block(tfree,NULL);
+  } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree));
+
+  // return if the list is empty
+  if (head == NULL) return;
+
+  // find the tail -- also to get a proper count (without data races)
+  uintptr_t max_count = page->capacity; // cannot collect more than capacity
+  uintptr_t count = 1;
+  mi_block_t* tail = head;
+  mi_block_t* next;
+  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
+    count++;
+    tail = next;
+  }
+  // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
+  if (count > max_count) {
+    _mi_fatal_error("corrupted thread-free list\n");
+    return; // the thread-free items cannot be freed
+  }
+
+  // and append the current local free list
+  mi_block_set_next(page,tail, page->local_free);
+  page->local_free = head;
+
+  // update counts now
+  mi_atomic_subu(&page->thread_freed, count);
+  page->used -= count;
+}
+
+void _mi_page_free_collect(mi_page_t* page, bool force) {
+  mi_assert_internal(page!=NULL);
+
+  // collect the thread free list
+  if (force || mi_tf_block(page->thread_free) != NULL) {  // quick test to avoid an atomic operation
+    _mi_page_thread_free_collect(page);
+  }
+
+  // and the local free list
+  if (page->local_free != NULL) {
+    if (mi_likely(page->free == NULL)) {
+      // usual case
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->is_zero = false;
+    }
+    else if (force) {
+      // append -- only on shutdown (force) as this is a linear operation
+      mi_block_t* tail = page->local_free;
+      mi_block_t* next;
+      while ((next = mi_block_next(page, tail)) != NULL) {
+        tail = next;
+      }
+      mi_block_set_next(page, tail, page->free);
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->is_zero = false;
+    }
+  }
+
+  mi_assert_internal(!force || page->local_free == NULL);
+}
+
+
+
+/* -----------------------------------------------------------
+  Page fresh and retire
+----------------------------------------------------------- */
+
+// called from segments when reclaiming abandoned pages
+void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
+  mi_assert_expensive(mi_page_is_valid_init(page));
+  mi_assert_internal(page->heap == NULL);
+  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
+  _mi_page_free_collect(page,false);
+  mi_page_queue_t* pq = mi_page_queue(heap, page->block_size);
+  mi_page_queue_push(heap, pq, page);
+  mi_assert_expensive(_mi_page_is_valid(page));
+}
+
+// allocate a fresh page from a segment
+static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size) {
+  mi_assert_internal(pq==NULL||mi_heap_contains_queue(heap, pq));
+  mi_page_t* page = _mi_segment_page_alloc(block_size, &heap->tld->segments, &heap->tld->os);
+  if (page == NULL) return NULL;
+  mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
+  mi_page_init(heap, page, block_size, &heap->tld->stats);
+  _mi_stat_increase( &heap->tld->stats.pages, 1);
+  if (pq!=NULL) mi_page_queue_push(heap, pq, page); // huge pages use pq==NULL
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return page;
+}
+
+// Get a fresh page to use
+static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
+  mi_assert_internal(mi_heap_contains_queue(heap, pq));
+
+  // try to reclaim an abandoned page first
+  mi_page_t* page = pq->first;
+  if (!heap->no_reclaim &&
+      _mi_segment_try_reclaim_abandoned(heap, false, &heap->tld->segments) &&
+      page != pq->first)
+  {
+    // we reclaimed, and we got lucky with a reclaimed page in our queue
+    page = pq->first;
+    if (page->free != NULL) return page;
+  }
+  // otherwise allocate the page
+  page = mi_page_fresh_alloc(heap, pq, pq->block_size);
+  if (page==NULL) return NULL;
+  mi_assert_internal(pq->block_size==page->block_size);
+  mi_assert_internal(pq==mi_page_queue(heap,page->block_size));
+  return page;
+}
+
+/* -----------------------------------------------------------
+   Do any delayed frees
+   (put there by other threads if they deallocated in a full page)
+----------------------------------------------------------- */
+void _mi_heap_delayed_free(mi_heap_t* heap) {
+  // take over the list
+  mi_block_t* block;
+  do {
+    block = (mi_block_t*)heap->thread_delayed_free;
+  } while (block != NULL && !mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), NULL, block));
+
+  // and free them all
+  while(block != NULL) {
+    mi_block_t* next = mi_block_nextx(heap,block, heap->cookie);
+    // use internal free instead of regular one to keep stats etc correct
+    if (!_mi_free_delayed_block(block)) {
+      // we might already start delayed freeing while another thread has not yet
+      // reset the delayed_freeing flag; in that case delay it further by reinserting.
+      mi_block_t* dfree;
+      do {
+        dfree = (mi_block_t*)heap->thread_delayed_free;
+        mi_block_set_nextx(heap, block, dfree, heap->cookie);
+      } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));
+
+    }
+    block = next;
+  }
+}
+
+/* -----------------------------------------------------------
+  Unfull, abandon, free and retire
+----------------------------------------------------------- */
+
+// Move a page from the full list back to a regular list
+void _mi_page_unfull(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(mi_page_is_in_full(page));
+
+  _mi_page_use_delayed_free(page, MI_NO_DELAYED_FREE);
+  if (!mi_page_is_in_full(page)) return;
+
+  mi_heap_t* heap = page->heap;
+  mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL];
+  mi_page_set_in_full(page, false); // to get the right queue
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_page_set_in_full(page, true);
+  mi_page_queue_enqueue_from(pq, pqfull, page);
+}
+
+static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(!mi_page_immediate_available(page));
+  mi_assert_internal(!mi_page_is_in_full(page));
+
+  _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE);
+  if (mi_page_is_in_full(page)) return;
+
+  mi_page_queue_enqueue_from(&page->heap->pages[MI_BIN_FULL], pq, page);
+  _mi_page_free_collect(page,false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
+}
+
+
+// Abandon a page with used blocks at the end of a thread.
+// Note: only call if it is ensured that no references exist from
+// the `page->heap->thread_delayed_free` into this page.
+// Currently only called through `mi_heap_collect_ex` which ensures this.
+void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(page->heap != NULL);
+
+#if MI_DEBUG > 1
+  mi_heap_t* pheap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
+#endif
+
+  // remove from our page list
+  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // page is no longer associated with our heap
+  mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
+
+#if MI_DEBUG>1
+  // check there are no references left..
+  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->cookie)) {
+    mi_assert_internal(_mi_ptr_page(block) != page);
+  }
+#endif
+
+  // and abandon it
+  mi_assert_internal(page->heap == NULL);
+  _mi_segment_page_abandon(page,segments_tld);
+}
+
+
+// Free a page with no more free blocks
+void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(mi_page_all_free(page));
+  #if MI_DEBUG>1
+  // check if we can safely free
+  mi_thread_free_t free = mi_tf_set_delayed(page->thread_free,MI_NEVER_DELAYED_FREE);
+  free = mi_atomic_exchange(&page->thread_free, free);
+  mi_assert_internal(mi_tf_delayed(free) != MI_DELAYED_FREEING);
+  #endif
+
+  mi_page_set_has_aligned(page, false);
+
+  // account for huge pages here
+  // (note: no longer necessary as huge pages are always abandoned)
+  if (page->block_size > MI_LARGE_OBJ_SIZE_MAX) {
+    if (page->block_size > MI_HUGE_OBJ_SIZE_MAX) {
+      _mi_stat_decrease(&page->heap->tld->stats.giant, page->block_size);
+    }
+    else {
+      _mi_stat_decrease(&page->heap->tld->stats.huge, page->block_size);
+    }
+  }
+  
+  // remove from the page list
+  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
+  mi_segments_tld_t* segments_tld = &page->heap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // and free it
+  mi_assert_internal(page->heap == NULL);
+  _mi_segment_page_free(page, force, segments_tld);
+}
+
+// Retire a page with no more used blocks
+// Important to not retire too quickly though as new
+// allocations might coming.
+// Note: called from `mi_free` and benchmarks often
+// trigger this due to freeing everything and then
+// allocating again so careful when changing this.
+void _mi_page_retire(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(mi_page_all_free(page));
+
+  mi_page_set_has_aligned(page, false);
+
+  // don't retire too often..
+  // (or we end up retiring and re-allocating most of the time)
+  // NOTE: refine this more: we should not retire if this
+  // is the only page left with free blocks. It is not clear
+  // how to check this efficiently though... 
+  // for now, we don't retire if it is the only page left of this size class.
+  mi_page_queue_t* pq = mi_page_queue_of(page);
+  if (mi_likely(page->block_size <= (MI_SMALL_SIZE_MAX/4))) {
+    // if (mi_page_mostly_used(page->prev) && mi_page_mostly_used(page->next)) {
+    if (pq->last==page && pq->first==page) {
+      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
+      return; // dont't retire after all
+    }
+  }
+
+  _mi_page_free(page, pq, false);
+}
+
+
+/* -----------------------------------------------------------
+  Initialize the initial free list in a page.
+  In secure mode we initialize a randomized list by
+  alternating between slices.
+----------------------------------------------------------- */
+
+#define MI_MAX_SLICE_SHIFT  (6)   // at most 64 slices
+#define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
+#define MI_MIN_SLICES       (2)
+
+static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t extend, mi_stats_t* const stats) {
+  UNUSED(stats);
+  #if (MI_SECURE<=2)
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->local_free == NULL);
+  #endif
+  mi_assert_internal(page->capacity + extend <= page->reserved);
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+  const size_t bsize = page->block_size;
+
+  // initialize a randomized free list
+  // set up `slice_count` slices to alternate between
+  size_t shift = MI_MAX_SLICE_SHIFT;
+  while ((extend >> shift) == 0) {
+    shift--;
+  }
+  const size_t slice_count = (size_t)1U << shift;
+  const size_t slice_extend = extend / slice_count;
+  mi_assert_internal(slice_extend >= 1);
+  mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
+  size_t      counts[MI_MAX_SLICES];   // available objects in the slice
+  for (size_t i = 0; i < slice_count; i++) {
+    blocks[i] = mi_page_block_at(page, page_area, page->capacity + i*slice_extend);
+    counts[i] = slice_extend;
+  }
+  counts[slice_count-1] += (extend % slice_count);  // final slice holds the modulus too (todo: distribute evenly?)
+
+  // and initialize the free list by randomly threading through them
+  // set up first element
+  size_t current = _mi_heap_random(heap) % slice_count;
+  counts[current]--;
+  mi_block_t* const free_start = blocks[current];
+  // and iterate through the rest
+  uintptr_t rnd = heap->random;
+  for (size_t i = 1; i < extend; i++) {
+    // call random_shuffle only every INTPTR_SIZE rounds
+    const size_t round = i%MI_INTPTR_SIZE;
+    if (round == 0) rnd = _mi_random_shuffle(rnd);
+    // select a random next slice index
+    size_t next = ((rnd >> 8*round) & (slice_count-1));
+    while (counts[next]==0) {                            // ensure it still has space
+      next++;
+      if (next==slice_count) next = 0;
+    }
+    // and link the current block to it
+    counts[next]--;
+    mi_block_t* const block = blocks[current];
+    blocks[current] = (mi_block_t*)((uint8_t*)block + bsize);  // bump to the following block
+    mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next`
+    current = next;
+  }
+  // prepend to the free list (usually NULL)
+  mi_block_set_next(page, blocks[current], page->free);  // end of the list
+  page->free = free_start;
+  heap->random = _mi_random_shuffle(rnd);
+}
+
+static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t extend, mi_stats_t* const stats)
+{
+  UNUSED(stats);
+  #if (MI_SECURE <= 2)
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->local_free == NULL);
+  #endif
+  mi_assert_internal(page->capacity + extend <= page->reserved);
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+  const size_t bsize = page->block_size;
+  mi_block_t* const start = mi_page_block_at(page, page_area, page->capacity);
+  
+  // initialize a sequential free list
+  mi_block_t* const last = mi_page_block_at(page, page_area, page->capacity + extend - 1);  
+  mi_block_t* block = start;
+  while(block <= last) {
+    mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
+    mi_block_set_next(page,block,next);
+    block = next;
+  }  
+  // prepend to free list (usually `NULL`)
+  mi_block_set_next(page, last, page->free);
+  page->free = start;
+}
+
+/* -----------------------------------------------------------
+  Page initialize and extend the capacity
+----------------------------------------------------------- */
+
+#define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
+#if (MI_SECURE>0)
+#define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
+#else
+#define MI_MIN_EXTEND         (1)
+#endif
+
+// Extend the capacity (up to reserved) by initializing a free list
+// We do at most `MI_MAX_EXTEND` to avoid touching too much memory
+// Note: we also experimented with "bump" allocation on the first
+// allocations but this did not speed up any benchmark (due to an
+// extra test in malloc? or cache effects?)
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_stats_t* stats) {
+  UNUSED(stats);
+  mi_assert_expensive(mi_page_is_valid_init(page));
+  #if (MI_SECURE<=2)
+  mi_assert(page->free == NULL);
+  mi_assert(page->local_free == NULL);
+  if (page->free != NULL) return;
+  #endif
+  if (page->capacity >= page->reserved) return;
+
+  size_t page_size;
+  _mi_page_start(_mi_page_segment(page), page, &page_size);
+  mi_stat_counter_increase(stats->pages_extended, 1);
+
+  // calculate the extend count
+  size_t extend = page->reserved - page->capacity;
+  size_t max_extend = (page->block_size >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)page->block_size);
+  if (max_extend < MI_MIN_EXTEND) max_extend = MI_MIN_EXTEND;
+
+  if (extend > max_extend) {
+    // ensure we don't touch memory beyond the page to reduce page commit.
+    // the `lean` benchmark tests this. Going from 1 to 8 increases rss by 50%.
+    extend = (max_extend==0 ? 1 : max_extend);
+  }
+
+  mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
+  mi_assert_internal(extend < (1UL<<16));
+
+  // and append the extend the free list
+  if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
+    mi_page_free_list_extend(page, extend, stats );
+  }
+  else {
+    mi_page_free_list_extend_secure(heap, page, extend, stats);
+  }
+  // enable the new free list
+  page->capacity += (uint16_t)extend;
+  mi_stat_increase(stats->page_committed, extend * page->block_size);
+
+  // extension into zero initialized memory preserves the zero'd free list
+  if (!page->is_zero_init) {
+    page->is_zero = false;
+  }
+  mi_assert_expensive(mi_page_is_valid_init(page));
+}
+
+// Initialize a fresh page
+static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_stats_t* stats) {
+  mi_assert(page != NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert(segment != NULL);
+  mi_assert_internal(block_size > 0);
+  // set fields
+  size_t page_size;
+  _mi_segment_page_start(segment, page, block_size, &page_size);
+  page->block_size = block_size;
+  mi_assert_internal(page_size / block_size < (1L<<16));
+  page->reserved = (uint16_t)(page_size / block_size);
+  #ifdef MI_ENCODE_FREELIST
+  page->cookie = _mi_heap_random(heap) | 1;
+  #endif
+  page->is_zero = page->is_zero_init;
+
+  mi_assert_internal(page->capacity == 0);
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->used == 0);
+  mi_assert_internal(page->thread_free == 0);
+  mi_assert_internal(page->thread_freed == 0);
+  mi_assert_internal(page->next == NULL);
+  mi_assert_internal(page->prev == NULL);
+  mi_assert_internal(!mi_page_has_aligned(page));
+  #if (MI_ENCODE_FREELIST)
+  mi_assert_internal(page->cookie != 0);
+  #endif
+  mi_assert_expensive(mi_page_is_valid_init(page));
+
+  // initialize an initial free list
+  mi_page_extend_free(heap,page,stats);
+  mi_assert(mi_page_immediate_available(page));
+}
+
+
+/* -----------------------------------------------------------
+  Find pages with free blocks
+-------------------------------------------------------------*/
+
+// Find a page with free blocks of `page->block_size`.
+static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq)
+{
+  // search through the pages in "next fit" order
+  mi_page_t* rpage = NULL;
+  size_t count = 0;
+  size_t page_free_count = 0;
+  mi_page_t* page = pq->first;
+  while( page != NULL)
+  {
+    mi_page_t* next = page->next; // remember next
+    count++;
+
+    // 0. collect freed blocks by us and other threads
+    _mi_page_free_collect(page,false);
+
+    // 1. if the page contains free blocks, we are done
+    if (mi_page_immediate_available(page)) {
+      // If all blocks are free, we might retire this page instead.
+      // do this at most 8 times to bound allocation time.
+      // (note: this can happen if a page was earlier not retired due
+      //  to having neighbours that were mostly full or due to concurrent frees)
+      if (page_free_count < 8 && mi_page_all_free(page)) {
+        page_free_count++;
+        if (rpage != NULL) _mi_page_free(rpage,pq,false);
+        rpage = page;
+        page = next;
+        continue;     // and keep looking
+      }
+      else {
+        break;  // pick this one
+      }
+    }
+
+    // 2. Try to extend
+    if (page->capacity < page->reserved) {
+      mi_page_extend_free(heap, page, &heap->tld->stats);
+      mi_assert_internal(mi_page_immediate_available(page));
+      break;
+    }
+
+    // 3. If the page is completely full, move it to the `mi_pages_full`
+    // queue so we don't visit long-lived pages too often.
+    mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
+    mi_page_to_full(page,pq);
+
+    page = next;
+  } // for each page
+
+  mi_stat_counter_increase(heap->tld->stats.searches,count);
+
+  if (page == NULL) {
+    page = rpage;
+    rpage = NULL;
+  }
+  if (rpage != NULL) {
+    _mi_page_free(rpage,pq,false);
+  }
+
+  if (page == NULL) {
+    page = mi_page_fresh(heap, pq);
+  }
+  else {
+    mi_assert(pq->first == page);
+  }
+  mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+  return page;
+}
+
+
+// Find a page with free blocks of `size`.
+static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
+  mi_page_queue_t* pq = mi_page_queue(heap,size);
+  mi_page_t* page = pq->first;
+  if (page != NULL) {
+    if ((MI_SECURE >= 3) && page->capacity < page->reserved && ((_mi_heap_random(heap) & 1) == 1)) {
+      // in secure mode, we extend half the time to increase randomness
+      mi_page_extend_free(heap, page, &heap->tld->stats);
+      mi_assert_internal(mi_page_immediate_available(page));
+    }
+    else {
+      _mi_page_free_collect(page,false);
+    }
+    if (mi_page_immediate_available(page)) {
+      return page; // fast path
+    }
+  }
+  return mi_page_queue_find_free_ex(heap, pq);
+}
+
+
+/* -----------------------------------------------------------
+  Users can register a deferred free function called
+  when the `free` list is empty. Since the `local_free`
+  is separate this is deterministically called after
+  a certain number of allocations.
+----------------------------------------------------------- */
+
+static mi_deferred_free_fun* volatile deferred_free = NULL;
+
+void _mi_deferred_free(mi_heap_t* heap, bool force) {
+  heap->tld->heartbeat++;
+  if (deferred_free != NULL && !heap->tld->recurse) {
+    heap->tld->recurse = true;
+    deferred_free(force, heap->tld->heartbeat);
+    heap->tld->recurse = false;
+  }
+}
+
+void mi_register_deferred_free(mi_deferred_free_fun* fn) mi_attr_noexcept {
+  deferred_free = fn;
+}
+
+
+/* -----------------------------------------------------------
+  General allocation
+----------------------------------------------------------- */
+
+// A huge page is allocated directly without being in a queue.
+// Because huge pages contain just one block, and the segment contains
+// just that page, we always treat them as abandoned and any thread
+// that frees the block can free the whole page and segment directly.
+static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
+  size_t block_size = _mi_os_good_alloc_size(size);
+  mi_assert_internal(_mi_bin(block_size) == MI_BIN_HUGE);  
+  mi_page_t* page = mi_page_fresh_alloc(heap,NULL,block_size);
+  if (page != NULL) {
+    mi_assert_internal(mi_page_immediate_available(page));
+    mi_assert_internal(page->block_size == block_size);
+    mi_assert_internal(_mi_page_segment(page)->page_kind==MI_PAGE_HUGE);
+    mi_assert_internal(_mi_page_segment(page)->used==1);
+    mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
+    mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
+
+    if (page->block_size > MI_HUGE_OBJ_SIZE_MAX) {
+      _mi_stat_increase(&heap->tld->stats.giant, block_size);
+      _mi_stat_counter_increase(&heap->tld->stats.giant_count, 1);
+    }
+    else {
+      _mi_stat_increase(&heap->tld->stats.huge, block_size);
+      _mi_stat_counter_increase(&heap->tld->stats.huge_count, 1);
+    }
+  }  
+  return page;
+}
+
+
+// Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
+void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
+{
+  mi_assert_internal(heap != NULL);
+
+  // initialize if necessary
+  if (mi_unlikely(!mi_heap_is_initialized(heap))) {
+    mi_thread_init(); // calls `_mi_heap_init` in turn
+    heap = mi_get_default_heap();
+  }
+  mi_assert_internal(mi_heap_is_initialized(heap));
+
+  // call potential deferred free routines
+  _mi_deferred_free(heap, false);
+
+  // free delayed frees from other threads
+  _mi_heap_delayed_free(heap);
+
+  // huge allocation?
+  mi_page_t* page;
+  if (mi_unlikely(size > MI_LARGE_OBJ_SIZE_MAX)) {
+    if (mi_unlikely(size > PTRDIFF_MAX)) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+      page = NULL;
+    }
+    else {
+      page = mi_huge_page_alloc(heap,size);
+    }
+  }
+  else {
+    // otherwise find a page with free blocks in our size segregated queues
+    page = mi_find_free_page(heap,size);
+  }
+  if (page == NULL) return NULL; // out of memory
+
+  mi_assert_internal(mi_page_immediate_available(page));
+  mi_assert_internal(page->block_size >= size);
+
+  // and try again, this time succeeding! (i.e. this should never recurse)
+  return _mi_page_malloc(heap, page, size);
+}
diff --git a/runtime/src/mimalloc/c/segment.c b/runtime/src/mimalloc/c/segment.c
new file mode 100644
index 00000000000..dcc6a04b398
--- /dev/null
+++ b/runtime/src/mimalloc/c/segment.c
@@ -0,0 +1,743 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+#include <stdio.h>
+
+#define MI_PAGE_HUGE_ALIGN  (256*1024)
+
+/* -----------------------------------------------------------
+  Segment allocation
+  We allocate pages inside big OS allocated "segments"
+  (4mb on 64-bit). This is to avoid splitting VMA's on Linux
+  and reduce fragmentation on other OS's. Each thread
+  owns its own segments.
+
+  Currently we have:
+  - small pages (64kb), 64 in one segment
+  - medium pages (512kb), 8 in one segment
+  - large pages (4mb), 1 in one segment
+  - huge blocks > MI_LARGE_OBJ_SIZE_MAX (512kb) are directly allocated by the OS
+
+  In any case the memory for a segment is virtual and only
+  committed on demand (i.e. we are careful to not touch the memory
+  until we actually allocate a block there)
+
+  If a  thread ends, it "abandons" pages with used blocks
+  and there is an abandoned segment list whose segments can
+  be reclaimed by still running threads, much like work-stealing.
+----------------------------------------------------------- */
+
+
+/* -----------------------------------------------------------
+  Queue of segments containing free pages
+----------------------------------------------------------- */
+
+
+#if (MI_DEBUG>1)
+static bool mi_segment_queue_contains(const mi_segment_queue_t* queue, mi_segment_t* segment) {
+  mi_assert_internal(segment != NULL);
+  mi_segment_t* list = queue->first;
+  while (list != NULL) {
+    if (list == segment) break;
+    mi_assert_internal(list->next==NULL || list->next->prev == list);
+    mi_assert_internal(list->prev==NULL || list->prev->next == list);
+    list = list->next;
+  }
+  return (list == segment);
+}
+#endif
+
+static bool mi_segment_queue_is_empty(const mi_segment_queue_t* queue) {
+  return (queue->first == NULL);
+}
+
+static void mi_segment_queue_remove(mi_segment_queue_t* queue, mi_segment_t* segment) {
+  mi_assert_expensive(mi_segment_queue_contains(queue, segment));
+  if (segment->prev != NULL) segment->prev->next = segment->next;
+  if (segment->next != NULL) segment->next->prev = segment->prev;
+  if (segment == queue->first) queue->first = segment->next;
+  if (segment == queue->last)  queue->last = segment->prev;
+  segment->next = NULL;
+  segment->prev = NULL;
+}
+
+static void mi_segment_enqueue(mi_segment_queue_t* queue, mi_segment_t* segment) {
+  mi_assert_expensive(!mi_segment_queue_contains(queue, segment));
+  segment->next = NULL;
+  segment->prev = queue->last;
+  if (queue->last != NULL) {
+    mi_assert_internal(queue->last->next == NULL);
+    queue->last->next = segment;
+    queue->last = segment;
+  }
+  else {
+    queue->last = queue->first = segment;
+  }
+}
+
+static mi_segment_queue_t* mi_segment_free_queue_of_kind(mi_page_kind_t kind, mi_segments_tld_t* tld) {
+  if (kind == MI_PAGE_SMALL) return &tld->small_free;
+  else if (kind == MI_PAGE_MEDIUM) return &tld->medium_free;
+  else return NULL;
+}
+
+static mi_segment_queue_t* mi_segment_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  return mi_segment_free_queue_of_kind(segment->page_kind, tld);
+}
+
+// remove from free queue if it is in one
+static void mi_segment_remove_from_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_segment_queue_t* queue = mi_segment_free_queue(segment, tld); // may be NULL
+  bool in_queue = (queue!=NULL && (segment->next != NULL || segment->prev != NULL || queue->first == segment));
+  if (in_queue) {
+    mi_segment_queue_remove(queue, segment);
+  }
+}
+
+static void mi_segment_insert_in_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_segment_enqueue(mi_segment_free_queue(segment, tld), segment);
+}
+
+
+/* -----------------------------------------------------------
+ Invariant checking
+----------------------------------------------------------- */
+
+#if (MI_DEBUG > 1)
+static bool mi_segment_is_in_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_segment_queue_t* queue = mi_segment_free_queue(segment, tld);
+  bool in_queue = (queue!=NULL && (segment->next != NULL || segment->prev != NULL || queue->first == segment));
+  if (in_queue) {
+    mi_assert_expensive(mi_segment_queue_contains(queue, segment));
+  }
+  return in_queue;
+}
+
+static size_t mi_segment_pagesize(mi_segment_t* segment) {
+  return ((size_t)1 << segment->page_shift);
+}
+static bool mi_segment_is_valid(mi_segment_t* segment) {
+  mi_assert_internal(segment != NULL);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(segment->used <= segment->capacity);
+  mi_assert_internal(segment->abandoned <= segment->used);
+  size_t nfree = 0;
+  for (size_t i = 0; i < segment->capacity; i++) {
+    if (!segment->pages[i].segment_in_use) nfree++;
+  }
+  mi_assert_internal(nfree + segment->used == segment->capacity);
+  mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
+  mi_assert_internal(segment->page_kind == MI_PAGE_HUGE ||
+                     (mi_segment_pagesize(segment) * segment->capacity == segment->segment_size));
+  return true;
+}
+#endif
+
+/* -----------------------------------------------------------
+ Segment size calculations
+----------------------------------------------------------- */
+
+// Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size)
+{
+  size_t   psize = (segment->page_kind == MI_PAGE_HUGE ? segment->segment_size : (size_t)1 << segment->page_shift);
+  uint8_t* p     = (uint8_t*)segment + page->segment_idx*psize;
+
+  if (page->segment_idx == 0) {
+    // the first page starts after the segment info (and possible guard page)
+    p     += segment->segment_info_size;
+    psize -= segment->segment_info_size;
+    // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
+    if (block_size > 0 && segment->page_kind <= MI_PAGE_MEDIUM) {
+      size_t adjust = block_size - ((uintptr_t)p % block_size);
+      if (adjust < block_size) {
+        p     += adjust;
+        psize -= adjust;
+      }
+      mi_assert_internal((uintptr_t)p % block_size == 0);
+    }
+  }
+  
+  if (MI_SECURE > 1 || (MI_SECURE == 1 && page->segment_idx == segment->capacity - 1)) {
+    // secure == 1: the last page has an os guard page at the end
+    // secure >  1: every page has an os guard page
+    psize -= _mi_os_page_size();
+  }
+
+  if (page_size != NULL) *page_size = psize;
+  mi_assert_internal(_mi_ptr_page(p) == page);
+  mi_assert_internal(_mi_ptr_segment(p) == segment);
+  return p;
+}
+
+static size_t mi_segment_size(size_t capacity, size_t required, size_t* pre_size, size_t* info_size) {
+  /*
+  if (mi_option_is_enabled(mi_option_secure)) {
+    // always reserve maximally so the protection falls on
+    // the same address area, as we need to reuse them from the caches interchangably.
+    capacity = MI_SMALL_PAGES_PER_SEGMENT;
+  }
+  */
+  const size_t minsize   = sizeof(mi_segment_t) + ((capacity - 1) * sizeof(mi_page_t)) + 16 /* padding */;
+  size_t guardsize = 0;
+  size_t isize     = 0;
+
+  if (MI_SECURE == 0) {
+    // normally no guard pages
+    isize = _mi_align_up(minsize, 16 * MI_MAX_ALIGN_SIZE);
+  }
+  else {
+    // in secure mode, we set up a protected page in between the segment info
+    // and the page data (and one at the end of the segment)
+    const size_t page_size = _mi_os_page_size();
+    isize = _mi_align_up(minsize, page_size);
+    guardsize = page_size;
+    required = _mi_align_up(required, page_size);
+  }
+;
+  if (info_size != NULL) *info_size = isize;
+  if (pre_size != NULL)  *pre_size  = isize + guardsize;
+  return (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + 2*guardsize, MI_PAGE_HUGE_ALIGN) );
+}
+
+
+/* ----------------------------------------------------------------------------
+Segment caches
+We keep a small segment cache per thread to increase local
+reuse and avoid setting/clearing guard pages in secure mode.
+------------------------------------------------------------------------------- */
+
+static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
+  if (segment_size>=0) _mi_stat_increase(&tld->stats->segments,1);
+                  else _mi_stat_decrease(&tld->stats->segments,1);
+  tld->count += (segment_size >= 0 ? 1 : -1);
+  if (tld->count > tld->peak_count) tld->peak_count = tld->count;
+  tld->current_size += segment_size;
+  if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size;
+}
+
+
+static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_segments_tld_t* tld) {
+  segment->thread_id = 0;
+  mi_segments_track_size(-((long)segment_size),tld);
+  if (MI_SECURE != 0) {
+    mi_assert_internal(!segment->mem_is_fixed);
+    _mi_mem_unprotect(segment, segment->segment_size); // ensure no more guard pages are set
+  }
+  _mi_mem_free(segment, segment_size, segment->memid, tld->stats);
+}
+
+
+// The thread local segment cache is limited to be at most 1/8 of the peak size of segments in use,
+#define MI_SEGMENT_CACHE_FRACTION (8)
+
+// note: returned segment may be partially reset
+static mi_segment_t* mi_segment_cache_pop(size_t segment_size, mi_segments_tld_t* tld) {
+  if (segment_size != 0 && segment_size != MI_SEGMENT_SIZE) return NULL;
+  mi_segment_t* segment = tld->cache;
+  if (segment == NULL) return NULL;
+  tld->cache_count--;
+  tld->cache = segment->next;
+  segment->next = NULL;
+  mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
+  _mi_stat_decrease(&tld->stats->segments_cache, 1);
+  return segment;
+}
+
+static bool mi_segment_cache_full(mi_segments_tld_t* tld) 
+{
+  if (tld->count == 1 && tld->cache_count==0) return false; // always cache at least the final segment of a thread
+  size_t max_cache = mi_option_get(mi_option_segment_cache);
+  if (tld->cache_count < max_cache
+       && tld->cache_count < (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION)) // at least allow a 1 element cache
+     ) { 
+    return false;
+  }
+  // take the opportunity to reduce the segment cache if it is too large (now)
+  // TODO: this never happens as we check against peak usage, should we use current usage instead?
+  while (tld->cache_count > max_cache) { //(1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) {
+    mi_segment_t* segment = mi_segment_cache_pop(0,tld);
+    mi_assert_internal(segment != NULL);
+    if (segment != NULL) mi_segment_os_free(segment, segment->segment_size, tld);
+  }
+  return true;
+}
+
+static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(!mi_segment_is_in_free_queue(segment, tld));
+  mi_assert_internal(segment->next == NULL);
+  if (segment->segment_size != MI_SEGMENT_SIZE || mi_segment_cache_full(tld)) {
+    return false;
+  }
+  mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
+  if (!segment->mem_is_fixed && mi_option_is_enabled(mi_option_cache_reset)) {
+    _mi_mem_reset((uint8_t*)segment + segment->segment_info_size, segment->segment_size - segment->segment_info_size, tld->stats);
+  }
+  segment->next = tld->cache;
+  tld->cache = segment;
+  tld->cache_count++;
+  _mi_stat_increase(&tld->stats->segments_cache,1);
+  return true;
+}
+
+// called by threads that are terminating to free cached segments
+void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
+  mi_segment_t* segment;
+  while ((segment = mi_segment_cache_pop(0,tld)) != NULL) {
+    mi_segment_os_free(segment, segment->segment_size, tld);
+  }
+  mi_assert_internal(tld->cache_count == 0);
+  mi_assert_internal(tld->cache == NULL);
+}
+
+
+/* -----------------------------------------------------------
+   Segment allocation
+----------------------------------------------------------- */
+
+// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
+static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+{
+  // calculate needed sizes first
+  size_t capacity;
+  if (page_kind == MI_PAGE_HUGE) {
+    mi_assert_internal(page_shift == MI_SEGMENT_SHIFT && required > 0);
+    capacity = 1;
+  }
+  else {
+    mi_assert_internal(required == 0);
+    size_t page_size = (size_t)1 << page_shift;
+    capacity = MI_SEGMENT_SIZE / page_size;
+    mi_assert_internal(MI_SEGMENT_SIZE % page_size == 0);
+    mi_assert_internal(capacity >= 1 && capacity <= MI_SMALL_PAGES_PER_SEGMENT);
+  }
+  size_t info_size;
+  size_t pre_size;
+  size_t segment_size = mi_segment_size(capacity, required, &pre_size, &info_size);
+  mi_assert_internal(segment_size >= required);
+  size_t page_size = (page_kind == MI_PAGE_HUGE ? segment_size : (size_t)1 << page_shift);
+
+  // Try to get it from our thread local cache first
+  bool eager_delay = (tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
+  bool eager  = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
+  bool commit = eager || (page_kind > MI_PAGE_MEDIUM);
+  bool protection_still_good = false;
+  bool is_zero = false;
+  mi_segment_t* segment = mi_segment_cache_pop(segment_size, tld);
+  if (segment != NULL) {
+    if (MI_SECURE!=0) {
+      mi_assert_internal(!segment->mem_is_fixed);
+      if (segment->page_kind != page_kind) {
+        _mi_mem_unprotect(segment, segment->segment_size); // reset protection if the page kind differs
+      }
+      else {
+        protection_still_good = true; // otherwise, the guard pages are still in place
+      }
+    }
+    if (!segment->mem_is_committed && page_kind > MI_PAGE_MEDIUM) {
+      mi_assert_internal(!segment->mem_is_fixed);
+      _mi_mem_commit(segment, segment->segment_size, &is_zero, tld->stats);
+      segment->mem_is_committed = true;
+    }
+    if (!segment->mem_is_fixed &&
+        (mi_option_is_enabled(mi_option_cache_reset) || mi_option_is_enabled(mi_option_page_reset))) {
+      bool reset_zero = false;
+      _mi_mem_unreset(segment, segment->segment_size, &reset_zero, tld->stats);
+      if (reset_zero) is_zero = true;
+    }
+  }
+  else {
+    // Allocate the segment from the OS
+    size_t memid;
+    bool   mem_large = (!eager_delay && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy    
+    segment = (mi_segment_t*)_mi_mem_alloc_aligned(segment_size, MI_SEGMENT_SIZE, &commit, &mem_large, &is_zero, &memid, os_tld);
+    if (segment == NULL) return NULL;  // failed to allocate
+    if (!commit) {
+      // ensure the initial info is committed
+      bool commit_zero = false;
+      _mi_mem_commit(segment, info_size, &commit_zero, tld->stats);
+      if (commit_zero) is_zero = true;
+    }
+    segment->memid = memid;
+    segment->mem_is_fixed = mem_large;
+    segment->mem_is_committed = commit;
+    mi_segments_track_size((long)segment_size, tld);
+  }
+  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
+
+  // zero the segment info (but not the `mem` fields)
+  ptrdiff_t ofs = offsetof(mi_segment_t,next);
+  memset((uint8_t*)segment + ofs, 0, info_size - ofs);    
+
+  // guard pages
+  if ((MI_SECURE != 0) && !protection_still_good) {
+    // in secure mode, we set up a protected page in between the segment info
+    // and the page data
+    mi_assert_internal( info_size == pre_size - _mi_os_page_size() && info_size % _mi_os_page_size() == 0);
+    _mi_mem_protect( (uint8_t*)segment + info_size, (pre_size - info_size) );
+    size_t os_page_size = _mi_os_page_size();
+    if (MI_SECURE <= 1) {
+      // and protect the last page too
+      _mi_mem_protect( (uint8_t*)segment + segment_size - os_page_size, os_page_size );
+    }
+    else {
+      // protect every page
+      for (size_t i = 0; i < capacity; i++) {
+        _mi_mem_protect( (uint8_t*)segment + (i+1)*page_size - os_page_size, os_page_size );
+      }
+    }
+  }
+
+  // initialize
+  segment->page_kind  = page_kind;
+  segment->capacity   = capacity;
+  segment->page_shift = page_shift;
+  segment->segment_size = segment_size;
+  segment->segment_info_size = pre_size;
+  segment->thread_id  = _mi_thread_id();
+  segment->cookie = _mi_ptr_cookie(segment);
+  for (uint8_t i = 0; i < segment->capacity; i++) {
+    segment->pages[i].segment_idx = i;
+    segment->pages[i].is_reset = false;
+    segment->pages[i].is_committed = commit;
+    segment->pages[i].is_zero_init = is_zero;
+  }
+  _mi_stat_increase(&tld->stats->page_committed, segment->segment_info_size);
+  //fprintf(stderr,"mimalloc: alloc segment at %p\n", (void*)segment);
+  return segment;
+}
+
+
+static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
+  UNUSED(force);
+  //fprintf(stderr,"mimalloc: free segment at %p\n", (void*)segment);
+  mi_assert(segment != NULL);
+  mi_segment_remove_from_free_queue(segment,tld);
+
+  mi_assert_expensive(!mi_segment_queue_contains(&tld->small_free, segment));
+  mi_assert_expensive(!mi_segment_queue_contains(&tld->medium_free, segment));
+  mi_assert(segment->next == NULL);
+  mi_assert(segment->prev == NULL);
+  _mi_stat_decrease(&tld->stats->page_committed, segment->segment_info_size);  
+
+  // update reset memory statistics
+  /*
+  for (uint8_t i = 0; i < segment->capacity; i++) {
+    mi_page_t* page = &segment->pages[i];
+    if (page->is_reset) {
+      page->is_reset = false;
+      mi_stat_decrease( tld->stats->reset,mi_page_size(page));
+    }
+  }
+  */
+
+  if (!force && mi_segment_cache_push(segment, tld)) {
+    // it is put in our cache
+  }
+  else {
+    // otherwise return it to the OS
+    mi_segment_os_free(segment, segment->segment_size, tld);
+  }
+}
+
+/* -----------------------------------------------------------
+  Free page management inside a segment
+----------------------------------------------------------- */
+
+
+static bool mi_segment_has_free(const mi_segment_t* segment) {
+  return (segment->used < segment->capacity);
+}
+
+static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_stats_t* stats) {
+  mi_assert_internal(mi_segment_has_free(segment));
+  mi_assert_expensive(mi_segment_is_valid(segment));
+  for (size_t i = 0; i < segment->capacity; i++) {
+    mi_page_t* page = &segment->pages[i];
+    if (!page->segment_in_use) {
+      if (page->is_reset || !page->is_committed) {
+        size_t psize;
+        uint8_t* start = _mi_page_start(segment, page, &psize);        
+        if (!page->is_committed) {
+          mi_assert_internal(!segment->mem_is_fixed);
+          page->is_committed = true;
+          bool is_zero = false;
+          _mi_mem_commit(start,psize,&is_zero,stats);
+          if (is_zero) page->is_zero_init = true;
+        }
+        if (page->is_reset) {
+          mi_assert_internal(!segment->mem_is_fixed);
+          page->is_reset = false;
+          bool is_zero = false;
+          _mi_mem_unreset(start, psize, &is_zero, stats);
+          if (is_zero) page->is_zero_init = true;
+        }
+      }
+      return page;
+    }
+  }
+  mi_assert(false);
+  return NULL;
+}
+
+
+/* -----------------------------------------------------------
+   Free
+----------------------------------------------------------- */
+
+static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
+
+static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_stats_t* stats) {
+  UNUSED(stats);
+  mi_assert_internal(page->segment_in_use);
+  mi_assert_internal(mi_page_all_free(page));
+  mi_assert_internal(page->is_committed);
+  size_t inuse = page->capacity * page->block_size;
+  _mi_stat_decrease(&stats->page_committed, inuse);
+  _mi_stat_decrease(&stats->pages, 1);
+  
+  // reset the page memory to reduce memory pressure?
+  if (!segment->mem_is_fixed && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
+    size_t psize;
+    uint8_t* start = _mi_page_start(segment, page, &psize);
+    page->is_reset = true;
+    _mi_mem_reset(start, psize, stats);
+  }
+
+  // zero the page data, but not the segment fields
+  page->is_zero_init = false;
+  ptrdiff_t ofs = offsetof(mi_page_t,capacity);
+  memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
+  page->segment_in_use = false;
+  segment->used--;
+}
+
+void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
+{
+  mi_assert(page != NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_expensive(mi_segment_is_valid(segment));
+
+  // mark it as free now
+  mi_segment_page_clear(segment, page, tld->stats);
+
+  if (segment->used == 0) {
+    // no more used pages; remove from the free list and free the segment
+    mi_segment_free(segment, force, tld);
+  }
+  else {
+    if (segment->used == segment->abandoned) {
+      // only abandoned pages; remove from free list and abandon
+      mi_segment_abandon(segment,tld);
+    }
+    else if (segment->used + 1 == segment->capacity) {
+      mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // for now we only support small and medium pages
+      // move back to segments  free list
+      mi_segment_insert_in_free_queue(segment,tld);
+    }
+  }
+}
+
+
+/* -----------------------------------------------------------
+   Abandonment
+----------------------------------------------------------- */
+
+// When threads terminate, they can leave segments with
+// live blocks (reached through other threads). Such segments
+// are "abandoned" and will be reclaimed by other threads to
+// reuse their pages and/or free them eventually
+static volatile _Atomic(mi_segment_t*) abandoned; // = NULL;
+static volatile _Atomic(uintptr_t)     abandoned_count; // = 0;
+
+static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(segment->used == segment->abandoned);
+  mi_assert_internal(segment->used > 0);
+  mi_assert_internal(segment->abandoned_next == NULL);
+  mi_assert_expensive(mi_segment_is_valid(segment));
+
+  // remove the segment from the free page queue if needed
+  mi_segment_remove_from_free_queue(segment,tld);
+  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
+
+  // all pages in the segment are abandoned; add it to the abandoned list
+  _mi_stat_increase(&tld->stats->segments_abandoned, 1);
+  mi_segments_track_size(-((long)segment->segment_size), tld);
+  segment->thread_id = 0;
+  mi_segment_t* next;
+  do {
+    next = (mi_segment_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*,&abandoned));
+    mi_atomic_write_ptr(mi_atomic_cast(void*,&segment->abandoned_next), next);
+  } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&abandoned), segment, next));
+  mi_atomic_increment(&abandoned_count);
+}
+
+void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert(page != NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_expensive(mi_segment_is_valid(segment));
+  segment->abandoned++;  
+  _mi_stat_increase(&tld->stats->pages_abandoned, 1);
+  mi_assert_internal(segment->abandoned <= segment->used);
+  if (segment->used == segment->abandoned) {
+    // all pages are abandoned, abandon the entire segment
+    mi_segment_abandon(segment,tld);
+  }
+}
+
+bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld) {
+  uintptr_t reclaimed = 0;
+  uintptr_t atmost;
+  if (try_all) {
+    atmost = abandoned_count+16;   // close enough
+  }
+  else {
+    atmost = abandoned_count/8;    // at most 1/8th of all outstanding (estimated)
+    if (atmost < 8) atmost = 8;    // but at least 8
+  }
+
+  // for `atmost` `reclaimed` abandoned segments...
+  while(atmost > reclaimed) {
+    // try to claim the head of the abandoned segments
+    mi_segment_t* segment;
+    do {
+      segment = (mi_segment_t*)abandoned;
+    } while(segment != NULL && !mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&abandoned), (mi_segment_t*)segment->abandoned_next, segment));
+    if (segment==NULL) break; // stop early if no more segments available
+
+    // got it.
+    mi_atomic_decrement(&abandoned_count);
+    segment->thread_id = _mi_thread_id();
+    segment->abandoned_next = NULL;
+    mi_segments_track_size((long)segment->segment_size,tld);
+    mi_assert_internal(segment->next == NULL && segment->prev == NULL);
+    mi_assert_expensive(mi_segment_is_valid(segment));
+    _mi_stat_decrease(&tld->stats->segments_abandoned,1);
+
+    // add its abandoned pages to the current thread
+    mi_assert(segment->abandoned == segment->used);
+    for (size_t i = 0; i < segment->capacity; i++) {
+      mi_page_t* page = &segment->pages[i];
+      if (page->segment_in_use) {
+        segment->abandoned--;
+        mi_assert(page->next == NULL);
+        _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
+        if (mi_page_all_free(page)) {
+          // if everything free by now, free the page
+          mi_segment_page_clear(segment,page,tld->stats);
+        }
+        else {
+          // otherwise reclaim it
+          _mi_page_reclaim(heap,page);
+        }
+      }
+    }
+    mi_assert(segment->abandoned == 0);
+    if (segment->used == 0) {  // due to page_clear
+      mi_segment_free(segment,false,tld);
+    }
+    else {
+      reclaimed++;
+      // add its free pages to the the current thread free small segment queue
+      if (segment->page_kind <= MI_PAGE_MEDIUM && mi_segment_has_free(segment)) {
+        mi_segment_insert_in_free_queue(segment,tld);
+      }
+    }
+  }
+  return (reclaimed>0);
+}
+
+
+/* -----------------------------------------------------------
+   Small page allocation
+----------------------------------------------------------- */
+
+// Allocate a small page inside a segment.
+// Requires that the page has free pages
+static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(mi_segment_has_free(segment));
+  mi_page_t* page = mi_segment_find_free(segment, tld->stats);
+  page->segment_in_use = true;  
+  segment->used++;
+  mi_assert_internal(segment->used <= segment->capacity);
+  if (segment->used == segment->capacity) {
+    // if no more free pages, remove from the queue
+    mi_assert_internal(!mi_segment_has_free(segment));
+    mi_segment_remove_from_free_queue(segment,tld);
+  }
+  return page;
+}
+
+static mi_page_t* mi_segment_page_alloc(mi_page_kind_t kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  mi_segment_queue_t* free_queue = mi_segment_free_queue_of_kind(kind,tld);
+  if (mi_segment_queue_is_empty(free_queue)) {
+    mi_segment_t* segment = mi_segment_alloc(0,kind,page_shift,tld,os_tld);
+    if (segment == NULL) return NULL;
+    mi_segment_enqueue(free_queue, segment);
+  }
+  mi_assert_internal(free_queue->first != NULL);
+  return mi_segment_page_alloc_in(free_queue->first,tld);
+}
+
+static mi_page_t* mi_segment_small_page_alloc(mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  return mi_segment_page_alloc(MI_PAGE_SMALL,MI_SMALL_PAGE_SHIFT,tld,os_tld);
+}
+
+static mi_page_t* mi_segment_medium_page_alloc(mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  return mi_segment_page_alloc(MI_PAGE_MEDIUM, MI_MEDIUM_PAGE_SHIFT, tld, os_tld);
+}
+
+/* -----------------------------------------------------------
+   large page allocation
+----------------------------------------------------------- */
+
+static mi_page_t* mi_segment_large_page_alloc(mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  mi_segment_t* segment = mi_segment_alloc(0,MI_PAGE_LARGE,MI_LARGE_PAGE_SHIFT,tld,os_tld);
+  if (segment == NULL) return NULL;
+  segment->used = 1;
+  mi_page_t* page = &segment->pages[0];
+  page->segment_in_use = true;
+  return page;
+}
+
+static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+{
+  mi_segment_t* segment = mi_segment_alloc(size, MI_PAGE_HUGE, MI_SEGMENT_SHIFT,tld,os_tld);
+  if (segment == NULL) return NULL;
+  mi_assert_internal(segment->segment_size - segment->segment_info_size >= size);
+  segment->used = 1;
+  segment->thread_id = 0; // huge pages are immediately abandoned
+  mi_page_t* page = &segment->pages[0];
+  page->segment_in_use = true;
+  return page;
+}
+
+/* -----------------------------------------------------------
+   Page allocation and free
+----------------------------------------------------------- */
+
+mi_page_t* _mi_segment_page_alloc(size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  mi_page_t* page;
+  if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
+    page = mi_segment_small_page_alloc(tld,os_tld);
+  }
+  else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    page = mi_segment_medium_page_alloc(tld, os_tld);
+  }
+  else if (block_size <= MI_LARGE_OBJ_SIZE_MAX) {
+    page = mi_segment_large_page_alloc(tld, os_tld);
+  }
+  else {
+    page = mi_segment_huge_page_alloc(block_size,tld,os_tld);
+  }
+  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page)));
+  return page;
+}
diff --git a/runtime/src/mimalloc/c/static.c b/runtime/src/mimalloc/c/static.c
new file mode 100644
index 00000000000..f8aa4c1e88d
--- /dev/null
+++ b/runtime/src/mimalloc/c/static.c
@@ -0,0 +1,28 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#if !KONAN_MI_MALLOC
+#define _DEFAULT_SOURCE
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+// For a static override we create a single object file
+// containing the whole library. If it is linked first
+// it will override all the standard library allocation
+// functions (on Unix's).
+#include "stats.c"
+#include "os.c"
+#include "memory.c"
+#include "segment.c"
+#include "page.c"
+#include "heap.c"
+#include "alloc.c"
+#include "alloc-aligned.c"
+#include "alloc-posix.c"
+#include "init.c"
+#include "options.c"
+#endif
\ No newline at end of file
diff --git a/runtime/src/mimalloc/c/stats.c b/runtime/src/mimalloc/c/stats.c
new file mode 100644
index 00000000000..50bd029db0a
--- /dev/null
+++ b/runtime/src/mimalloc/c/stats.c
@@ -0,0 +1,463 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <stdio.h>  // fputs, stderr
+#include <string.h> // memset
+
+
+/* -----------------------------------------------------------
+  Statistics operations
+----------------------------------------------------------- */
+
+static bool mi_is_in_main(void* stat) {
+  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
+         && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));  
+}
+
+static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  if (mi_is_in_main(stat))
+  {
+    // add atomically (for abandoned pages)
+    mi_atomic_add64(&stat->current,amount);
+    if (stat->current > stat->peak) stat->peak = stat->current;  // racing.. it's ok
+    if (amount > 0) {
+      mi_atomic_add64(&stat->allocated,amount);
+    }
+    else {
+      mi_atomic_add64(&stat->freed, -amount);
+    }
+  }
+  else {
+    // add thread local
+    stat->current += amount;
+    if (stat->current > stat->peak) stat->peak = stat->current;
+    if (amount > 0) {
+      stat->allocated += amount;
+    }
+    else {
+      stat->freed += -amount;
+    }
+  }
+}
+
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {  
+  if (mi_is_in_main(stat)) {
+    mi_atomic_add64( &stat->count, 1 );
+    mi_atomic_add64( &stat->total, (int64_t)amount );
+  }
+  else {
+    stat->count++;
+    stat->total += amount;
+  }
+}
+
+void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update(stat, (int64_t)amount);
+}
+
+void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update(stat, -((int64_t)amount));
+}
+
+// must be thread safe as it is called from stats_merge
+static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
+  if (stat==src) return;
+  mi_atomic_add64( &stat->allocated, src->allocated * unit);
+  mi_atomic_add64( &stat->current, src->current * unit);
+  mi_atomic_add64( &stat->freed, src->freed * unit);
+  // peak scores do not work across threads..
+  mi_atomic_add64( &stat->peak, src->peak * unit);
+}
+
+static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) {
+  if (stat==src) return;
+  mi_atomic_add64( &stat->total, src->total * unit);
+  mi_atomic_add64( &stat->count, src->count * unit);
+}
+
+// must be thread safe as it is called from stats_merge
+static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
+  if (stats==src) return;
+  mi_stat_add(&stats->segments, &src->segments,1);
+  mi_stat_add(&stats->pages, &src->pages,1);
+  mi_stat_add(&stats->reserved, &src->reserved, 1);
+  mi_stat_add(&stats->committed, &src->committed, 1);
+  mi_stat_add(&stats->reset, &src->reset, 1);
+  mi_stat_add(&stats->page_committed, &src->page_committed, 1);
+
+  mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
+  mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
+  mi_stat_add(&stats->threads, &src->threads, 1);
+
+  mi_stat_add(&stats->malloc, &src->malloc, 1);
+  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
+  mi_stat_add(&stats->huge, &src->huge, 1);
+  mi_stat_add(&stats->giant, &src->giant, 1);
+
+  mi_stat_counter_add(&stats->pages_extended, &src->pages_extended, 1);
+  mi_stat_counter_add(&stats->mmap_calls, &src->mmap_calls, 1);
+  mi_stat_counter_add(&stats->commit_calls, &src->commit_calls, 1);
+
+  mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
+  mi_stat_counter_add(&stats->searches, &src->searches, 1);
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
+  mi_stat_counter_add(&stats->giant_count, &src->giant_count, 1);
+#if MI_STAT>1
+  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
+    if (src->normal[i].allocated > 0 || src->normal[i].freed > 0) {
+      mi_stat_add(&stats->normal[i], &src->normal[i], 1);
+    }
+  }
+#endif
+}
+
+/* -----------------------------------------------------------
+  Display statistics
+----------------------------------------------------------- */
+
+// unit > 0 : size in binary bytes 
+// unit == 0: count as decimal
+// unit < 0 : count in binary
+static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, const char* fmt) {
+  char buf[32];
+  int  len = 32;
+  const char* suffix = (unit <= 0 ? " " : "b");
+  double base = (unit == 0 ? 1000.0 : 1024.0);
+  if (unit>0) n *= unit;
+
+  double pos = (double)(n < 0 ? -n : n);
+  if (pos < base)
+    snprintf(buf,len, "%d %s ", (int)n, suffix);
+  else if (pos < base*base)
+    snprintf(buf, len, "%.1f k%s", (double)n / base, suffix);
+  else if (pos < base*base*base)
+    snprintf(buf, len, "%.1f m%s", (double)n / (base*base), suffix);
+  else
+    snprintf(buf, len, "%.1f g%s", (double)n / (base*base*base), suffix);
+
+  _mi_fprintf(out, (fmt==NULL ? "%11s" : fmt), buf);
+}
+
+
+static void mi_print_amount(int64_t n, int64_t unit, mi_output_fun* out) {
+  mi_printf_amount(n,unit,out,NULL);
+}
+
+static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out) {
+  if (unit==1) _mi_fprintf(out,"%11s"," ");
+          else mi_print_amount(n,0,out);
+}
+
+static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out ) {
+  _mi_fprintf(out,"%10s:", msg);  
+  if (unit>0) {
+    mi_print_amount(stat->peak, unit, out);
+    mi_print_amount(stat->allocated, unit, out);
+    mi_print_amount(stat->freed, unit, out);
+    mi_print_amount(unit, 1, out);
+    mi_print_count(stat->allocated, unit, out);
+    if (stat->allocated > stat->freed)
+      _mi_fprintf(out, "  not all freed!\n");
+    else
+      _mi_fprintf(out, "  ok\n");
+  }
+  else if (unit<0) {
+    mi_print_amount(stat->peak, -1, out);
+    mi_print_amount(stat->allocated, -1, out);
+    mi_print_amount(stat->freed, -1, out);
+    if (unit==-1) {
+      _mi_fprintf(out, "%22s", "");
+    }
+    else {
+      mi_print_amount(-unit, 1, out);
+      mi_print_count((stat->allocated / -unit), 0, out);
+    }
+    if (stat->allocated > stat->freed)
+      _mi_fprintf(out, "  not all freed!\n");
+    else
+      _mi_fprintf(out, "  ok\n");
+  }
+  else {
+    mi_print_amount(stat->peak, 1, out);
+    mi_print_amount(stat->allocated, 1, out);
+    _mi_fprintf(out, "\n");
+  }
+}
+
+static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out ) {
+  _mi_fprintf(out, "%10s:", msg);
+  mi_print_amount(stat->total, -1, out);
+  _mi_fprintf(out, "\n");
+}
+
+static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out) {
+  double avg = (stat->count == 0 ? 0.0 : (double)stat->total / (double)stat->count);
+  _mi_fprintf(out, "%10s: %7.1f avg\n", msg, avg);
+}
+
+
+static void mi_print_header(mi_output_fun* out ) {
+  _mi_fprintf(out,"%10s: %10s %10s %10s %10s %10s\n", "heap stats", "peak  ", "total  ", "freed  ", "unit  ", "count  ");
+}
+
+#if MI_STAT>1
+static void mi_stats_print_bins(mi_stat_count_t* all, const mi_stat_count_t* bins, size_t max, const char* fmt, mi_output_fun* out) {
+  bool found = false;
+  char buf[64];
+  for (size_t i = 0; i <= max; i++) {
+    if (bins[i].allocated > 0) {
+      found = true;
+      int64_t unit = _mi_bin_size((uint8_t)i);
+      snprintf(buf, 64, "%s %3zu", fmt, i);
+      mi_stat_add(all, &bins[i], unit);
+      mi_stat_print(&bins[i], buf, unit, out);
+    }
+  }
+  //snprintf(buf, 64, "%s all", fmt);
+  //mi_stat_print(all, buf, 1);
+  if (found) {
+    _mi_fprintf(out, "\n");
+    mi_print_header(out);
+  }
+}
+#endif
+
+
+static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit);
+
+static void _mi_stats_print(mi_stats_t* stats, double secs, mi_output_fun* out) mi_attr_noexcept {
+  mi_print_header(out);
+  #if MI_STAT>1
+  mi_stat_count_t normal = { 0,0,0,0 };
+  mi_stats_print_bins(&normal, stats->normal, MI_BIN_HUGE, "normal",out);
+  mi_stat_print(&normal, "normal", 1, out);
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out);
+  mi_stat_print(&stats->giant, "giant", (stats->giant_count.count == 0 ? 1 : -(stats->giant.allocated / stats->giant_count.count)), out);
+  mi_stat_count_t total = { 0,0,0,0 };
+  mi_stat_add(&total, &normal, 1);
+  mi_stat_add(&total, &stats->huge, 1);
+  mi_stat_add(&total, &stats->giant, 1);
+  mi_stat_print(&total, "total", 1, out);
+  _mi_fprintf(out, "malloc requested:     ");
+  mi_print_amount(stats->malloc.allocated, 1, out);
+  _mi_fprintf(out, "\n\n");
+  #endif
+  mi_stat_print(&stats->reserved, "reserved", 1, out);
+  mi_stat_print(&stats->committed, "committed", 1, out);
+  mi_stat_print(&stats->reset, "reset", 1, out);
+  mi_stat_print(&stats->page_committed, "touched", 1, out);
+  mi_stat_print(&stats->segments, "segments", -1, out);
+  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out);
+  mi_stat_print(&stats->segments_cache, "-cached", -1, out);
+  mi_stat_print(&stats->pages, "pages", -1, out);
+  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out);
+  mi_stat_counter_print(&stats->pages_extended, "-extended", out);
+  mi_stat_counter_print(&stats->page_no_retire, "-noretire", out);
+  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out);
+  mi_stat_counter_print(&stats->commit_calls, "commits", out);
+  mi_stat_print(&stats->threads, "threads", -1, out);
+  mi_stat_counter_print_avg(&stats->searches, "searches", out);
+
+  if (secs >= 0.0) _mi_fprintf(out, "%10s: %9.3f s\n", "elapsed", secs);
+
+  double user_time;
+  double sys_time;
+  size_t peak_rss;
+  size_t page_faults;
+  size_t page_reclaim;
+  size_t peak_commit;
+  mi_process_info(&user_time, &sys_time, &peak_rss, &page_faults, &page_reclaim, &peak_commit);
+  _mi_fprintf(out,"%10s: user: %.3f s, system: %.3f s, faults: %lu, reclaims: %lu, rss: ", "process", user_time, sys_time, (unsigned long)page_faults, (unsigned long)page_reclaim );
+  mi_printf_amount((int64_t)peak_rss, 1, out, "%s");
+  if (peak_commit > 0) {
+    _mi_fprintf(out,", commit charge: ");
+    mi_printf_amount((int64_t)peak_commit, 1, out, "%s");
+  }
+  _mi_fprintf(out,"\n");
+}
+
+double _mi_clock_end(double start);
+double _mi_clock_start(void);
+static double mi_time_start = 0.0;
+
+static mi_stats_t* mi_stats_get_default(void) {
+  mi_heap_t* heap = mi_heap_get_default();
+  return &heap->tld->stats;
+}
+
+static void mi_stats_merge_from(mi_stats_t* stats) {
+  if (stats != &_mi_stats_main) {
+    mi_stats_add(&_mi_stats_main, stats);
+    memset(stats, 0, sizeof(mi_stats_t));
+  }
+}
+
+void mi_stats_reset(void) mi_attr_noexcept {
+  mi_stats_t* stats = mi_stats_get_default();
+  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
+  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
+  mi_time_start = _mi_clock_start();
+}
+
+void mi_stats_merge(void) mi_attr_noexcept {
+  mi_stats_merge_from( mi_stats_get_default() );
+}
+
+void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
+  mi_stats_merge_from(stats);
+}
+
+
+static void mi_stats_print_ex(mi_stats_t* stats, double secs, mi_output_fun* out) {
+  mi_stats_merge_from(stats);
+  _mi_stats_print(&_mi_stats_main, secs, out);
+}
+
+void mi_stats_print(mi_output_fun* out) mi_attr_noexcept {
+  mi_stats_print_ex(mi_stats_get_default(),_mi_clock_end(mi_time_start),out);
+}
+
+void mi_thread_stats_print(mi_output_fun* out) mi_attr_noexcept {
+  _mi_stats_print(mi_stats_get_default(), _mi_clock_end(mi_time_start), out);
+}
+
+
+
+// --------------------------------------------------------
+// Basic timer for convenience
+// --------------------------------------------------------
+
+#ifdef _WIN32
+#include <windows.h>
+static double mi_to_seconds(LARGE_INTEGER t) {
+  static double freq = 0.0;
+  if (freq <= 0.0) {
+    LARGE_INTEGER f;
+    QueryPerformanceFrequency(&f);
+    freq = (double)(f.QuadPart);
+  }
+  return ((double)(t.QuadPart) / freq);
+}
+
+static double mi_clock_now(void) {
+  LARGE_INTEGER t;
+  QueryPerformanceCounter(&t);
+  return mi_to_seconds(t);
+}
+#else
+#include <time.h>
+#ifdef CLOCK_REALTIME
+static double mi_clock_now(void) {
+  struct timespec t;
+  clock_gettime(CLOCK_REALTIME, &t);
+  return (double)t.tv_sec + (1.0e-9 * (double)t.tv_nsec);
+}
+#else
+// low resolution timer
+static double mi_clock_now(void) {
+  return ((double)clock() / (double)CLOCKS_PER_SEC);
+}
+#endif
+#endif
+
+
+static double mi_clock_diff = 0.0;
+
+double _mi_clock_start(void) {
+  if (mi_clock_diff == 0.0) {
+    double t0 = mi_clock_now();
+    mi_clock_diff = mi_clock_now() - t0;
+  }
+  return mi_clock_now();
+}
+
+double _mi_clock_end(double start) {
+  double end = mi_clock_now();
+  return (end - start - mi_clock_diff);
+}
+
+
+// --------------------------------------------------------
+// Basic process statistics
+// --------------------------------------------------------
+
+#if defined(_WIN32)
+#include <windows.h>
+#include <psapi.h>
+#pragma comment(lib,"psapi.lib")
+
+static double filetime_secs(const FILETIME* ftime) {
+  ULARGE_INTEGER i;
+  i.LowPart = ftime->dwLowDateTime;
+  i.HighPart = ftime->dwHighDateTime;
+  double secs = (double)(i.QuadPart) * 1.0e-7; // FILETIME is in 100 nano seconds
+  return secs;
+}
+static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
+  FILETIME ct;
+  FILETIME ut;
+  FILETIME st;
+  FILETIME et;
+  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
+  *utime = filetime_secs(&ut);
+  *stime = filetime_secs(&st);
+
+  PROCESS_MEMORY_COUNTERS info;
+  GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
+  *peak_rss = (size_t)info.PeakWorkingSetSize;
+  *page_faults = (size_t)info.PageFaultCount;
+  *peak_commit = (size_t)info.PeakPagefileUsage;
+  *page_reclaim = 0;
+}
+
+#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/resource.h>
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <mach/mach.h>
+#endif
+
+static double timeval_secs(const struct timeval* tv) {
+  return (double)tv->tv_sec + ((double)tv->tv_usec * 1.0e-6);
+}
+
+static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
+  struct rusage rusage;
+  getrusage(RUSAGE_SELF, &rusage);
+#if defined(__APPLE__) && defined(__MACH__)
+  *peak_rss = rusage.ru_maxrss;
+#else
+  *peak_rss = rusage.ru_maxrss * 1024;
+#endif
+  *page_faults = rusage.ru_majflt;
+  *page_reclaim = rusage.ru_minflt;
+  *peak_commit = 0;
+  *utime = timeval_secs(&rusage.ru_utime);
+  *stime = timeval_secs(&rusage.ru_stime);
+}
+
+#else
+#ifndef __wasi__
+// WebAssembly instances are not processes
+#pragma message("define a way to get process info")
+#endif
+
+static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
+  *peak_rss = 0;
+  *page_faults = 0;
+  *page_reclaim = 0;
+  *peak_commit = 0;
+  *utime = 0.0;
+  *stime = 0.0;
+}
+#endif
diff --git a/runtime/src/opt_alloc/cpp/AllocImpl.cpp b/runtime/src/opt_alloc/cpp/AllocImpl.cpp
new file mode 100644
index 00000000000..1bdb93f4d66
--- /dev/null
+++ b/runtime/src/opt_alloc/cpp/AllocImpl.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2010-2019 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+
+extern "C" {
+void* mi_calloc(size_t, size_t);
+void mi_free(void*);
+void* konan_calloc_impl(size_t n_elements, size_t elem_size) {
+ return mi_calloc(n_elements, elem_size);
+}
+void konan_free_impl (void* mem) {
+  mi_free(mem);
+}
+}  // extern "C"
diff --git a/runtime/src/std_alloc/cpp/AllocImpl.cpp b/runtime/src/std_alloc/cpp/AllocImpl.cpp
new file mode 100644
index 00000000000..ce85a81ecbb
--- /dev/null
+++ b/runtime/src/std_alloc/cpp/AllocImpl.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright 2010-2019 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license
+ * that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+
+extern "C" {
+// Memory operations.
+void* konan_calloc_impl(size_t n_elements, size_t elem_size) {
+ return calloc(n_elements, elem_size);
+}
+void konan_free_impl (void* mem) {
+  free(mem);
+}
+}
+
diff --git a/shared/src/main/kotlin/org/jetbrains/kotlin/konan/target/KonanTargetExtenstions.kt b/shared/src/main/kotlin/org/jetbrains/kotlin/konan/target/KonanTargetExtenstions.kt
index f736d61e3d8..4ca9734e46e 100644
--- a/shared/src/main/kotlin/org/jetbrains/kotlin/konan/target/KonanTargetExtenstions.kt
+++ b/shared/src/main/kotlin/org/jetbrains/kotlin/konan/target/KonanTargetExtenstions.kt
@@ -4,4 +4,20 @@ fun KonanTarget.supportsCodeCoverage(): Boolean =
         this == KonanTarget.MINGW_X64 ||
         this == KonanTarget.LINUX_X64 ||
         this == KonanTarget.MACOS_X64 ||
-        this == KonanTarget.IOS_X64
\ No newline at end of file
+        this == KonanTarget.IOS_X64
+
+fun KonanTarget.supportsMimallocAllocator(): Boolean =
+     when(this) {
+        is KonanTarget.LINUX_X64 -> true
+        is KonanTarget.MINGW_X86 -> true
+        is KonanTarget.MINGW_X64 -> true
+        is KonanTarget.MACOS_X64 -> true
+        is KonanTarget.LINUX_ARM64 -> true
+        is KonanTarget.LINUX_ARM32_HFP -> true
+        is KonanTarget.ANDROID_X64 -> true
+        is KonanTarget.ANDROID_ARM64 -> true
+        is KonanTarget.IOS_ARM32 -> true
+        is KonanTarget.IOS_ARM64 -> true
+        is KonanTarget.IOS_X64 -> true
+        else -> false // watchOS/tvOS/android_x86/android_arm32 aren't tested; linux_mips32/linux_mipsel32 need linking with libatomic.
+    }
\ No newline at end of file