diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 190ba3e04a..c8e7a41974 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -326,6 +326,23 @@ def __init__(self, device_id: int):
         self._handle = handle_return(driver.cuDeviceGetMemPool(device_id))
         self._dev_id = device_id
 
+        # Set a higher release threshold to improve performance when there are no active allocations.
+        # By default, the release threshold is 0, which means memory is immediately released back
+        # to the OS when there are no active suballocations, causing performance issues.
+        # Check current release threshold
+        current_threshold = handle_return(
+            driver.cuMemPoolGetAttribute(self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD)
+        )
+        # If threshold is 0 (default), set it to maximum to retain memory in the pool
+        if int(current_threshold) == 0:
+            handle_return(
+                driver.cuMemPoolSetAttribute(
+                    self._handle,
+                    driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                    driver.cuuint64_t(0xFFFFFFFFFFFFFFFF),
+                )
+            )
+
     def allocate(self, size: int, stream: Stream = None) -> Buffer:
         """Allocate a buffer of the requested size.
 
diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
index 3a9c7076a7..56fd8ac700 100644
--- a/cuda_core/docs/source/release/0.X.Y-notes.rst
+++ b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -36,4 +36,5 @@ None.
 Fixes and enhancements
 ----------------------
 
+- Improved :class:`DeviceMemoryResource` allocation performance when there are no active allocations by setting a higher release threshold (addresses issue #771).
 - Fix :class:`LaunchConfig` grid unit conversion when cluster is set (addresses issue #867).
\ No newline at end of file
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 2454046465..2ba7b418fe 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -10,7 +10,7 @@
 
 import pytest
 
-from cuda.core.experimental import Buffer, Device, MemoryResource
+from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource
 from cuda.core.experimental._memory import DLDeviceType
 from cuda.core.experimental._utils.cuda_utils import handle_return
 
@@ -257,3 +257,29 @@ def test_buffer_dunder_dlpack_device_failure():
     buffer = dummy_mr.allocate(size=1024)
     with pytest.raises(BufferError, match=r"^buffer is neither device-accessible nor host-accessible$"):
         buffer.__dlpack_device__()
+
+
+def test_device_memory_resource_initialization():
+    """Test that DeviceMemoryResource can be initialized successfully.
+
+    This test verifies that the DeviceMemoryResource initializes properly,
+    including the release threshold configuration for performance optimization.
+    """
+    device = Device()
+    if not device.properties.memory_pools_supported:
+        pytest.skip("memory pools not supported")
+    device.set_current()
+
+    # This should succeed and configure the memory pool release threshold
+    mr = DeviceMemoryResource(device.device_id)
+
+    # Verify basic properties
+    assert mr.device_id == device.device_id
+    assert mr.is_device_accessible is True
+    assert mr.is_host_accessible is False
+
+    # Test allocation/deallocation works
+    buffer = mr.allocate(1024)
+    assert buffer.size == 1024
+    assert buffer.device_id == device.device_id
+    buffer.close()