chore: Replace isinstance(obj, T) with type(obj) is T comparisons #1292

bharatr21 · 2025-12-01T06:05:53Z

Description

Replace isinstance(obj, T) checks with type(obj) is T to optimize cuda.core.launch()

Additional Notes

I made a benchmarking script in Cython to prove the speedup of using type() in place of isinstance() checks since the original issue requested profiling which resulted in an ~5x speedup.
Appreciate some guidance to know if I've done the profiling right

Created a file benchmark_isinstance_cython.pyx :

from cpython.mem cimport PyMem_Malloc, PyMem_Free
from libc.stdint cimport (intptr_t,
                         int8_t, int16_t, int32_t, int64_t,
                         uint8_t, uint16_t, uint32_t, uint64_t)
from libcpp cimport bool as cpp_bool
from libcpp.complex cimport complex as cpp_complex
from libcpp.vector cimport vector

import ctypes
import numpy
import time
from statistics import mean, stdev


ctypedef cpp_complex.complex[float] cpp_single_complex
ctypedef cpp_complex.complex[double] cpp_double_complex

# Cache type objects
cdef object ctypes_bool = ctypes.c_bool
cdef object ctypes_int8 = ctypes.c_int8
cdef object ctypes_int16 = ctypes.c_int16
cdef object ctypes_int32 = ctypes.c_int32
cdef object ctypes_int64 = ctypes.c_int64
cdef object ctypes_uint8 = ctypes.c_uint8
cdef object ctypes_uint16 = ctypes.c_uint16
cdef object ctypes_uint32 = ctypes.c_uint32
cdef object ctypes_uint64 = ctypes.c_uint64
cdef object ctypes_float = ctypes.c_float
cdef object ctypes_double = ctypes.c_double
cdef object numpy_bool = numpy.bool_
cdef object numpy_int8 = numpy.int8
cdef object numpy_int16 = numpy.int16
cdef object numpy_int32 = numpy.int32
cdef object numpy_int64 = numpy.int64
cdef object numpy_uint8 = numpy.uint8
cdef object numpy_uint16 = numpy.uint16
cdef object numpy_uint32 = numpy.uint32
cdef object numpy_uint64 = numpy.uint64
cdef object numpy_float16 = numpy.float16
cdef object numpy_float32 = numpy.float32
cdef object numpy_float64 = numpy.float64
cdef object numpy_complex64 = numpy.complex64
cdef object numpy_complex128 = numpy.complex128

# Limitation due to cython/cython#534
ctypedef void* voidptr


# ============================================================================
# Version 1: Current implementation using isinstance()
# ============================================================================

cdef inline int prepare_ctypes_arg_isinstance(
       vector[void*]& data,
       vector[void*]& data_addresses,
       arg,
       const size_t idx) except -1:
   cdef void* ptr

   if isinstance(arg, ctypes_bool):
       ptr = PyMem_Malloc(sizeof(cpp_bool))
       (<cpp_bool*>ptr)[0] = <cpp_bool>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_int8):
       ptr = PyMem_Malloc(sizeof(int8_t))
       (<int8_t*>ptr)[0] = <int8_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_int16):
       ptr = PyMem_Malloc(sizeof(int16_t))
       (<int16_t*>ptr)[0] = <int16_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_int32):
       ptr = PyMem_Malloc(sizeof(int32_t))
       (<int32_t*>ptr)[0] = <int32_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_int64):
       ptr = PyMem_Malloc(sizeof(int64_t))
       (<int64_t*>ptr)[0] = <int64_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_uint8):
       ptr = PyMem_Malloc(sizeof(uint8_t))
       (<uint8_t*>ptr)[0] = <uint8_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_uint16):
       ptr = PyMem_Malloc(sizeof(uint16_t))
       (<uint16_t*>ptr)[0] = <uint16_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_uint32):
       ptr = PyMem_Malloc(sizeof(uint32_t))
       (<uint32_t*>ptr)[0] = <uint32_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_uint64):
       ptr = PyMem_Malloc(sizeof(uint64_t))
       (<uint64_t*>ptr)[0] = <uint64_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_float):
       ptr = PyMem_Malloc(sizeof(float))
       (<float*>ptr)[0] = <float>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, ctypes_double):
       ptr = PyMem_Malloc(sizeof(double))
       (<double*>ptr)[0] = <double>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   else:
       return 1


cdef inline int prepare_numpy_arg_isinstance(
       vector[void*]& data,
       vector[void*]& data_addresses,
       arg,
       const size_t idx) except -1:
   cdef void* ptr

   if isinstance(arg, numpy_bool):
       ptr = PyMem_Malloc(sizeof(cpp_bool))
       (<cpp_bool*>ptr)[0] = <cpp_bool>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_int8):
       ptr = PyMem_Malloc(sizeof(int8_t))
       (<int8_t*>ptr)[0] = <int8_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_int16):
       ptr = PyMem_Malloc(sizeof(int16_t))
       (<int16_t*>ptr)[0] = <int16_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_int32):
       ptr = PyMem_Malloc(sizeof(int32_t))
       (<int32_t*>ptr)[0] = <int32_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_int64):
       ptr = PyMem_Malloc(sizeof(int64_t))
       (<int64_t*>ptr)[0] = <int64_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_uint8):
       ptr = PyMem_Malloc(sizeof(uint8_t))
       (<uint8_t*>ptr)[0] = <uint8_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_uint16):
       ptr = PyMem_Malloc(sizeof(uint16_t))
       (<uint16_t*>ptr)[0] = <uint16_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_uint32):
       ptr = PyMem_Malloc(sizeof(uint32_t))
       (<uint32_t*>ptr)[0] = <uint32_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_uint64):
       ptr = PyMem_Malloc(sizeof(uint64_t))
       (<uint64_t*>ptr)[0] = <uint64_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_float32):
       ptr = PyMem_Malloc(sizeof(float))
       (<float*>ptr)[0] = <float>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_float64):
       ptr = PyMem_Malloc(sizeof(double))
       (<double*>ptr)[0] = <double>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_complex64):
       ptr = PyMem_Malloc(sizeof(cpp_single_complex))
       (<cpp_single_complex*>ptr)[0] = cpp_complex.complex[float](arg.real, arg.imag)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif isinstance(arg, numpy_complex128):
       ptr = PyMem_Malloc(sizeof(cpp_double_complex))
       (<cpp_double_complex*>ptr)[0] = cpp_complex.complex[double](arg.real, arg.imag)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   else:
       return 1


# ============================================================================
# Version 2: Optimized implementation using type() is
# ============================================================================

cdef inline int prepare_ctypes_arg_type_is(
       vector[void*]& data,
       vector[void*]& data_addresses,
       arg,
       const size_t idx) except -1:
   cdef void* ptr
   cdef object arg_type = type(arg)

   if arg_type is ctypes_bool:
       ptr = PyMem_Malloc(sizeof(cpp_bool))
       (<cpp_bool*>ptr)[0] = <cpp_bool>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_int8:
       ptr = PyMem_Malloc(sizeof(int8_t))
       (<int8_t*>ptr)[0] = <int8_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_int16:
       ptr = PyMem_Malloc(sizeof(int16_t))
       (<int16_t*>ptr)[0] = <int16_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_int32:
       ptr = PyMem_Malloc(sizeof(int32_t))
       (<int32_t*>ptr)[0] = <int32_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_int64:
       ptr = PyMem_Malloc(sizeof(int64_t))
       (<int64_t*>ptr)[0] = <int64_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_uint8:
       ptr = PyMem_Malloc(sizeof(uint8_t))
       (<uint8_t*>ptr)[0] = <uint8_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_uint16:
       ptr = PyMem_Malloc(sizeof(uint16_t))
       (<uint16_t*>ptr)[0] = <uint16_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_uint32:
       ptr = PyMem_Malloc(sizeof(uint32_t))
       (<uint32_t*>ptr)[0] = <uint32_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_uint64:
       ptr = PyMem_Malloc(sizeof(uint64_t))
       (<uint64_t*>ptr)[0] = <uint64_t>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_float:
       ptr = PyMem_Malloc(sizeof(float))
       (<float*>ptr)[0] = <float>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is ctypes_double:
       ptr = PyMem_Malloc(sizeof(double))
       (<double*>ptr)[0] = <double>(arg.value)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   else:
       return 1


cdef inline int prepare_numpy_arg_type_is(
       vector[void*]& data,
       vector[void*]& data_addresses,
       arg,
       const size_t idx) except -1:
   cdef void* ptr
   cdef object arg_type = type(arg)

   if arg_type is numpy_bool:
       ptr = PyMem_Malloc(sizeof(cpp_bool))
       (<cpp_bool*>ptr)[0] = <cpp_bool>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_int8:
       ptr = PyMem_Malloc(sizeof(int8_t))
       (<int8_t*>ptr)[0] = <int8_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_int16:
       ptr = PyMem_Malloc(sizeof(int16_t))
       (<int16_t*>ptr)[0] = <int16_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_int32:
       ptr = PyMem_Malloc(sizeof(int32_t))
       (<int32_t*>ptr)[0] = <int32_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_int64:
       ptr = PyMem_Malloc(sizeof(int64_t))
       (<int64_t*>ptr)[0] = <int64_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_uint8:
       ptr = PyMem_Malloc(sizeof(uint8_t))
       (<uint8_t*>ptr)[0] = <uint8_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_uint16:
       ptr = PyMem_Malloc(sizeof(uint16_t))
       (<uint16_t*>ptr)[0] = <uint16_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_uint32:
       ptr = PyMem_Malloc(sizeof(uint32_t))
       (<uint32_t*>ptr)[0] = <uint32_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_uint64:
       ptr = PyMem_Malloc(sizeof(uint64_t))
       (<uint64_t*>ptr)[0] = <uint64_t>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_float32:
       ptr = PyMem_Malloc(sizeof(float))
       (<float*>ptr)[0] = <float>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_float64:
       ptr = PyMem_Malloc(sizeof(double))
       (<double*>ptr)[0] = <double>(arg)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_complex64:
       ptr = PyMem_Malloc(sizeof(cpp_single_complex))
       (<cpp_single_complex*>ptr)[0] = cpp_complex.complex[float](arg.real, arg.imag)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   elif arg_type is numpy_complex128:
       ptr = PyMem_Malloc(sizeof(cpp_double_complex))
       (<cpp_double_complex*>ptr)[0] = cpp_complex.complex[double](arg.real, arg.imag)
       data_addresses[idx] = ptr
       data[idx] = ptr
       return 0
   else:
       return 1


# ============================================================================
# Benchmark functions
# ============================================================================

def benchmark_isinstance(kernel_args, int iterations):
   """Benchmark the isinstance() approach."""
   cdef size_t n_args = len(kernel_args)
   cdef size_t i, j
   cdef int not_prepared
   cdef vector[voidptr] data
   cdef vector[voidptr] data_addresses
   cdef double start, end

   # Warmup
   for _ in range(100):
       data = vector[voidptr](n_args, NULL)
       data_addresses = vector[voidptr](n_args)
       for i, arg in enumerate(kernel_args):
           if isinstance(arg, int):
               continue
           elif isinstance(arg, float):
               continue
           elif isinstance(arg, complex):
               continue
           elif isinstance(arg, bool):
               continue

           not_prepared = prepare_numpy_arg_isinstance(data, data_addresses, arg, i)
           if not_prepared:
               not_prepared = prepare_ctypes_arg_isinstance(data, data_addresses, arg, i)

       for data_ptr in data:
           if data_ptr:
               PyMem_Free(data_ptr)

   # Actual benchmark
   start = time.perf_counter()
   for j in range(iterations):
       data = vector[voidptr](n_args, NULL)
       data_addresses = vector[voidptr](n_args)

       for i, arg in enumerate(kernel_args):
           if isinstance(arg, int):
               continue
           elif isinstance(arg, float):
               continue
           elif isinstance(arg, complex):
               continue
           elif isinstance(arg, bool):
               continue

           not_prepared = prepare_numpy_arg_isinstance(data, data_addresses, arg, i)
           if not_prepared:
               not_prepared = prepare_ctypes_arg_isinstance(data, data_addresses, arg, i)

       for data_ptr in data:
           if data_ptr:
               PyMem_Free(data_ptr)

   end = time.perf_counter()
   return end - start


def benchmark_type_is(kernel_args, int iterations):
   """Benchmark the type() is approach."""
   cdef size_t n_args = len(kernel_args)
   cdef size_t i, j
   cdef int not_prepared
   cdef vector[voidptr] data
   cdef vector[voidptr] data_addresses
   cdef double start, end
   cdef object arg_type

   # Warmup
   for _ in range(100):
       data = vector[voidptr](n_args, NULL)
       data_addresses = vector[voidptr](n_args)
       for i, arg in enumerate(kernel_args):
           arg_type = type(arg)
           if arg_type is int:
               continue
           elif arg_type is float:
               continue
           elif arg_type is complex:
               continue
           elif arg_type is bool:
               continue

           not_prepared = prepare_numpy_arg_type_is(data, data_addresses, arg, i)
           if not_prepared:
               not_prepared = prepare_ctypes_arg_type_is(data, data_addresses, arg, i)

       for data_ptr in data:
           if data_ptr:
               PyMem_Free(data_ptr)

   # Actual benchmark
   start = time.perf_counter()
   for j in range(iterations):
       data = vector[voidptr](n_args, NULL)
       data_addresses = vector[voidptr](n_args)

       for i, arg in enumerate(kernel_args):
           arg_type = type(arg)
           if arg_type is int:
               continue
           elif arg_type is float:
               continue
           elif arg_type is complex:
               continue
           elif arg_type is bool:
               continue

           not_prepared = prepare_numpy_arg_type_is(data, data_addresses, arg, i)
           if not_prepared:
               not_prepared = prepare_ctypes_arg_type_is(data, data_addresses, arg, i)

       for data_ptr in data:
           if data_ptr:
               PyMem_Free(data_ptr)

   end = time.perf_counter()
   return end - start


def run_benchmark():
   """Main benchmark runner."""
   print("=" * 70)
   print("Cython Benchmark: isinstance() vs type() is")
   print("Kernel Argument Handling Hot Path")
   print("=" * 70)
   print()

   # Create realistic kernel arguments
   kernel_args = [
       numpy.int32(100),
       numpy.float32(2.5),
       numpy.float64(1.23),
       numpy.complex64(1+1j),
       numpy.int64(999),
       numpy.uint32(255),
       ctypes.c_int32(50),
       ctypes.c_float(1.5),
       ctypes.c_double(2.7),
       numpy.int8(10),
       numpy.int16(20),
       numpy.uint8(5),
       numpy.uint16(30),
       numpy.float32(0.5),
   ]

   iterations = 50000
   num_runs = 10

   print(f"Configuration:")
   print(f"  - Arguments per launch: {len(kernel_args)}")
   print(f"  - Simulated launches per run: {iterations:,}")
   print(f"  - Number of runs: {num_runs}")
   print(f"  - Total argument processing: {len(kernel_args) * iterations * num_runs:,}")
   print()

   print("Running isinstance() benchmark...")
   isinstance_times = []
   for i in range(num_runs):
       t = benchmark_isinstance(kernel_args, iterations)
       isinstance_times.append(t)
       print(f"  Run {i+1}: {t:.4f}s")
   isinstance_mean = mean(isinstance_times)
   isinstance_stdev = stdev(isinstance_times)
   print(f"  Mean: {isinstance_mean:.4f}s ± {isinstance_stdev:.6f}s")
   print()

   print("Running type() is benchmark...")
   type_is_times = []
   for i in range(num_runs):
       t = benchmark_type_is(kernel_args, iterations)
       type_is_times.append(t)
       print(f"  Run {i+1}: {t:.4f}s")
   type_is_mean = mean(type_is_times)
   type_is_stdev = stdev(type_is_times)
   print(f"  Mean: {type_is_mean:.4f}s ± {type_is_stdev:.6f}s")
   print()

   print("=" * 70)
   print("RESULTS")
   print("=" * 70)
   print(f"isinstance():  {isinstance_mean:.4f}s ± {isinstance_stdev:.6f}s")
   print(f"type() is:     {type_is_mean:.4f}s ± {type_is_stdev:.6f}s")
   print()

   speedup = isinstance_mean / type_is_mean
   time_saved = isinstance_mean - type_is_mean
   percent_faster = (speedup - 1) * 100

   if speedup > 1.02:
       print(f"✓ type() is is {speedup:.2f}x FASTER ({percent_faster:.1f}% improvement)")
       print(f"  Time saved per 1M launches: {time_saved / (iterations * num_runs) * 1e6 * 1000:.2f}ms")
       print()
       print("RECOMMENDATION: Replace isinstance() with type() is")
   elif speedup < 0.98:
       print(f"⚠️  isinstance() is {1/speedup:.2f}x FASTER")
       print()
       print("RECOMMENDATION: Keep using isinstance()")
   else:
       print(f"≈ Performance is similar (difference < 2%)")
       print()
       print("RECOMMENDATION: Keep using isinstance() for clarity")
   print()

   # Per-launch cost
   total_launches = iterations * num_runs
   print(f"Per-launch argument processing cost:")
   print(f"  isinstance(): {isinstance_mean / total_launches * 1e6:.2f} µs")
   print(f"  type() is:    {type_is_mean / total_launches * 1e6:.2f} µs")
   print()

I mainly used the compiler flags -O3 and -march=native and compiled and ran the above benchmark via this setup script setup_benchmark.py:

#!/usr/bin/env python3
"""
Setup script for building the Cython benchmark extension.
"""

from setuptools import setup, Extension
from Cython.Build import cythonize
import numpy

extensions = [
   Extension(
       "benchmark_isinstance_cython",
       ["benchmark_isinstance_cython.pyx"],
       include_dirs=[numpy.get_include()],
       extra_compile_args=["-O3", "-march=native"],
       language="c++",
   )
]

setup(
   name="benchmark_isinstance_cython",
   ext_modules=cythonize(
       extensions,
       compiler_directives={
           'language_level': 3,
           'boundscheck': False,
           'wraparound': False,
           'cdivision': True,
       }
   ),
)

The script was then run with python setup_benchmark.py build_ext --inplace

Checklist

New or existing tests cover these changes.
The documentation is up to date with these changes.

copy-pr-bot · 2025-12-01T06:05:56Z

This pull request requires additional validation before any workflows can run on NVIDIA's runners.

Pull request vetters can view their responsibilities here.

Contributors can view more details about this message here.

bharatr21 · 2025-12-01T06:32:10Z

/ok to test

mdboom · 2025-12-02T13:17:41Z

/ok to test 0db38d0

github-actions · 2025-12-02T13:27:20Z

Doc Preview CI
🚀 View preview at https://nvidia.github.io/cuda-python/pr-preview/pr-1292/
https://nvidia.github.io/cuda-python/pr-preview/pr-1292/cuda-core/
https://nvidia.github.io/cuda-python/pr-preview/pr-1292/cuda-bindings/
https://nvidia.github.io/cuda-python/pr-preview/pr-1292/cuda-pathfinder/
Preview will be ready when the GitHub Pages deployment is complete.

mdboom

This is great work. I downloaded your script and was able to reproduce a similar result (5.23x faster) on my laptop. The math looks sound. Just to make sure, I replaced your manual calculations with Python's builtin timeit.timeit and pyperf (the latter being the sort of "gold standard" for accurate perf timings in Python). But the result is all roughly the same, and 5x is large enough that that level of accuracy doesn't really matter -- it's an obvious big win.

My only concern with this PR is backward compatibility. It is technically possible to subclass either a numpy or ctypes datatype right now and it would be accepted and work here with the isinstance check but would no longer be accepted after this change. I don't know how often that actually happens in practice, and our test suite obviously doesn't do that. I'm not sure how to assess how much we care about this -- it seems hard to do a GitHub code search for, for example. @leofang, thoughts?

If we determine we do want to be strict about backward compatibility, we could probably do:

if arg_type is ctypes_bool:
   ...
elif ...
   ...
else:
    # If no exact types are found, fallback to slower `isinstance` check
    if isinstance(arg_type, ctypes_bool):
        ...
    elif:
        ...
    else:
        return 1

Note that the fallback cases are in a separate if/elif/else block so that Cython can still optimize the outer one to a C switch statement.

I suspect that would not have a significant impact on the benchmark (which doesn't exercise subclasses). If we go this route, we should also add a test that creates a subclass of a ctype and numpy type and confirms that it works and does the right thing.

chore: Replace isinstance(obj, T) with type(obj) is T comparisons

874b7f4

leofang requested a review from mdboom December 1, 2025 15:49

leofang added enhancement Any code-related improvements P1 Medium priority - Should do cuda.core Everything related to the cuda.core module labels Dec 1, 2025

leofang added this to the cuda.core beta 10 milestone Dec 1, 2025

Merge branch 'main' into isinstance-type

0db38d0

NVIDIA deleted a comment from copy-pr-bot bot Dec 2, 2025

mdboom reviewed Dec 2, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

chore: Replace isinstance(obj, T) with type(obj) is T comparisons #1292

chore: Replace isinstance(obj, T) with type(obj) is T comparisons #1292

bharatr21 commented Dec 1, 2025 •

edited

Loading

Uh oh!

copy-pr-bot bot commented Dec 1, 2025

Uh oh!

bharatr21 commented Dec 1, 2025

Uh oh!

mdboom commented Dec 2, 2025

Uh oh!

github-actions bot commented Dec 2, 2025

Preview will be ready when the GitHub Pages deployment is complete.

Uh oh!

mdboom left a comment •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

chore: Replace isinstance(obj, T) with type(obj) is T comparisons #1292

Are you sure you want to change the base?

chore: Replace isinstance(obj, T) with type(obj) is T comparisons #1292

Conversation

bharatr21 commented Dec 1, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Description

Additional Notes

Checklist

Uh oh!

copy-pr-bot bot commented Dec 1, 2025

Uh oh!

bharatr21 commented Dec 1, 2025

Uh oh!

mdboom commented Dec 2, 2025

Uh oh!

github-actions bot commented Dec 2, 2025

Preview will be ready when the GitHub Pages deployment is complete.

Uh oh!

mdboom left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

bharatr21 commented Dec 1, 2025 •

edited

Loading

mdboom left a comment •

edited

Loading