In [1]:
import numpy as np
from typing import Tuple, Union
import cProfile

from decision_mining.core.c45 import gain_ratio, find_threshold

# Profiling on commit [30729e7](https://github.com/MartijnKnegt/INNO21-Backend/commit/30729e74769d5d6da808c3da0c0bf302d42d3dcc)

In [2]:
rng = np.random.default_rng()

Profile with unordered array

In [3]:
X = np.arange(10000)
rng.shuffle(X)
y = (X > 7500).astype(int) + (X > 5000).astype(int)
with cProfile.Profile() as pr:
    find_threshold(X, y)

pr.print_stats()
# 49995

         2159795 function calls (2069804 primitive calls) in 21.899 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    29997    0.060    0.000    0.421    0.000 <__array_function__ internals>:2(concatenate)
    39996    0.075    0.000    0.529    0.000 <__array_function__ internals>:2(count_nonzero)
    29997    0.056    0.000    0.748    0.000 <__array_function__ internals>:2(diff)
    29997    0.063    0.000    0.513    0.000 <__array_function__ internals>:2(nonzero)
        1    0.000    0.000    0.001    0.001 <__array_function__ internals>:2(sort)
    59994    0.120    0.000    1.407    0.000 <__array_function__ internals>:2(sum)
    49995    0.105    0.000   14.416    0.000 <__array_function__ internals>:2(unique)
   129988    0.104    0.000    0.222    0.000 _asarray.py:110(asanyarray)
    29997    0.026    0.000    0.053    0.000 _asarray.py:23(asarray)
    29997    0.871    0.000    2.331    0.000 _distn_infrastru

Profile with ordered array

In [4]:
X = np.arange(10000)
y = (X > 7500).astype(int) + (X > 5000).astype(int)
with cProfile.Profile() as pr:
    find_threshold(X, y)

pr.print_stats()

         2159795 function calls (2069804 primitive calls) in 16.595 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    29997    0.059    0.000    0.397    0.000 <__array_function__ internals>:2(concatenate)
    39996    0.074    0.000    0.528    0.000 <__array_function__ internals>:2(count_nonzero)
    29997    0.055    0.000    0.728    0.000 <__array_function__ internals>:2(diff)
    29997    0.063    0.000    0.497    0.000 <__array_function__ internals>:2(nonzero)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(sort)
    59994    0.118    0.000    1.346    0.000 <__array_function__ internals>:2(sum)
    49995    0.101    0.000   10.968    0.000 <__array_function__ internals>:2(unique)
   129988    0.101    0.000    0.213    0.000 _asarray.py:110(asanyarray)
    29997    0.025    0.000    0.051    0.000 _asarray.py:23(asarray)
    29997    0.819    0.000    2.217    0.000 _distn_infrastru

## Conclusion
Pre-sorting the attribute array can increase performance by 3-5 seconds. Largest time sink is `np.unique`, can be partially remedied by combining `split_info`, `gain` and `gain_ratio` into one function.

## Swapping `np.unique(attribute).size == 1` with `(attribute == attribute[0]).all()`

In [5]:
attribute = np.arange(10000)
rng.shuffle(attribute)

In [6]:
%timeit (attribute == attribute[0]).all()

22.2 µs ± 1.28 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [7]:
%timeit np.unique(attribute).size == 1

1.05 ms ± 26.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [8]:
attribute = np.zeros(10000)

In [9]:
%timeit (attribute == attribute[0]).all()

16.8 µs ± 1.81 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [10]:
%timeit np.unique(attribute).size == 1

327 µs ± 15.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Conclusion
`np.unique(attribute).size == 1` is significantly faster.