In [1]:
def find_outliers(data):
    """Find outliers in data, return indices of outliers"""
    out = data[(data - data.mean()).abs() > 2 * data.std()]
    return out.index

In [2]:
%run outliers.py

In [3]:
import numpy as np
import pandas  as pd

In [4]:
data = pd.Series(np.random.randint(50, 60, 10_000))

In [5]:
data[7] = 3

In [6]:
data[1003] = 100
find_outliers(data)

Index([7, 1003], dtype='int64')

In [8]:
%timeit find_outliers(data)

296 µs ± 21.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [9]:
%timeit?

[1;31mDocstring:[0m
Time execution of a Python statement or expression

Usage, in line mode:
  %timeit [-n<N> -r<R> [-t|-c] -q -p<P> -o] statement
or in cell mode:
  %%timeit [-n<N> -r<R> [-t|-c] -q -p<P> -o] setup_code
  code
  code...

Time execution of a Python statement or expression using the timeit
module.  This function can be used both as a line and cell magic:

- In line mode you can time a single-line statement (though multiple
  ones can be chained with using semicolons).

- In cell mode, the statement in the first line is used as setup code
  (executed but not timed) and the body of the cell is timed.  The cell
  body has access to any variables created in the setup code.

Options:
-n<N>: execute the given statement <N> times in a loop. If <N> is not
provided, <N> is determined so as to get sufficient accuracy.

-r<R>: number of repeats <R>, each consisting of <N> loops, and take the
average result.
Default: 7

-t: use time.time to measure the time, which is the default o

In [10]:
pip install pytest-benchmark        ## Instala o Pytest-Benchmark

Collecting pytest-benchmark
  Downloading pytest_benchmark-4.0.0-py3-none-any.whl (43 kB)
     ---------------------------------------- 0.0/44.0 kB ? eta -:--:--
     ---------------------------------------- 44.0/44.0 kB 1.1 MB/s eta 0:00:00
Collecting pytest>=3.8 (from pytest-benchmark)
  Obtaining dependency information for pytest>=3.8 from https://files.pythonhosted.org/packages/df/d0/e192c4275aecabf74faa1aacd75ef700091913236ec78b1a98f62a2412ee/pytest-7.4.2-py3-none-any.whl.metadata
  Downloading pytest-7.4.2-py3-none-any.whl.metadata (7.9 kB)
Collecting py-cpuinfo (from pytest-benchmark)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Collecting iniconfig (from pytest>=3.8->pytest-benchmark)
  Downloading iniconfig-2.0.0-py3-none-any.whl (5.9 kB)
Collecting pluggy<2.0,>=0.12 (from pytest>=3.8->pytest-benchmark)
  Obtaining dependency information for pluggy<2.0,>=0.12 from https://files.pythonhosted.org/packages/05/b8/42ed91898d4784546c5f06c60506400548db3f7a4b3fb441cba4e5c17

In [11]:
from outliers import find_outliers

In [12]:
def gen_data(size, num_outliers):
    """Generate data in with size element containint num_outliers outliers.
    Returns the data and the outliers.
    """
    regular = np.random.randint(50, 60, size-num_outliers)
    low = np.random.randint(1, 10, num_outliers//2)
    high = np.random.randint(100, 110, num_outliers-len(low))

    data = np.concatenate([regular, low, high])
    np.random.shuffle(data)
    return pd.Series(data), pd.Series(np.concatenate([low, high]))

In [13]:
def test_bench_outliers(benchmark):
    size = 10_000  # Usual size of data
    num_outliers = 5  # Usual number of outliers
    data, expected = gen_data(size, num_outliers)
    indices = benchmark(find_outliers, data)
    outliers = data.loc[indices]
    assert set(expected) == set(outliers), 'bad result'

In [15]:
## Rodar o comando no Terminal: python -m pytest
## Executa um benchmark com as informações: "Name (time in us)"", "Min", "Max", "Mean", "StdDev", "Median", "IQR", "Outliers", "OPS (Kops/s)"", "Rounds" e "Iterations"