# Agenda

1. Threading exceptions
2. `asyncio` -- ideas + examples
3. Benchmarking
4. NumPy + `pandas`

# Benchmarking

In [10]:
def traditional_loop():
    numbers = range(10_000_000)
    
    output = []
    for one_number in numbers:
        output.append(one_number ** 2)
        
    return output

def comprehension():
    numbers = range(10_000_000)
    
    return [one_number ** 2
           for one_number in numbers]



In [11]:
import time
start_time = time.perf_counter()
traditional_loop()
end_time = time.perf_counter()

print(f'Took {end_time - start_time}')

Took 2.5332047143019736


In [12]:
import time
start_time = time.perf_counter()
comprehension()
end_time = time.perf_counter()

print(f'Took {end_time - start_time}')

Took 2.1925993440672755


In [16]:
# everything we pass to %timeit needs to be on one line!
%timeit traditional_loop()

SyntaxError: unmatched ')' (1912264951.py, line 2)

In [14]:
%timeit comprehension()

2.22 s ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
# Non-Jupyter magic command version of timeit
import timeit



In [22]:
traditional_loop = '''
numbers = range(10_000_000)

output = []
for one_number in numbers:
    output.append(one_number ** 2)
'''

comprehension = '''
numbers = range(10_000_000)

output = [one_number ** 2
       for one_number in numbers]
'''

timeit.timeit(traditional_loop, number=10)

24.426533837802708

In [23]:
timeit.timeit(comprehension, number=10)

22.13207621080801

In [24]:
help(timeit.timeit)

Help on function timeit in module timeit:

timeit(stmt='pass', setup='pass', timer=<built-in function perf_counter>, number=1000000, globals=None)
    Convenience function to create Timer object and call timeit method.



In [26]:
def traditional_loop():
    numbers = range(10_000_000)
    
    output = []
    for one_number in numbers:
        output.append(one_number ** 2)
        
    return output

def comprehension():
    numbers = range(10_000_000)
    
    return [one_number ** 2
           for one_number in numbers]

timeit.timeit('traditional_loop()', number=3, globals=globals())

7.442414263263345

In [27]:
timeit.timeit('comprehension()', number=3, globals=globals())

6.650881435722113

# NumPy and Pandas

In [28]:
import sys
x = 0
sys.getsizeof(x)

24

# NumPy

NumPy gives us one data structure: `ndarray` (n-dimensional array).  But don't call it directly -- instead, call `np.array` to create a new NumPy array.



In [29]:
import numpy as np   # everyone uses this alias

a = np.array([10,20, 30, 40, 50])

In [30]:
type(a)

numpy.ndarray

In [31]:
a

array([10, 20, 30, 40, 50])

In [32]:
a[0]

10

In [33]:
a[1]

20

In [34]:
a[0] = 50

In [35]:
a

array([50, 20, 30, 40, 50])

In [36]:
a[0] = 10

In [37]:
a

array([10, 20, 30, 40, 50])

In [38]:
mylist = [10, 20, 30, 40, 50]
mylist + mylist  # what happens when we add a list to itself?

[10, 20, 30, 40, 50, 10, 20, 30, 40, 50]

In [39]:
a + a   # what happens when I add a + a?

array([ 20,  40,  60,  80, 100])

In [40]:
b = np.array([100, 200, 300])

In [41]:
a + b

ValueError: operands could not be broadcast together with shapes (5,) (3,) 

In [42]:
a

array([10, 20, 30, 40, 50])

In [44]:
a + 3    # broadcast

array([13, 23, 33, 43, 53])

In [45]:
a - 3

array([ 7, 17, 27, 37, 47])

In [47]:
a * 3

array([ 30,  60,  90, 120, 150])

In [48]:
a / 3

array([ 3.33333333,  6.66666667, 10.        , 13.33333333, 16.66666667])

In [49]:
a // 3

array([ 3,  6, 10, 13, 16])

In [50]:
a ** 3

array([  1000,   8000,  27000,  64000, 125000])

In [51]:
a % 3

array([1, 2, 0, 1, 2])

In [52]:
a[[2, 4]]   # fancy index

array([30, 50])

In [53]:
a[[2, 3, 3, 2]]  

array([30, 40, 40, 30])

# Other ways to create NumPy arrays

In [54]:
np.ones(10)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [55]:
np.ones(10) * 5

array([5., 5., 5., 5., 5., 5., 5., 5., 5., 5.])

In [57]:
np.zeros(10) + 5     # zeros and not zeroes

array([5., 5., 5., 5., 5., 5., 5., 5., 5., 5.])

In [59]:
# get 6 random integers from 0-100
np.random.randint(0, 100, 6)

array([59, 32, 12, 43, 16, 41])

In [60]:
# get 10 random floats, all between 0 and 1
np.random.rand(10)

array([0.82098316, 0.48416498, 0.45431404, 0.83398218, 0.16524907,
       0.11614058, 0.53791655, 0.80648735, 0.79660693, 0.08790547])

In [61]:
np.random.rand(10) * 100

array([43.3447918 , 81.8750917 , 58.62072403, 51.38219414, 21.746913  ,
       59.14925806, 38.18954011, 94.98294293, 92.74554535, 19.6572262 ])

In [62]:
np.arange(10, 30, 3)   # np.array(range(10, 30, 3))

array([10, 13, 16, 19, 22, 25, 28])

# NumPy array methods

In [63]:
a

array([10, 20, 30, 40, 50])

In [64]:
a.sum() 

150

In [65]:
a.mean()

30.0

In [66]:
a.std()

14.142135623730951

In [67]:
a.min()

10

In [68]:
a.max()

50

In [72]:
a.size

5

In [73]:
a.shape

(5,)

In [74]:
a

array([10, 20, 30, 40, 50])

In [76]:
a[[0, 1, 2]]   # fancy index

array([10, 20, 30])

In [77]:
# boolean index -- must be of the same length as a
a[[True, False, False, False, True]]

array([10, 50])

In [78]:
a[[True, True, False, True, True]]

array([10, 20, 40, 50])

In [79]:
a

array([10, 20, 30, 40, 50])

In [80]:
a + 30

array([40, 50, 60, 70, 80])

In [81]:
a == 30    # use a comparison operator

array([False, False,  True, False, False])

In [83]:
# use a inside the [], to get a boolean array, which we'll use as a boolean index
a[a==30]

array([30])

In [84]:
a[a<30]

array([10, 20])

In [85]:
a[a>a.mean()]

array([40, 50])

In [86]:
np.array([10, 20, 30])

array([10, 20, 30])

In [87]:
a

array([10, 20, 30, 40, 50])

In [88]:
list(a)

[10, 20, 30, 40, 50]

In [89]:
a.nbytes

40

In [90]:
a.tolist()

[10, 20, 30, 40, 50]

# NumPy exercises

1. Create an array of 20 random integers, from 0 to 100.
2. Find the largest even number.
3. Find the mean of the odd numbers.
4. Create a new array with 20 floats from 0-1,000.
5. Find the numbers that are < the mean.
6. Find the numbers that are < the mean - one standard deviation.

In [95]:
np.random.seed(0)
a = np.random.randint(0, 100, 20)
a

array([44, 47, 64, 67, 67,  9, 83, 21, 36, 87, 70, 88, 88, 12, 58, 65, 39,
       87, 46, 88])

In [100]:
# get the even numbers
a[a%2==0]

array([44, 64, 36, 70, 88, 88, 12, 58, 46, 88])

In [101]:
# get the largest even numbers
a[a%2==0].max()

88

In [105]:
# get the mean of the odd numbers
a[a%2==1].mean()

57.2

In [106]:
np.random.seed(0)
a = np.random.rand(20) * 1000
a

array([548.81350393, 715.18936637, 602.76337607, 544.883183  ,
       423.65479934, 645.89411307, 437.58721126, 891.77300078,
       963.6627605 , 383.44151883, 791.72503808, 528.89491975,
       568.04456109, 925.59663829,  71.0360582 ,  87.1292997 ,
        20.21839744, 832.61984555, 778.15675095, 870.01214825])

In [108]:
# find numbers less than the mean

a[a<a.mean()]

array([548.81350393, 544.883183  , 423.65479934, 437.58721126,
       383.44151883, 528.89491975, 568.04456109,  71.0360582 ,
        87.1292997 ,  20.21839744])

In [109]:
a.mean()

581.5548245225973

In [112]:
# find numbers < mean-std
a[a < a.mean()-a.std()]

array([71.0360582 , 87.1292997 , 20.21839744])

In [None]:
# find numbers < mean - 2*a.std()
a[a < a.mean()-(a.std() * 2)]

In [114]:
a.std()

275.91521991954784

# Next up

- complex queries
- dtypes
- `nan`

Resume at 11:50

In [115]:
a

array([548.81350393, 715.18936637, 602.76337607, 544.883183  ,
       423.65479934, 645.89411307, 437.58721126, 891.77300078,
       963.6627605 , 383.44151883, 791.72503808, 528.89491975,
       568.04456109, 925.59663829,  71.0360582 ,  87.1292997 ,
        20.21839744, 832.61984555, 778.15675095, 870.01214825])

In [116]:
np.random.seed(0)
a = np.random.randint(0, 100, 10)
a

array([44, 47, 64, 67, 67,  9, 83, 21, 36, 87])

In [118]:
# I want even numbers < the mean
b = a[a<a.mean()]
b

array([44, 47,  9, 21, 36])

In [120]:
b[b%2==0]

array([44, 36])

In [121]:
# how can I do this in one query?

a[a<a.mean()  and a%2==0]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [123]:
s = ''

if not s:
    print('Nothing there')

Nothing there
