In [0]:
import numpy as np
import pandas as pd
np.random.seed(2019111030)
import matplotlib.pyplot as plt

## Appendix A: Advanced NumPy
<img src="https://images-na.ssl-images-amazon.com/images/I/51cUNf8zukL._SX379_BO1,204,203,200_.jpg" width="200" height="300">




In [5]:
arr = np.random.randn(5, 4)
arr
np.mean(arr)

-0.3560274401073715

#NumPy dtype Hierarchy

<img src="https://learning.oreilly.com/library/view/python-for-data/9781449323592/httpatomoreillycomsourceoreillyimages2172220.png">

In [8]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)

True

In [9]:
np.float64.mro()


[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

In [0]:
np.issubdtype(ints.dtype, np.number)


#Advanced Array Manipulation

##Reshaping Arrays

In [0]:
arr = np.arange(8)
arr
arr.reshape((4, 2))

In [0]:
arr.reshape((4, 2)).reshape((2, 4))

In [10]:
arr = np.arange(15)
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [11]:
other_arr = np.ones((3, 5))
other_arr.shape
arr.reshape(other_arr.shape)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [12]:
arr = np.arange(15).reshape((5, 3))
arr
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [13]:
arr.flatten()


array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

##C Versus Fortran Order

In [14]:
arr = np.arange(12).reshape((3, 4))
arr
arr.ravel()
arr.ravel('F')

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

##Concatenating and Splitting Arrays

In [0]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)
np.concatenate([arr1, arr2], axis=1)

In [0]:
np.vstack((arr1, arr2))
np.hstack((arr1, arr2))

In [0]:
arr = np.random.randn(5, 2)
arr
first, second, third = np.split(arr, [1, 3])
first
second
third

###Stacking helpers: r and c

In [15]:
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)
np.r_[arr1, arr2]
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.        ,  1.        ,  0.        ],
       [ 2.        ,  3.        ,  1.        ],
       [ 4.        ,  5.        ,  2.        ],
       [ 0.00920315,  1.2576978 ,  3.        ],
       [ 0.74277949,  0.12316407,  4.        ],
       [ 0.76934687, -0.41757366,  5.        ]])

In [16]:
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

##Repeating Elements: tile and repeat

In [0]:
arr = np.arange(3)
arr
arr.repeat(3)

In [0]:
arr.repeat([2, 3, 4])

In [0]:
arr = np.random.randn(2, 2)
arr
arr.repeat(2, axis=0)

In [0]:
arr.repeat([2, 3], axis=0)
arr.repeat([2, 3], axis=1)

In [0]:
arr
np.tile(arr, 2)

In [0]:
arr
np.tile(arr, (2, 1))
np.tile(arr, (3, 2))

##Fancy Indexing Equivalents: take and put

In [0]:
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]

In [0]:
arr.take(inds)
arr.put(inds, 42)
arr
arr.put(inds, [40, 41, 42, 43])
arr

In [0]:
inds = [2, 0, 2, 1]
arr = np.random.randn(2, 4)
arr
arr.take(inds, axis=1)

#Broadcasting

In [0]:
arr = np.arange(5)
arr
arr * 4

In [0]:
arr = np.random.randn(4, 3)
arr.mean(0)
demeaned = arr - arr.mean(0)
demeaned
demeaned.mean(0)

In [0]:
d.mean(0)
arr
row_means = arr.mean(1)
row_means.shape
row_means.reshape((4, 1))
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)

##Broadcasting Over Other Axes

In [0]:
arr - arr.mean(1)

In [0]:
arr - arr.mean(1).reshape((4, 1))

In [0]:
arr = np.zeros((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]
arr_1d[np.newaxis, :]

In [0]:
arr = np.random.randn(3, 4, 5)
depth_means = arr.mean(2)
depth_means
depth_means.shape
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)

##Setting Array Values by Broadcasting

In [0]:
arr = np.zeros((4, 3))
arr[:] = 5
arr

In [0]:
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr
arr[:2] = [[-1.37], [0.509]]
arr

#Advanced ufunc Usage

##ufunc Instance Methods

In [0]:
arr = np.arange(10)
np.add.reduce(arr)
arr.sum()

In [0]:
np.random.seed(12346)  # for reproducibility
arr = np.random.randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr[:, :-1] < arr[:, 1:]
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

In [0]:
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)

In [0]:
arr = np.arange(3).repeat([1, 2, 2])
arr
np.multiply.outer(arr, np.arange(5))

In [0]:
x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)
result.shape

In [0]:
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])

In [0]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)

##Writing New ufuncs in Python

In [0]:
def add_elements(x, y):
    return x + y
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))

In [0]:
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))

In [0]:
arr = np.random.randn(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)

#Structured and Record Arrays

In [0]:
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

In [0]:
sarr[0]
sarr[0]['y']

In [0]:
sarr['x']

##Nested dtypes and Multidimensional Fields

In [0]:
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr

In [0]:
arr[0]['x']

In [0]:
arr['x']

In [0]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']
data['y']
data['x']['a']

#More About Sorting

In [0]:
arr = np.random.randn(6)
arr.sort()
arr

In [0]:
arr = np.random.randn(3, 5)
arr
arr[:, 0].sort()  # Sort first column values in-place
arr

In [0]:
arr = np.random.randn(5)
arr
np.sort(arr)
arr

In [0]:
arr = np.random.randn(3, 5)
arr
arr.sort(axis=1)
arr

In [0]:
arr[:, ::-1]

##Indirect Sorts: argsort and lexsort

In [0]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
values[indexer]

In [0]:
arr = np.random.randn(3, 5)
arr[0] = values
arr
arr[:, arr[0].argsort()]

In [0]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
sorter
zip(last_name[sorter], first_name[sorter])


##Alternative Sort Algorithms

In [17]:
values = np.array(['2:first', '2:second', '1:first', '1:second',
                   '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')

##Partially Sorting Arrays

In [0]:
np.random.seed(12345)
arr = np.random.randn(20)
arr
np.partition(arr, 3)

In [0]:
indices = np.argpartition(arr, 3)
indices
arr.take(indices)

##numpy.searchsorted: Finding Elements in a Sorted Array

In [0]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)

In [0]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)

In [0]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
arr.searchsorted([0, 1], side='right')

In [0]:
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

In [0]:
labels = bins.searchsorted(data)
labels

In [0]:
pd.Series(data).groupby(labels).mean()

#Advanced Array Input and Output

##Memory-Mapped Files

In [20]:
mmap = np.memmap('mymmap', dtype='float64', mode='w+',
                 shape=(10000, 10000))
mmap

memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [0]:
section = mmap[:5]

In [0]:
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap
del mmap

In [0]:
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap

In [0]:
%xdel mmap
!rm mymmap

#Performance Tips

##The Importance of Contiguous Memory

In [0]:
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguous

In [0]:
%timeit arr_c.sum(1)
%timeit arr_f.sum(1)

In [0]:
arr_f.copy('C').flags

In [0]:
arr_c[:50].flags.contiguous
arr_c[:, :50].flags

In [22]:
%xdel arr_c
%xdel arr_f

NameError: name 'arr_c' is not defined
NameError: name 'arr_f' is not defined


In [0]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS

# Reference
[Materials and IPython notebooks for "Python for Data Analysis" by Wes McKinney, published by O'Reilly Media](https://www.cin.ufpe.br/~embat/Python%20for%20Data%20Analysis.pdf)

https://learning.oreilly.com/library/view/python-for-data/9781449323592/ch12.html

https://nbviewer.jupyter.org/github/pydata/pydata-book/tree/2nd-edition/

https://github.com/re4lfl0w/ipython/blob/master/books/python_data_analysis/ch04_Numpy.ipynb

https://notebooks.azure.com/wesm/projects/python-for-data-analysis

https://github.com/re4lfl0w/ipython/tree/master/books/python_data_analysis



%timeit 시간찍기
%timeit np.arange(10000)
%timeit range(10000)