In [1]:
# Structured Data: Numpys Structured Arrays
# This section demonstrates the use of Numpy's structured and record arrays which provide
# effecient storage for compound, heterogeneous data
# Patterns shown here are useful for simple operations, scenarios like this lend themselves
# to the use of Pandas DataFrames

# Imagine several categories of data on a number of people (name, age, and weight)
# and we'd like to store values for use in a python program
name = ['Alice', 'Bob', 'Cathy', 'Doug']
age = [25,45,37,19]
weight = [55.0,85.5,68.0,61.5]

In [2]:
# This is a bit clumsy. There's nothing here to tell that the three arrays are related
# It would be more natural to use a single structure to store all the data
# Numpy can handle structured arrays

In [3]:
# We previously created simple array using expressions like this: 
x = np.zeros(4, dtype=int)

NameError: name 'np' is not defined

In [4]:
# We previously created simple array using expressions like this: 
import numpy as np
x = np.zeros(4, dtype=int)

In [5]:
# We can create a structured array using a compound data type specification:

# Use a compound data type for structured arrays
data = np.zeros(4, dtype={'names':('name', 'age', 'weight'), 'formats':('U10', 'i4', 'f8')})
print(data.dtype)

[('name', '<U10'), ('age', '<i4'), ('weight', '<f8')]


In [7]:
# U10 translates to "Unicode string of max length 10"
# i4 translates to "4-byte (i.e., 32 bit) integer"
# f8 translates to "8-byte (i.e., 64 bit) float"

# Now that we've created an empty container array, we can fill the array with our list of values
data['name'] = name
data['age'] = age
data['weight'] = weight
print(data)

[('Alice', 25, 55. ) ('Bob', 45, 85.5) ('Cathy', 37, 68. )
 ('Doug', 19, 61.5)]


In [8]:
# As we had hoped, the data arranged together in one convenient block of memory

# You can refer to values either by index or by name:

# Get all names
data['name']

array(['Alice', 'Bob', 'Cathy', 'Doug'], dtype='<U10')

In [9]:
# Get name from last row
data[-1]['name']

'Doug'

In [10]:
# Get names where age is under 30
data[data['age'] < 30]['name']

array(['Alice', 'Doug'], dtype='<U10')

In [12]:
# If you'd like to do operations that are any more complicated than these, consider pandas package
# Pandas provides a DataFrame object which is a structure built on NumPy arrays that offers a variety of useful manipulation functionality similar to what we've shown here

# Creating Structured Arrays
# Structured array data types can be specified in a number of ways. Earlier, we saw the dictionary method:
np.dtype({'names':('name', 'age', 'weight'), 'formats':('U10', 'i4', 'f8')})

dtype([('name', '<U10'), ('age', '<i4'), ('weight', '<f8')])

In [13]:
# For clarity, numerical types can be specified with Python types or NumPy dtypes instead:
np.dtype({'names':('name', 'age', 'weight'), 'formats':((np.str_, 10), int, np.float32)})

dtype([('name', '<U10'), ('age', '<i8'), ('weight', '<f4')])

In [14]:
# A compound type can also be specified as a list of tuples:
np.dtype([('name', 'S10'), ('age', 'i4'), ('weight', 'f8')])

dtype([('name', 'S10'), ('age', '<i4'), ('weight', '<f8')])

In [15]:
# if the names of the types do not matter to youm you can specify the types alone in a comma-seperated string:
np.dtype('S10,i4,f8')

dtype([('f0', 'S10'), ('f1', '<i4'), ('f2', '<f8')])

In [16]:
# More Advanced Compound Types
# It is possible to define even more advance compound types. 
# You can create a type where each element contains an array or matrix of values.
# We will create a type with a mat component consisting of a 3x3 floating-point matrix:

tp = np.dtype(['id', 'i8'), ('mat', 'f8', (3, 3))])
X = np.zeros(1, dtype=tp)
print(X[0])
print(X['mat'][0])

SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' (3710257457.py, line 6)

In [17]:
# More Advanced Compound Types
# It is possible to define even more advance compound types. 
# You can create a type where each element contains an array or matrix of values.
# We will create a type with a mat component consisting of a 3x3 floating-point matrix:

tp = np.dtype([('id', 'i8'), ('mat', 'f8', (3, 3))])
X = np.zeros(1, dtype=tp)
print(X[0])
print(X['mat'][0])

(0, [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [18]:
# Now each element in the X array consists of an id and 3x3 matrix
# NumPy dtype directly maps onto a C structure definition, so the buffer containing the array
# content can be accessed directly within an appropriate written C program 

In [19]:
# RecordArrays: Structured Arrays with a Twist
# NumPy also provides the np.recarray class, which is almost identical to the structered arrays
# just described, but with one additional feature: fields can be accesed as attributes rather
# than as dictionary keys. We previously accessed ages by:
data['age']

array([25, 45, 37, 19], dtype=int32)

In [20]:
# If we view or data as a record array instead, we can access this with slightly fewer keystrokes:
data_rec = data.view(np.recarray)
data_rec.age

array([25, 45, 37, 19], dtype=int32)

In [21]:
# The downside of record arrays, there is some extra overhead involved in accessing the fields
# even when using the same syntax as seen here:
%timeit data['age']
%timeit data_rec['age']
%timeit data_rec.age

81.9 ns ± 0.0876 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
1.27 µs ± 2.5 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
2.15 µs ± 2.55 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
