In [2]:
# Numpy

import numpy as np

In [5]:
# Create an 1d array from a list

list1 = [0,1,2,3,4]
arr1d = np.array(list1)
arr1d

array([0, 1, 2, 3, 4])

In [6]:
# Add 2 to each element of arr1d
arr1d + 2

array([2, 3, 4, 5, 6])

In [7]:
# Create a 2d array from a list of lists
list2 = [[0,1,2], [3,4,5], [6,7,8]]
arr2d = np.array(list2)
arr2d

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [8]:
# Create a float 2d array
arr2d_f = np.array(list2, dtype='float')  # 'float', 'int', 'bool', 'str', 'object'
arr2d_f

array([[0., 1., 2.],
       [3., 4., 5.],
       [6., 7., 8.]])

In [9]:
# Convert to 'int' datatype
arr2d_f.astype('int')

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [10]:
# Create a boolean array
arr2d_b = np.array([1, 0, 10], dtype='bool')
arr2d_b

array([ True, False,  True])

In [12]:
# Create an object array to hold numbers as well as strings
arr1d_obj = np.array([1, 'a'], dtype='object')
arr1d_obj

array([1, 'a'], dtype=object)

In [13]:
# Convert an array back to a list
arr1d_obj.tolist()

[1, 'a']

In [14]:
# Create a 2d array with 3 rows and 4 columns
list2 = [[1, 2, 3, 4],[3, 4, 5, 6], [5, 6, 7, 8]]
arr2 = np.array(list2, dtype='float')
arr2

array([[1., 2., 3., 4.],
       [3., 4., 5., 6.],
       [5., 6., 7., 8.]])

In [15]:
# shape
print('Shape: ', arr2.shape)

# dtype
print('Datatype: ', arr2.dtype)

# size
print('Size: ', arr2.size)

# ndim
print('Num Dimensions: ', arr2.ndim)

Shape:  (3, 4)
Datatype:  float64
Size:  12
Num Dimensions:  2


In [16]:
# Extract the first 2 rows and columns
arr2[:2, :2]

array([[1., 2.],
       [3., 4.]])

In [17]:
# Get the boolean output by applying the condition to each element.
b = arr2 > 4
b

array([[False, False, False, False],
       [False, False,  True,  True],
       [ True,  True,  True,  True]])

In [18]:
arr2[b]

array([5., 6., 5., 6., 7., 8.])

In [19]:
# Reverse only the row positions
arr2[::-1, ]

array([[5., 6., 7., 8.],
       [3., 4., 5., 6.],
       [1., 2., 3., 4.]])

In [20]:
# Reverse the row and column positions
arr2[::-1, ::-1]

array([[8., 7., 6., 5.],
       [6., 5., 4., 3.],
       [4., 3., 2., 1.]])

In [21]:
# Insert a nan and an inf
arr2[1,1] = np.nan  # not a number
arr2[1,2] = np.inf  # infinite
arr2

array([[ 1.,  2.,  3.,  4.],
       [ 3., nan, inf,  6.],
       [ 5.,  6.,  7.,  8.]])

In [22]:
# Replace nan and inf with -1. Don't use arr2 == np.nan
missing_bool = np.isnan(arr2) | np.isinf(arr2)
arr2[missing_bool] = -1  
arr2

array([[ 1.,  2.,  3.,  4.],
       [ 3., -1., -1.,  6.],
       [ 5.,  6.,  7.,  8.]])

In [23]:
# mean, max and min
print("Mean value is: ", arr2.mean())
print("Max value is: ", arr2.max())
print("Min value is: ", arr2.min())

Mean value is:  3.5833333333333335
Max value is:  8.0
Min value is:  -1.0


In [24]:
# Row wise and column wise min
print("Column wise minimum: ", np.amin(arr2, axis=0))
print("Row wise minimum: ", np.amin(arr2, axis=1))

Column wise minimum:  [ 1. -1. -1.  4.]
Row wise minimum:  [ 1. -1.  5.]


In [25]:
# Cumulative Sum
np.cumsum(arr2)

array([ 1.,  3.,  6., 10., 13., 12., 11., 17., 22., 28., 35., 43.])

In [26]:
# Assign portion of arr2 to arr2a. Doesn't really create a new array.
arr2a = arr2[:2,:2]  
arr2a[:1, :1] = 100  # 100 will reflect in arr2
arr2

array([[100.,   2.,   3.,   4.],
       [  3.,  -1.,  -1.,   6.],
       [  5.,   6.,   7.,   8.]])

In [29]:
# Copy portion of arr2 to arr2b
arr2b = arr2[:2, :2].copy()
arr2b[:1, :1] = 101  # 101 will not reflect in arr2
arr2

array([[100.,   2.,   3.,   4.],
       [  3.,  -1.,  -1.,   6.],
       [  5.,   6.,   7.,   8.]])

In [30]:
# Reshape a 3x4 array to 4x3 array
arr2.reshape(4, 3)

array([[100.,   2.,   3.],
       [  4.,   3.,  -1.],
       [ -1.,   6.,   5.],
       [  6.,   7.,   8.]])

In [31]:
# Flatten it to a 1d array
arr2.flatten()

array([100.,   2.,   3.,   4.,   3.,  -1.,  -1.,   6.,   5.,   6.,   7.,
         8.])

In [32]:
# Changing the flattened array does not change parent
b1 = arr2.flatten()  
b1[0] = 100  # changing b1 does not affect arr2
arr2

array([[100.,   2.,   3.,   4.],
       [  3.,  -1.,  -1.,   6.],
       [  5.,   6.,   7.,   8.]])

In [33]:
# Changing the raveled array changes the parent also.
b2 = arr2.ravel()  
b2[0] = 101  # changing b2 changes arr2 also
arr2

array([[101.,   2.,   3.,   4.],
       [  3.,  -1.,  -1.,   6.],
       [  5.,   6.,   7.,   8.]])

In [34]:
# Lower limit is 0 be default
print(np.arange(5))  

# 0 to 9
print(np.arange(0, 10))  

# 0 to 9 with step of 2
print(np.arange(0, 10, 2))  

# 10 to 1, decreasing order
print(np.arange(10, 0, -1))

[0 1 2 3 4]
[0 1 2 3 4 5 6 7 8 9]
[0 2 4 6 8]
[10  9  8  7  6  5  4  3  2  1]


In [35]:
# Start at 1 and end at 50
np.linspace(start=1, stop=50, num=10, dtype=int)

array([ 1,  6, 11, 17, 22, 28, 33, 39, 44, 50])

In [36]:
# Limit the number of digits after the decimal to 2
np.set_printoptions(precision=2)  

# Start at 10^1 and end at 10^50
np.logspace(start=1, stop=50, num=10, base=10)

array([1.00e+01, 2.78e+06, 7.74e+11, 2.15e+17, 5.99e+22, 1.67e+28,
       4.64e+33, 1.29e+39, 3.59e+44, 1.00e+50])

In [37]:
np.zeros([2,2])

array([[0., 0.],
       [0., 0.]])

In [38]:
np.ones([2,2])

array([[1., 1.],
       [1., 1.]])

In [39]:
a = [1,2,3] 

# Repeat whole of 'a' two times
print('Tile:   ', np.tile(a, 2))

# Repeat each element of 'a' two times
print('Repeat: ', np.repeat(a, 2))

Tile:    [1 2 3 1 2 3]
Repeat:  [1 1 2 2 3 3]


In [40]:
# Random numbers between [0,1) of shape 2,2
print(np.random.rand(2,2))

# Normal distribution with mean=0 and variance=1 of shape 2,2
print(np.random.randn(2,2))

# Random integers between [0, 10) of shape 2,2
print(np.random.randint(0, 10, size=[2,2]))

# One random number between [0,1)
print(np.random.random())

# Random numbers between [0,1) of shape 2,2
print(np.random.random(size=[2,2]))

# Pick 10 items from a given list, with equal probability
print(np.random.choice(['a', 'e', 'i', 'o', 'u'], size=10))  

# Pick 10 items from a given list with a predefined probability 'p'
print(np.random.choice(['a', 'e', 'i', 'o', 'u'], size=10, p=[0.3, .1, 0.1, 0.4, 0.1]))  # picks more o's

[[0.21 0.05]
 [0.06 0.02]]
[[0.9  0.61]
 [0.95 0.47]]
[[0 7]
 [0 1]]
0.8403022392655172
[[0.33 0.05]
 [0.14 0.25]]
['o' 'u' 'a' 'a' 'e' 'u' 'u' 'i' 'i' 'i']
['a' 'u' 'o' 'o' 'e' 'o' 'u' 'i' 'e' 'e']


In [41]:
# Create the random state
rn = np.random.RandomState(100)

# Create random numbers between [0,1) of shape 2,2
print(rn.rand(2,2))

[[0.54 0.28]
 [0.42 0.84]]


In [42]:
# Set the random seed
np.random.seed(100)

# Create random numbers between [0,1) of shape 2,2
print(np.random.rand(2,2))

[[0.54 0.28]
 [0.42 0.84]]


In [43]:
# Create random integers of size 10 between [0,10)
np.random.seed(100)
arr_rand = np.random.randint(0, 10, size=10)
print(arr_rand)

[8 8 3 7 7 0 4 2 5 2]


In [44]:
# Get the unique items and their counts
uniqs, counts = np.unique(arr_rand, return_counts=True)
print("Unique items : ", uniqs)
print("Counts       : ", counts)

Unique items :  [0 2 3 4 5 7 8]
Counts       :  [1 2 1 1 1 2 2]


In [46]:
# Numpy for Data Analysis

In [47]:
# Create an array
arr_rand = np.array([8, 8, 3, 7, 7, 0, 4, 2, 5, 2])
print("Array: ", arr_rand)

# Positions where value > 5
index_gt5 = np.where(arr_rand > 5)
print("Positions where value > 5: ", index_gt5)

Array:  [8 8 3 7 7 0 4 2 5 2]
Positions where value > 5:  (array([0, 1, 3, 4]),)


In [48]:
# Take items at given index
arr_rand.take(index_gt5)

array([[8, 8, 7, 7]])

In [49]:
# If value > 5, then yield 'gt5' else 'le5'
np.where(arr_rand > 5, 'gt5', 'le5')

array(['gt5', 'gt5', 'le5', 'gt5', 'gt5', 'le5', 'le5', 'le5', 'le5',
       'le5'], dtype='<U3')

In [50]:
# Location of the max
print('Position of max value: ', np.argmax(arr_rand))  

# Location of the min
print('Position of min value: ', np.argmin(arr_rand))

Position of max value:  0
Position of min value:  5


In [52]:
# Turn off scientific notation
np.set_printoptions(suppress=True)  

# Import data from csv file url
path = 'https://raw.githubusercontent.com/selva86/datasets/master/Auto.csv'
data = np.genfromtxt(path, delimiter=',', skip_header=1, filling_values=-999, dtype='float')
data[:3]  # see first 3 rows

array([[  18. ,    8. ,  307. ,  130. , 3504. ,   12. ,   70. ,    1. ,
        -999. ],
       [  15. ,    8. ,  350. ,  165. , 3693. ,   11.5,   70. ,    1. ,
        -999. ],
       [  18. ,    8. ,  318. ,  150. , 3436. ,   11. ,   70. ,    1. ,
        -999. ]])

In [54]:
# data2 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype='object')
data2 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype=None)
data2[:3]  # see first 3 rows

  


array([(18., 8, 307., 130, 3504, 12. , 70, 1, b'"chevrolet chevelle malibu"'),
       (15., 8, 350., 165, 3693, 11.5, 70, 1, b'"buick skylark 320"'),
       (18., 8, 318., 150, 3436, 11. , 70, 1, b'"plymouth satellite"')],
      dtype=[('f0', '<f8'), ('f1', '<i8'), ('f2', '<f8'), ('f3', '<i8'), ('f4', '<i8'), ('f5', '<f8'), ('f6', '<i8'), ('f7', '<i8'), ('f8', 'S38')])

In [58]:
# Save the array as a csv file
np.savetxt("out.csv", data, delimiter=",")

In [59]:
# Save single numpy array object as .npy file
np.save('myarray.npy', arr2d)  

# Save multile numpy arrays as a .npz file
np.savez('array.npz', arr2d_f, arr2d_b)

In [60]:
# Load a .npy file
a = np.load('myarray.npy')
print(a)

[[0 1 2]
 [3 4 5]
 [6 7 8]]


In [61]:
# Load a .npz file
b = np.load('array.npz')
print(b.files)
b['arr_0']

['arr_0', 'arr_1']


array([[0., 1., 2.],
       [3., 4., 5.],
       [6., 7., 8.]])

In [62]:
a = np.zeros([4, 4])
b = np.ones([4, 4])
print(a)
print(b)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


In [63]:
# Vertical Stack Equivalents (Row wise)
np.concatenate([a, b], axis=0)  
np.vstack([a,b])  
np.r_[a,b]

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [64]:
# Horizontal Stack Equivalents (Coliumn wise)
np.concatenate([a, b], axis=1) 
np.hstack([a,b])  
np.c_[a,b]

array([[0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.]])

In [65]:
np.r_[[1,2,3], 0, 0, [4,5,6]]

array([1, 2, 3, 0, 0, 4, 5, 6])

In [6]:
arr = np.random.randint(1,6, size=[8, 4])
arr

array([[3, 4, 4, 2],
       [3, 5, 1, 3],
       [3, 2, 2, 2],
       [3, 5, 5, 3],
       [2, 2, 4, 5],
       [4, 3, 1, 4],
       [1, 5, 5, 5],
       [1, 4, 1, 4]])

In [69]:
# Sort each columns of arr
np.sort(arr, axis=0)

array([[1, 1, 1, 1],
       [2, 2, 2, 2],
       [2, 3, 4, 2],
       [2, 4, 4, 3],
       [3, 4, 4, 3],
       [3, 4, 4, 5],
       [3, 4, 5, 5],
       [4, 5, 5, 5]])

In [3]:
# Get the index positions that would sort the array
x = np.array([1, 10, 5, 2, 8, 9])
sort_index = np.argsort(x)
print(sort_index)

[0 3 2 4 5 1]


In [4]:
x[sort_index]

array([ 1,  2,  5,  8,  9, 10])

In [7]:
# Argsort the first column
sorted_index_1stcol = arr[:, 0].argsort()

# Sort 'arr' by first column without disturbing the integrity of rows
arr[sorted_index_1stcol]

array([[1, 5, 5, 5],
       [1, 4, 1, 4],
       [2, 2, 4, 5],
       [3, 4, 4, 2],
       [3, 5, 1, 3],
       [3, 2, 2, 2],
       [3, 5, 5, 3],
       [4, 3, 1, 4]])

In [8]:
# Descending sort
arr[sorted_index_1stcol[::-1]]

array([[4, 3, 1, 4],
       [3, 5, 5, 3],
       [3, 2, 2, 2],
       [3, 5, 1, 3],
       [3, 4, 4, 2],
       [2, 2, 4, 5],
       [1, 4, 1, 4],
       [1, 5, 5, 5]])

In [9]:
# Sort by column 0, then by column 1
lexsorted_index = np.lexsort((arr[:, 1], arr[:, 0])) 
arr[lexsorted_index]

array([[1, 4, 1, 4],
       [1, 5, 5, 5],
       [2, 2, 4, 5],
       [3, 2, 2, 2],
       [3, 4, 4, 2],
       [3, 5, 1, 3],
       [3, 5, 5, 3],
       [4, 3, 1, 4]])

In [10]:
# Create a datetime64 object
date64 = np.datetime64('2018-02-04 23:10:10')
date64

numpy.datetime64('2018-02-04T23:10:10')

In [11]:
# Drop the time part from the datetime64 object
dt64 = np.datetime64(date64, 'D')
dt64

numpy.datetime64('2018-02-04')

In [12]:
# Create the timedeltas (individual units of time)
tenminutes = np.timedelta64(10, 'm')  # 10 minutes
tenseconds = np.timedelta64(10, 's')  # 10 seconds
tennanoseconds = np.timedelta64(10, 'ns')  # 10 nanoseconds

print('Add 10 days: ', dt64 + 10)
print('Add 10 minutes: ', dt64 + tenminutes)
print('Add 10 seconds: ', dt64 + tenseconds)
print('Add 10 nanoseconds: ', dt64 + tennanoseconds)

Add 10 days:  2018-02-14
Add 10 minutes:  2018-02-04T00:10
Add 10 seconds:  2018-02-04T00:00:10
Add 10 nanoseconds:  2018-02-04T00:00:00.000000010


In [13]:
# Convert np.datetime64 back to a string
np.datetime_as_string(dt64)

'2018-02-04'

In [14]:
print('Date: ', dt64)
print("Is it a business day?: ", np.is_busday(dt64))  
print("Add 2 business days, rolling forward to nearest biz day: ", np.busday_offset(dt64, 2, roll='forward'))  
print("Add 2 business days, rolling backward to nearest biz day: ", np.busday_offset(dt64, 2, roll='backward'))

Date:  2018-02-04
Is it a business day?:  False
Add 2 business days, rolling forward to nearest biz day:  2018-02-07
Add 2 business days, rolling backward to nearest biz day:  2018-02-06


In [15]:
# Create date sequence
dates = np.arange(np.datetime64('2018-02-01'), np.datetime64('2018-02-10'))
print(dates)

# Check if its a business day
np.is_busday(dates)

['2018-02-01' '2018-02-02' '2018-02-03' '2018-02-04' '2018-02-05'
 '2018-02-06' '2018-02-07' '2018-02-08' '2018-02-09']


array([ True,  True, False, False,  True,  True,  True,  True,  True])

In [16]:
# Convert np.datetime64 to datetime.datetime
import datetime
dt = dt64.tolist()
dt

datetime.date(2018, 2, 4)

In [18]:
print('Year: ', dt.year)  
print('Day of month: ', dt.day)
print('Month of year: ', dt.month)  
print('Day of Week: ', dt.weekday())  # Sunday

Year:  2018
Day of month:  4
Month of year:  2
Day of Week:  6


In [19]:
# Define a scalar function
def foo(x):
    if x % 2 == 1:
        return x**2
    else:
        return x/2

# On a scalar
print('x = 10 returns ', foo(10))
print('x = 11 returns ', foo(11))

x = 10 returns  5.0
x = 11 returns  121


In [20]:
# Vectorize foo(). Make it work on vectors.
foo_v = np.vectorize(foo, otypes=[float])

print('x = [10, 11, 12] returns ', foo_v([10, 11, 12]))
print('x = [[10, 11, 12], [1, 2, 3]] returns ', foo_v([[10, 11, 12], [1, 2, 3]]))

x = [10, 11, 12] returns  [  5. 121.   6.]
x = [[10, 11, 12], [1, 2, 3]] returns  [[  5. 121.   6.]
 [  1.   1.   9.]]


In [21]:
# Create a 4x10 random array
np.random.seed(100)
arr_x = np.random.randint(1,10,size=[4,10])
arr_x

array([[9, 9, 4, 8, 8, 1, 5, 3, 6, 3],
       [3, 3, 2, 1, 9, 5, 1, 7, 3, 5],
       [2, 6, 4, 5, 5, 4, 8, 2, 2, 8],
       [8, 1, 3, 4, 3, 6, 9, 2, 1, 8]])

In [22]:
#How to find the difference of the maximum and the minimum value in each row

# Define func1d
def max_minus_min(x):
    return np.max(x) - np.min(x)

# Apply along the rows
print('Row wise: ', np.apply_along_axis(max_minus_min, 1, arr=arr_x))

# Apply along the columns
print('Column wise: ', np.apply_along_axis(max_minus_min, 0, arr=arr_x))

Row wise:  [8 8 6 8]
Column wise:  [7 8 2 7 6 5 8 5 5 5]


In [23]:
# example of searchsorted
x = np.arange(10)
print('Where should 5 be inserted?: ', np.searchsorted(x, 5))
print('Where should 5 be inserted (right)?: ', np.searchsorted(x, 5, side='right'))

Where should 5 be inserted?:  5
Where should 5 be inserted (right)?:  6


In [24]:
# Randomly choose an item from a list based on a predefined probability
lst = range(10000)  # the list
probs = np.random.random(10000); probs /= probs.sum()  # probabilities

%timeit lst[np.searchsorted(probs.cumsum(), np.random.random())]
%timeit np.random.choice(lst, p=probs)

34.9 µs ± 435 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
1.44 ms ± 7.57 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [25]:
# Create a 1D array
x = np.arange(5)
print('Original array: ', x)

# Introduce a new column axis
x_col = x[:, np.newaxis]
print('x_col shape: ', x_col.shape)
print(x_col)

# Introduce a new row axis
x_row = x[np.newaxis, :]
print('x_row shape: ', x_row.shape)
print(x_row)

Original array:  [0 1 2 3 4]
x_col shape:  (5, 1)
[[0]
 [1]
 [2]
 [3]
 [4]]
x_row shape:  (1, 5)
[[0 1 2 3 4]]


In [26]:
# Create the array and bins
x = np.arange(10)
bins = np.array([0, 3, 6, 9])

# Get bin allotments
np.digitize(x, bins)

array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4])

In [27]:
# Cap all elements of x to lie between 3 and 8
np.clip(x, 3, 8)

array([3, 3, 3, 3, 4, 5, 6, 7, 8, 8])

In [29]:
# Bincount example
x = np.array([1,1,2,2,2,4,4,5,6,6,6]) # doesn't need to be sorted
np.bincount(x) # 0 occurs 0 times, 1 occurs 2 times, 2 occurs thrice, 3 occurs 0 times, ...

# Histogram example
counts, bins = np.histogram(x, [0, 2, 4, 6, 8])
print('Counts: ', counts)
print('Bins: ', bins)

Counts:  [2 3 3 3]
Bins:  [0 2 4 6 8]
