In [None]:
import os

DIR = r'c://downloads'

In [None]:
import numpy as np

# Arrays

In [None]:
a_list = list(range(20))
print(a_list)
print(type(a_list))

an_array = np.array(a_list) # not the preferred way to create numpy arrays
print(an_array)
print(type(an_array))

In [None]:
print(np.arange(20))
print(np.zeros(10))
print(np.ones(10))
print(np.empty(30)) # actually contains garbage values, but doesn't spend initializing them, so it is marginally faster

In [None]:
# Arrays can be sliced like lists

data = np.arange(20)
data[5] = 77
print(data)

print(data[6])
print(data[3:10:2])

# dtype

In [None]:
# arrays have a fixed/single data type

print(data.dtype)

data[5] = 0.2 # Will be converted to 0
print(data)

In [None]:
# Option 1 (not recommended)
data = np.array(range(20), dtype = np.float64) # uses Python's "slow" implementation
data[5] = 0.2
print(data)

# Option 2 (less recommended)
data = np.arange(20).astype(np.float64) # makes two arrays unnecessarily
data[5] = 0.2
print(data)

# Option 3 (recommended)
data = np.arange(20, dtype = np.float64)
data[5] = 0.2
print(data)

In [None]:
# We can specify a general type and leave it to NumPy to worry about the specifics.
print(np.arange(20, dtype = float).dtype)

In [None]:
# float64 is the standard type
print(np.zeros(10).dtype)

# Shapes

In [None]:
z = np.zeros((5, 3)) # creates a matrix
print(z)
print(z.shape)

In [None]:
print(len(z)) # only gives the length of the first dimension
print(z.size)

In [None]:
print(z.reshape(3, 5))
print(z.reshape(15)) # into a 1-D array

In [None]:
print(z.reshape(4, 5)) # reshaping has to be of the same size

In [None]:
print(np.arange(2 * 3 * 4 * 5).reshape(2, 3, 4, 5)) # supports any number of dimensions...

In [None]:
# slicing multi dimensional arrays

data = np.arange(15).reshape(5, 3)
print(data)
print('*' * 20)

print(data[4, 2]) # select a specific "cell"
print('*' * 20)

print(data[4, :]) # select the whole row
print(data[:, 2]) # select the whole column
print('*' * 20)

print(data[1:3,::-1]) # there are a bunch of possibilities...

In [None]:
data = np.zeros((8, 3))
data[5, :] = np.arange(5, 8) # replace the row with different values
print(data)
print('*' * 20)

data[0:3, :] = np.arange(3 * 3).reshape(3, 3) # works in multiple dimensions
print(data)

# Vectorized arithmetics

In [None]:
a = np.array([1, 6, 9])
b = np.array([2, 1, 0.5])

print('%s + %s = %s' % (str(a), str(b), str(a + b)))
print('%s * %s = %s' % (str(a), str(b), str(a * b)))

In [None]:
# Multiplication is element-wise (unlike in  Matlab)
A = np.arange(3 * 3).reshape(3, 3)
B = np.arange(3 * 3)[::-1].reshape(3, 3)
print('A:')
print(A)
print('B:')
print(B)
print('A * B:')
print(A * B)

In [None]:
# But yes, you can also multiply matrices in linear-algebra style
A = np.arange(2 * 3).reshape(2, 3)
B = np.arange(3 * 2).reshape(3, 2)
print(np.dot(A, B))

In [None]:
# Broadcasting - performing arithmetic on different size arrays
# (as long as the larger is dividisble by the smaller)

print(3.14 * np.ones(5))
print(3.14 + np.ones(5))
print('*' * 20)

A = np.arange(3 * 3).reshape(3, 3)
b = np.arange(3)
print('A:')
print(A)
print('b: %s' % str(b))
print('A + b:')
print(A + b)

# Vectorized operations are much faster!

In [None]:
a = np.arange(10 ** 4)
b = np.arange(10 ** 4)[::-1]

%timeit [ai + bi for ai, bi in zip(a, b)]
%timeit a + b

__Conclusion__: Avoid loops when possible!

# Arrays are not lists!

In [None]:
# although arrays are made to have similar behavior to lists, it is not always the case

list1, list2 = [1, 2, 3], [4, 5, 6]
array1, array2 = np.array(list1), np.array(list2)

print(list1 + list2)
print(array1 + array2)
print('*' * 20)
print(list1 * 3)
print(array1 * 3)

In [None]:
# numpy alternatives...

print(np.concatenate([array1, array2]))
print(np.tile(array1, 3))

# Common mathematical methods
##### More efficient than python versions

In [None]:
data = [1, 6, 19, 22, 15, 6, 3, 1, 9]
print(np.mean(data), np.average(data), np.std(data))
print(np.median(data))
print(np.sum(data), np.prod(data))
print(np.max(data), np.min(data))
print(np.argmax(data), np.argmin(data)) # index of the largest/smallest numberS

In [None]:
# Calculating z-values in one line
print((np.array(data) - np.mean(data)) / np.std(data))

In [None]:
# Efficient vectorized functions
print(np.exp(data))
print(np.log(data))
print(np.log(data) / np.log(2))
print(np.log2(data))
print(np.log10(data))
print(np.sin(data))
print(np.cos(data))
print(np.sin(data) ** 2 + np.cos(data) ** 2)

In [None]:
np.abs([1, -2, -3])

In [None]:
print(np.pi)
print(np.e)

# Axes

In [None]:
# math operations generally take the whole array
# but we can specify the axis...

A = np.arange(3 * 5).reshape(3, 5)
print(A)
print(np.average(A))
print(np.average(A, axis = 0))
print(np.average(A, axis = 1))

# Random

In [None]:
print(np.random.randint(3, 5)) # Unlike random.randint, it doesn't include 5!
print(np.random.randint(3, 5, 10))
print(np.random.randint(3, 5, (4, 12)))

In [None]:
# Uniform distribution
print(np.random.rand())
print(np.random.rand(4, 3)) # arguments for this function are shapes, not the number range...
print(np.random.uniform(-5.3, 19.4, 6))

In [None]:
# Gaussian/normal-distribution random
print(np.random.randn())
print(3 * np.random.randn() + 6) # mean = 6, std = 3
print(np.random.randn(2, 3))

# Booleans & slicing

In [None]:
bool_array = np.array([True, False, True])
print(bool_array)
print(bool_array.dtype)

In [None]:
x = np.array([1, 5, 2, 1, 9, 1, -3])
print(x == 1)
print(x <= 1)

In [None]:
x = np.arange(10)
y = np.arange(10)[::-1]
print(~(x == 5)) # not equal to five
print((x < 8) & (y < 8)) # and...
print((x >= 8) | (y >= 8)) # or...

In [None]:
# we can slice arrays with boolean arrays

x = np.arange(8)
mask = np.array([True, True, False, False, False, False, True, True])
print(x)
print(mask)
print(x[mask])

In [None]:
x = np.random.randn(20)
print(x)
print(x[x >= 0])

In [None]:
# Filtering outliers (numbers that are more than 2 standard deviations away from the mean)
data = np.array([3,4, 0.2, 1.7, 5.6, -19.3, 4.2, 20.4, 0.1, 2.5, 3.2, 4.3, -2.2, -1.1, 0.01])
print(data[np.abs(data - np.average(data)) <= 2 * np.std(data)])

In [None]:
# we can replace the values we don't want, instead of filtering them out using np.where()...

mask = np.array([True, True, False, False, False, False, True, True])
print(np.where(mask, 1.0, -np.pi))

In [None]:
# we cannot use python's bool on numpy's boolean array...

a = np.array([True, False, True])
print(bool(a))

In [None]:
# we have to use

print(np.any(a))
print(np.all(a))

# nan & inf

In [None]:
print(2.2 / 0)

In [None]:
# numpy allows division by 0

a = np.array([1, 5, 9, 3, -4, 0])
b = np.array([1, 0, 0.5, 2, 0, 0])
print(a / b)

In [None]:
# silence warnings...

with np.errstate(divide = 'ignore', invalid = 'ignore'):
    print(a / b)
    
# Can also silence globally with np.seterr although not recommended

In [None]:
# we will get special values for all mathematcially undefined operations

print(np.log(a))
print(a ** 0.31)

In [None]:
# and these can propagate later in code...

a = np.array([2, -1.414, np.nan, np.nan, np.inf, -np.inf])
b = np.array([np.nan, np.inf, -np.inf, 3.2, 1.2, np.inf])
print(a + b)
print(a * b)

In [None]:
print(np.isinf(np.inf), np.isinf(-np.inf), np.isinf(np.nan), np.isinf(np.pi))
print(np.isnan(np.inf), np.isnan(-np.inf), np.isnan(np.nan), np.isnan(np.pi))

In [None]:
# filtering out nan (not a number) values...

a = np.log([1, -2, -3, 5, 4])
print(a)
print(np.isnan(a))
print(a[~np.isnan(a)])

In [None]:
# nan is not None

print(np.nan == None, np.nan is None)

In [None]:
np.isnan(None)

In [None]:
# nan's are actually still numbers (but not really...)

print(type(np.nan), type(np.inf), type(-np.inf))

# Example - nucleotide probabilities

In [None]:
# first calculate the occurence of a pair of nucleotides
# i.e. the number of times one nucleotide follows the other 

dna_seq = 'AGTCCAGTGACGTGGGTGAGAGTGAGTGACACAGTGGAGTAGACCCAAAGTGATTAGTCACACAGTGATGACAGATGACGATAGAGA'

NT_OPTIONS = 'ACGT'
nt_to_index = {nt: i for i, nt in enumerate(NT_OPTIONS)}
pairs_count = np.zeros((len(NT_OPTIONS), len(NT_OPTIONS)))

for i in range(len(dna_seq) - 1):
    pairs_count[nt_to_index[dna_seq[i]], nt_to_index[dna_seq[i + 1]]] += 1
    
print(pairs_count)

In [None]:
# normalize it to get the probability

nt_prob = pairs_count / np.sum(pairs_count, axis = 1).reshape(len(NT_OPTIONS), 1)
print(nt_prob)

In [None]:
# so, e.g., there is a 7.14% chance of A coming after A
# but there is a 50% chance of G coming after an A