This notebook is inspired from:
[Jake VanderPlas - Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/index.html)

# Numpy

In [3]:
## importing numpy
import numpy as np

np.__version__

'1.18.1'

## Creating Numpy Arrays

__From lists__

In [6]:
my_array1 = np.array([4, 1, 7, 13, 2.76])

In [7]:
# if we want we can also specify the data types
my_array.shape


(5,)

In [8]:
# but aware that they have certain range and limitations.
my_array2 = np.array([[4, 1, 7, 13, 2.76]])

my_array2.shape

(1, 5)

In [9]:
my_array3 = np.array([[[4, 1, 7, 13, 2.76]]])
my_array3.shape

(1, 1, 5)

In [15]:
my_1d_array = np.array([13,27,33.3, 1e56], dtype = 'int32')
#gives overflow error because of type

OverflowError: Python int too large to convert to C long

In [18]:
array1 = np.array([1,2,3,4], dtype = 'int32')
array1[2] = 4.5
array1

#four gets truncated because the type of the list is int --> goes from float to int

array([1, 2, 4, 4], dtype=int32)

Unlike lists, arrays can be multidimensional

In [19]:
multidim = np.array([[1,2,3,12],
          [4,5,6,11], 
          [7,8,9,10]])

In [20]:
multidim.shape

(3, 4)

__From Scratch__

In [21]:
# we can create a numpy array with zeros of any shape
np.zeros((6,2))

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [25]:
## Again we can pass the dtype

np.zeros((2,6), dtype = int)

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [23]:
# we can create an array of any shape filled with any number:

np.full((3, 7), .23)

array([[0.23, 0.23, 0.23, 0.23, 0.23, 0.23, 0.23],
       [0.23, 0.23, 0.23, 0.23, 0.23, 0.23, 0.23],
       [0.23, 0.23, 0.23, 0.23, 0.23, 0.23, 0.23]])

In [27]:
## np.ones?
tester = np.full((3,5),1)

Other useful methods for creating arrays:

- `np.arrange`

- `np.linspace`

- `np.random.random`

In [28]:
tester[2,2] = 5
tester

array([[1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 5, 1, 1]])

In [29]:
tester.T

array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 5],
       [1, 1, 1],
       [1, 1, 1]])

In [31]:
np.random.random(size = 10) 
#creates random numbers with uniform distribution

array([0.54250685, 0.46903215, 0.43365438, 0.34346243, 0.64741838,
       0.6259192 , 0.11883143, 0.74424408, 0.92641591, 0.69751749])

In [32]:
np.random.randn(10)

array([ 0.00243473,  0.01336678, -1.35427955,  1.27300099,  1.08257173,
        1.9793423 , -1.62795075, -1.12163812,  2.38948793, -0.11054567])

In [33]:
np.random.normal(loc = 69, scale = 2, size = 10)
#random distribution

array([71.39205118, 66.00420233, 67.14039805, 72.87239491, 69.55624993,
       73.66506792, 70.22551638, 68.24882608, 68.36607497, 68.56208317])

## Descriptive Statistics with Numpy

In [38]:
## let's create a sample from normally distributed population of size = 10
#comparing gender and pay 
#the random numbers will be different every time so you have to use random.seed to have reporducibility 

np.random.seed(60120)
sample1 = np.random.normal(loc = 10, scale = 1, size = 10) #scale = standard deviation #loc = mean
sample1

array([11.09688495,  9.82876882, 10.89658301,  7.64402997,  9.05907328,
       10.00666684,  9.72806703,  9.95520623, 12.27507952, 10.28825701])

In [40]:
## what is the mean of sample1?

In [41]:
sample1.mean()

10.077861664932225

In [None]:
## what is the median of sample1?

In [43]:
np.median(sample1)

9.980936535974632

In [45]:
## sorting sample1
sample1.sort()     # changes the original array
np.sort(sample1)   # does not changes the original array - must assign to another variable

array([ 7.64402997,  9.05907328,  9.72806703,  9.82876882,  9.95520623,
       10.00666684, 10.28825701, 10.89658301, 11.09688495, 12.27507952])

In [None]:
## what is the 0.1 percentile of sample1?

In [50]:
np.percentile(sample1, q = .1) # not a number in the list 
np.percentile(sample1, q = .1, interpolation = 'lower') #will be a number in the list 
np.percentile(sample1, q = .1, interpolation = 'higher') #also will be a high number in the list

9.059073278428512

In [53]:
## Where is the max/min in sample1
sample1.min(), sample1.max(), sample1.argmin(), sample1.argmax(), #argmin and argmax are similar to idx_max

#can use axis argument to find min and max of columns (axis = 1) or rows (axis = 0)

(7.6440299682270565, 12.275079515331011, 0, 9)

In [None]:
## We can use different formattings as we print values
print('Maximum of sample1 is %.2f'%sample1.max())
print('The index of the max in sample1 is {}'.format(sample1.argmax()))

[Comparison between % and format](https://stackoverflow.com/questions/5082452/string-formatting-vs-format)

[Descriptive Statistics](https://www.hackerearth.com/blog/developers/descriptive-statistics-python-numpy/)