# Review: Data Types and Summary Statistics

|Aggregate Stat | Quantitative Continuous | Quantitative Discrete | Qualitative Ordinal | Qualitative Nominal |
|------|------|------|------|------|
| unique values | yes* | yes | yes | yes |
| min | yes | yes | yes | no |
| max | yes | yes | yes | no |
| range | Y | Y | y | N |
| mean |  Y|  Y|  N| N |
| median |Y | Y| Y| N |
| mode | Y |  Y|  Y| N |
| variance | Y | Y | N |N  |


## Copying numpy arrays

This is a reminder to use "deep copy" rather than "shallow copy"

In [1]:
# let's try the obvious thing
import numpy as np
nparray = np.array([[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33]])
nparray2 = nparray
print("nparray")
print(nparray)
print("nparray2")
print(nparray2)
print(nparray.size)
print(nparray.ndim)
print(nparray.shape)

nparray
[[ 0  1  2  3]
 [10 11 12 13]
 [20 21 22 23]
 [30 31 32 33]]
nparray2
[[ 0  1  2  3]
 [10 11 12 13]
 [20 21 22 23]
 [30 31 32 33]]
16
2
(4, 4)


In [3]:
nparray[0,0] = 200
print("nparray")
print(nparray)
print("nparray2")
print(nparray2)

# whaaat just happened?  #nparray2 changed too, SAME MEMORY LOCATION

nparray
[[200   1   2   3]
 [ 10  11  12  13]
 [ 20  21  22  23]
 [ 30  31  32  33]]
nparray2
[[200   1   2   3]
 [ 10  11  12  13]
 [ 20  21  22  23]
 [ 30  31  32  33]]


In [4]:
# how do we stop that happening?? hint, what are we doing? we are *copying*
nparray2 = nparray.copy()
print("nparray")
print(nparray)
print("nparray2")
print(nparray2)

nparray
[[200   1   2   3]
 [ 10  11  12  13]
 [ 20  21  22  23]
 [ 30  31  32  33]]
nparray2
[[200   1   2   3]
 [ 10  11  12  13]
 [ 20  21  22  23]
 [ 30  31  32  33]]


In [5]:
nparray[0,0] = 0
print("nparray")
print(nparray)
print("nparray2")
print(nparray2)

nparray
[[ 0  1  2  3]
 [10 11 12 13]
 [20 21 22 23]
 [30 31 32 33]]
nparray2
[[200   1   2   3]
 [ 10  11  12  13]
 [ 20  21  22  23]
 [ 30  31  32  33]]


## Doing things to whole numpy arrays (broadcasting)

In [2]:
import numpy as np

nparray = np.array([[0, 1, 2, 3,3], [10, 11, 12, 13,3], [20, 21, 22, 23,3], [30, 31, 32, 33,3]])
print("nparray\n", nparray)
print("nparray shape\n", nparray.shape)
print(nparray.ndim)

nparray
 [[ 0  1  2  3  3]
 [10 11 12 13  3]
 [20 21 22 23  3]
 [30 31 32 33  3]]
nparray shape
 (4, 5)
2


In [5]:
# what if I want every element in nparray * 2?
print(nparray)
print(nparray[1,])
print(nparray*2)

[[ 0  1  2  3  3]
 [10 11 12 13  3]
 [20 21 22 23  3]
 [30 31 32 33  3]]
[10 11 12 13  3]
[[ 0  2  4  6  6]
 [20 22 24 26  6]
 [40 42 44 46  6]
 [60 62 64 66  6]]


In [15]:
# what if I want every element in nparray / 2?
print(nparray/2)
print(nparray.shape)
# watch out!!
nparray=nparray.astype(int)
print(nparray)

[[ 0.   0.5  1.   1.5  1.5]
 [ 5.   5.5  6.   6.5  1.5]
 [10.  10.5 11.  11.5  1.5]
 [15.  15.5 16.  16.5  1.5]]
(4, 5)
[[ 0  1  2  3  3]
 [10 11 12 13  3]
 [20 21 22 23  3]
 [30 31 32 33  3]]


In [16]:
# let's get some summary statistics

data = np.genfromtxt('data/vehiclesNumeric.csv', dtype=int, delimiter=',', skip_header=1, encoding='utf8')
print(data[0:10])   #row 0 to 10 and all colums
print(data.shape)
# no for loops!
print("max", data.max(axis=0), "min", data.min(axis=0), "mean", data.mean(axis=0, dtype=int))   #axis 0 columns colums, brroadcasting is columnwise, statstics per colum


FileNotFoundError: data/vehiclesNumeric.csv not found.

In [29]:
# (review!) how do we assign value(s) to a row or column?
print(nparray)
print(nparray.shape[1])
nparray[:1] = np.zeros(nparray.shape[1])   #up to 1st row exculsive 
print(nparray)

nparray[:,:0] =np.zeros(nparray.shape[0])

[[ 0  1  2  3  3]
 [10 11 12 13  3]
 [20 21 22 23  3]
 [30 31 32 33  3]]
5
[[ 0  0  0  0  0]
 [10 11 12 13  3]
 [20 21 22 23  3]
 [30 31 32 33  3]]


In [20]:
print(nparray[:,:1].shape)  #all arrays up to 1st column
print(nparray[:,:1])
print(nparray[:,0].shape)
print(nparray[:,0])



(4, 1)
[[ 0]
 [10]
 [20]
 [30]]
(4,)
[ 0 10 20 30]


In [21]:
# let's sum across each column
np.sum(nparray, axis=0)

array([60, 64, 68, 72, 12])

In [22]:
# how would we sum across each row?
np.sum(nparray, axis=1)

array([  9,  49,  89, 129])

In [23]:
# what if we had a tensor?
nptensorFloat = np.ones([3, 4, 5])
print(nptensorFloat)

np.sum(nptensorFloat, axis=2)
print(np.sum(nptensorFloat, axis=2).shape)

[[[1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]]

 [[1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]]

 [[1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]
  [1. 1. 1. 1. 1.]]]
(3, 4)


In [None]:
# what if we don't specify an axis?
print(np.sum(nptensorFloat, axis=2))

In [None]:
# what other functions can we apply across axes?

In [35]:
# let's take it up a notch

nparrayRandomInt = np.random.randint(low=0, high=10, size=(3,4))
print(nparrayRandomInt)

print(nparrayRandomInt - np.min(nparrayRandomInt, axis=0))

# whaaat just happened? let's look at the shapes


[[1 5 1 5]
 [9 8 1 2]
 [0 7 1 8]]
[[1 0 0 3]
 [9 3 0 0]
 [0 2 0 6]]


In [None]:
# what if we try to do the subtract-min thing across axis 1?
print(nparrayRandomInt - np.min(nparrayRandomInt, axis=1))


In [None]:
# how can we fix that? make the arrays shape-compatible!
print(nparrayRandomInt - np.min(nparrayRandomInt, axis=1)[:, np.newaxis])

In [None]:
# is there another way to achieve this?
print(nparrayRandomInt - np.min(nparrayRandomInt, axis=1, keepdims=True))


## Why numpy?

Numpy is space efficient
(reference: https://www.geeksforgeeks.org/python-lists-vs-numpy-arrays/)

- very space efficient because it's based on C
-more time efficietn

In [24]:
# importing numpy package
import numpy as np
  
# importing system module
import sys
  
# declaring a list of 1000 elements 
S= range(1000)
  
# printing size of each element of the list
print("Size of each element of list in bytes: ",sys.getsizeof(S))
  
# printing size of the whole list
print("Size of the whole list in bytes: ",sys.getsizeof(S)*len(S))
  
# declaring a Numpy array of 1000 elements 
D= np.arange(1000)
  
# printing size of each element of the Numpy array
print("Size of each element of the Numpy array in bytes: ",D.itemsize)
  
# printing size of the whole Numpy array
print("Size of the whole Numpy array in bytes: ",D.size*D.itemsize)

Size of each element of list in bytes:  48
Size of the whole list in bytes:  48000
Size of each element of the Numpy array in bytes:  8
Size of the whole Numpy array in bytes:  8000


Numpy *can be* more time efficient (reference: https://stackoverflow.com/questions/9708783/numpy-vs-list-comprehension-which-is-faster)

In [None]:
import sys, numpy
import timeit #times things, use numpy for fast programming

def numpysum(n):
    a = numpy.arange(n) ** 2
    b = numpy.arange(n) ** 3
    return a + b

def pythonsum(n):
    a = [i ** 2 for i in range(n)]
    b = [i ** 3 for i in range(n)]
    return [a[i] + b[i] for i in range(n)]

for size in [10, 100, 1000]:
    print("size", size)
    print("time with python", timeit.timeit(lambda: pythonsum(size)))
    print("time with numpy", timeit.timeit(lambda: numpysum(size)))