# Agenda

1. 2D NumPy
2. Pandas
    - Series (1D)
    - Data frame (2D)
3. Reading from CSV

In [1]:
import numpy as np

In [5]:
np.random.seed(0)
a = np.random.randint(0, 100, 20)
a

array([44, 47, 64, 67, 67,  9, 83, 21, 36, 87, 70, 88, 88, 12, 58, 65, 39,
       87, 46, 88])

In [3]:
a.dtype

dtype('int64')

In [7]:
np.random.seed(0)
a = np.random.randint(0, 100, 20, dtype=np.int8)
a

array([44, 10, 12, 47, 42, 68, 23, 38, 22, 55, 64, 76, 33, 88, 67, 51, 78,
       26, 82, 34], dtype=int8)

In [8]:
b = np.array([10, 20, 30, 40])

In [9]:
a + b

ValueError: operands could not be broadcast together with shapes (20,) (4,) 

In [10]:
a.shape

(20,)

In [11]:
b.shape

(4,)

In [12]:
a.shape = (5, 4)

In [13]:
a

array([[44, 10, 12, 47],
       [42, 68, 23, 38],
       [22, 55, 64, 76],
       [33, 88, 67, 51],
       [78, 26, 82, 34]], dtype=int8)

In [14]:
np.random.seed(0)
a = np.random.randint(0, 100, 20, dtype=np.int8).reshape(5, 4)

In [15]:
a

array([[44, 10, 12, 47],
       [42, 68, 23, 38],
       [22, 55, 64, 76],
       [33, 88, 67, 51],
       [78, 26, 82, 34]], dtype=int8)

In [16]:
np.random.seed(0)
a = np.random.randint(0, 100, [5,4], dtype=np.int8)
a

array([[44, 10, 12, 47],
       [42, 68, 23, 38],
       [22, 55, 64, 76],
       [33, 88, 67, 51],
       [78, 26, 82, 34]], dtype=int8)

In [19]:
# how can I get to 64?

a[2][2]    # don't do this, even if it works

64

In [21]:
a[2,2]    # this is how we should retrieve from NumPy

64

In [22]:
a[0, 2]

12

In [23]:
a[2, 3]  # this is a tuple 

76

In [24]:
a[(2,3)]      # this syntax works, but is ugly -- don't do it

76

In [25]:
# what if I want both 55 and 64 from index 2?

a[2, [1,2]]      # row index 2, columns 1+2

array([55, 64], dtype=int8)

In [26]:
a[2, 1:3]        # row index 2, columns 1 up to, not including 3

array([55, 64], dtype=int8)

In [27]:
a[1:3, 1:3]

array([[68, 23],
       [55, 64]], dtype=int8)

In [28]:
a[1:4:2, 1:3]   # rows 1+3, columns 1+2

array([[68, 23],
       [88, 67]], dtype=int8)

In [29]:
a

array([[44, 10, 12, 47],
       [42, 68, 23, 38],
       [22, 55, 64, 76],
       [33, 88, 67, 51],
       [78, 26, 82, 34]], dtype=int8)

In [30]:
a.sum()

960

In [31]:
a.sum(axis=0)   # sums all of the numbers in each column, gives us a new row-sized array

array([219, 247, 248, 246])

In [32]:
a.sum(axis=1)   # sums all of the numbers in each row, gives us a new column-sized array

array([113, 171, 217, 239, 220])

In [33]:
a.min(axis=0)

array([22, 10, 12, 34], dtype=int8)

In [34]:
a.max(axis=1)

array([47, 68, 76, 88, 82], dtype=int8)

In [35]:
a.std(axis=0)

array([18.78722971, 28.16806703, 27.13374283, 14.71597771])

In [36]:
a.mean(axis=0)

array([43.8, 49.4, 49.6, 49.2])

# Exercises: NumPy

1. Create a 2-dimensional NumPy array with 45 numbers from 0-100, with 5 rows and 9 columns.
2. Retrieve row index 2.
3. Retrieve column index 3.  (Think about a slice, and how you could get the items.)
4. Retrieve rows index 1 and 4.
5. Retrieve column indexes 1 and 4.
6. Get the mean of the even items in row index 4.
7. Get the mean of the odd items in column index 3.

In [37]:
a

array([[44, 10, 12, 47],
       [42, 68, 23, 38],
       [22, 55, 64, 76],
       [33, 88, 67, 51],
       [78, 26, 82, 34]], dtype=int8)

In [38]:
a.transpose()

array([[44, 42, 22, 33, 78],
       [10, 68, 55, 88, 26],
       [12, 23, 64, 67, 82],
       [47, 38, 76, 51, 34]], dtype=int8)

In [39]:
a.T

array([[44, 42, 22, 33, 78],
       [10, 68, 55, 88, 26],
       [12, 23, 64, 67, 82],
       [47, 38, 76, 51, 34]], dtype=int8)

In [40]:
np.random.seed(0)
a = np.random.randint(0, 100, [5, 9], dtype=np.int8)
a

array([[44, 10, 12, 47, 42, 68, 23, 38, 22],
       [55, 64, 76, 33, 88, 67, 51, 78, 26],
       [82, 34, 91, 67, 11, 88,  9, 36, 83],
       [94, 33, 31, 21, 81, 89, 37, 86, 98],
       [36, 55,  5, 87, 58, 43, 76, 70, 60]], dtype=int8)

In [41]:
# row index 2
a[2]

array([82, 34, 91, 67, 11, 88,  9, 36, 83], dtype=int8)

In [42]:
# column index 3
a.T[3]

array([47, 33, 67, 21, 87], dtype=int8)

In [44]:
# column index 3, using a slice
a[0:5, 3]

array([47, 33, 67, 21, 87], dtype=int8)

In [45]:
# column index 3, using a different slice
a[:, 3]

array([47, 33, 67, 21, 87], dtype=int8)

In [46]:
# rows at index 1+4

a[[1,4]]

array([[55, 64, 76, 33, 88, 67, 51, 78, 26],
       [36, 55,  5, 87, 58, 43, 76, 70, 60]], dtype=int8)

In [50]:
# row indexes 1+4, using a slice
a[1:5:3]

array([[55, 64, 76, 33, 88, 67, 51, 78, 26],
       [36, 55,  5, 87, 58, 43, 76, 70, 60]], dtype=int8)

In [51]:
a[:, [1,4]]

array([[10, 42],
       [64, 88],
       [34, 11],
       [33, 81],
       [55, 58]], dtype=int8)

In [52]:
a[:, 1:5:3]

array([[10, 42],
       [64, 88],
       [34, 11],
       [33, 81],
       [55, 58]], dtype=int8)

In [56]:
# mean of even items in row index 4
a[4][a[4] % 2 == 0].mean()

60.0

In [60]:
# mean of odd items in column index 3
a[:, 3][a[:, 3] % 2 == 1].mean()

51.0

In [61]:
a

array([[44, 10, 12, 47, 42, 68, 23, 38, 22],
       [55, 64, 76, 33, 88, 67, 51, 78, 26],
       [82, 34, 91, 67, 11, 88,  9, 36, 83],
       [94, 33, 31, 21, 81, 89, 37, 86, 98],
       [36, 55,  5, 87, 58, 43, 76, 70, 60]], dtype=int8)

In [62]:
b = a.reshape(9,5)
b

array([[44, 10, 12, 47, 42],
       [68, 23, 38, 22, 55],
       [64, 76, 33, 88, 67],
       [51, 78, 26, 82, 34],
       [91, 67, 11, 88,  9],
       [36, 83, 94, 33, 31],
       [21, 81, 89, 37, 86],
       [98, 36, 55,  5, 87],
       [58, 43, 76, 70, 60]], dtype=int8)

In [63]:
a.reshape(3,4)

ValueError: cannot reshape array of size 45 into shape (3,4)

In [64]:
a

array([[44, 10, 12, 47, 42, 68, 23, 38, 22],
       [55, 64, 76, 33, 88, 67, 51, 78, 26],
       [82, 34, 91, 67, 11, 88,  9, 36, 83],
       [94, 33, 31, 21, 81, 89, 37, 86, 98],
       [36, 55,  5, 87, 58, 43, 76, 70, 60]], dtype=int8)

In [65]:
b

array([[44, 10, 12, 47, 42],
       [68, 23, 38, 22, 55],
       [64, 76, 33, 88, 67],
       [51, 78, 26, 82, 34],
       [91, 67, 11, 88,  9],
       [36, 83, 94, 33, 31],
       [21, 81, 89, 37, 86],
       [98, 36, 55,  5, 87],
       [58, 43, 76, 70, 60]], dtype=int8)

In [66]:
id(a)

4926381136

In [67]:
id(b)

4926388720

In [68]:
a[0, 0] = 99

In [69]:
a

array([[99, 10, 12, 47, 42, 68, 23, 38, 22],
       [55, 64, 76, 33, 88, 67, 51, 78, 26],
       [82, 34, 91, 67, 11, 88,  9, 36, 83],
       [94, 33, 31, 21, 81, 89, 37, 86, 98],
       [36, 55,  5, 87, 58, 43, 76, 70, 60]], dtype=int8)

In [70]:
b

array([[99, 10, 12, 47, 42],
       [68, 23, 38, 22, 55],
       [64, 76, 33, 88, 67],
       [51, 78, 26, 82, 34],
       [91, 67, 11, 88,  9],
       [36, 83, 94, 33, 31],
       [21, 81, 89, 37, 86],
       [98, 36, 55,  5, 87],
       [58, 43, 76, 70, 60]], dtype=int8)

In [72]:
help(np.reshape)

Help on function reshape in module numpy:

reshape(a, newshape, order='C')
    Gives a new shape to an array without changing its data.
    
    Parameters
    ----------
    a : array_like
        Array to be reshaped.
    newshape : int or tuple of ints
        The new shape should be compatible with the original shape. If
        an integer, then the result will be a 1-D array of that length.
        One shape dimension can be -1. In this case, the value is
        inferred from the length of the array and remaining dimensions.
    order : {'C', 'F', 'A'}, optional
        Read the elements of `a` using this index order, and place the
        elements into the reshaped array using this index order.  'C'
        means to read / write the elements using C-like index order,
        with the last axis index changing fastest, back to the first
        axis index changing slowest. 'F' means to read / write the
        elements using Fortran-like index order, with the first index
        c

In [73]:
b = a.reshape(9, 5).copy()

In [75]:
help(np.copy)

Help on function copy in module numpy:

copy(a, order='K', subok=False)
    Return an array copy of the given object.
    
    Parameters
    ----------
    a : array_like
        Input data.
    order : {'C', 'F', 'A', 'K'}, optional
        Controls the memory layout of the copy. 'C' means C-order,
        'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,
        'C' otherwise. 'K' means match the layout of `a` as closely
        as possible. (Note that this function and :meth:`ndarray.copy` are very
        similar, but have different default values for their order=
        arguments.)
    subok : bool, optional
        If True, then sub-classes will be passed-through, otherwise the
        returned array will be forced to be a base-class array (defaults to False).
    
        .. versionadded:: 1.19.0
    
    Returns
    -------
    arr : ndarray
        Array interpretation of `a`.
    
    See Also
    --------
    ndarray.copy : Preferred method for creating an arr

In [76]:
b.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [77]:
b= a.reshape(9, 5)
b.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [78]:
a

array([[99, 10, 12, 47, 42, 68, 23, 38, 22],
       [55, 64, 76, 33, 88, 67, 51, 78, 26],
       [82, 34, 91, 67, 11, 88,  9, 36, 83],
       [94, 33, 31, 21, 81, 89, 37, 86, 98],
       [36, 55,  5, 87, 58, 43, 76, 70, 60]], dtype=int8)

In [79]:
a.sort()

In [80]:
a

array([[10, 12, 22, 23, 38, 42, 47, 68, 99],
       [26, 33, 51, 55, 64, 67, 76, 78, 88],
       [ 9, 11, 34, 36, 67, 82, 83, 88, 91],
       [21, 31, 33, 37, 81, 86, 89, 94, 98],
       [ 5, 36, 43, 55, 58, 60, 70, 76, 87]], dtype=int8)

In [83]:
a.sort(axis=0)

In [84]:
a

array([[ 5, 11, 22, 23, 38, 42, 47, 68, 87],
       [ 9, 12, 33, 36, 58, 60, 70, 76, 88],
       [10, 31, 34, 37, 64, 67, 76, 78, 91],
       [21, 33, 43, 55, 67, 82, 83, 88, 98],
       [26, 36, 51, 55, 81, 86, 89, 94, 99]], dtype=int8)

In [85]:
help(a.sort)

Help on built-in function sort:

sort(...) method of numpy.ndarray instance
    a.sort(axis=-1, kind=None, order=None)
    
    Sort an array in-place. Refer to `numpy.sort` for full documentation.
    
    Parameters
    ----------
    axis : int, optional
        Axis along which to sort. Default is -1, which means sort along the
        last axis.
    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
        Sorting algorithm. The default is 'quicksort'. Note that both 'stable'
        and 'mergesort' use timsort under the covers and, in general, the
        actual implementation will vary with datatype. The 'mergesort' option
        is retained for backwards compatibility.
    
        .. versionchanged:: 1.15.0
           The 'stable' option was added.
    
    order : str or list of str, optional
        When `a` is an array with fields defined, this argument specifies
        which fields to compare first, second, etc.  A single field can
        be specified as

In [86]:
import random
random.seed(0)
numbers = [random.randint(-50, 50)
          for i in range(10)]
numbers

[-1, 47, 3, -45, -17, 15, 12, 1, 50, -12]

In [87]:
sorted(numbers, key=abs)

[-1, 1, 3, 12, -12, 15, -17, -45, 47, 50]

In [89]:
a = np.array([abs(one_number) for one_number in numbers])
a.sort()
a

array([ 1,  1,  3, 12, 12, 15, 17, 45, 47, 50])

In [91]:
import pandas as pd
from pandas import Series, DataFrame

# Pandas data structures:

- Series -- 1-dimensional data
- DataFrame -- 2-dimensional data

In [92]:
s = Series([10, 20, 30, 40, 50, 60])
s

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int64

In [93]:
s.mean()

35.0

In [94]:
s.std()

18.708286933869708

In [96]:
s.min()

10

In [97]:
s.max()

60

In [98]:
s.sum()

210

In [99]:
s.values

array([10, 20, 30, 40, 50, 60])

In [100]:
s

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int64

In [101]:
s[3]

40

In [102]:
s[[1, 5]]

1    20
5    60
dtype: int64

In [103]:
s[1:5:2]

1    20
3    40
dtype: int64

In [104]:
# I can set the index of my series!
s = Series([10, 20, 30, 40, 50], index=[2,4,6,8,10])

In [105]:
s

2     10
4     20
6     30
8     40
10    50
dtype: int64

In [106]:
# There are two ways to get data from our series:
# - .loc[i], using the index that we set
# - .iloc[i], using the numeric locations, starting with 0

In [107]:
s.loc[4]

20

In [108]:
s.iloc[4]

50

In [109]:
# I can set the index to be strings
s = Series([10, 20, 30, 40, 50], index=list('abcde'))
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [110]:
s.loc['b']

20

In [111]:
s.iloc[3]

40

In [112]:
s.loc[['b', 'd']]

b    20
d    40
dtype: int64

In [113]:
s.loc['b':'d']   # slice?

b    20
c    30
d    40
dtype: int64

In [114]:
s.iloc[[2,3]]

c    30
d    40
dtype: int64

In [115]:
s.iloc[2:5:2]

c    30
e    50
dtype: int64

In [117]:
s.loc['a'] = 99
s

a    99
b    20
c    30
d    40
e    50
0    99
dtype: int64

In [120]:
s.loc['b':'e':2] = 88
s

a    99
b    88
c    30
d    88
e    50
0    99
dtype: int64

In [121]:
s.index = list('abcdef')
s

a    99
b    88
c    30
d    88
e    50
f    99
dtype: int64

In [122]:
s = Series([10, 20, 30])
s

0    10
1    20
2    30
dtype: int64

In [123]:
s = Series(np.array([10, 20, 30]))
s

0    10
1    20
2    30
dtype: int64

In [124]:
s = Series(np.random.randint(0, 100, 10))
s

0    88
1    88
2    12
3    58
4    65
5    39
6    87
7    46
8    88
9    81
dtype: int64

# Exercise: Pandas

1. Create a series that has 10 random integers from 0-100, with the index a-j.
2. Retrieve what's at index b.
3. Retrieve from indexes c, d, and f.
4. Calculate the mean from indexes a, e, g, h.


In [125]:
np.random.seed(0)
s = Series(np.random.randint(0, 100, 10), index=list('abcdefghij'))
s

a    44
b    47
c    64
d    67
e    67
f     9
g    83
h    21
i    36
j    87
dtype: int64

In [126]:
s.loc['b']

47

In [127]:
s['b']

47

In [128]:
s.loc[['c', 'd', 'f']]

c    64
d    67
f     9
dtype: int64

In [130]:
s.loc[['a', 'e', 'g', 'h']].mean()

53.75

In [131]:
s1 = Series([10, 20, 30, 40, 50], index=list('abcde'))
s2 = Series([100, 200, 300, 400, 500], index=list('abcde'))



In [132]:
s1

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [133]:
s2

a    100
b    200
c    300
d    400
e    500
dtype: int64

In [134]:
s1 + s2

a    110
b    220
c    330
d    440
e    550
dtype: int64

In [135]:
s1 = Series([10, 20, 30, 40, 50], index=list('abcde'))
s2 = Series([100, 200, 300, 400, 500], index=list('edcba'))   # reverse

s1 + s2

a    510
b    420
c    330
d    240
e    150
dtype: int64

In [136]:
s1 = Series([10, 20, 30, 40, 50], index=list('abcde'))
s2 = Series([100, 200, 300, 400, 500], index=list('cdefg'))

s1 + s2

a      NaN
b      NaN
c    130.0
d    240.0
e    350.0
f      NaN
g      NaN
dtype: float64

In [137]:
s1

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [139]:
s1.loc['f'] = 60    # adds a new element to s1, since it didn't
s1

a    10
b    20
c    30
d    40
e    50
f    60
dtype: int64

In [141]:
s1.index

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [142]:
s2.index

Index(['c', 'd', 'e', 'f', 'g'], dtype='object')

In [143]:
s2.index = list('abcde')

In [144]:
s2

a    100
b    200
c    300
d    400
e    500
dtype: int64

In [145]:
s = Series([10, 20, 30, 40, 50], index=list('abcab'))
s


a    10
b    20
c    30
a    40
b    50
dtype: int64

In [146]:
s.loc['a']

a    10
a    40
dtype: int64

In [147]:
s.loc['a'][0]

10

In [148]:
s.loc['b']

b    20
b    50
dtype: int64

In [149]:
s.loc['c']

30

In [150]:
s1 = Series([10, 20, 30, 40, 50], index=list('abcab'))
s2 = Series([100, 200, 300, 400, 500], index=list('abcde'))

s1 + s2

a    110.0
a    140.0
b    220.0
b    250.0
c    330.0
d      NaN
e      NaN
dtype: float64

In [152]:
s1 = Series([10, 20, 30, 40, 50], index=list('abcab'))
s2 = Series([100, 200, 300, 400, 500], index=list('ababa'))

s1 + s2

a    110.0
a    310.0
a    510.0
a    140.0
a    340.0
a    540.0
b    220.0
b    420.0
b    250.0
b    450.0
c      NaN
dtype: float64

In [153]:
s = Series([10, 20, 30, 40, 50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [154]:
s.dtype

dtype('int64')

In [155]:
s.astype(np.int8)

0    10
1    20
2    30
3    40
4    50
dtype: int8

In [156]:
np.random.seed(0)
s = Series(np.random.randint(0, 100, 10), index=list('abcdefghij'))
s

a    44
b    47
c    64
d    67
e    67
f     9
g    83
h    21
i    36
j    87
dtype: int64

In [157]:
# descriptive statistics
s.describe()

count    10.000000
mean     52.500000
std      25.674241
min       9.000000
25%      38.000000
50%      55.500000
75%      67.000000
max      87.000000
dtype: float64

In [158]:
s

a    44
b    47
c    64
d    67
e    67
f     9
g    83
h    21
i    36
j    87
dtype: int64

In [159]:
s.head()   # show me the 5 first

a    44
b    47
c    64
d    67
e    67
dtype: int64

In [160]:
s.head(3)  # show me the 3 first

a    44
b    47
c    64
dtype: int64

In [161]:
s.tail()

f     9
g    83
h    21
i    36
j    87
dtype: int64

In [162]:
s.tail(3)

h    21
i    36
j    87
dtype: int64

In [163]:
s

a    44
b    47
c    64
d    67
e    67
f     9
g    83
h    21
i    36
j    87
dtype: int64

In [164]:
s.loc[['c', 'g']] = np.nan
s

a    44.0
b    47.0
c     NaN
d    67.0
e    67.0
f     9.0
g     NaN
h    21.0
i    36.0
j    87.0
dtype: float64

In [166]:
s.mean() # ignores NaN!

47.25

In [167]:
help(s.mean)

Help on method mean in module pandas.core.generic:

mean(axis: 'int | None | lib.NoDefault' = <no_default>, skipna=True, level=None, numeric_only=None, **kwargs) method of pandas.core.series.Series instance
    Return the mean of the values over the requested axis.
    
    Parameters
    ----------
    axis : {index (0)}
        Axis for the function to be applied on.
    skipna : bool, default True
        Exclude NA/null values when computing the result.
    level : int or level name, default None
        If the axis is a MultiIndex (hierarchical), count along a
        particular level, collapsing into a scalar.
    numeric_only : bool, default None
        Include only float, int, boolean columns. If None, will attempt to use
        everything, then use only numeric data. Not implemented for Series.
    **kwargs
        Additional keyword arguments to be passed to the function.
    
    Returns
    -------
    scalar or Series (if level specified)



In [None]:
s.mean(skipn)