### Numpy Basics 

What you will find in numpy: 
    *ndarray, an efficient multidimensional array providing fast array-oriented arithmetic operations and flexible broadcasting capabilities.
    *mathematical functions for fast operations on entire arrays of data without having to write for loops 
    *Tools for reading/writing array data to disk and working with memory-mapped files. 
    *Linear algebra, random number generation, and Fourier transform capabilities
    *A C API for connecting NumPy with libraries written in C, C++, or FORTRAN

### REMEMBER INDEXING STARTS AT 0

In [2]:
import numpy as np 

In [16]:
data = np.random.randn(2,3)

In [17]:
data

array([[ 1.30598025,  1.2604482 , -1.26504574],
       [ 0.72884391, -0.09399044, -0.3502273 ]])

In [18]:
data * 10

array([[ 13.05980248,  12.60448198, -12.65045741],
       [  7.28843905,  -0.93990441,  -3.50227305]])

In [19]:
data + data

array([[ 2.6119605 ,  2.5208964 , -2.53009148],
       [ 1.45768781, -0.18798088, -0.70045461]])

In [20]:
data.shape

(2, 3)

In [21]:
data.dtype

dtype('float64')

How to create a NP array

In [22]:
data1 = [6, 7.5, 8, 0, 1]

In [23]:
arr1 = np.array(data1)

In [25]:
arr1

array([6. , 7.5, 8. , 0. , 1. ])

Nested Sequences (like a list of equal length lists) will be converted into a multidimensional array: 

In [26]:
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]

In [27]:
arr2 = np.array(data2)

In [28]:
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

You can convert an array from one dtype to another using ndarray's astype

In [29]:
arr = np.array([1, 2, 3, 4, 5])

In [30]:
arr.dtype

dtype('int64')

In [31]:
float_arr = arr.astype(np.float64)

In [32]:
float_arr.dtype

dtype('float64')

If I cast floating point numbers to be integers the decimal part will be truncated 

In [33]:
arr = np.array ([3.7, -1.2, -2.6])

In [34]:
arr

array([ 3.7, -1.2, -2.6])

In [36]:
arr.astype(np.int32)

array([ 3, -1, -2], dtype=int32)

if you have an array of strings that are numbers you can convert them to numbers 

In [39]:
numeric_strings = np.array(['1.25', '-9.6', 42], dtype=np.string_)

In [40]:
numeric_strings.astype(float)

array([ 1.25, -9.6 , 42.  ])

Arrays are important because they enable you to express batch operations on data without writing any for loops. Numpy users call this vectorization. 

In [41]:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])

In [42]:
arr

array([[1., 2., 3.],
       [4., 5., 6.]])

In [43]:
arr * arr

array([[ 1.,  4.,  9.],
       [16., 25., 36.]])

In [44]:
arr - arr

array([[0., 0., 0.],
       [0., 0., 0.]])

In [45]:
1/ arr

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])

In [46]:
arr ** 0.5

array([[1.        , 1.41421356, 1.73205081],
       [2.        , 2.23606798, 2.44948974]])

Comparisons between arrays of the same size yield boolean arrays: 

In [47]:
arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])

In [48]:
arr2

array([[ 0.,  4.,  1.],
       [ 7.,  2., 12.]])

In [49]:
arr2 > arr

array([[False,  True, False],
       [ True, False,  True]])

### Basic Indexing and Slicing 

In [50]:
arr = np.arange(10)

In [51]:
arr


array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [52]:
arr[5]

5

In [57]:
arr[5:8] = 12

In [58]:
arr

array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

In [59]:
arr_slice = arr[5:8]

In [60]:
arr_slice

array([12, 12, 12])

In [61]:
arr_slice[1] = 12345

In [62]:
arr

array([    0,     1,     2,     3,     4,    12, 12345,    12,     8,
           9])

In [63]:
arr_slice[:] = 64

In [64]:
arr

array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

If you want a copy of a slice of an ndarray instead of a view, you will need to explicitly copy the array for example arr[5:8].copy()

In [65]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

In [66]:
arr2d

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [67]:
arr2d[2]

array([7, 8, 9])

In [68]:
arr2d[0][2]

3

In [69]:
arr2d[:2, 1:]

array([[2, 3],
       [5, 6]])

In [70]:
arr2d[:2, 1:] = 0

In [71]:
arr2d

array([[1, 0, 0],
       [4, 0, 0],
       [7, 8, 9]])

### Boolean Indexing 

In [72]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])

In [73]:
data = np.random.randn(7, 4)

In [74]:
names

array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], dtype='<U4')

In [75]:
data

array([[ 0.63643105, -1.52295898,  1.13105541,  0.06121658],
       [ 0.06361788,  0.49769599,  1.67800017,  2.74559778],
       [-0.63765636, -1.40693924, -0.92822585,  1.04918251],
       [ 1.00625476,  0.68807245, -0.72484228,  0.76438984],
       [-0.58874641,  1.54936109,  1.04343258, -0.38628951],
       [ 0.59392693, -1.06952695,  0.63805728,  0.80179648],
       [-1.80121687,  1.10256566,  2.17254557, -2.04519312]])

In [76]:
names == 'Bob'

array([ True, False, False,  True, False, False, False])

In [77]:
data[names == 'Bob']

array([[ 0.63643105, -1.52295898,  1.13105541,  0.06121658],
       [ 1.00625476,  0.68807245, -0.72484228,  0.76438984]])

In [79]:
data[names == 'Bob', 2:]

array([[ 1.13105541,  0.06121658],
       [-0.72484228,  0.76438984]])

To select everything but 'Bob', you can either use != or negate the condition using ~

In [80]:
data[~(names == 'Bob')]

array([[ 0.06361788,  0.49769599,  1.67800017,  2.74559778],
       [-0.63765636, -1.40693924, -0.92822585,  1.04918251],
       [-0.58874641,  1.54936109,  1.04343258, -0.38628951],
       [ 0.59392693, -1.06952695,  0.63805728,  0.80179648],
       [-1.80121687,  1.10256566,  2.17254557, -2.04519312]])

In [81]:
cond = names == 'Bob'

In [82]:
data[~cond]

array([[ 0.06361788,  0.49769599,  1.67800017,  2.74559778],
       [-0.63765636, -1.40693924, -0.92822585,  1.04918251],
       [-0.58874641,  1.54936109,  1.04343258, -0.38628951],
       [ 0.59392693, -1.06952695,  0.63805728,  0.80179648],
       [-1.80121687,  1.10256566,  2.17254557, -2.04519312]])

In [83]:
mask = (names == 'Bob') | (names == 'Will')

In [84]:
mask

array([ True, False,  True,  True,  True, False, False])

In [85]:
data[mask]

array([[ 0.63643105, -1.52295898,  1.13105541,  0.06121658],
       [-0.63765636, -1.40693924, -0.92822585,  1.04918251],
       [ 1.00625476,  0.68807245, -0.72484228,  0.76438984],
       [-0.58874641,  1.54936109,  1.04343258, -0.38628951]])

Setting values with boolean arrays works in a common-sense way. To set all of the negative values in data to 0 we need only do:

In [86]:
data[data < 0] = 0

In [87]:
data

array([[0.63643105, 0.        , 1.13105541, 0.06121658],
       [0.06361788, 0.49769599, 1.67800017, 2.74559778],
       [0.        , 0.        , 0.        , 1.04918251],
       [1.00625476, 0.68807245, 0.        , 0.76438984],
       [0.        , 1.54936109, 1.04343258, 0.        ],
       [0.59392693, 0.        , 0.63805728, 0.80179648],
       [0.        , 1.10256566, 2.17254557, 0.        ]])

### Array Functions 

In [88]:
arr = np.arange(10)

In [89]:
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [90]:
np.sqrt(arr)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

In [91]:
np.exp(arr)

array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
       5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03,
       2.98095799e+03, 8.10308393e+03])

These are referred to as unary funcs. 

In [99]:
x = np.random.randn(8)
x

array([-0.22326232, -0.84013256,  0.28482067,  0.36435158, -0.07568938,
       -0.54549194, -1.70795271,  1.35976642])

In [100]:
y = np.random.randn(8)
y

array([ 0.39860623,  2.19170456,  0.89721904,  0.53532719, -2.25103231,
       -0.75700653, -0.89465968,  0.09596084])

In [101]:
np.maximum(x, y)

array([ 0.39860623,  2.19170456,  0.89721904,  0.53532719, -0.07568938,
       -0.54549194, -0.89465968,  1.35976642])

search Numpy Unfuncs for more options or 'Binary Universal Functions'

### Array-Oriented Programming with Arrays 

.where

In [108]:
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])

In [109]:
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])

In [110]:
cond = np.array([True, False, True, True, False])

## if you want to take a value from xarr whenever the corresponding value in cond is True, and otherwise take the value from yarr. A list comprehension doing this might look like: 

In [113]:
result = [(x if c else y)
          .....:
for x, y, c in zip(xarr, yarr, cond)]

In [114]:
result

[1.1, 2.2, 1.3, 1.4, 2.5]

A typical use of where in data analysis is to produce a new array of values based on another array:

In [115]:
arr = np.random.randn(4, 4)

In [116]:
arr

array([[-2.90149601e-01, -1.15494134e+00,  1.52473738e+00,
        -1.87802052e-01],
       [-3.42375842e-01,  1.51020636e+00,  1.48244599e-01,
         5.93584032e-01],
       [-2.78382367e-01, -1.09082026e+00, -1.96956988e+00,
         5.55264482e-01],
       [ 1.11312972e-03,  1.83555392e+00, -2.74625038e-01,
        -5.67330214e-01]])

In [117]:
arr > 0

array([[False, False,  True, False],
       [False,  True,  True,  True],
       [False, False, False,  True],
       [ True,  True, False, False]])

In [118]:
np.where(arr > 0, 2, -2)
#every value of true has been replaced with a positive two and every instance of false has been replaced with a negative two 

array([[-2, -2,  2, -2],
       [-2,  2,  2,  2],
       [-2, -2, -2,  2],
       [ 2,  2, -2, -2]])

Math and Stats Methods

In [119]:
arr = np.random.randn(5, 4)

In [120]:
arr

array([[-0.01390772,  2.42708522,  0.7722513 ,  0.37821441],
       [ 2.2401655 , -0.20082766,  0.05329887,  1.52824019],
       [ 0.25900656,  0.62590948,  0.36548946,  0.40700474],
       [ 0.75200386,  0.60635381,  1.02913584, -0.8308229 ],
       [-0.77249533,  0.74823203,  0.14858617, -1.63935256]])

In [121]:
arr.mean()

0.4441785631769026

In [122]:
arr.sum()

8.883571263538052

In [123]:
arr.mean(axis=1)
#'compute mean across the columns'

array([ 0.8909108 ,  0.90521922,  0.41435256,  0.38916765, -0.37875742])

In [124]:
arr.sum(axis=0)
# 'computer sum down the rows'

array([ 2.46477287,  4.20675287,  2.36876164, -0.15671612])

Look up basic array statistical methods 

### Methods for Boolean Arrays 

Boolean values are coerced to 1 (True) and 0(False) in the preceding methods. Thus, sum is often as a means of counting True values in a boolean array:  

In [125]:
arr = np.random.randn(100)

In [126]:
(arr > 0).sum()

46

In [128]:
bools = np.array([False, False, True, False])

In [129]:
bools.any()

True

In [130]:
bools.all()

False

### Sorting 

In [131]:
arr = np.random.randn(6)

In [132]:
arr

array([-0.21237372, -0.21833075, -4.02287055, -0.58650665, -0.05977331,
        0.53181432])

In [133]:
arr.sort()

In [134]:
arr

array([-4.02287055, -0.58650665, -0.21833075, -0.21237372, -0.05977331,
        0.53181432])

### Unique and Other Set Logic 

In [135]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])

In [136]:
np.unique(names)

array(['Bob', 'Joe', 'Will'], dtype='<U4')

In [137]:
ints = np.array([3, 3, 1, 2])

In [138]:
np.unique(ints)

array([1, 2, 3])

In [139]:
sorted(set(names))

['Bob', 'Joe', 'Will']

Another function, np.in1d, tests membership of the values in one array in another, returning a boolean array:

In [142]:
values = np.array([6, 0, 0, 3, 2, 5, 6])

In [144]:
tvalues = np.array([1, 2, 3, 4, 5, 6, 7])

In [141]:
np.in1d(values, [2, 3, 6])

array([ True, False, False,  True,  True, False,  True])

In [145]:
np.setdiff1d(values, tvalues)

array([0])

setdiff1d Set difference, elements in x that are not in y 

In [147]:
np.setxor1d(values, tvalues)

array([0, 1, 4, 7])

setxor1d Set symmetric differences; elements that are in either of the arrays but not both

### Pandas Basics: 

In [150]:
import pandas as pd

### Series 

A series is a one-dimensional array-like object containing a sequence of values and an associated array of data labels, called its index

In [22]:
obj = pd.Series([4, 7, -5, 3])

In [23]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [24]:
obj.values

array([ 4,  7, -5,  3])

In [25]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [26]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [27]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [28]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [29]:
obj2['a']

-5

In [30]:
obj2['d'] = 6

In [31]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [32]:
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [33]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [34]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

Create a Pandas Series from a dictionary

In [35]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [36]:
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

When you are only passing a dict, the index in the resulting series will have the dict's keys in sorted order. You can overrride this by passing the dict keys in the order you want them to appear in the resulting Series:

In [37]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [38]:
obj4 = pd.Series(sdata, index=states)

In [39]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [40]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [41]:
import pandas as pd

In [44]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [45]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [46]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [47]:
obj4.name = 'population'

In [48]:
obj4.index.name = 'state'

In [50]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [51]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [52]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']

In [53]:
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

Dataframe

A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.) The DataFrame has both a row and column index; it can be thought of as a dict of Series all sharing the same index. Under the hood, the data is stored as one or more two-dimensional blocks rather than a list, dict, or some other collection of one-dimensional arrays. The exact details of DataFrame's internals are outside the scope of this book. 

In [54]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]} 

In [55]:
frame = pd.DataFrame(data)

In [56]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [57]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [58]:
pd.DataFrame(data, columns=['year', 'state', 'pop',])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [61]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five', 'six'])

In [62]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [63]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [64]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [65]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [66]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [67]:
frame2['debt'] = 16.5

In [68]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [69]:
frame2['debt'] = np.arange(6.)

In [70]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [71]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [73]:
frame3 = pd.DataFrame(pop)

In [74]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [75]:
#transpose 
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5
