In [1]:
import pandas as pd

# Data Structure

# 1_Series

A Series is a one-dimensional array-like object containing a sequence of values (of
similar types to NumPy types) and an associated array of data labels, called its index.
The simplest Series is formed from only an array of data:

In [2]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj2 = pd.Series([4,7,-5,3], index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [6]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [7]:
obj2['a']

-5

In [8]:
obj2['d'] = 6

In [9]:
obj2['d', 'a'] = [6, 8]
obj2

d    6
b    7
a    8
c    3
dtype: int64

In [10]:
obj2[['c', 'a', 'd']]

c    3
a    8
d    6
dtype: int64

In [11]:
obj2[obj2 > 0]

d    6
b    7
a    8
c    3
dtype: int64

In [12]:
obj * 2

0     8
1    14
2   -10
3     6
dtype: int64

In [13]:
import numpy as np
np.exp(obj2)

d     403.428793
b    1096.633158
a    2980.957987
c      20.085537
dtype: float64

In [14]:
'b' in obj2

True

In [15]:
'e' in obj2

False

In [16]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [17]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index = states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [18]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [19]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [20]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [21]:
obj4.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [22]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [23]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [24]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [25]:
obj4.name = 'population'

In [26]:
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [27]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [28]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

# 2_DataFrame

A DataFrame represents a rectangular table of data and contains an ordered collection
of columns, each of which can be a different value type (numeric, string,
boolean, etc.). The DataFrame has both a row and column index; it can be thought of
as a dict of Series all sharing the same index. Under the hood, the data is stored as one
or more two-dimensional blocks rather than a list, dict, or some other collection of
one-dimensional arrays.

In [29]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [30]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [31]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [32]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], 
                     index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [33]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [34]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [35]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [36]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [37]:
frame2['debt'] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [38]:
val = pd.Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [39]:
frame2['eastern'] = frame2.state == 'ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,False
two,2001,Ohio,1.7,-1.2,False
three,2002,Ohio,3.6,,False
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [40]:
del frame2['eastern']
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [41]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [42]:
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [43]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [44]:
pd.DataFrame(pop, index = [2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [45]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [46]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [47]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [48]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

# 3_Index_Objects

Pandas’s Index objects are responsible for holding the axis labels and other metadata
(like the axis name or names). Any array or other sequence of labels you use when
constructing a Series or DataFrame is internally converted to an Index:

In [49]:
obj = pd.Series(range(3), index = ['a', 'b', 'c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [50]:
index[1:]

Index(['b', 'c'], dtype='object')

In [51]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [52]:
obj2 = pd.Series([1.5, -2.5, 0], index = labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [53]:
obj2.index is labels

True

In [54]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [55]:
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [56]:
'Ohio' in frame3.columns

True

In [57]:
2003 in frame3.index

False

In [58]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

# 4_Essential Functionality

## 4.1_Reindexing

An important method on pandas objects is reindex, which means to create a new
object with the data conformed to a new index. Consider an example:

In [59]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [60]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

For ordered data like time series, it may be desirable to do some interpolation or filling
of values when reindexing. The method option allows us to do this, using a
method such as ffill, which forward-fills the values:

In [61]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [62]:
obj3.reindex(range(6), method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [63]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), 
                    index=['a', 'c', 'd'], 
                    columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [64]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [65]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


## 4.2_Dropping Entries from an Axis

Dropping one or more entries from an axis is easy if you already have an index array
or list without those entries. As that can require a bit of munging and set logic, the
drop method will return a new object with the indicated value or values deleted from
an axis:

In [66]:
obj = pd.Series(np.arange(5,), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [67]:
new_obj = obj.drop('c')
new_obj

a    0
b    1
d    3
e    4
dtype: int32

In [68]:
obj.drop(['d', 'c'])

a    0
b    1
e    4
dtype: int32

In [69]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [70]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [71]:
data.drop('two', axis = 1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [72]:
data.drop(['two', 'four'], axis = 'columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


Many functions, like drop, which modify the size or shape of a Series or DataFrame,
can manipulate an object in-place without returning a new object:

In [73]:
obj.drop('c', inplace = True)
obj

a    0
b    1
d    3
e    4
dtype: int32

## 4.3_Indexing, Selection, and Filtering

Series indexing (obj[...]) works analogously to NumPy array indexing, except you
can use the Series’s index values instead of only integers. Here are some examples of
this:

In [74]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [75]:
obj['b']

1.0

In [76]:
obj[1]

1.0

In [77]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [78]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [79]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [80]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

Slicing with labels behaves differently than normal Python slicing in that the endpoint
is inclusive:

In [81]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [82]:
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

Indexing into a DataFrame is for retrieving one or more columns either with a single
value or sequence:

In [83]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [84]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [85]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [86]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [87]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


The row selection syntax data[:2] is provided as a convenience. Passing a single element
or a list to the [] operator selects columns.

Another use case is in indexing with a boolean DataFrame, such as one produced by a
scalar comparison:

In [88]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [89]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


## 4.4_Selection with loc and iloc

For DataFrame label-indexing on the rows, I introduce the special indexing operators
loc and iloc. They enable you to select a subset of the rows and columns from a
DataFrame with NumPy-like notation using either axis labels (loc) or integers
(iloc).

In [90]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [91]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [92]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [93]:
data.iloc[[1,2], [3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [94]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [95]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


## 4.5_Integer Indexes

Working with pandas objects indexed by integers is something that often trips up
new users due to some differences with indexing semantics on built-in Python data
structures like lists and tuples. For example, you might not expect the following code
to generate an error:

In [96]:
ser = pd.Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [97]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

2.0

In [98]:
ser[:1]

0    0.0
dtype: float64

In [99]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [100]:
ser.iloc[:1]

0    0.0
dtype: float64

## 4.6_Arithmetic and Data Alignment

An important pandas feature for some applications is the behavior of arithmetic
between objects with different indexes. When you are adding together objects, if any
index pairs are not the same, the respective index in the result will be the union of the
index pairs. For users with database experience, this is similar to an automatic outer
join on the index labels. Let’s look at an example:

In [101]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [102]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [103]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [104]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

The internal data alignment introduces missing values in the label locations that don’t
overlap. Missing values will then propagate in further arithmetic computations.

In [105]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                             index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [106]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [107]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [108]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [109]:
df1 = pd.DataFrame({'A' : [1, 2]})

In [110]:
df2 = pd.DataFrame({'B' : [3, 4]})

In [111]:
df1

Unnamed: 0,A
0,1
1,2


In [112]:
df2

Unnamed: 0,B
0,3
1,4


In [113]:
df1 - df2

Unnamed: 0,A,B
0,,
1,,


## 4.6_Arithmetic methods with fill values

In arithmetic operations between differently indexed objects, you might want to fill
with a special value, like 0, when an axis label is found in one object but not the other:

In [114]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), 
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))

df2.loc[1, 'b'] = np.nan
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [115]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [116]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [117]:
df1.add(df2, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [118]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [119]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [120]:
df1.reindex(columns = df2.columns, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


## 4.7_Operations between DataFrame and Series

As with NumPy arrays of different dimensions, arithmetic between DataFrame and
Series is also defined. First, as a motivating example, consider the difference between
a two-dimensional array and one of its rows:

In [121]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [122]:
arr[0]

array([0., 1., 2., 3.])

In [123]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [124]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [125]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [126]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [127]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [128]:
series3 = frame['d']
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [129]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [130]:
frame.sub(series3, axis = 'index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


## 4.8_Function Application and Mapping

In [131]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-0.948558,-1.388561,-0.509855
Ohio,-1.48354,0.034738,-0.111894
Texas,0.185181,0.044817,0.194475
Oregon,-0.630653,0.11861,-0.112806


In [132]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.948558,1.388561,0.509855
Ohio,1.48354,0.034738,0.111894
Texas,0.185181,0.044817,0.194475
Oregon,0.630653,0.11861,0.112806


In [133]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    1.668720
d    1.507171
e    0.704331
dtype: float64

In [134]:
frame.apply(f, axis = 'columns')

Utah      0.878706
Ohio      1.518278
Texas     0.149658
Oregon    0.749263
dtype: float64

In [135]:
def f(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.48354,-1.388561,-0.509855
max,0.185181,0.11861,0.194475


In [136]:
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.95,-1.39,-0.51
Ohio,-1.48,0.03,-0.11
Texas,0.19,0.04,0.19
Oregon,-0.63,0.12,-0.11


In [137]:
frame['e'].map(format)

Utah      -0.51
Ohio      -0.11
Texas      0.19
Oregon    -0.11
Name: e, dtype: object

## 4.9_Sorting and Ranking

In [138]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [139]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [140]:
frame.sort_index(axis = 1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [141]:
frame.sort_index(axis = 1, ascending = False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [142]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [143]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [144]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [145]:
frame.sort_values(by = 'b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [146]:
frame.sort_values(by = ['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [147]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [148]:
obj.rank(method = 'first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [149]:
obj.rank(ascending = False, method = 'max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [150]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [151]:
frame.rank(axis = 'columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


## 4.10_Axis Indexes with Duplicate Labels

In [153]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [154]:
obj.index.is_unique

False

In [155]:
obj['a']

a    0
a    1
dtype: int64

In [156]:
obj['c']

4

In [157]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,0.123856,-1.255315,-1.774863
a,1.016034,0.108184,1.15928
b,-1.259852,-0.282586,0.237398
b,-0.314988,-0.946154,-1.049276


In [158]:
df.loc['b']

Unnamed: 0,0,1,2
b,-1.259852,-0.282586,0.237398
b,-0.314988,-0.946154,-1.049276


# 5_Summarizing and Computing Descriptive Statistics

Pandas objects are equipped with a set of common mathematical and statistical methods.
Most of these fall into the category of reductions or summary statistics, methods
that extract a single value (like the sum or mean) from a Series or a Series of values
from the rows or columns of a DataFrame. Compared with the similar methods
found on NumPy arrays, they have built-in handling for missing data. Consider a
small DataFrame:

In [159]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [160]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [161]:
df.sum(axis = 'columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [162]:
df.mean(axis = 'columns', skipna = False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [165]:
df.idxmax()

one    b
two    d
dtype: object

In [166]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [167]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [168]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

## 5.1_Correlation and Covariance

Some summary statistics, like correlation and covariance, are computed from pairs of
arguments. Let’s consider some DataFrames of stock prices and volumes obtained
from Yahoo! Finance using the add-on pandas-datareader package. If you don’t
have it installed already, it can be obtained via conda or pip:

In [170]:
import pandas_datareader.data as web

In [171]:
all_data = {ticker: web.get_data_yahoo(ticker)
           for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [173]:
price = pd.DataFrame({ticker: data['Adj Close']
                      for ticker, data in all_data.items()})

volume = pd.DataFrame({ticker: data['Volume']
                       for ticker, data in all_data.items()})

In [174]:
returns = price.pct_change()

In [175]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-08-17,-0.002611,-0.006626,0.006606,0.006798
2020-08-18,0.008333,0.003857,0.005754,0.026759
2020-08-19,0.001255,-0.008646,-0.006067,-0.007102
2020-08-20,0.02219,-0.005572,0.023271,0.022113
2020-08-21,0.038015,-0.000641,-0.006012,0.002124


In [176]:
returns['MSFT'].corr(returns['IBM'])

0.5861967685728635

In [178]:
returns['MSFT'].cov(returns['IBM'])

0.00016410387773140356

In [179]:
returns.MSFT.corr(returns.IBM)

0.5861967685728635

In [180]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.513439,0.707359,0.662543
IBM,0.513439,1.0,0.586197,0.540432
MSFT,0.707359,0.586197,1.0,0.784282
GOOG,0.662543,0.540432,0.784282,1.0


In [181]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000339,0.000152,0.000227,0.000202
IBM,0.000152,0.000258,0.000164,0.000144
MSFT,0.000227,0.000164,0.000304,0.000226
GOOG,0.000202,0.000144,0.000226,0.000274


In [182]:
returns.corrwith(returns.IBM)

AAPL    0.513439
IBM     1.000000
MSFT    0.586197
GOOG    0.540432
dtype: float64

In [183]:
returns.corrwith(volume)

AAPL   -0.077297
IBM    -0.099064
MSFT   -0.046476
GOOG   -0.139843
dtype: float64

## 5.2_Unique Values, Value Counts, and Membership

In [184]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [185]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [186]:
pd.value_counts(obj.values, sort = False)

c    3
d    1
a    3
b    2
dtype: int64

In [187]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [188]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [189]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [190]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])

In [192]:
unique_vals = pd.Series(['c', 'b', 'a'])

In [193]:
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2], dtype=int64)

In [194]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [195]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


# Data Loading, Storage, and File Formats

Accessing data is a necessary first step for using most of the tools in this book. I’m
going to be focused on data input and output using pandas, though there are numerous
tools in other libraries to help with reading and writing data in various formats.
Input and output typically falls into a few main categories: reading text files and other
more efficient on-disk formats, loading data from databases, and interacting with network
sources like web APIs.

# 6_Reading and Writing Data in Text Format

Pandas features a number of functions for reading tabular data as a DataFrame
object. Tthough read_csv and read_table are
likely the ones you’ll use the most.

In [197]:
df = pd.read_csv('examples/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [199]:
pd.read_table('examples/ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [202]:
pd.read_csv('examples/ex2.csv', header = None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [203]:
pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [204]:
names = ['a', 'b', 'c', 'd', 'message']

In [205]:
pd.read_csv('examples/ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [206]:
parsed = pd.read_csv('examples/csv_mindex.csv', 
                    index_col = ['key1', 'key2'])

parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [207]:
list(open('examples/ex3.txt'))

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

In [208]:
result = pd.read_table('examples/ex3.txt', sep='\s+')
result

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [209]:
result = pd.read_csv('examples/ex5.csv')
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [210]:
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [211]:
result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [212]:
sentinels = {'messae': ['foo', 'NA'], 'something': ['two']}

In [213]:
pd.read_csv('examples/ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,foo


## 6.1_Reading Text Files in Pieces

When processing very large files or figuring out the right set of arguments to correctly
process a large file, you may only want to read in a small piece of a file or iterate
through smaller chunks of the file.

In [214]:
pd.options.display.max_rows = 10

In [215]:
result = pd.read_csv('examples/ex6.csv')
result

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
...,...,...,...,...,...
9995,2.311896,-0.417070,-1.409599,-0.515821,L
9996,-0.479893,-0.650419,0.745152,-0.646038,E
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G


In [216]:
pd.read_csv('examples/ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [217]:
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)
chunker

<pandas.io.parsers.TextFileReader at 0x17cbf0abbe0>

## 6.2_Writing Data to Text Format

In [218]:
data = pd.read_csv('examples/ex5.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [219]:
data.to_csv('examples/out.csv')

In [220]:
import sys

In [221]:
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [222]:
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [223]:
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [224]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


In [225]:
dates = pd.date_range('1/1/2000', periods=7)

In [226]:
ts = pd.Series(np.arange(7), index=dates)

In [227]:
ts.to_csv('examples/tseries.csv')

In [228]:
!cat examples/tseries.csv

'cat' is not recognized as an internal or external command,
operable program or batch file.


## 6.3_Working with Delimited Formats

In [230]:
import csv

In [231]:
f = open('examples/ex7.csv')
reader = csv.reader(f)

It’s possible to load most forms of tabular data from disk using functions like pan
das.read_table. In some cases, however, some manual processing may be necessary.
It’s not uncommon to receive a file with one or more malformed lines that trip up
read_table. To illustrate the basic tools, consider a small CSV file

In [232]:
for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [233]:
for line in reader:
    print(line)

In [234]:
with open('examples/ex7.csv') as f:
    lines = list(csv.reader(f))

In [235]:
header, values = lines[0], lines[1:]

In [237]:
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

## 6.4_JSON Data

JSON (short for JavaScript Object Notation) has become one of the standard formats
for sending data by HTTP request between web browsers and other applications. It is
a much more free-form data format than a tabular text form like CSV. Here is an
example:

In [242]:
obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
{"name": "Katie", "age": 38,
"pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

In [243]:
import json

In [244]:
result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 30, 'pets': ['Zeus', 'Zuko']},
  {'name': 'Katie', 'age': 38, 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [245]:
asjson = json.dumps(result)

In [246]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Scott,30
1,Katie,38


In [247]:
!cat examples/example.json

'cat' is not recognized as an internal or external command,
operable program or batch file.


In [248]:
data = pd.read_json('examples/example.json')
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [249]:
print(data.to_json)

<bound method NDFrame.to_json of    a  b  c
0  1  2  3
1  4  5  6
2  7  8  9>


In [250]:
print(data.to_json(orient = 'records'))

[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]


## 6.5_XML and HTML: Web Scraping

Python has many libraries for reading and writing data in the ubiquitous HTML and
XML formats. Examples include lxml, Beautiful Soup, and html5lib. While lxml is
comparatively much faster in general, the other libraries can better handle malformed
HTML or XML files.
pandas has a built-in function, read_html, which uses libraries like lxml and Beautiful
Soup to automatically parse tables out of HTML files as DataFrame objects. To
show how this works, I downloaded an HTML file (used in the pandas documentation)
from the United States FDIC government agency showing bank failures.1 First,
you must install some additional libraries used by read_html:

In [251]:
tables = pd.read_html('examples/fdic_failed_bank_list.html')
len(tables)

1

In [252]:
failures = tables[0]
failures.head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"


In [253]:
close_timestamps = pd.to_datetime(failures['Closing Date'])
close_timestamps.dt.year.value_counts()

2010    157
2009    140
2011     92
2012     51
2008     25
       ... 
2004      4
2001      4
2007      3
2003      3
2000      2
Name: Closing Date, Length: 15, dtype: int64

## 6.6_Parsing XML with lxml.objectify

XML (eXtensible Markup Language) is another common structured data format supporting
hierarchical, nested data with metadata. The book you are currently reading
was actually created from a series of large XML documents.
Earlier, I showed the pandas.read_html function, which uses either lxml or Beautiful
Soup under the hood to parse data from HTML. XML and HTML are structurally
similar, but XML is more general. Here, I will show an example of how to use lxml to
parse data from a more general XML format.
The New York Metropolitan Transportation Authority (MTA) publishes a number of
data series about its bus and train services. Here we’ll look at the performance data,
which is contained in a set of XML files. Each train or bus service has a different file
(like Performance_MNR.xml for the Metro-North Railroad) containing monthly data
as a series of XML records that look like this:

In [258]:
from lxml import objectify

In [259]:
perf = pd.DataFrame(data)

In [261]:
perf.head()

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [262]:
from io import StringIO
tag = '<a href="http://www.google.com">Google</a>'
root = objectify.parse(StringIO(tag)).getroot()

In [263]:
root

<Element a at 0x17cc09d3100>

In [264]:
root.get('href')

'http://www.google.com'

In [265]:
root.text

'Google'

# 7_Binary Data Formats

One of the easiest ways to store data (also known as serialization) efficiently in binary
format is using Python’s built-in pickle serialization. pandas objects all have a
to_pickle method that writes the data to disk in pickle format:

In [266]:
frame = pd.read_csv('examples/ex1.csv')
frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [267]:
frame.to_pickle('examples/frame_pickle')

In [268]:
pd.read_pickle('examples/frame_pickle')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


## 7.1_Using HDF5 Format

HDF5 is a well-regarded file format intended for storing large quantities of scientific
array data. It is available as a C library, and it has interfaces available in many other
languages, including Java, Julia, MATLAB, and Python. The “HDF” in HDF5 stands
for hierarchical data format. Each HDF5 file can store multiple datasets and supporting
metadata. Compared with simpler formats, HDF5 supports on-the-fly compression
with a variety of compression modes, enabling data with repeated patterns to be
stored more efficiently. HDF5 can be a good choice for working with very large datasets
that don’t fit into memory, as you can efficiently read and write small sections of
much larger arrays.
While it’s possible to directly access HDF5 files using either the PyTables or h5py
libraries, pandas provides a high-level interface that simplifies storing Series and
DataFrame object. The HDFStore class works like a dict and handles the low-level
details:

In [269]:
frame = pd.DataFrame({'a': np.random.randn(100)})

In [271]:
store = pd.HDFStore('mydata.h5')

In [272]:
store['obj1'] = frame

In [273]:
store['obj1_col'] = frame['a']
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5

In [274]:
store['obj1']

Unnamed: 0,a
0,-0.712055
1,0.148076
2,0.034380
3,0.261267
4,0.963669
...,...
95,-0.126203
96,0.258020
97,3.139202
98,-1.649423


In [275]:
store.put('obj2', frame, format = 'table')
store.select('obj2', where = ['index >= 10 and index <= 15'])
store.close()

In [276]:
frame.to_hdf('mydata.h5', 'obj3', format = 'table')

In [277]:
pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])

Unnamed: 0,a
0,-0.712055
1,0.148076
2,0.03438
3,0.261267
4,0.963669


## 7.2_Reading Microsoft Excel Files

Pandas also supports reading tabular data stored in Excel 2003 (and higher) files
using either the ExcelFile class or pandas.read_excel function. Internally these
tools use the add-on packages xlrd and openpyxl to read XLS and XLSX files, respectively.
You may need to install these manually with pip or conda.

In [278]:
xlsx = pd.ExcelFile('examples/ex1.xlsx')

In [279]:
pd.read_excel(xlsx, 'Sheet1')

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [280]:
frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')
frame

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [281]:
writer = pd.ExcelWriter('examples/ex2.xlsx')

In [282]:
frame.to_excel(writer, 'Sheet1')

In [283]:
writer.save()

In [284]:
frame.to_excel('examples/ex2.xlsx')

# 8_Interacting with Web APIs

Many websites have public APIs providing data feeds via JSON or some other format.
There are a number of ways to access these APIs from Python; one easy-to-use
method that I recommend is the requests package.

In [285]:
import requests

In [286]:
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'

In [287]:
resp = requests.get(url)
resp

<Response [200]>

In [288]:
data = resp.json()

In [289]:
data[0]['title']



In [290]:
issues = pd.DataFrame(data, columns=['number', 'title',
                                     'labels', 'state'])
issues

Unnamed: 0,number,title,labels,state
0,35845,CI: Revert 31323 for deprecation warning from ...,[],open
1,35843,REGR: Series.__repr__ is broken for SparseDtyp...,"[{'id': 32815646, 'node_id': 'MDU6TGFiZWwzMjgx...",open
2,35841,REF: simplify _cython_agg_blocks,[],open
3,35839,REF: remove unnecesary try/except,[],open
4,35838,"Fix Series construction from Sparse[""datetime6...","[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
...,...,...,...,...
25,35799,TST: resample does not yield empty groups (#10...,[],open
26,35798,Bump asv Python version,"[{'id': 732775912, 'node_id': 'MDU6TGFiZWw3MzI...",open
27,35797,Fix is_categorical_dtype for Sparse[category],"[{'id': 78527356, 'node_id': 'MDU6TGFiZWw3ODUy...",open
28,35796,ENH:dataframe columns dropping with column nam...,"[{'id': 76812, 'node_id': 'MDU6TGFiZWw3NjgxMg=...",open


# 9_Interacting with Databases

In a business setting, most data may not be stored in text or Excel files. SQL-based
relational databases (such as SQL Server, PostgreSQL, and MySQL) are in wide use,
and many alternative databases have become quite popular. The choice of database is
usually dependent on the performance, data integrity, and scalability needs of an
application.
Loading data from SQL into a DataFrame is fairly straightforward, and pandas has
some functions to simplify the process. As an example, I’ll create a SQLite database
using Python’s built-in sqlite3 driver:

In [291]:
import sqlite3

In [294]:
query = """
CREATE TABLE test1(a VARCHAR(20), b VARCHAR(20),c REAL, d INTEGER);"""

In [295]:
con = sqlite3.connect('mydata.sqlite')
con.execute(query)

<sqlite3.Cursor at 0x17cc1124e30>

In [296]:
con.commit()

In [297]:
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]

In [298]:
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"

In [299]:
con.executemany(stmt, data)

<sqlite3.Cursor at 0x17cc10cc880>

In [300]:
con.commit()

In [301]:
cursor = con.execute('select * from test1')

In [302]:
rows = cursor.fetchall()
rows

[]

In [303]:
cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

In [304]:
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

Unnamed: 0,a,b,c,d


In [305]:
import sqlalchemy as sqla

In [306]:
db = sqla.create_engine('sqlite:///mydata.sqlite')

In [307]:
pd.read_sql('select * from test', db)

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


# Data Cleaning and Preparation

# 10_Handling Missing Data

Missing data occurs commonly in many data analysis applications. One of the goals
of pandas is to make working with missing data as painless as possible. For example,
all of the descriptive statistics on pandas objects exclude missing data by default.
The way that missing data is represented in pandas objects is somewhat imperfect,
but it is functional for a lot of users. For numeric data, pandas uses the floating-point
value NaN (Not a Number) to represent missing data. We call this a sentinel value that
can be easily detected:

In [308]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [309]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [310]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

## 10.1_Filtering Out Missing Data

There are a few ways to filter out missing data. While you always have the option to
do it by hand using pandas.isnull and boolean indexing, the dropna can be helpful.
On a Series, it returns the Series with only the non-null data and index values:

In [311]:
from numpy import nan as NA

In [312]:
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [314]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [315]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])

In [317]:
cleaned = data.dropna()
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [318]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [319]:
data.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [320]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [321]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [322]:
df = pd.DataFrame(np.random.randn(7, 3))

In [323]:
df.iloc[:4, 1] = NA

In [325]:
df.iloc[2, 2] = NA
df

Unnamed: 0,0,1,2
0,1.756833,,-0.620877
1,-1.018609,,0.892303
2,-0.645588,,
3,-0.147291,,0.211741
4,0.458434,0.982918,1.624429
5,-1.127536,-1.060043,-0.615365
6,0.634723,0.735118,0.61511


In [326]:
df.dropna()

Unnamed: 0,0,1,2
4,0.458434,0.982918,1.624429
5,-1.127536,-1.060043,-0.615365
6,0.634723,0.735118,0.61511


In [327]:
df.dropna(thresh = 2)

Unnamed: 0,0,1,2
0,1.756833,,-0.620877
1,-1.018609,,0.892303
3,-0.147291,,0.211741
4,0.458434,0.982918,1.624429
5,-1.127536,-1.060043,-0.615365
6,0.634723,0.735118,0.61511


## 10.2_Filling In Missing Data

Rather than filtering out missing data (and potentially discarding other data along
with it), you may want to fill in the “holes” in any number of ways. For most purposes,
the fillna method is the workhorse function to use. Calling fillna with a
constant replaces missing values with that value:

In [328]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.756833,0.0,-0.620877
1,-1.018609,0.0,0.892303
2,-0.645588,0.0,0.0
3,-0.147291,0.0,0.211741
4,0.458434,0.982918,1.624429
5,-1.127536,-1.060043,-0.615365
6,0.634723,0.735118,0.61511


In [329]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,1.756833,0.5,-0.620877
1,-1.018609,0.5,0.892303
2,-0.645588,0.5,0.0
3,-0.147291,0.5,0.211741
4,0.458434,0.982918,1.624429
5,-1.127536,-1.060043,-0.615365
6,0.634723,0.735118,0.61511


In [330]:
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,1.756833,0.0,-0.620877
1,-1.018609,0.0,0.892303
2,-0.645588,0.0,0.0
3,-0.147291,0.0,0.211741
4,0.458434,0.982918,1.624429
5,-1.127536,-1.060043,-0.615365
6,0.634723,0.735118,0.61511


In [331]:
df = pd.DataFrame(np.random.randn(6, 3))

In [332]:
df.iloc[2:, 1] = NA

In [333]:
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.566479,1.491068,-1.700689
1,0.655457,-1.75248,-1.204699
2,-1.826244,,1.271903
3,0.263801,,0.265469
4,0.588067,,
5,-0.149191,,


In [334]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.566479,1.491068,-1.700689
1,0.655457,-1.75248,-1.204699
2,-1.826244,-1.75248,1.271903
3,0.263801,-1.75248,0.265469
4,0.588067,-1.75248,0.265469
5,-0.149191,-1.75248,0.265469


In [335]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.566479,1.491068,-1.700689
1,0.655457,-1.75248,-1.204699
2,-1.826244,-1.75248,1.271903
3,0.263801,-1.75248,0.265469
4,0.588067,,0.265469
5,-0.149191,,0.265469


In [336]:
data = pd.Series([1., NA, 3.5, NA, 7])

In [337]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# 11_Data Transformation

## 11.1_Removing Duplicates

In [338]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [339]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [340]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [341]:
data['v1'] = range(7)

In [342]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [343]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## 11.2_Transforming Data Using a Function or Mapping

In [344]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [345]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [346]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

## 11.3_Replacing Values

Filling in missing data with the fillna method is a special case of more general value
replacement. As you’ve already seen, map can be used to modify a subset of values in
an object but replace provides a simpler and more flexible way to do so. Let’s consider
this Series:

In [349]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [350]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [351]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [352]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [353]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## 11.4_Renaming Axis Indexes

Like values in a Series, axis labels can be similarly transformed by a function or mapping
of some form to produce new, differently labeled objects. You can also modify
the axes in-place without creating a new data structure. Here’s a simple example:

In [354]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [355]:
transform = lambda x: x[:4].upper()

In [356]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [357]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [358]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [359]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [360]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


## 11.5_Discretization and Binning

Continuous data is often discretized or otherwise separated into “bins” for analysis.
Suppose you have data about a group of people in a study, and you want to group
them into discrete age buckets:

In [None]:
# book page 203