<a href="https://colab.research.google.com/github/smiledinisa/data_python_analysis/blob/master/pandas001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第五章 pandas 入门

![替代文字](https://img-blog.csdnimg.cn/20200807151248699.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dhb2N1aTg4Mw==,size_16,color_FFFFFF,t_70)

In [None]:
from pandas import Series, DataFrame
import pandas as pd


## pandas的数据结构介绍

### Series
由一组数据（numpy）以及一组与之相关的数据标签组成。

In [None]:
obj = Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [None]:
obj.values

array([ 4,  7, -5,  3])

In [None]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [None]:
obj2 = Series([4,7,5,3], index=['a', 'b', 'c', 'd'])

In [None]:
obj2

a    4
b    7
c    5
d    3
dtype: int64

In [None]:
obj2['d']

3

In [None]:
# 字典的转换。
sdata = {'ohi':3500, 'texa':7100, 'orgen':1900, 'utah':4000}
obj3 = Series(sdata)

In [None]:
obj3

ohi      3500
texa     7100
orgen    1900
utah     4000
dtype: int64

In [None]:
states = ['aiaang','ohi',  'orgen', 'utah']
obj4 = Series(sdata, index=states)

In [None]:
obj4

aiaang       NaN
ohi       3500.0
orgen     1900.0
utah      4000.0
dtype: float64

In [None]:
# NaN表示缺失。
# isnull和notnull 来检测缺失数据。


In [None]:
pd.isnull(obj4)

aiaang     True
ohi       False
orgen     False
utah      False
dtype: bool

In [None]:
pd.notnull(obj4)

aiaang    False
ohi        True
orgen      True
utah       True
dtype: bool

In [None]:
obj4.isnull()

aiaang     True
ohi       False
orgen     False
utah      False
dtype: bool

以上三种方法都是可以的。

In [None]:
obj3

ohi      3500
texa     7100
orgen    1900
utah     4000
dtype: int64

In [None]:
obj4

aiaang       NaN
ohi       3500.0
orgen     1900.0
utah      4000.0
dtype: float64

In [None]:
obj3+obj4

aiaang       NaN
ohi       7000.0
orgen     3800.0
texa         NaN
utah      8000.0
dtype: float64

Series 对象的name 属性。


In [None]:
obj4.name = 'population'
obj4.index.name = 'state'

In [None]:
obj4

state
aiaang       NaN
ohi       3500.0
orgen     1900.0
utah      4000.0
Name: population, dtype: float64

### DataFrame

In [None]:
import numpy as np
import pandas as pd
from pandas import Series
from pandas import DataFrame
# 传入由等长列表组成的字典，来构建DataFrame
data = {'state': ['Ohio','Ohio','Ohio','Nevada','Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)

In [None]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [None]:
DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [None]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five'])

In [None]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [None]:
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [None]:
frame2.index

Index(['one', 'two', 'three', 'four', 'five'], dtype='object')

In [None]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [None]:
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [None]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [None]:
# the rows also can be retrieved by position or name. use attribute ..loc
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [None]:
# empyt colum 'debt' can be assigned
frame2['debt'] = 100
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,100
two,2001,Ohio,1.7,100
three,2002,Ohio,3.6,100
four,2001,Nevada,2.4,100
five,2002,Nevada,2.9,100


In [None]:
frame2['debt'] = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


In [None]:
# note the length must match.
# if use a Series assign a colum , the index must be realigned exactly to DataFrame's

# if not , inserting some missing values in any holes. with NaN

val = pd.Series([1.5, 2.5, -3.6], index=['one', 'three', 'five'])
frame2['debt'] = val

frame2

# u can see the result like this:

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,1.5
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,2.5
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,-3.6


In [None]:
 # del keyword. and creat a new colum
 frame2['eastern'] = frame2.state == 'Ohio'
 frame2
 # note: new colums can not be created by frame2.estern

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,1.5,True
two,2001,Ohio,1.7,,True
three,2002,Ohio,3.6,2.5,True
four,2001,Nevada,2.4,,False
five,2002,Nevada,2.9,-3.6,False


In [None]:
frame2.columns

Index(['year', 'state', 'pop', 'debt', 'eastern'], dtype='object')

In [None]:
del frame2['eastern']

In [None]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [None]:
# note that delete a column aslo can not be use "del frame2.xxx"
# its not copy, its only view on the data



In [None]:
#anothe common form of data is nested dict of dicts:

pop = {'Nevada':{2001: 2.4, 2002: 2.9},
       'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
       

In [None]:
pop

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [None]:
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [None]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [None]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [None]:
frame3.index.name = 'year'
frame3.columns.name = 'state'

In [None]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [None]:
frame3.values # two dimensional ndarray.

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [None]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,1.5
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,2.5
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,-3.6


In [None]:
frame2.values

array([[2000, 'Ohio', 1.5, 1.5],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, 2.5],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, -3.6]], dtype=object)

所有可能传给DataFrame的值：

![替代文字](https://img-blog.csdnimg.cn/2020080722105763.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dhb2N1aTg4Mw==,size_16,color_FFFFFF,t_70)

### Index Objects 索引对象
pandas’s Index objects are responsible for holding the axis labels and other metadata
(like the axis name or names). Any array or other sequence of labels you use when
constructing a Series or DataFrame is internally converted to an Index:

In [None]:
frame3 = DataFrame({'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}})

In [None]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [None]:
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object')

In [None]:
# like set 
'Ohio' in frame3.columns

True

In [None]:
2001 in frame3.index

True

In [None]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels # but not like set, it can have duplicate lables.!!!

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

Index对象的各种方法和属性。

![替代文字](https://img-blog.csdnimg.cn/20200808095245189.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dhb2N1aTg4Mw==,size_16,color_FFFFFF,t_70)

In [None]:
dup_labels.delete(2)
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

In [None]:
dup_labels.delete(3)
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

In [None]:
print(frame3)
frame3.index.delete(2)
frame3

      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2000     NaN   1.5


Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


## Essential Functionality
基本的机制，fundamental mechanics of interacting with the 
data contained.

### Reindexing 索引重建


In [None]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6],index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [None]:
#calling reindex on this series rearrange the data according to the new index:
obj2 = obj.reindex(['a','b','c','d','e'])

In [None]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [None]:
# iterpolation with method keyward.
obj3 = pd.Series(['blue', 'purple', 'yellow'],index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [None]:
obj3.reindex(range(6), method='ffill') #range boject


0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

AttributeError: ignored

In DataFrame it can beused to change the index,columns,both.

In [None]:
import numpy as np
frame = pd.DataFrame(np.arange(9).reshape((3,3)), index=['a', 'b', 'c'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [None]:
frame2 = frame.reindex(['a','c','d','b','e'])

In [None]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
c,6.0,7.0,8.0
d,,,
b,3.0,4.0,5.0
e,,,


In [None]:
# use colums keywards 
frame3= frame.reindex(index=['a','c','d','b','e'], columns=['Texas', 'Ohio', 'California', 'Utah'])
frame3

Unnamed: 0,Texas,Ohio,California,Utah
a,1.0,0.0,2.0,
c,7.0,6.0,8.0,
d,,,,
b,4.0,3.0,5.0,
e,,,,


The function arguments:

frame.reindex(labels=None, index=None, columns=None, axis=None, method=None, copy=True, level=None, fill_value=nan, limit=None, tolerance=None)

![替代文字](https://img-blog.csdnimg.cn/20200808103429470.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dhb2N1aTg4Mw==,size_16,color_FFFFFF,t_70)

### Dropping Entries from an Axis
根据轴来删除条目

obj.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')

In [None]:
# keyward drop.
obj= pd.Series(np.arange(5.0),index=['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [None]:
new_obj = obj.drop('a')

In [None]:
new_obj

b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [None]:
# with DataFrame
data = pd.DataFrame(np.arange(16).reshape(4,4), index=['Ohio', 'Colorada', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorada,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [None]:
data.drop(['Colorada', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [None]:
# axis = 0 if you drop the index.
#if you want drop columns , can use axis=1 or axis='columns'
data.drop(['one','two'], axis= 1)

Unnamed: 0,three,four
Ohio,2,3
Colorada,6,7
Utah,10,11
New York,14,15


In [None]:
# or like this.
data.drop(columns=['one', 'two'])

Unnamed: 0,three,four
Ohio,2,3
Colorada,6,7
Utah,10,11
New York,14,15


In [None]:
# note the drop function methed 'inplace=False', it return a copy of new one(series,dataframe)
# if we want modify the size of  a Series or DataFrame, we can set it implace=True
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorada,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [None]:
data.drop(index=['Ohio'], columns=['two'], inplace=True)


In [None]:
data

Unnamed: 0,one,three,four
Colorada,4,6,7
Utah,8,10,11
New York,12,14,15


### **Indexing, Selection, and Filtering**索引、选择和筛选


#### Indexing

In [None]:
# Series Indexing like numpy array indexing. 
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [None]:
obj['a']

0.0

In [None]:
obj['d']

3.0

In [None]:
obj[['a','c']]

a    0.0
c    2.0
dtype: float64

In [None]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [None]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [None]:
# slicing like python, but the endpoint is include 
obj['a':'c']

a    0.0
b    1.0
c    2.0
dtype: float64

In [None]:
# use slicing method modify the value
obj['a':'c'] = 100

In [None]:
obj

a    100.0
b    100.0
c    100.0
d      3.0
dtype: float64

Indexing into a DataFrame is for retrieving one or more 
columns 

In [None]:
data = pd.DataFrame(np.arange(16).reshape(4,4), index=['Ohio', 'Colorada', 'Utah', 'NewYork'],
                    columns= ['one', 'two', 'trhee', 'four'])
data

Unnamed: 0,one,two,trhee,four
Ohio,0,1,2,3
Colorada,4,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


In [None]:
data.loc['Ohio']

one      0
two      1
trhee    2
four     3
Name: Ohio, dtype: int64

In [None]:
data['two']

Ohio         1
Colorada     5
Utah         9
NewYork     13
Name: two, dtype: int64

In [None]:
data[['trhee','two']]

Unnamed: 0,trhee,two
Ohio,2,1
Colorada,6,5
Utah,10,9
NewYork,14,13


In [None]:
data[:2]

Unnamed: 0,one,two,trhee,four
Ohio,0,1,2,3
Colorada,4,5,6,7


In [None]:
data[:1]

Unnamed: 0,one,two,trhee,four
Ohio,0,1,2,3


In [None]:
data[data['trhee'] > 5 ]

Unnamed: 0,one,two,trhee,four
Colorada,4,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


In [None]:
# 为了方便起见，提供了行选择语法数据[：2]


In [None]:
# boolean DataFrame
data < 5


Unnamed: 0,one,two,trhee,four
Ohio,True,True,True,True
Colorada,True,False,False,False
Utah,False,False,False,False
NewYork,False,False,False,False


In [None]:
data[data < 5] = 0 # like numpy case.

In [None]:
data

Unnamed: 0,one,two,trhee,four
Ohio,0,0,0,0
Colorada,0,5,6,7
Utah,8,9,10,11
NewYork,12,13,14,15


#### Selection with loc and iloc   **(loc,iloc)**


``.iloc[]`` is primarily integer position based (from ``0`` to
``length-1`` of the axis), but may also be used with a boolean
array.

Allowed inputs are:

- An integer, e.g. ``5``.
- A list or array of integers, e.g. ``[4, 3, 0]``.
- A slice object with ints, e.g. ``1:7``.
- A boolean array.
- A ``callable`` function with one argument (the calling Series or
  DataFrame) and that returns valid output for indexing (one of the above).
  This is useful in method chains, when you don't have a reference to the
  calling object, but would like to base your selection on some value.

``.loc[]`` is primarily label based, but may also be used with a
boolean array.

Allowed inputs are:

- A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
  interpreted as a *label* of the index, and **never** as an
  integer position along the index).
- A list or array of labels, e.g. ``['a', 'b', 'c']``.
- A slice object with labels, e.g. ``'a':'f'``.


In [None]:
# 从函数的说明可以大致看出iloc 应用于数字，loc应用字符索引。
data.loc['Colorada', ['two','trhee']]

two      5
trhee    6
Name: Colorada, dtype: int64

In [None]:
data.iloc[2,[3,0,1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [None]:
data.iloc[2]

one       8
two       9
trhee    10
four     11
Name: Utah, dtype: int64

In [None]:
# slice 
data.loc[:'Utah', 'two']

Ohio        0
Colorada    5
Utah        9
Name: two, dtype: int64

In [None]:
data.loc['Utah', 'two']

9

In [None]:
data.iloc[:,:3]

Unnamed: 0,one,two,trhee
Ohio,0,0,0
Colorada,0,5,6
Utah,8,9,10
NewYork,12,13,14


![替代文字](https://img-blog.csdnimg.cn/20200808120252118.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dhb2N1aTg4Mw==,size_16,color_FFFFFF,t_70)

In [None]:
import pandas as pd
from pandas import Series as Series
from pandas import DataFrame as df
import numpy as np


### Integer Indexes 整数索引


In [None]:
ser = pd.Series(np.arange(6.0))

In [None]:
ser

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
5    5.0
dtype: float64

In [None]:
ser[-1] # there willl some erro

KeyError: ignored

In [None]:
# but if change the idex to nointeger.
ser2 = pd.Series(np.arange(3.0), index=['a', 'b', 
                                        'c'])

In [None]:
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [None]:
ser2[-1] # its ok.

2.0

In [None]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [None]:
ser2.iloc[:2]

a    0.0
b    1.0
dtype: float64

In [None]:
ser.iloc[:1]

0    0.0
dtype: float64

#### Arithmetic and Data Alignment. 算法和数据对齐。

In [None]:
# An important pandas feature for some applications is the behavior of arithmetic
# between objects with different indexes. When you are adding together objects, if any
# index pairs are not the same, the respective index in the result will be the union of the
# index pairs. For users with database experience, this is similar to an automatic outer
# join on the index labels. Let’s look at an example:

s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index= ['a', 'c', 'd', 'e'])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [None]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [None]:
s1 + s2 # like out join

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [None]:
# In the case of DataFrame, alignment is performed on both the rows and the columns:
df1 = df(np.arange(9.).reshape((3,3)), columns=list('bcd'), index= ['Ohio', 'Texas', 'Colorado'])
df2 = df(np.arange(12.).reshape((4,3)), columns=list('bde'), index= ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [None]:
print(df1)
print(df2)

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0


In [None]:
# Adding these together returns a DataFrame whose index and columns are the unions
# of the ones in each DataFrame:

df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [None]:
# since the c and e columns, colorada ,oregon, utah, rows not found in both df, so theres the missing reslut .a


In [None]:
# If you add DataFrame objects with no column or row labels in common, the result
# will contain all nulls:
df1 = df({"a":[1,2]})
df2 = df({'b':[3,4]})

In [None]:
df1 + df2

Unnamed: 0,a,b
0,,
1,,


#### Arithmetic methods with fill values 填充值的算数方法。

In [None]:
# In arithmetic operations between differently indexed objects, you might want to fill
# with a special value, like 0, when an axis label is found in one object but not the other:

df1 = df(np.arange(12.).reshape((3,4)), columns= list('abcd'))
df2 = df(np.arange(20.).reshape((4,5)), columns= list('abcde'))

print(df1)
print(df2)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   6.0   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0


In [None]:
df2.loc[1, 'b'] = np.nan
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [None]:
# Adding these together results in NA values in the locations that don’t overlap:

df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [None]:
# Using the  add method on  df1 , passing  df2 and set the argument to fill_value;
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


![替代文字](https://img-blog.csdnimg.cn/20200809100412249.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dhb2N1aTg4Mw==,size_16,color_FFFFFF,t_70)

In [None]:
df1/100

Unnamed: 0,a,b,c,d
0,0.0,0.01,0.02,0.03
1,0.04,0.05,0.06,0.07
2,0.08,0.09,0.1,0.11


In [None]:
df1.rdiv(1) # 1除以。

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [None]:
df1.div(1)

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


#### Operations between DataFrame and Series 


In [None]:
# operations between dataframe and series a similar like brodcasting.

frame = df(np.arange(12.0).reshape((4,3)),
           columns= list('bde'),
           index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
print(frame)
print(series)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64


In [None]:
# By default, arithmetic between DataFrame and Series matches the index of the Series
# on the DataFrame’s columns, broadcasting down the rows:

frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [None]:
# If an index value is not found in either the DataFrame’s columns or the Series’s index,
# the objects will be reindexed to form the union:?

series2 = pd.Series(range(3), index= list('bef'))
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [None]:
frame.add(series2, axis=1 )

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


### Function Application and Mapping 函数应用与映射


Signature: **frame.applymap(func)** -> **'DataFrame'**
Docstring:
Apply a function to a Dataframe elementwise.

This method applies a function that accepts and returns a scalar
to every element of a DataFrame.






In [None]:
# NumPy ufuncs (element-wise array methods) also work with pandas objects:

frame = df(np.random.randn(4,3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.14465,0.436345,1.485189
Ohio,-0.032256,-0.167276,-1.569843
Texas,1.797647,2.885102,0.778924
Oregon,0.653553,-0.907005,1.209906


In [None]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.14465,0.436345,1.485189
Ohio,0.032256,0.167276,1.569843
Texas,1.797647,2.885102,0.778924
Oregon,0.653553,0.907005,1.209906


In [None]:
# Another frequent operation is applying a function on one-dimensional arrays to each
# column or row. DataFrame’s  apply method does exactly this:

f = lambda x: x.max() - x.min()
frame.apply(f)


b    1.829903
d    3.792107
e    3.055032
dtype: float64

In [None]:
# but if you want to apply it on colunmns:
frame.apply(f, axis = 'columns')

Utah      1.340539
Ohio      1.537587
Texas     2.106178
Oregon    2.116911
dtype: float64

In [None]:
frame.apply(f, axis= 1)

Utah      1.340539
Ohio      1.537587
Texas     2.106178
Oregon    2.116911
dtype: float64

In [None]:
# Many of the most common array statistics (like  sum and  mean ) are DataFrame meth‐
# ods, so using  apply is not necessary.
# The function passed to  apply need not return a scalar value; it can also return a Series
# with multiple values:

def f(x):
  return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)


Unnamed: 0,b,d,e
min,-0.032256,-0.907005,-1.569843
max,1.797647,2.885102,1.485189


In [None]:
# Element-wise Python functions can be used, too. Suppose you wanted to compute a
# formatted string from each floating-point value in  frame . You can do this with  apply
# map :

format = lambda x: '%2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.14465,0.436345,1.485189
Ohio,-0.032256,-0.167276,-1.569843
Texas,1.797647,2.885102,0.778924
Oregon,0.653553,-0.907005,1.209906


In [None]:
frame['e'].map(format)

Utah       1.485189
Ohio      -1.569843
Texas      0.778924
Oregon     1.209906
Name: e, dtype: object

In [None]:
format = lambda x: x**2
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.020924,0.190397,2.205787
Ohio,0.00104,0.027981,2.464407
Texas,3.231533,8.323812,0.606723
Oregon,0.427131,0.822659,1.463872


### Sorting and Ranking

**Signature**: **frame.sort_index**(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, ignore_index: bool=False)

**Docstring**:
Sort object by labels (along an axis).



**Signature**: **obj.sort_values**(axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False)

**Docstring**:
Sort by the values.



In [None]:
# Sorting a dataset by some criterion is another important built-in operation. To sort
# lexicographically(字典顺序) by row or column index, use the  sort_index method, which returns
# a new, sorted object:

obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [None]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [None]:
# With a DataFrame, you can sort by index on either axis:
frame = df(np.arange(8).reshape(2,4), index=['three', 'one'], columns=['d','a','b','c'])

frame.sort_values

<bound method DataFrame.sort_values of        d  a  b  c
three  0  1  2  3
one    4  5  6  7>

In [None]:
frame.sort_index()


Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [None]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [None]:
# The data is sorted in ascending order by default, but can be sorted in descending
# order, too:

frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [None]:
# To sort a Series by its values, use its  sort_values method:

obj = pd.Series([4, 7, -3, 2])
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [None]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [None]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [None]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [None]:
# When sorting a DataFrame, you can use the data in one or more columns as the sort
# keys. To do so, pass one or more column names to the  by option of  sort_values 

frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [None]:
frame.sort_values(by = ['b'])

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [None]:
frame.sort_values(by = ['b', 'a'], ascending= False) # 优先按 b 排序，当b一样的，按a来进行排序。

Unnamed: 0,b,a
1,7,1
0,4,0
3,2,1
2,-3,0


In [None]:
# Ranking assigns ranks from one through the number of valid data points in an array.
# The  rank methods for Series and DataFrame are the place to look; by default  rank
# breaks ties by assigning each group the mean rank:

obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()


0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [None]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [None]:
# You can rank in descending order, too:

obj.rank(ascending= False, method = 'max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

![替代文字](https://img-blog.csdnimg.cn/20200809114258765.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dhb2N1aTg4Mw==,size_16,color_FFFFFF,t_70)

### Axis Indexes with Duplicate Labels 具有重复标签的轴索隐


In [29]:
import pandas as pd
from pandas import DataFrame as DF
from pandas import Series as SR
import numpy as np

In [3]:
obj = sr(range(5), index=list('aabbc'))
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [5]:
# use keyward : index_is_unique
obj.index.is_unique

False

In [6]:
# idnexing a label with multiple entries return a  Series
obj['a']

a    0
a    1
dtype: int64

In [7]:
obj['c']

4

In [14]:
# same situation in DataFrame
df2 = DF(np.random.randn(4,3), index=list('aabb'))
df2

Unnamed: 0,0,1,2
a,0.546921,0.609059,-0.792008
a,1.733854,-1.24415,-0.849407
b,0.219873,-0.306571,-0.754854
b,1.066338,1.39027,-0.638894


In [9]:
df.loc['b']

Unnamed: 0,0,1,2
b,-0.213814,0.083857,0.667793
b,-0.12013,1.505211,0.690022


## Summarizing and Computing Descriptive Statistics 汇总计算描述性统计

In [11]:
# pandas objects are equipped with a set of common mathematical and statistical meth‐
# ods. Most of these fall into the category of reductions or summary statistics, methods
# that extract a single value (like the sum or mean) from a Series or a Series of values
# from the rows or columns of a DataFrame. Compared with the similar methods
# found on NumPy arrays, they have built-in handling for missing data. Consider a
# small DataFrame:

df1 = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],index=['a', 'b', 'c', 'd'],columns=['one', 'two'])
df1

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [15]:
df1.sum()

one    9.25
two   -5.80
dtype: float64

In [16]:
df1.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [19]:
df1.mean(axis= 1,skipna= False) #不能忽略NA

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [20]:
df1.mean(axis= 1)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [22]:
# cap the indirect statistics info:
df1.idxmax()

one    b
two    d
dtype: object

In [23]:
df1.idxmin()

one    d
two    b
dtype: object

In [24]:
df1

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [25]:
df1.cumsum() # 

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [27]:
# the keyward decribe():
print(df1)
df1.describe()

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3


Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [30]:
# On non-numeric data,  describe produces alternative summary statistics:
obj = SR(['a', 'a', 'b', 'c'] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [31]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

各种统计函数以及相应参数方法。

![替代文字](https://img-blog.csdnimg.cn/20200810103528495.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dhb2N1aTg4Mw==,size_16,color_FFFFFF,t_70)

#### Correlation and Covariance 相关以及协方差


In [32]:
# Let’s consider some DataFrames of stock prices and volumes obtained
# from Yahoo!

# we will use pandas-datareader package.


In [33]:
import pandas_datareader.data as web

In [34]:
all_data = {ticker:web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = DF({ticker: data['Adj Close'] for ticker, data in all_data.items()}) #收盘价
volume = DF({ticker: data['Volume'] for ticker,data in all_data.items()}) #成交量。

In [35]:
returns = price.pct_change()

In [37]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-08-03,0.025198,0.011144,0.056241,-0.005739
2020-08-04,0.006678,0.012308,-0.015009,-0.00643
2020-08-05,0.003625,-0.003099,-0.001641,0.005898
2020-08-06,0.034889,0.005341,0.016014,0.017976
2020-08-07,-0.022736,0.003775,-0.017888,-0.00374


In [40]:
# 计算不同股票涨跌的相关性。corr ，协方差 cov
returns['IBM'].corr(returns['GOOG'])

0.5454259521629349

In [42]:
returns['IBM'].cov(returns['GOOG'])

0.00014626407944867638

In [44]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.519489,0.713107,0.666846
IBM,0.519489,1.0,0.592286,0.545426
MSFT,0.713107,0.592286,1.0,0.787119
GOOG,0.666846,0.545426,0.787119,1.0


In [45]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.00034,0.000154,0.00023,0.000205
IBM,0.000154,0.00026,0.000167,0.000146
MSFT,0.00023,0.000167,0.000306,0.000229
GOOG,0.000205,0.000146,0.000229,0.000277


In [46]:
returns.corrwith(returns.IBM)

AAPL    0.519489
IBM     1.000000
MSFT    0.592286
GOOG    0.545426
dtype: float64

In [47]:
volume

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-08-12,101217500.0,3560300.0,30181400.0,2936700
2015-08-13,48535800.0,2533400.0,22627200.0,1810700
2015-08-14,42929500.0,3226400.0,21473400.0,1072100
2015-08-17,40884700.0,2249600.0,21099700.0,1051700
2015-08-18,34560700.0,2018300.0,23574100.0,1456100
...,...,...,...,...
2020-08-03,77037800.0,3526100.0,78983000.0,2330200
2020-08-04,43267900.0,3466100.0,49280100.0,1903500
2020-08-05,30498000.0,3675400.0,28858600.0,1979500
2020-08-06,50607200.0,3417100.0,32656800.0,1995400


In [48]:
# 计算几个和成交的关系。
returns.corrwith(volume)

AAPL   -0.103123
IBM    -0.103913
MSFT   -0.058835
GOOG   -0.156423
dtype: float64

In [49]:
returns.corrwith(volume, axis=1)

Date
2015-08-12         NaN
2015-08-13    0.799512
2015-08-14    0.848508
2015-08-17    0.936865
2015-08-18    0.225860
                ...   
2020-08-03    0.847259
2020-08-04   -0.382892
2020-08-05   -0.071762
2020-08-06    0.804243
2020-08-07   -0.931684
Length: 1257, dtype: float64

#### Unique Values, Value Counts, and Membership 

![替代文字](https://img-blog.csdnimg.cn/2020081011252617.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2dhb2N1aTg4Mw==,size_16,color_FFFFFF,t_70)

In [50]:
obj = SR(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [51]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [52]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [53]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [54]:
pd.value_counts(obj.values, sort= False)


a    3
b    2
c    3
d    1
dtype: int64

In [55]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [56]:
mask = obj.isin(['b','c']) #用来过滤。

In [57]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [58]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [59]:
# Related to  isin is the  Index.get_indexer method, which gives you an index array
# from an array of possibly non-distinct values into another array of distinct values:

to_match = SR(['c', 'a', 'b', 'b', 'c', 'a'])

In [61]:
unique_vals = SR(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

In [62]:
# In some cases, you may want to compute a histogram on multiple related columns in
# a DataFrame. Here’s an example:

data = DF({'Qu1': [1, 3, 4, 3, 4],
      'Qu2': [2, 3, 1, 2, 3],
      'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [63]:
result = data.apply(pd.value_counts).fillna(0)

In [65]:
result  #Here, the row labels in the result are the distinct values occurring in all of the col‐
# umns. The values are the respective counts of these values in each column.

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
