In [None]:
#### -*- coding:utf-8 -*-
# 2 Pandas
from pandas import Series, DataFrame  # Series 和 DataFrame使用次数很多，故直接引入本地命名空间
import pandas as pd

In [4]:
# Series
# 类似一维数组，由一组数据与一组与之相关的数据标签组成
obj = Series([4, 7, -5, 3]) # 左为索引，右为值， 索引默认为0到N-1
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
obj.values  # 获取值

array([ 4,  7, -5,  3], dtype=int64)

In [6]:
obj.index  # 获取索引

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj2 = Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [8]:
# 通过索引提取
obj2['a']

-5

In [9]:
# 通过列表索取
obj2[['c', 'a', 'd']]

c    3
a   -5
d    4
dtype: int64

In [12]:
# 因为Series是索引值到数据值的一个映射，看作一个定长的有序字典，故可以直接传入字典创建Series
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [14]:
states = ['California', 'Ohio', 'Oregon', 'Texas']  # 其中那个California无法匹配，所以填的是NaN
obj4 = Series(sdata, index = states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [15]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [16]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [17]:
obj4.isnull()  # 相同的方法

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [19]:
obj3+obj4  # 自动对齐，6不6

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [20]:
obj4.name = 'population'  # Series有name属性
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [21]:
obj.index = ['Bob', 'steve', 'Jeff', 'Ryan']  # 索引可以通过赋值来修改
obj

Bob      4
steve    7
Jeff    -5
Ryan     3
dtype: int64

In [22]:
# DataFrame
# 1.可以直接传入字典
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
       'year':[2000,2001,2002,2001,2002],
       'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [39]:
frame2 = DataFrame(data, columns =['year','state','pop'], index = ['one','two','three','four','five'])  # 可以规定列名顺序,行名
frame2

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9


In [35]:
# 从DataFrame中提取Series
frame['state']  # 字典风格提取

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

In [37]:
frame.state  # 属性点缀风格提取

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

In [40]:
# 从DataFrame里提取行
frame2.ix['three']  # bad behaviour

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


year     2002
state    Ohio
pop       3.6
Name: three, dtype: object

In [43]:
frame2.loc['three']  # preference

year     2002
state    Ohio
pop       3.6
Name: three, dtype: object

In [45]:
# 添加列
frame2['debt'] = 16.5  #  如果是Series会匹配索引
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [48]:
# 删除列
frame2['eastern'] = frame2.state == 'Ohio'
del frame2['eastern']

In [49]:
# 2.通过嵌套索引   
pop = {'Nevada':{2001:2.4,2002:2.9},             # 外层字典的键为列，内层字典的键为行索引
      'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [50]:
frame3.T  # 可以转置哦

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [52]:
# 也可以用Series组成字典传入
pdata = {'Ohio':frame3['Ohio'][:-1],
        'Nevada':frame3['Nevada'][:2]}
DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [59]:
# 设置DataFrame的name
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [61]:
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [63]:
# 索引对象
obj = Series(range(3), index = ['a','b','c'])
index = obj.index
index  # index不可修改

Index(['a', 'b', 'c'], dtype='object')

In [None]:
# 索引的函数
# append 连接两个索引对象
# diff , intersection, union, isin
# delelte, drop 删除索引i处元素，并得到新index, 删除传入值，并得到新的index
# insert 插入得到新index
# is_monotonic, is_unique, unique

In [64]:
# 基本功能
#1 重新索引
obj = Series([4.5, 7.2, -5.3, 3.6], index = ['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [65]:
obj.reindex(['a','b','c','d','e'], fill_value = 0) # fill_value 是插值处理
                                                   # method = ffill(pad)前向填充，bfill(backfill)后向填充
                                                   # 仅有一个序列就是默认重新行索引（index=...），指定列索引需要columns = ...,
                                                   # 并用下插值只用于行

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [68]:
# 利用ix的标签功能，重新索引任务可以变得更加简洁#############################################################################🐮
frame.ix[['a','b','c','d'],['Texas','Utah','California']]  # 显示行索引，后是列索引,填整数也可以的
                                                           # ix = loc + iloc

Unnamed: 0,Texas,Utah,California
a,,,
b,,,
c,,,
d,,,


In [71]:
#2 丢弃制定轴上的项
obj.drop(['d','c'])

b    7.2
a   -5.3
dtype: float64

In [75]:
frame3.drop([2001,2002])  #  DataFrame可以删除任意轴

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5


In [88]:
#3 索引的选取与过滤
# 标签索引是末端包含的
obj['b':'a']

b    7.2
a   -5.3
dtype: float64

In [93]:
# 对DataFrame索引只是获取列
# 想获取行，还得ix
frame3.loc[2000]  

state
Nevada    NaN
Ohio      1.5
Name: 2000, dtype: float64

In [95]:
frame3.loc[2000, 'Ohio']  # 先提取行再提取列

1.5

In [98]:
# icol,irow --根据整数位置提取列/行
# get_value -- 根据行标签和列标签提取单个值
frame3.get_value(2000,'Ohio')

1.5

In [110]:
#4 算术运算与数据对齐
# 填充 fill_value = 
# 看看broadcasting的力量 
frame = DataFrame(np.arange(12).reshape((4,3)),columns = list('bde'),index = ['Utah','Ohio','Texas','Oregon'])
series = frame.iloc[0]
frame - series  # 行上广播

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [111]:
# 想在列上广播，必须用算术运算法  -- add, sub,div,mul
frame.sub(series, axis = 0)

Unnamed: 0,b,d,e
Ohio,,,
Oregon,,,
Texas,,,
Utah,,,
b,,,
d,,,
e,,,


In [113]:
# apply
def f(x):
    return Series([x.min(), x.max()], index = ['min','max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,0,1,2
max,9,10,11


In [114]:
# applymap -- 克隆Series的map
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [115]:
frame['e'].map(format)

Utah       2.00
Ohio       5.00
Texas      8.00
Oregon    11.00
Name: e, dtype: object

In [117]:
#5 排序与排名
# 按索引排序
frame.sort_index(axis = 1,ascending = False)  # 默认axis = 0, ascending = True

Unnamed: 0,e,d,b
Utah,2,1,0
Ohio,5,4,3
Texas,8,7,6
Oregon,11,10,9


In [124]:
# 按值排序
frame.sort_values(by = 'b')# by = 列名

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [126]:
obj.sort_values

<bound method Series.sort_values of d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64>

In [127]:
#  排名
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()  # method = average(默认)，max/min.first 
            # 对于frame 有axis

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [128]:
obj.rank(ascending = False)

0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64

In [129]:
# 重复索引
obj.index.is_unique

True

In [130]:
#6 汇总和计算描述统计
# count    describe    min, max,   argmin, argmax--索引位置   idxmin idxmax--索引值    quantile   sum   mean
# median   mad--平均绝对离差   var  std   skew   kurt    cumsum   cummin,cummax--累计最大值，累计最小值  comprod--累计积  
# diff      pct_change-- 计算百分数变化
# corr()   cov()
# series与series  A.corrwith(B)
# frame 与 series  A.corrwith(B)   frame按列    A.corrwith(B, axis = 1)  frame按行

In [132]:
# 唯一值，值计数与成员资格
obj.unique()

array([ 7, -5,  4,  2,  0], dtype=int64)

In [133]:
obj.value_counts()

 7    2
 4    2
-5    1
 2    1
 0    1
dtype: int64

In [134]:
obj.value_counts(sort = False)

 0    1
 2    1
-5    1
 4    2
 7    2
dtype: int64

In [136]:
obj.isin([2])  # 2是否在obj中

0    False
1    False
2    False
3    False
4     True
5    False
6    False
dtype: bool

In [2]:
#7 处理缺失数据
# pandas中使用浮点值NaN表示浮点和非浮点数组中的缺失数据
# Python内置的None值也会被当作NA处理
# dropna  fillna  isnull  notnull
from numpy import nan as NA
data = Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [41]:
data = DataFrame([[1, 6.5, 3],[1,NA,NA],
                 [NA,NA,NA],[NA,6.5,3]])
cleand = data.dropna()
cleand

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [16]:
data.dropna(how = 'all')  # 只丢弃全为NA的那些行

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
3,,6.5,3.0,


In [13]:
data[4] = NA

In [18]:
data.dropna(axis = 1, how = 'all')  # 删掉了第四列，因为其全部元素都是NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
# 填充缺失数据
data.fillna(0)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.5,3.0,0.0


In [26]:
data.fillna({1:0.5,4:5})  # 对不同列填充不同值--用字典的方法

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,5.0
1,1.0,0.5,,5.0
2,,0.5,,5.0
3,,6.5,3.0,5.0


In [27]:
# 就地修改不生成副本
_ = data.fillna(0, inplace = True)
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.5,3.0,0.0


In [43]:
# reindex中fillna的应用
data.fillna(data.mean())

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,6.5,3.0
2,1.0,6.5,3.0
3,1.0,6.5,3.0


In [48]:
data.dropna(how = 'all').fillna(data.sum(axis = 0))  # 巧妙的运用 axis = 0是列，1是行

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,13.0,6.0
3,2.0,6.5,3.0


In [46]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [49]:
data.sum(axis = 1)

0    10.5
1     1.0
2     NaN
3     9.5
dtype: float64

In [51]:
# 层次化索引
# 可以在一个轴上拥有多个索引 -- 用低纬度形式处理高纬度数据
import numpy as np
data = Series(np.random.randn(10),index = [['a','a','a','b','b','b','c','c','d','d'],[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data

a  1    1.176224
   2   -0.402358
   3    0.166712
b  1    0.483971
   2   -0.528979
   3   -0.926673
c  1    0.587399
   2    0.604816
d  2    1.023289
   3    0.895870
dtype: float64

In [52]:
data.index  # MultiIndex索引

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [53]:
data['b']  # 提取外层索引

1    0.483971
2   -0.528979
3   -0.926673
dtype: float64

In [55]:
data[:,2]  # 提取内层索引

a   -0.402358
b   -0.528979
c    0.604816
d    1.023289
dtype: float64

In [58]:
# 层次化索引在数据重塑与分组操作中扮演着重要角色

In [60]:
# unstack方法： 将层次化索引数据转化为DataFrame
data.unstack()

Unnamed: 0,1,2,3
a,1.176224,-0.402358,0.166712
b,0.483971,-0.528979,-0.926673
c,0.587399,0.604816,
d,,1.023289,0.89587


In [61]:
data.unstack().stack()

a  1    1.176224
   2   -0.402358
   3    0.166712
b  1    0.483971
   2   -0.528979
   3   -0.926673
c  1    0.587399
   2    0.604816
d  2    1.023289
   3    0.895870
dtype: float64

In [62]:
# 对于DataFrame每条轴都可以有分层索引呢
frame = DataFrame(np.arange(12).reshape((4, 3)),index = [['a','a','b','b'],[1,2,2,1]],
                 columns = [['Ohio','Ohio','Colorado'],
                           ['Green','Red','Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,2,6,7,8
b,1,9,10,11


In [63]:
# 还可以给人赋予索引名字呢
frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,2,6,7,8
b,1,9,10,11


In [64]:
frame['Ohio']  # 轻松选取分组

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,2,6,7
b,1,9,10


In [65]:
# 重排分级顺序
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
2,b,6,7,8
1,b,9,10,11


In [67]:
frame.sort_index(level = 1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,9,10,11
a,2,3,4,5
b,2,6,7,8


In [68]:
frame.swaplevel(0, 1).sort_index(level = 0)  # 一般都是对外层索引进行排序（从外到内排序）

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,9,10,11
2,a,3,4,5
2,b,6,7,8


In [71]:
frame.T.swaplevel('state','color').T  # ...

Unnamed: 0_level_0,color,Green,Red,Green
Unnamed: 0_level_1,state,Ohio,Ohio,Colorado
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,2,6,7,8
b,1,9,10,11


In [73]:
frame.swaplevel('state','color',axis = 1)  # 巧妙

Unnamed: 0_level_0,color,Green,Red,Green
Unnamed: 0_level_1,state,Ohio,Ohio,Colorado
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,2,6,7,8
b,1,9,10,11


In [74]:
# 根据级别汇总统计
frame.sum(level = 'key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,9,11,13
2,9,11,13


In [75]:
frame.sum(level = 'color', axis = 1)  

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,2,14,7
b,1,20,10


In [76]:
# 将DataFrame里面的列转化为索引
frame2 = DataFrame({'a':range(7),'b':range(7,0,-1),
                   'c':['one','one','one','two','two','two','two'],
                   'd':[0,1,2,0,1,2,3]})
frame2

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [77]:
frame2.set_index(['c','d'])  # 列为索引的列从DataFrame中剔除

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [82]:
frame2.set_index(['c','d'], drop = False)  # 保留充做索引的列

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3
