In [174]:
from pandas import Series,DataFrame
import pandas as pd

# Series是一种类似于一维数组的对象，它由一组数据以及一组与之相关的
# 数据标签（即索引）组成。
obj = Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [175]:
# 获取其values和index
print obj.values
print obj.index

[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)


In [176]:
# 自定义索引
obj2 = Series([4,7,-5,3], index=['a','b','c','d'])
print obj2
print obj2.index

a    4
b    7
c   -5
d    3
dtype: int64
Index([u'a', u'b', u'c', u'd'], dtype='object')


In [177]:
# 通过索引去选取单个或一组值,甚至更改值
obj2 = Series([4,7,-5,3], index=['a','b','c','d'])
print obj2['b']
print obj2[['a','b','c']]
obj2['c']=9
print '更改后\n',obj2

7
a    4
b    7
c   -5
dtype: int64
更改后
a    4
b    7
c    9
d    3
dtype: int64


In [178]:
import numpy as np
print obj2
# 进行过滤、标量乘法、应用数学函数2
print "满足大于3的\n",obj2[obj2>3]
print obj2*2
print np.exp(obj2)

a    4
b    7
c    9
d    3
dtype: int64
满足大于3的
a    4
b    7
c    9
dtype: int64
a     8
b    14
c    18
d     6
dtype: int64
a      54.598150
b    1096.633158
c    8103.083928
d      20.085537
dtype: float64


In [179]:
# 还可以将Series看成是一个定长的有序字典，因为它是索引值到数据值的一个映射。
print 'b' in obj2
print 'e' in obj2

True
False


In [180]:
# 如果数据被存放在一个Python字典中，可以通过这个字典来创建Series
sdata = {'tom':4000, 'texa':2000, 'organ':5000, 'sara':6000}
obj3 = Series(sdata)
obj3

organ    5000
sara     6000
texa     2000
tom      4000
dtype: int64

In [181]:
states = ['Calif','tom','texa','organ']
obj4 = Series(sdata,index=states)
print obj4
# pandas的isnull()和notnull()用于检测缺失数据
print pd.isnull(obj4)
print pd.notnull(obj4)

# 当然，Series也有类似的实例方法
print obj4.isnull()

Calif       NaN
tom      4000.0
texa     2000.0
organ    5000.0
dtype: float64
Calif     True
tom      False
texa     False
organ    False
dtype: bool
Calif    False
tom       True
texa      True
organ     True
dtype: bool
Calif     True
tom      False
texa     False
organ    False
dtype: bool


In [182]:
# Series最重要的一个功能是：它在算术运算中会自动对齐不同索引的数据
obj3 + obj4

Calif        NaN
organ    10000.0
sara         NaN
texa      4000.0
tom       8000.0
dtype: float64

In [183]:
# Series对象本身及其索引都有一个name属性，该属性和pandas其他
# 的关键功能关系非常密切
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
Calif       NaN
tom      4000.0
texa     2000.0
organ    5000.0
Name: population, dtype: float64

In [184]:
# DataFrame是一个表格型的数据结构，它含有一组有序的列，每列
# 可以是不同的值类型（数值、字符串、布尔值等）。DataFrame既有
# 行索引也有列索引，它可以被看作是由Series组成的字典（共用同一
# 个索引）。DataFrame是以二维结构保存数据的。

In [185]:
# 构建DataFrame
data = {'state':['tom','tom','tom','Nevada','Nevada'],
        'year':[2000,2001,2002,2001,2002],
        'pop':[1.5,1.7,3.6,2.4,2.9]
       }
frame = DataFrame(data)
print frame
# 输出的结果和Series一样，DataFrame会自动加上索引，且全部列会被有序排列

   pop   state  year
0  1.5     tom  2000
1  1.7     tom  2001
2  3.6     tom  2002
3  2.4  Nevada  2001
4  2.9  Nevada  2002


In [186]:
# 如果指定了列序列，则DataFrame的列 就会按照指定顺序进行排列
print DataFrame(data,columns=['year','state','pop'])

   year   state  pop
0  2000     tom  1.5
1  2001     tom  1.7
2  2002     tom  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9


In [187]:
# 和Series一样，如果传入的列在数据中找不到，就会产生NaN值
frame2 = DataFrame(data,columns=['year','state','pop','debt'], index
                  =['one','two','three','four','five'])
print frame2
print frame2.columns

       year   state  pop debt
one    2000     tom  1.5  NaN
two    2001     tom  1.7  NaN
three  2002     tom  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
Index([u'year', u'state', u'pop', u'debt'], dtype='object')


In [188]:
# 通过类似字典标记的方式或属性的方式，可以将DataFrame的列获取为一个Series
print frame2.year
print frame2['state']

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64
one         tom
two         tom
three       tom
four     Nevada
five     Nevada
Name: state, dtype: object


In [189]:
# 用索引字段ix获取某一行
frame2.loc['two']

year     2001
state     tom
pop       1.7
debt      NaN
Name: two, dtype: object

In [190]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,tom,1.5,
two,2001,tom,1.7,
three,2002,tom,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [191]:
del frame2['debt']  # 关键字del用于删除列

In [192]:
frame2

Unnamed: 0,year,state,pop
one,2000,tom,1.5
two,2001,tom,1.7
three,2002,tom,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9


In [193]:
# 另一种常见形式是嵌套字典（也就是字典的字典）
pop = {'Nevada':{2001: 2.4, 2002: 2.9},
        'Ohio':{2000:1.5, 2001:1.7, 2002: 3.6}
      }
# 将它传给DataFrame，它就会被解释为：外层字典的键作为列，内层键则作为行索引
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [194]:
#  也可以对该结果进行转置
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [195]:
df1 = DataFrame(np.arange(12).reshape((3,4)), columns=list('abcd'))
df1

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [196]:
df2 = DataFrame(np.arange(20).reshape((4,5)), columns=list('abcde'))
df2

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [197]:
#  相加
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [198]:
df1.add(df2, fill_value=0)   # add()方法

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [199]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,0
1,4,5,6,7,0
2,8,9,10,11,0


In [200]:
#  函数应用和映射  apply
frame4 = DataFrame(np.random.randn(4,3), columns=list('bde'), 
                   index=['Utah', 'Ohio', 'Texas','Oregon'])
frame4

Unnamed: 0,b,d,e
Utah,0.582053,0.295957,0.995202
Ohio,0.429904,-1.508232,-0.377954
Texas,-2.187789,0.824834,-0.69868
Oregon,0.7212,-1.5725,-0.897148


In [201]:
np.abs(frame4)

Unnamed: 0,b,d,e
Utah,0.582053,0.295957,0.995202
Ohio,0.429904,1.508232,0.377954
Texas,2.187789,0.824834,0.69868
Oregon,0.7212,1.5725,0.897148


In [202]:
f = lambda x: x.max() - x.min()
frame4.apply(f)

b    2.908990
d    2.397333
e    1.892350
dtype: float64

In [203]:
frame4.apply(f, axis=1)

Utah      0.699245
Ohio      1.938135
Texas     3.012623
Oregon    2.293700
dtype: float64

In [204]:
# 除标量值外，传递给apply的函数还可以返回由多个值组成的Series
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame4.apply(f)

Unnamed: 0,b,d,e
min,-2.187789,-1.5725,-0.897148
max,0.7212,0.824834,0.995202


In [205]:
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame4.apply(f,axis=1)

Unnamed: 0,min,max
Utah,0.295957,0.995202
Ohio,-1.508232,0.429904
Texas,-2.187789,0.824834
Oregon,-1.5725,0.7212


In [206]:
#排序和排名

In [207]:
#要对行或者列索引进行排序（按字典顺序）,使用sort_index方法
obj = Series(range(4),index=['d','a','b','c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [211]:
# 而对于DataFrame，则可以根据任意一个轴上的索引进行排序
frame5 = DataFrame(np.arange(8).reshape((2, 4)), index=['three','one'], columns=['d','a','b','c'])
frame5.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [215]:
frame5.sort_index(axis=1)  
# 默认是升序的，降序的话，加入参数ascending=False

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [218]:
# 根据其中一列进行排序
frame6 = DataFrame({'b': [4,7,-3,2], 'a': [0,1,0,1]})
frame6.sort_values(by='b')   
# 根据多个列排序，则是传入名称的列表即可 by=['a','b']

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [219]:
obj=Series(['a','a','b','c'] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object