## 第5章 pandas入门
pandas是基于NumPy构建的，让以NumPy为中心的应用变得更加简单
---
- 具备按轴自动或显式数据对齐功能的数据结构
- 集成时间序列功能
- 灵活处理缺失数据
- 2个主要数据结构：Series和DataFrame

In [47]:
'''Series
    由一组数据（各种NumPy数据类型）和一组与之相关的索引组成。'''
import numpy as np
import pandas as pd

arr1=pd.Series([1,2,3,4,5])  #自动创建0到N-1的整数型索引。也可以指定索引arr1=pd.Series([1,2,3,4,5],index=['a','b','c','d','e'])
arr1.index=[1,2,3,4,5] #修改索引
print(arr1)
print('----------')
print(arr1[[3,4]]) #通过多个索引取值
print('----------')
print(arr1.values)
print('----------')
print(arr1.index)

1    1
2    2
3    3
4    4
5    5
dtype: int64
----------
3    3
4    4
dtype: int64
----------
[1 2 3 4 5]
----------
Int64Index([1, 2, 3, 4, 5], dtype='int64')


In [48]:
#Series可以看成是一个定长的有序字典
data1={'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
index1=['California','Ohio','Oregon','Texas']
arr2=pd.Series(data1,index1)
print(arr2)
print('----------')
#isNull()和notnull()检测缺失数据
print(pd.isnull(arr2))
print('----------')
print(arr2.notnull())

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
----------
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
----------
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool


In [50]:
'''DataFrame
    表格型数据结构，含有一组有序的列，每列可以是不同的值类型'''
data2={'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
       'year':[2000,2001,2002,2001,2002],
       'pop':[1.5,1.7,3.6,2.4,2.9]}
arr3=pd.DataFrame(data2,index=[1,2,3,4,5],columns=['state','year','pop','debt'])
arr3['debt']=np.arange(5) #为某列赋值
del arr3['debt'] #删除某列
print(arr3)
print('----------')
#2种方式根据column Name选取数据
print(arr3.year)
print('----------')
print(arr3['state'])
print('----------')
#ix根据行选取数据
print(arr3.ix[3])

    state  year  pop
1    Ohio  2000  1.5
2    Ohio  2001  1.7
3    Ohio  2002  3.6
4  Nevada  2001  2.4
5  Nevada  2002  2.9
----------
1    2000
2    2001
3    2002
4    2001
5    2002
Name: year, dtype: int64
----------
1      Ohio
2      Ohio
3      Ohio
4    Nevada
5    Nevada
Name: state, dtype: object
----------
state    Ohio
year     2002
pop       3.6
Name: 3, dtype: object


In [51]:
'''嵌套字典转换DataFrame
    外层字典的key作为column，内层的键作为index'''
data3={'Ohio':{2001:2.1,2002:2.2},
       'Nevada':{2000:2,2001:24,2002:26},
       'Ocean':{2004:50,2007:60}}
arr4=pd.DataFrame(data3)
print(arr4)
print('----------')
print(arr4.T)

      Nevada  Ocean  Ohio
2000     2.0    NaN   NaN
2001    24.0    NaN   2.1
2002    26.0    NaN   2.2
2004     NaN   50.0   NaN
2007     NaN   60.0   NaN
----------
        2000  2001  2002  2004  2007
Nevada   2.0  24.0  26.0   NaN   NaN
Ocean    NaN   NaN   NaN  50.0  60.0
Ohio     NaN   2.1   2.2   NaN   NaN


In [52]:
'''DataFrame的Index对象是不可修改的，这样Index对象才能在多个数据结构中安全共享'''
data4={'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
       'year':[2000,2001,2002,2001,2002],
       'pop':[1.5,1.7,3.6,2.4,2.9]}
state1=['year','pop','California']
arr5=pd.DataFrame(data4)
print(arr5)
print('----------')
print(arr5.reindex([1,2,3,4,5],columns=state1)) #reindex()重排索引行和列

   pop   state  year
0  1.5    Ohio  2000
1  1.7    Ohio  2001
2  3.6    Ohio  2002
3  2.4  Nevada  2001
4  2.9  Nevada  2002
----------
     year  pop  California
1  2001.0  1.7         NaN
2  2002.0  3.6         NaN
3  2001.0  2.4         NaN
4  2002.0  2.9         NaN
5     NaN  NaN         NaN


In [53]:
#reindex的method选项
arr6=pd.Series(['blue','purple','yellow'],index=[0,2,4])
print(arr6)
print('----------')
print(arr6.reindex(range(6),method='ffill')) #或者method='pad'。前向填充值
print('----------')
print(arr6.reindex(range(6),method='bfill')) #或者method='backfill'。后向填充值

0      blue
2    purple
4    yellow
dtype: object
----------
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
----------
0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object


In [54]:
'''drop()：删除指定轴上的项'''
data5=pd.DataFrame(np.arange(16).reshape(4,4),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])
print(data5)
print('----------')
#默认是按行删除，axis=1表示按列删除
print(data5.drop(['Ohio','Utah']))
print('----------')
print(data5.drop(['one','three'],axis=1))

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
----------
          one  two  three  four
Colorado    4    5      6     7
New York   12   13     14    15
----------
          two  four
Ohio        1     3
Colorado    5     7
Utah        9    11
New York   13    15


In [55]:
'''Series的索引'''
arr7=pd.Series(np.arange(4),index=['a','b','c','d'])
print(arr7)
print('----------')
#标签的切片和普通Python切片的区别。前者的末端是包含的，后者不包含。
print(arr7[2:4])
print('----------')
print(arr7['b':'d'])

a    0
b    1
c    2
d    3
dtype: int32
----------
c    2
d    3
dtype: int32
----------
b    1
c    2
d    3
dtype: int32


In [56]:
'''DataFrame的索引'''
data6=pd.DataFrame(np.arange(16).reshape(4,4),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])
print(data6)
print('----------')
print(data6[:2])
print('----------')
print(data6[data6['three']>5])
print('----------')
print(data6<10) #布尔型索引

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
----------
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
----------
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
----------
            one    two  three   four
Ohio       True   True   True   True
Colorado   True   True   True   True
Utah       True   True  False  False
New York  False  False  False  False


In [61]:
'''通过ix[]选取DataFrame行和列的子集'''
data7=pd.DataFrame(np.arange(16).reshape(4,4),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])
print(data7.ix[['Ohio','Utah'],['two','four']])

      two  four
Ohio    1     3
Utah    9    11


In [63]:
'''数据对齐：在不重叠的索引处引入NA值'''
arr8=pd.Series([1,2,3,4,5],index=['a','b','c','d','e'])
arr9=pd.Series([1,2,3],index=['a','b','f'])
print(arr8+arr9)
#对于DataFrame，对齐操作会同时发生在行和列上

a    2.0
b    4.0
c    NaN
d    NaN
e    NaN
f    NaN
dtype: float64


In [57]:
'''算数方法'''
data8=pd.DataFrame(np.arange(12).reshape(3,4),columns=list('abcd'))
data9=pd.DataFrame(np.arange(20).reshape(4,5),columns=list('abcde'))
print(data8+data9)
print('----------')
print(data8.add(data9,fill_value=0))
print('----------')
print(data8.reindex(columns=data9.columns,fill_value=0))

      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0  11.0  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN
----------
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0  11.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0
----------
   a  b   c   d  e
0  0  1   2   3  0
1  4  5   6   7  0
2  8  9  10  11  0


In [59]:
'''DataFrame和Series之间的运算'''
data10=pd.DataFrame(np.arange(12).reshape(3,4))
print(data10)
print('----------')
print(data10-data10.ix[0]) #广播

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
----------
   0  1  2  3
0  0  0  0  0
1  4  4  4  4
2  8  8  8  8


In [60]:
'''Series的排序和排名'''
arr10=pd.Series(range(4),index=['d','a','c','b'])
print(arr10)
print('----------')
print(arr10.sort_index())
print('----------')
print(arr10.sort_values())

d    0
a    1
c    2
b    3
dtype: int32
----------
a    1
b    3
c    2
d    0
dtype: int32
----------
d    0
a    1
c    2
b    3
dtype: int32


In [61]:
'''DataFrame的排序'''
data11=pd.DataFrame(np.array([3,4,7,0,5,1,6,2]).reshape(2,4),index=['three','one'],columns=['d','a','b','c'])
print(data11)
print('----------')
print(data11.sort_index()) #按行
print('----------')
print(data11.sort_index(axis=1)) #按列
print('----------')
print(data11.sort_values(by=['b','d'])) #选中列的排序

       d  a  b  c
three  3  4  7  0
one    5  1  6  2
----------
       d  a  b  c
one    5  1  6  2
three  3  4  7  0
----------
       a  b  c  d
three  4  7  0  3
one    1  6  2  5
----------
       d  a  b  c
one    5  1  6  2
three  3  4  7  0


In [8]:
arr11=pd.Series([4,3,5,12,1,0])
print(arr11.rank(ascending=True,method='max')) #将值变成排名

0    4.0
1    3.0
2    5.0
3    6.0
4    2.0
5    1.0
dtype: float64


In [62]:
'''带重复值的索引'''
arr12=pd.Series(range(5),index=['a','a','b','b','c'])
print(arr12)
print('----------')
print(arr12['a']) #重复索引返回一个Series
print('----------')
print(arr12['c']) #单个索引返回一个标量值

a    0
a    1
b    2
b    3
c    4
dtype: int32
----------
a    0
a    1
dtype: int32
----------
4


In [63]:
'''汇总'''
data12=pd.DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=list('abcd'),columns=['one','two'])
print(data12)
print('----------')
print(data12.sum()) #sum()默认是按列
print('----------')
print(data12.sum(axis=1)) #按行
print('----------')
print(data12.mean(axis=1,skipna=False)) #skipna()：排除缺失值，默认为True
print('----------')
print(data12.idxmax()) #idxmax()和idxmin()返回最大值，最小值对应的index和column name

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
----------
one    9.25
two   -5.80
dtype: float64
----------
a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64
----------
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64
----------
one    b
two    d
dtype: object


In [68]:
'''计算相关系数'''
data13=pd.DataFrame([[1,4,2,1],[5,4,4,3],[4.5,4,4,3.5],[3,3,5,2]],index=['12-24','12-25','12-26','12-27'],columns=['John','LiLi','Lisa','Mike'])
print(data13)
print('----------')
print(data13['Mike'].corr(data13['Lisa'])) #corr()计算两个Series之间的相关度，具体相关度查看该函数的文档
print('----------')
print(data13['LiLi'].corr(data13['Lisa']))
print('----------')
print(data13.corrwith(data13['Lisa'])) #corrwith()计算DataFrame和一个Series之间的相关度
#如果按行计算，则加上axis=1

       John  LiLi  Lisa  Mike
12-24   1.0     4     2   1.0
12-25   5.0     4     4   3.0
12-26   4.5     4     4   3.5
12-27   3.0     3     5   2.0
----------
0.567480306535
----------
-0.662266178533
----------
John    0.644949
LiLi   -0.662266
Lisa    1.000000
Mike    0.567480
dtype: float64


In [46]:
'''唯一值，值计数'''
arr13=pd.Series(['b','a','c','d','a','c','a'])
print(arr13.unique()) #计算唯一值
print('----------')
print(arr13.value_counts()) #计算Series中各值出现的频率
print('----------')
arr14=arr13.isin(['a']) #通过isin得到一个布尔型数组，根据这个数组选出其中的子集
print(arr14) 
print('----------')
print(arr13[arr14])

['b' 'a' 'c' 'd']
----------
a    3
c    2
b    1
d    1
dtype: int64
----------
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
----------
1    a
4    a
6    a
dtype: object


In [70]:
'''处理缺失数据'''
from numpy import nan as NA
arr15=pd.Series([1,NA,3.5,NA,7])
data14=pd.DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,6.5,3]])
print(arr15.isnull())
print('----------')
print(arr15.dropna()) 
print('----------')
print(data14.dropna()) #dropna()默认丢弃任何含有NA的行。如果丢弃全为NA的列，添加axis=1
print('----------')
print(data14.dropna(how='all')) #how='all'只丢弃全为NA的行
print('----------')
print(data14.dropna(thresh=2)) #选出含有2个以上non-NA的行
print('----------')
#print(data14.fillna(0)) #为NA填充值0
print(data14.fillna({1:100,2:200})) #按列为为NA填充值，如果按行填充则添加axis=1

0    False
1     True
2    False
3     True
4    False
dtype: bool
----------
0    1.0
2    3.5
4    7.0
dtype: float64
----------
     0    1    2
0  1.0  6.5  3.0
----------
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0
----------
     0    1    2
0  1.0  6.5  3.0
3  NaN  6.5  3.0
----------
     0      1      2
0  1.0    6.5    3.0
1  1.0  100.0  200.0
2  NaN  100.0  200.0
3  NaN    6.5    3.0


In [85]:
'''层次化索引'''
data15=pd.DataFrame(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
print(data15)
print('----------')
print(data15.index)
print('----------')
print(data15.unstack()) #unstack()重塑一个DataFrame
print('----------')
print(data15.unstack().stack())
print('----------')
data15.index.names=['key1','key2']
print(data15.swaplevel('key1','key2')) #swaplevel()重排分级顺序
print('----------')
print(data15.sum(level='key2')) #level参数指定某条轴上求和的级别

            0
a 1 -0.094555
  2 -0.720497
  3  0.840033
b 1 -1.038809
  2  0.813479
  3  0.601762
c 1 -0.044930
  2 -0.368784
d 2  0.773070
  3  0.310715
----------
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
----------
          0                    
          1         2         3
a -0.094555 -0.720497  0.840033
b -1.038809  0.813479  0.601762
c -0.044930 -0.368784       NaN
d       NaN  0.773070  0.310715
----------
            0
a 1 -0.094555
  2 -0.720497
  3  0.840033
b 1 -1.038809
  2  0.813479
  3  0.601762
c 1 -0.044930
  2 -0.368784
d 2  0.773070
  3  0.310715
----------
                  0
key2 key1          
1    a    -0.094555
2    a    -0.720497
3    a     0.840033
1    b    -1.038809
2    b     0.813479
3    b     0.601762
1    c    -0.044930
2    c    -0.368784
     d     0.773070
3    d     0.310715
----------
             0
key2          
1    -1.178295
2     0.497268
3     1.

In [88]:
'''DataFrame的列转换为行'''
data16=pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
print(data16)
print('----------')
print(data16.set_index(['c','d'])) #默认情况下，转换为行的列会自动移除
print('----------')
print(data16.set_index(['c','d'],drop=False))
#相反操作使用reset_index()

   a  b    c  d
0  0  7  one  0
1  1  6  one  1
2  2  5  one  2
3  3  4  two  0
4  4  3  two  1
5  5  2  two  2
6  6  1  two  3
----------
       a  b
c   d      
one 0  0  7
    1  1  6
    2  2  5
two 0  3  4
    1  4  3
    2  5  2
    3  6  1
----------
       a  b    c  d
c   d              
one 0  0  7  one  0
    1  1  6  one  1
    2  2  5  one  2
two 0  3  4  two  0
    1  4  3  two  1
    2  5  2  two  2
    3  6  1  two  3
