## 第7章 数据规整化：清理、转换、合并、重塑


In [7]:
import numpy as np
import pandas as pd

In [17]:
'''合并数据集
    pandas.merge()'''
left1=pd.DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})
right1=pd.DataFrame({'key':['a','b','d'],'data2':range(3)})
#两个DataFrame中有相同的行'a'和'b'
print(left1)
print(right1)
print(pd.merge(left1,right1,on='key')) #指定key列

   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b
   data2 key
0      0   a
1      1   b
2      2   d
   data1 key  data2
0      0   b      1
1      1   b      1
2      6   b      1
3      2   a      0
4      4   a      0
5      5   a      0


## pd.merge()函数的参数列表
![](http://i2.muimg.com/567571/bb56c8b988ef3e23.png)

In [16]:
left2=pd.DataFrame({'lkey':['b','b','a','c','a','a','b'],'data3':range(7)})
right2=pd.DataFrame({'rkey':['a','b','d'],'data4':range(3)})
#column name不同，可以进行分别指定
print(pd.merge(left2,right2,left_on='lkey',right_on='rkey'))

   data3 lkey  data4 rkey
0      0    b      1    b
1      1    b      1    b
2      6    b      1    b
3      2    a      0    a
4      4    a      0    a
5      5    a      0    a


In [18]:
#默认情况下，merge()做的是交集（inner），通过how属性指定。
# 多对多产生的是行的笛卡尔积，data1有3行，data2有2行，对应产生的有6行
print (pd.merge(left1,right1,how='outer'))
print('-------------------')
print (pd.merge(left1,right1,how='left'))
print('-------------------')
print (pd.merge(left1,right1,how='right'))

   data1 key  data2
0    0.0   b    1.0
1    1.0   b    1.0
2    6.0   b    1.0
3    2.0   a    0.0
4    4.0   a    0.0
5    5.0   a    0.0
6    3.0   c    NaN
7    NaN   d    2.0
-------------------
   data1 key  data2
0      0   b    1.0
1      1   b    1.0
2      2   a    0.0
3      3   c    NaN
4      4   a    0.0
5      5   a    0.0
6      6   b    1.0
-------------------
   data1 key  data2
0    0.0   b      1
1    1.0   b      1
2    6.0   b      1
3    2.0   a      0
4    4.0   a      0
5    5.0   a      0
6    NaN   d      2


In [19]:
#对于重复的列名，通过merge()的suffixes属性设置(添加后缀)
left3 = pd.DataFrame({'key1':['foo','foo','bar'],'key2':['one','two','one'],'lval':[1,2,3]})
right3 = pd.DataFrame({'key1':['foo','foo','bar','bar'],'key2':['one','one','one','two'],'rval':[4,5,6,7]})
print(pd.merge(left3,right3,on='key1',suffixes=('_left','_right'))) #对于重复的列名key2，分别加上后缀以区分

  key1 key2_left  lval key2_right  rval
0  foo       one     1        one     4
1  foo       one     1        one     5
2  foo       two     2        one     4
3  foo       two     2        one     5
4  bar       one     3        one     6
5  bar       one     3        two     7


In [20]:
'''DataFrame的索引作为连接键,设置left_index或right_index属性'''
left4=pd.DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
right4=pd.DataFrame({'group_val':[3.5,7]},index=['a','b'])
print(left4)
print(right4)
print('-------------------')
print(pd.merge(left4,right4,left_on='key',right_index=True))

  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
   group_val
a        3.5
b        7.0
-------------------
  key  value  group_val
0   a      0        3.5
2   a      2        3.5
3   a      3        3.5
1   b      1        7.0
4   b      4        7.0


In [21]:
'''对于层次化索引的数据做merge
    将要合并的列以list形式传入'''
left5=pd.DataFrame({'data':range(5),'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],'key2':[2000,2001,20002,2001,2002]})
right5=pd.DataFrame(np.arange(12).reshape(6,2),columns=['event1','event2'],index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],[2001,2000,2000,2000,2001,2002]])
print(left5)
print('-------------------')
print(right5)
print('-------------------')
print(pd.merge(left5,right5,left_on=['key1','key2'],right_index=True))

   data    key1   key2
0     0    Ohio   2000
1     1    Ohio   2001
2     2    Ohio  20002
3     3  Nevada   2001
4     4  Nevada   2002
-------------------
             event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11
-------------------
   data    key1  key2  event1  event2
0     0    Ohio  2000       4       5
0     0    Ohio  2000       6       7
1     1    Ohio  2001       8       9
3     3  Nevada  2001       0       1


In [33]:
'''DataFrame的join()更方便的实现按索引合并(不管有没有相同的column name)'''
left6=pd.DataFrame([[1,2],[3,4],[5,6]],index=['a','c','e'],columns=['Ohio','Nevada'])
right6=pd.DataFrame([[7,8],[9,10],[11,12],[13,14]],index=['b','c','d','e'],columns=['Missouri','Alabama'])
another=pd.DataFrame(np.arange(8).reshape(4,2),columns=['New York','Oregon'],index=['a','c','e','f'])
print(left6)
print(right6)
print(another)
print('-------------------')
print(left6.join(right6,how='outer'))
print('-------------------')
print(left6.join([right6,another])) #向join传入DataFrame

   Ohio  Nevada
a     1       2
c     3       4
e     5       6
   Missouri  Alabama
b         7        8
c         9       10
d        11       12
e        13       14
   New York  Oregon
a         0       1
c         2       3
e         4       5
f         6       7
-------------------
   Ohio  Nevada  Missouri  Alabama
a   1.0     2.0       NaN      NaN
b   NaN     NaN       7.0      8.0
c   3.0     4.0       9.0     10.0
d   NaN     NaN      11.0     12.0
e   5.0     6.0      13.0     14.0
-------------------
   Ohio  Nevada  Missouri  Alabama  New York  Oregon
a     1       2       NaN      NaN         0       1
c     3       4       9.0     10.0         2       3
e     5       6      13.0     14.0         4       5


In [37]:
'''NumPy的conccatenate()函数：合并原始的NumPy数组'''
arr1=np.arange(12).reshape(3,4)
print(np.concatenate([arr1,arr1]))
print('-------------------')
print(np.concatenate([arr1,arr1],axis=1))

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
-------------------
[[ 0  1  2  3  0  1  2  3]
 [ 4  5  6  7  4  5  6  7]
 [ 8  9 10 11  8  9 10 11]]


In [59]:
'''pandas的concat()函数'''
arr2=pd.Series([0,1],index=['a','b'])
arr3=pd.Series([2,3,4],index=['c','d','e'])
arr4=pd.Series([5,6],index=['f','g'])
arr5=pd.concat([arr2*3,arr4])
print(arr5)
print('-------------------')
print(pd.concat([arr2,arr3,arr4])) #默认是按行合并
print('-------------------')
print(pd.concat([arr2,arr3,arr4],axis=1)) #axis=1按列合并，形成一个DataFrame
print('-------------------')
print(pd.concat([arr2,arr5],axis=1,join='inner')) #join参数默认取outer
print('-------------------')
print(pd.concat([arr2,arr5],axis=1,join_axes=[['a','c','b','e']])) #join_axes参数另设置index

a    0
b    3
f    5
g    6
dtype: int64
-------------------
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
-------------------
     0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
f  NaN  NaN  5.0
g  NaN  NaN  6.0
-------------------
   0  1
a  0  0
b  1  3
-------------------
     0    1
a  0.0  0.0
c  NaN  NaN
b  1.0  3.0
e  NaN  NaN




### concat()函数的参数
![](http://i2.muimg.com/567571/fbabe87eb45483b2.png)

In [62]:
#keys参数创建一个层次化索引
print(pd.concat([arr2,arr2,arr4],keys=['one','two','three'])) 
print('-------------------')
#axis=1，keys参数设置cloumn name，如果已经有column name，则设置2级column name
print(pd.concat([arr2,arr2,arr4],axis=1,keys=['one','two','three']))

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64
-------------------
   one  two  three
a  0.0  0.0    NaN
b  1.0  1.0    NaN
f  NaN  NaN    5.0
g  NaN  NaN    6.0


In [68]:
data1=pd.DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
data2=pd.DataFrame(np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
print(pd.concat([data1,data2],axis=1,keys=['level1','level2']))
print('-------------------')
#如果传入的是不是list而是一个dict，则字典的key会被当做keys参数的值
print(pd.concat({'level1':data1,'level2':data2},axis=1))
print('-------------------')
print(pd.concat([data1,data2],ignore_index=True)) #忽略原轴的index

  level1     level2     
     one two  three four
a      0   1    0.0  1.0
b      2   3    NaN  NaN
c      4   5    2.0  3.0
-------------------
  level1     level2     
     one two  three four
a      0   1    0.0  1.0
b      2   3    NaN  NaN
c      4   5    2.0  3.0
-------------------


   four  one  three  two
0   NaN  0.0    NaN  1.0
1   NaN  2.0    NaN  3.0
2   NaN  4.0    NaN  5.0
3   1.0  NaN    0.0  NaN
4   3.0  NaN    2.0  NaN


In [74]:
'''使用NumPy的where()索引部分重叠的数据'''
arr6=pd.Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=['a','b','c','d','e','f'])
arr7=pd.Series(np.arange(len(arr6),dtype=np.float64),index=['a','b','c','d','e','f'])
arr7[-1]=np.nan
print(arr6)
print(arr7)
print('-------------------')
print(np.where(pd.isnull(arr6),arr7,arr6))
#Series的combine_first()实现相同的功能，还是进行index排序对齐
print(arr6[:-2].combine_first(arr7[2:]))
print('-------------------')
#DataFrame的Combine_first
data3=pd.DataFrame({'a':[1,np.nan,5,np.nan],'b':[np.nan,2,np.nan,6],'c':range(2,18,4)})
data4=pd.DataFrame({'a':[5,4,np.nan,3,7],'b':[np.nan,3,4,6,8]})
print(data3.combine_first(data4))

a    NaN
b    2.5
c    NaN
d    3.5
e    4.5
f    NaN
dtype: float64
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
f    NaN
dtype: float64
-------------------
[ 0.   2.5  2.   3.5  4.5  nan]
a    NaN
b    2.5
c    2.0
d    3.5
e    4.0
f    NaN
dtype: float64
-------------------
     a    b     c
0  1.0  NaN   2.0
1  4.0  2.0   6.0
2  5.0  4.0  10.0
3  3.0  6.0  14.0
4  7.0  8.0   NaN


## 重塑(reshape)和旋转(pivot)
重新排列表格型数据的函数
- stack：将数据的列转换为行
- unstack：将数据的行转换为列

In [77]:
data5=pd.DataFrame(np.arange(6).reshape(2,3),index=pd.Index(['Ohio','Colorado'],name='state'),columns=pd.Index(['one','two','three'],name='number'))
print(data5)
print('-------------------')
print(data5.stack()) #stack()将column name转换为行，生成层次化索引
print('-------------------')
print(data5.stack().unstack())

number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
-------------------
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32
-------------------
number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5


In [82]:
'''移除重复数据'''
data6=pd.DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]})
print(data6)
print('-------------------')
print(data6.duplicated())
print('-------------------')
print(data6.drop_duplicates()) #drop_duplicates()删除重复数据
print('-------------------')
print(data6.drop_duplicates(['k1']))
print(data6.drop_duplicates(['k2']))

    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4
-------------------
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
-------------------
    k1  k2
0  one   1
2  one   2
3  two   3
5  two   4
-------------------
    k1  k2
0  one   1
3  two   3
    k1  k2
0  one   1
2  one   2
3  two   3
5  two   4


In [85]:
'''Series的map()：映射进行数据转换'''
data7=pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],'ouces':[4,3,12,6,7.5,8,3,5,6]})
print(data7)
print('-------------------')
#现在要添加肉类对应的动物类型到数据中
meat_to_animal={'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}
data7['animal']=data7['food'].map(str.lower).map(meat_to_animal)
print(data7)

          food  ouces
0        bacon    4.0
1  pulled pork    3.0
2        bacon   12.0
3     Pastrami    6.0
4  corned beef    7.5
5        Bacon    8.0
6     pastrami    3.0
7    honey ham    5.0
8     nova lox    6.0
-------------------
          food  ouces  animal
0        bacon    4.0     pig
1  pulled pork    3.0     pig
2        bacon   12.0     pig
3     Pastrami    6.0     cow
4  corned beef    7.5     cow
5        Bacon    8.0     pig
6     pastrami    3.0     cow
7    honey ham    5.0     pig
8     nova lox    6.0  salmon


In [90]:
'''replace()替换value'''
data8=pd.Series([1,-999,2,-999,-1000,3])
print(data8)
print('-------------------')
print(data8.replace(-999,np.nan))
print('-------------------')
print(data8.replace([-999,-1000],np.nan)) #替换多个值，传入一个list

0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64
-------------------
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
-------------------
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64


In [100]:
'''rename():重命名轴索引'''
data9=pd.DataFrame(np.arange(12).reshape(3,4),index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])
print(data9)
print('-------------------')
data9.index=data9.index.map(str.upper) #注意upper不要加括号
print(data9)
print('-------------------')
print(data9.rename(index=str.title,columns=str.upper)) #注意，rename()函数并不是修改原数据，而是创建数据集的转换版
print('-------------------')
print(data9.rename(index={'OHIO':'INDIANA'},columns={'four':'five'})) #修改单行/列
print('-------------------')
print(data9.rename(index={'OHIO':'INDIANA'},columns={'four':'five'},inplace=True)) #如果想修改原数据就设置参数inplace=True

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
New York    8    9     10    11
-------------------
          one  two  three  four
OHIO        0    1      2     3
COLORADO    4    5      6     7
NEW YORK    8    9     10    11
-------------------
          ONE  TWO  THREE  FOUR
Ohio        0    1      2     3
Colorado    4    5      6     7
New York    8    9     10    11
-------------------
          one  two  three  five
INDIANA     0    1      2     3
COLORADO    4    5      6     7
NEW YORK    8    9     10    11
-------------------


None


In [118]:
'''pandas的cut():连续数据离散化拆分成'面元'(bin)    '''
arr8=[20,22,25,27,21,23,37,31,61,45,41,32] #年龄数据
cut_arr=[18,25,35,60,100] #划分年龄
arr9=pd.cut(arr8,cut_arr) #cut()函数返回一个Categorical对象
print(arr9)
print('-------------------')
print(arr9.codes) #label属性对年龄段数据进行标号
print('-------------------')
print(pd.value_counts(arr9))
print('-------------------')
#cut()的参数
arr10=pd.cut(arr8,cut_arr,right=False) #默认分组是左开右闭，right=False设置左闭右开
print(arr10)
print('-------------------')
print(pd.cut(arr8,cut_arr,labels=['小伙子','年轻人','中年人','老年人'])) #labels设置面元的名称
print('-------------------')
print(pd.cut(arr8,4)) #如果传入面元的数量而不是面元的边界，函数会根据最小值和最大值计算等长面元
print('-------------------')
print(pd.cut(arr8,4,precision=1)) #precision参数设置精确度，默认小数点后保留3位

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
-------------------
[0 0 0 1 0 0 2 1 3 2 2 1]
-------------------
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
-------------------
[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, object): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]
-------------------
[小伙子, 小伙子, 小伙子, 年轻人, 小伙子, ..., 年轻人, 老年人, 中年人, 中年人, 年轻人]
Length: 12
Categories (4, object): [小伙子 < 年轻人 < 中年人 < 老年人]
-------------------
[(19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], (19.959, 30.25], ..., (30.25, 40.5], (50.75, 61], (40.5, 50.75], (40.5, 50.75], (30.25, 40.5]]
Length: 12
Categories (4, object): [(19.959, 30.25] < (30.25, 40.5] < (40.5, 50.75] < (50.75, 61]]
-------------------
[(20, 30.2], (20, 30.2], (20,

In [122]:
'''pandas的qcut()函数：与cut()很相似。qcut()还可以根据样本分位数进行过面元划分，可以得到大小基本相等的面元 '''
arr11=np.random.randn(1000)
arr12=pd.cut(arr11,5,precision=2)
print(arr12)
print(pd.value_counts(arr12))
print('-------------------')
arr13=pd.qcut(arr11,5,precision=2)
print(arr13)
print(pd.value_counts(arr13))
print('-------------------')

[(-0.62, 0.6], (-0.62, 0.6], (-0.62, 0.6], (0.6, 1.83], (0.6, 1.83], ..., (0.6, 1.83], (-0.62, 0.6], (-0.62, 0.6], (-0.62, 0.6], (-1.85, -0.62]]
Length: 1000
Categories (5, object): [(-3.076, -1.85] < (-1.85, -0.62] < (-0.62, 0.6] < (0.6, 1.83] < (1.83, 3.051]]
(-0.62, 0.6]       447
(0.6, 1.83]        247
(-1.85, -0.62]     242
(-3.076, -1.85]     41
(1.83, 3.051]       23
dtype: int64
-------------------
[(-0.27, 0.2], (-0.27, 0.2], (0.2, 0.83], (0.83, 3.051], (0.83, 3.051], ..., (0.83, 3.051], (-0.27, 0.2], (-0.27, 0.2], (0.2, 0.83], [-3.069, -0.91]]
Length: 1000
Categories (5, object): [[-3.069, -0.91] < (-0.91, -0.27] < (-0.27, 0.2] < (0.2, 0.83] < (0.83, 3.051]]
(0.83, 3.051]      200
(0.2, 0.83]        200
(-0.27, 0.2]       200
(-0.91, -0.27]     200
[-3.069, -0.91]    200
dtype: int64
-------------------


In [136]:
'''检测和过滤异常值'''
data10=pd.DataFrame(np.random.randn(1000,4))
print(data10.describe)
print('-------------------')
print(data10[(np.abs(data10)>3).any(1)]) #含有value>3的数据行
print('-------------------')
arr14=data10[3]
print(arr14[(np.abs(arr14)>3)]) #选出某列中value>3的数据

<bound method DataFrame.describe of             0         1         2         3
0    0.287240 -0.430738  0.331949  0.384516
1    1.572674  0.910432 -1.356146  0.983332
2    0.660219  0.189345 -0.139494  0.086400
3   -0.487755 -0.989152  1.383950 -0.373748
4   -0.397469  1.350276  1.530688 -0.398101
5    0.273168  0.219817 -0.265810  0.502540
6   -0.672474 -0.730212 -0.200615 -1.315972
7   -1.529583 -0.293518 -0.799729 -0.044376
8    0.179019  1.792138  1.317383 -1.346169
9    1.313557 -0.439941 -0.977049  1.414809
10  -0.112460  0.114610  0.074789 -0.536046
11   0.650535 -1.072981  0.527290 -0.304948
12   0.412234 -1.360951 -0.131575  0.822279
13   0.757350  0.362794  1.517306 -0.040360
14  -0.029886  0.095837  0.561448 -0.454664
15   0.279548  0.253806  0.683898  0.369121
16  -0.484580  0.731680 -0.233117  1.228052
17  -1.342085  0.314506  1.263326 -0.519255
18  -1.091740 -0.033414  1.080969  1.434657
19   0.163717 -0.990361  1.081009 -0.655894
20   1.574490 -0.132031 -0.878985 -1.041

In [138]:
'''take():排列和随机采样'''
data11=pd.DataFrame(np.arange(5*4).reshape(5,4))
arr15=np.random.permutation(5)
print(arr15)
print('-------------------')
print(data11.take(arr15))

[4 3 0 2 1]
-------------------
    0   1   2   3
4  16  17  18  19
3  12  13  14  15
0   0   1   2   3
2   8   9  10  11
1   4   5   6   7


In [142]:
'''计算指标/哑变量
    get_dummies():DataFrame某一列中含有k个不同的值，可以派生出一个k列矩阵或DataFrame(value全都是0和1)'''
data12=pd.DataFrame({'key':['b','b','a','c','a','b'],'data':range(6)})
print(data12)
print('-------------------')
print(pd.get_dummies(data12['key'],prefix='key')) #prefix参数加前缀

   data key
0     0   b
1     1   b
2     2   a
3     3   c
4     4   a
5     5   b
-------------------
   key_a  key_b  key_c
0      0      1      0
1      0      1      0
2      1      0      0
3      0      0      1
4      1      0      0
5      0      1      0


In [149]:
'''字符串操作'''
arr16='a,b,  guido'
print(arr16.split(','))
print([i.strip( )for i in arr16.split(',')]) #strip()修剪空白符
print('::'.join([i.strip( )for i in arr16.split(',')]))
'''子串定位：关键字in/index/find'''
print(arr16.index(','))
print(arr16.find(':'))
print(arr16.count(','))

['a', 'b', '  guido']
['a', 'b', 'guido']
a::b::guido
1
-1
2


## Python内置的字符串方法
![](http://i1.piimg.com/567571/abf407822f279e2a.png)
![](http://i1.piimg.com/567571/87e16c70e5ec72e9.png)