# 合并数据集

## 数据库风格的 DataFrame 合并

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                'data1': range(7)})

In [4]:
df2 = DataFrame({'key': ['a', 'b', 'd'],
                'data2': range(3)})

In [5]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [6]:
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,d


In [7]:
pd.merge(df1, df2)

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [8]:
pd.merge(df1, df2, on='key')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [9]:
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                'data1': range(7)})

In [14]:
df4 = DataFrame({'rkey': ['a', 'b', 'd'],
                'data2': range(3)})

In [15]:
df3

Unnamed: 0,data1,lkey
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [16]:
df4

Unnamed: 0,data2,rkey
0,0,a
1,1,b
2,2,d


In [17]:
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,data1,lkey,data2,rkey
0,0,b,1,b
1,1,b,1,b
2,6,b,1,b
3,2,a,0,a
4,4,a,0,a
5,5,a,0,a


In [22]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
6,3.0,c,
7,,d,2.0


In [23]:
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                'data1': range(6)})

In [24]:
df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                'data2': range(5)})

In [25]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [26]:
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,a
3,3,b
4,4,d


In [27]:
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                'data1': range(6)})

In [32]:
df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                'data2': range(5)})

In [29]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [33]:
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,a
3,3,b
4,4,d


In [34]:
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,data1,key,data2
0,0,b,1.0
1,0,b,3.0
2,1,b,1.0
3,1,b,3.0
4,2,a,0.0
5,2,a,2.0
6,3,c,
7,4,a,0.0
8,4,a,2.0
9,5,b,1.0


In [35]:
pd.merge(df1, df2, how='inner')

Unnamed: 0,data1,key,data2
0,0,b,1
1,0,b,3
2,1,b,1
3,1,b,3
4,5,b,1
5,5,b,3
6,2,a,0
7,2,a,2
8,4,a,0
9,4,a,2


In [37]:
left = DataFrame({'key1': ['foo', 'foo', 'bar'],
                 'key2': ['one', 'two', 'one'],
                 'lval': [1, 2, 3]})

In [38]:
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                 'key2': ['one', 'two', 'one', 'two'],
                 'lval': [4, 5, 6, 7]})

In [39]:
left

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [40]:
right

Unnamed: 0,key1,key2,lval
0,foo,one,4
1,foo,two,5
2,bar,one,6
3,bar,two,7


In [41]:
pd.merge(left, right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,lval_x,lval_y
0,foo,one,1.0,4
1,foo,two,2.0,5
2,bar,one,3.0,6
3,bar,two,,7


In [47]:
left.index

RangeIndex(start=0, stop=3, step=1)

In [48]:
left.columns

Index(['key1', 'key2', 'lval'], dtype='object')

In [50]:
type(left['key1'])

pandas.core.series.Series

In [56]:
left['key1'].is_unique

False

In [57]:
pd.merge(left, right, on='key1')

Unnamed: 0,key1,key2_x,lval_x,key2_y,lval_y
0,foo,one,1,one,4
1,foo,one,1,two,5
2,foo,two,2,one,4
3,foo,two,2,two,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [60]:
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

Unnamed: 0,key1,key2_left,lval_left,key2_right,lval_right
0,foo,one,1,one,4
1,foo,one,1,two,5
2,foo,two,2,one,4
3,foo,two,2,two,5
4,bar,one,3,one,6
5,bar,one,3,two,7


## 索引上的合并

In [61]:
left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                  'value': range(6)})

In [65]:
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

In [66]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [67]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [68]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [69]:
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [73]:
lefth = DataFrame({'ley1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                  'key2': [2000, 2001, 2002, 2001, 2002],
                  'data': np.arange(5.)})

In [71]:
righth = DataFrame(np.arange(12).reshape((6, 2)),
                   index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                         [2001, 2000, 2000, 2000, 2001, 2002]],
                   columns=['event1', 'event2'])

In [74]:
lefth

Unnamed: 0,data,key2,ley1
0,0.0,2000,Ohio
1,1.0,2001,Ohio
2,2.0,2002,Ohio
3,3.0,2001,Nevada
4,4.0,2002,Nevada


In [76]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11
