# Merging pandas dataframes

In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.DataFrame({'key':['b','b','a','c','a','a','b'],
                    'data1':range(7)})
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [3]:
df2 = pd.DataFrame({'key':['a','b','d'],
                    'data2':range(3)})
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,d


In [4]:
pd.merge(df1,df2)
#The merge method does an inner join by default, which is an intersecion of the keys.

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [5]:
pd.merge(df1,df2,how='outer')
#outer joins take the union of the keys and fill missing values with NaN

Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
6,3.0,c,
7,,d,2.0


### Concatenation

In [6]:
ser1 = pd.Series([1,2],index=['a','b'])
ser2 = pd.Series([3,4],index=['c','d'])
ser3 = pd.Series([5,6],index=['e','f'])

In [7]:
#by default concat will stack the series or dataframes
pd.concat([ser1,ser2,ser3])

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [8]:
pd.concat([ser1,ser2,ser3], axis=1)

Unnamed: 0,0,1,2
a,1.0,,
b,2.0,,
c,,3.0,
d,,4.0,
e,,,5.0
f,,,6.0


In [9]:
#axes can be specified
pd.concat([ser1,ser2,ser3], axis=1, join_axes=[['a','b','e']])

Unnamed: 0,0,1,2
a,1.0,,
b,2.0,,
e,,,5.0


In [10]:
#the keys arguement will result in a hierarchical index 
x = pd.concat([ser1,ser2,ser3],keys=['one','two','three'])
x

one    a    1
       b    2
two    c    3
       d    4
three  e    5
       f    6
dtype: int64

In [11]:
x.unstack()

Unnamed: 0,a,b,c,d,e,f
one,1.0,2.0,,,,
two,,,3.0,4.0,,
three,,,,,5.0,6.0


In [12]:
x.unstack().loc['one','a']

1.0

### Combining data with overlap

In [13]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 15, 4)})
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [14]:
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [15]:
#combine first keeps the values from the first dataframe when there is an overlap
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


In [16]:
df2.combine_first(df1)

Unnamed: 0,a,b,c
0,5.0,,2.0
1,4.0,3.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,
