In [1]:
#Now lets take a look at merging the data in datasets using index of the dataframes
import pandas as pd
from pandas import DataFrame
import numpy as np

In [2]:
#Sometimes the key on the basis of which the datasets are merging is its index

df1 = DataFrame({'keys': list('acbcabd'), 'data1': np.arange(7)})
df2 = DataFrame({'values': np.arange(3)}, index=list('abc'))

In [4]:
#Now we can specify the index value of the dataset to True for which we are merging the data
#Like in this case, we have to merge the data of the keys value and the index of the second dataframe

pd.merge(df1, df2, left_on='keys', right_index=True)

Unnamed: 0,keys,data1,values
0,a,0,0
4,a,4,0
1,c,1,2
3,c,3,2
2,b,2,1
5,b,5,1


In [6]:
#We could have specified it for left_index, then
pd.merge(df2, df1, right_on='keys', left_index=True)

Unnamed: 0,values,keys,data1
0,0,a,0
4,0,a,4
2,1,b,2
5,1,b,5
1,2,c,1
3,2,c,3


In [7]:
#We can use the outer join to get the union of the values instead of intersection
pd.merge(df1, df2, left_on='keys', right_index=True, how='outer')

Unnamed: 0,keys,data1,values
0,a,0,0.0
4,a,4,0.0
1,c,1,2.0
3,c,3,2.0
2,b,2,1.0
5,b,5,1.0
6,d,6,


In [8]:
#Consider the case of hierarchical data
data1 = DataFrame({'key1': ['Adam','Jones','Ross','Taylor'],
                   'key2': [21,45,23,65],
                    'data1': np.arange(4)})

data2 = DataFrame(np.arange(10).reshape(5,2), index=[['Adam','Ross','Jones','Taylor','Adam'], [21,23,54,66,23]], columns = ['event1', 'event2'])

In [9]:
data1

Unnamed: 0,key1,key2,data1
0,Adam,21,0
1,Jones,45,1
2,Ross,23,2
3,Taylor,65,3


In [10]:
data2

Unnamed: 0,Unnamed: 1,event1,event2
Adam,21,0,1
Ross,23,2,3
Jones,54,4,5
Taylor,66,6,7
Adam,23,8,9


In [11]:
#Now for merging data we have
pd.merge(data1, data2, left_on=['key1','key2'], right_index=True)

Unnamed: 0,key1,key2,data1,event1,event2
0,Adam,21,0,0,1
2,Ross,23,2,2,3


In [12]:
#in case of union, we have
pd.merge(data1, data2, left_on=['key1','key2'], right_index=True, how='outer')

Unnamed: 0,key1,key2,data1,event1,event2
0,Adam,21,0.0,0.0,1.0
1,Jones,45,1.0,,
2,Ross,23,2.0,2.0,3.0
3,Taylor,65,3.0,,
3,Jones,54,,4.0,5.0
3,Taylor,66,,6.0,7.0
3,Adam,23,,8.0,9.0


In [13]:
#We can also merge two datasets on the basis of indexes
data3 = DataFrame(np.arange(6).reshape(3,2), index=list('abd'), columns=['first','second'])
data4 = DataFrame(np.arange(7,17).reshape(5,2), index=list('bcdef'), columns=['third','fourth'])

In [14]:
data3

Unnamed: 0,first,second
a,0,1
b,2,3
d,4,5


In [15]:
data4

Unnamed: 0,third,fourth
b,7,8
c,9,10
d,11,12
e,13,14
f,15,16


In [16]:
#Merging on index we have
pd.merge(data3, data4, left_index=True, right_index=True)

Unnamed: 0,first,second,third,fourth
b,2,3,7,8
d,4,5,11,12


In [17]:
#in case of union of values
pd.merge(data3, data4, left_index=True, right_index=True, how='outer')

Unnamed: 0,first,second,third,fourth
a,0.0,1.0,,
b,2.0,3.0,7.0,8.0
c,,,9.0,10.0
d,4.0,5.0,11.0,12.0
e,,,13.0,14.0
f,,,15.0,16.0


In [18]:
#Pandas has a more appropriate method for merging two datasets using indexes i.e. the join method
#The above example could be solved using the join method as

data3.join(data4, how='outer')

Unnamed: 0,first,second,third,fourth
a,0.0,1.0,,
b,2.0,3.0,7.0,8.0
c,,,9.0,10.0
d,4.0,5.0,11.0,12.0
e,,,13.0,14.0
f,,,15.0,16.0


In [19]:
#In case of one index and other a column of values
df1.join(df2, on='keys')

Unnamed: 0,keys,data1,values
0,a,0,0.0
1,c,1,2.0
2,b,2,1.0
3,c,3,2.0
4,a,4,0.0
5,b,5,1.0
6,d,6,
