In [1]:
#Now lets take a look at the methods to wrangle, clean and arrange data in the processable form
#First operation to perform in such scenarios is the merge operations
#Merging allows to merge data of two data sets into a single data set

import pandas as pd
from pandas import DataFrame
import numpy as np

In [2]:
#One-to-One Merge

df1 = DataFrame({'keys': list('bca'), 'data1': np.arange(3)})
df2 = DataFrame({'keys': list('dgef'), 'data2':np.arange(4)})

In [3]:
#Merging dataframes using pandas merge method 
pd.merge(df1, df2) #Merges df2 into df1

Unnamed: 0,keys,data1,data2


In [4]:
#Note that we didn't get any output , its because merge gives us the result by merging values that are in both dataframes
#Since it was a one to one merge with no elements in common, the result was nothing
#Lets turn towards the proper one-to-one merge example

df2 = DataFrame({'keys': list('cgea'), 'data2':np.arange(4)})
pd.merge(df1,df2)

Unnamed: 0,keys,data1,data2
0,c,1,0
1,a,2,3


In [5]:
#Since the keys c and a were common the values for them were obtained on merging data
#One can see that using merge the useless data, i.e. the data which was not concerned with the relationship of two datasets was removed by default

#Now lets consider an example of one-to-many merge
df2 = DataFrame({'keys': list('badedbaa'), 'data2': np.arange(8)})
pd.merge(df1, df2)

Unnamed: 0,keys,data1,data2
0,b,0,0
1,b,0,5
2,a,2,1
3,a,2,6
4,a,2,7


In [6]:
#For moany-one merge
pd.merge(df2, df1)

Unnamed: 0,keys,data2,data1
0,b,0,0
1,b,5,0
2,a,1,2
3,a,6,2
4,a,7,2


In [7]:
#The only difference in one-to-many and many-to-one is that the columns are in opposite position
#For merging, we need to provide an axis on which the values are to e merged. In this case keys acted as the merge column by default
#Thus, only the keys with same values would merge into each other

#Lets see many-many merge now
df1 = DataFrame({'keys': list('abcabba'), 'data1': np.arange(7)})
pd.merge(df1,df2)

Unnamed: 0,keys,data1,data2
0,a,0,1
1,a,0,6
2,a,0,7
3,a,3,1
4,a,3,6
5,a,3,7
6,a,6,1
7,a,6,6
8,a,6,7
9,b,1,0


In [8]:
#In case of many-to-many, each value of data1 'a' is mapped with each value of data2 'a' then the same process is done for b
#Thus, although we passed 3 'a' keys to data1 and 3 'a' keys to data2, but due to many-to-many mapping, we got 9 values as a result of merge 
pd.merge(df2,df1)

Unnamed: 0,keys,data2,data1
0,b,0,1
1,b,0,4
2,b,0,5
3,b,5,1
4,b,5,4
5,b,5,5
6,a,1,0
7,a,1,3
8,a,1,6
9,a,6,0


In [9]:
#Although merge considers by default all those values which overlap both dataframes, but it is better to specify the axis on which we are merging
pd.merge(df1,df2, on='keys') #on argument takes a single column or a list of columns and merges the data on the basis of it

Unnamed: 0,keys,data1,data2
0,a,0,1
1,a,0,6
2,a,0,7
3,a,3,1
4,a,3,6
5,a,3,7
6,a,6,1
7,a,6,6
8,a,6,7
9,b,1,0


In [11]:
#Now lets see all other arguments and operations we can perform using merge by taking the case of many-to-one merge
df3 = DataFrame({'keys': list('abcabcab'), 'data3':np.arange(8)})
df4 = DataFrame({'keys': list('abd'), 'data4':np.arange(3)})

In [12]:
pd.merge(df3,df4,on='keys')

Unnamed: 0,keys,data3,data4
0,a,0,0
1,a,3,0
2,a,6,0
3,b,1,1
4,b,4,1
5,b,7,1


In [14]:
#In case of different column names, wecan merge using the different columns
df3 = df3.rename(columns = {'keys':'lkeys'})
df4 = df4.rename(columns = {'keys':'rkeys'})

In [15]:
#Using different left and right arguments we can merge the columns according to the side
pd.merge(df3, df4, left_on='lkeys', right_on='rkeys')

Unnamed: 0,lkeys,data3,rkeys,data4
0,a,0,a,0
1,a,3,a,0
2,a,6,a,0
3,b,1,b,1
4,b,4,b,1
5,b,7,b,1


In [17]:
#By default, the merge method applies an inner join which gets only those elements which are an intersection of the two datasets
#To apply for all elements in the two datasets, we use the outer join argument
df3 = DataFrame({'keys': list('abcabcab'), 'data3':np.arange(8)})
df4 = DataFrame({'keys': list('abd'), 'data4':np.arange(3)})

pd.merge(df3, df4, how='outer')

Unnamed: 0,keys,data3,data4
0,a,0.0,0.0
1,a,3.0,0.0
2,a,6.0,0.0
3,b,1.0,1.0
4,b,4.0,1.0
5,b,7.0,1.0
6,c,2.0,
7,c,5.0,
8,d,,2.0


In [18]:
#There are two other methods to apply joins i.e. left and right joins
pd.merge(df3, df4, how='left')

Unnamed: 0,keys,data3,data4
0,a,0,0.0
1,b,1,1.0
2,c,2,
3,a,3,0.0
4,b,4,1.0
5,c,5,
6,a,6,0.0
7,b,7,1.0


In [19]:
pd.merge(df3, df4, how='right')

Unnamed: 0,keys,data3,data4
0,a,0.0,0
1,a,3.0,0
2,a,6.0,0
3,b,1.0,1
4,b,4.0,1
5,b,7.0,1
6,d,,2


In [20]:
#Like stated above many-to-many merge results in a cartesian product of the given values
#To get unique values from it we can use the join argument

pd.merge(df1, df2, how='inner')

Unnamed: 0,keys,data1,data2
0,a,0,1
1,a,0,6
2,a,0,7
3,a,3,1
4,a,3,6
5,a,3,7
6,a,6,1
7,a,6,6
8,a,6,7
9,b,1,0


In [28]:
#Now suppose the data we are passing contains two different key values on which we can merge the data

data1 = DataFrame({'key1': list('abc'), 'key2': list('123'), 'data1':np.arange(3)})
data2 = DataFrame({'key1': list('bbcd'), 'key2': list('1223'), 'data2': np.arange(4)})

In [29]:
data1

Unnamed: 0,key1,key2,data1
0,a,1,0
1,b,2,1
2,c,3,2


In [30]:
data2

Unnamed: 0,key1,key2,data2
0,b,1,0
1,b,2,1
2,c,2,2
3,d,3,3


In [31]:
#Now we have to merge the datasets on the basis of both key1 and key2 then we pass it as a list to the on argument

pd.merge(data1, data2, on=['key1','key2'], how='outer')

Unnamed: 0,key1,key2,data1,data2
0,a,1,0.0,
1,b,2,1.0,1.0
2,c,3,2.0,
3,b,1,,0.0
4,c,2,,2.0
5,d,3,,3.0


In [34]:
#When applying merge operation, it is also possible to assign names to the overlapping entries in the dataframe
#This can be performed using the suffixes argument

pd.merge(data1, data2, on='key1')

Unnamed: 0,key1,key2_x,data1,key2_y,data2
0,b,2,1,1,0
1,b,2,1,2,1
2,c,3,2,2,2


In [36]:
pd.merge(data1, data2, on='key1', suffixes=('_data1', '_data2'))

Unnamed: 0,key1,key2_data1,data1,key2_data2,data2
0,b,2,1,1,0
1,b,2,1,2,1
2,c,3,2,2,2
