# Combining & Merging Datasets in Pandas

In [1]:
import pandas as pd
import numpy as np

## Joining DataFrame

In [2]:
d1=pd.DataFrame(
    {"key":["a","b","c","c","d","e"],
     "num1":range(6)})
d2=pd.DataFrame(
    {"key":["b","c","e","f"],
     "num2":range(4)})

In [3]:
print(d1)
print(d2)

  key  num1
0   a     0
1   b     1
2   c     2
3   c     3
4   d     4
5   e     5
  key  num2
0   b     0
1   c     1
2   e     2
3   f     3


In [4]:
pd.merge(d1, d2)

Unnamed: 0,key,num1,num2
0,b,1,0
1,c,2,1
2,c,3,1
3,e,5,2


In [5]:
pd.merge(d1, d2, on='key')

Unnamed: 0,key,num1,num2
0,b,1,0
1,c,2,1
2,c,3,1
3,e,5,2


In [6]:
d3=pd.DataFrame(
    {"key1":["a","b","c","c","d","e"],
     "num1":range(6)})
d4=pd.DataFrame(
    {"key2":["b","c","e","f"],
     "num2":range(4)})

In [7]:
pd.merge(
    d3,d4,left_on="key1",right_on="key2"
)

Unnamed: 0,key1,num1,key2,num2
0,b,1,b,0
1,c,2,c,1
2,c,3,c,1
3,e,5,e,2


In [8]:
pd.merge(d1,d2,how="outer")

Unnamed: 0,key,num1,num2
0,a,0.0,
1,b,1.0,0.0
2,c,2.0,1.0
3,c,3.0,1.0
4,d,4.0,
5,e,5.0,2.0
6,f,,3.0


In [9]:
pd.merge(d1,d2,how="left")

Unnamed: 0,key,num1,num2
0,a,0,
1,b,1,0.0
2,c,2,1.0
3,c,3,1.0
4,d,4,
5,e,5,2.0


In [10]:
pd.merge(d1,d2,how="right")

Unnamed: 0,key,num1,num2
0,b,1.0,0
1,c,2.0,1
2,c,3.0,1
3,e,5.0,2
4,f,,3


In [11]:
pd.merge(d1, d2, how='inner')

Unnamed: 0,key,num1,num2
0,b,1,0
1,c,2,1
2,c,3,1
3,e,5,2


In [12]:
df1=pd.DataFrame(
    {"key":["a","b","c","c","d","e"],
     "num1":range(6),
     "count":["one","three","two",
              "one","one","two"]})
df2=pd.DataFrame(
    {"key":["b","c","e","f"],
     "num2":range(4),
     "count":["one","two","two","two"]})

In [13]:
pd.merge(df1, df2, on=['key', 'count'], 
         how='outer')

Unnamed: 0,key,num1,count,num2
0,a,0.0,one,
1,b,1.0,three,
2,c,2.0,two,1.0
3,c,3.0,one,
4,d,4.0,one,
5,e,5.0,two,2.0
6,b,,one,0.0
7,f,,two,3.0


In [14]:
pd.merge(df1, df2, on="key", how='outer')

Unnamed: 0,key,num1,count_x,num2,count_y
0,a,0.0,one,,
1,b,1.0,three,0.0,one
2,c,2.0,two,1.0,two
3,c,3.0,one,1.0,two
4,d,4.0,one,,
5,e,5.0,two,2.0,two
6,f,,,3.0,two


In [15]:
pd.merge(df1, df2, 
         on='key', 
         suffixes=('_data1', '_data2'))

Unnamed: 0,key,num1,count_data1,num2,count_data2
0,b,1,three,0,one
1,c,2,two,1,two
2,c,3,one,1,two
3,e,5,two,2,two


## Merging on index

In [16]:
df1=pd.DataFrame(
    {"letter":["a","a","b",
               "b","a","c"],
     "num":range(6)}) 
df2=pd.DataFrame(
    {"value":[3,5,7]},
    index=["a","b","e"])

In [17]:
print(df1)
print(df2)

  letter  num
0      a    0
1      a    1
2      b    2
3      b    3
4      a    4
5      c    5
   value
a      3
b      5
e      7


In [18]:
pd.merge(df1,df2,
         left_on="letter",
         right_index=True)

Unnamed: 0,letter,num,value
0,a,0,3
1,a,1,3
4,a,4,3
2,b,2,5
3,b,3,5


In [19]:
right=pd.DataFrame(
    [[1,2],[3,4],[5,6]],
    index=["a","c","d"],
    columns=["Tom","Tim"])
left=pd.DataFrame(
    [[7,8],[9,10],[11,12],[13,14]],
    index=["a","b","e","f"],
    columns=["Sam","Kim"])

In [20]:
pd.merge(right,left, 
         right_index=True, 
         left_index=True, 
         how="outer")

Unnamed: 0,Tom,Tim,Sam,Kim
a,1.0,2.0,7.0,8.0
b,,,9.0,10.0
c,3.0,4.0,,
d,5.0,6.0,,
e,,,11.0,12.0
f,,,13.0,14.0


In [21]:
left.join(right)

Unnamed: 0,Sam,Kim,Tom,Tim
a,7,8,1.0,2.0
b,9,10,,
e,11,12,,
f,13,14,,


In [22]:
left.join(right,how="outer")

Unnamed: 0,Sam,Kim,Tom,Tim
a,7.0,8.0,1.0,2.0
b,9.0,10.0,,
c,,,3.0,4.0
d,,,5.0,6.0
e,11.0,12.0,,
f,13.0,14.0,,


In [23]:
data=pd.DataFrame([[1,3],[5,7],[9,11]],            
                  index=["a","b","f"],      
                  columns=["Alex","Keta"])
left.join([right,data])

Unnamed: 0,Sam,Kim,Tom,Tim,Alex,Keta
a,7.0,8.0,1.0,2.0,1.0,3.0
b,9.0,10.0,,,5.0,7.0
e,11.0,12.0,,,,
f,13.0,14.0,,,9.0,11.0


## Concatenating Along an Axis

In [24]:
seq= np.arange(20).reshape((4, 5))

In [25]:
np.concatenate([seq,seq], axis=1)

array([[ 0,  1,  2,  3,  4,  0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19, 15, 16, 17, 18, 19]])

In [26]:
np.concatenate([seq, seq], axis=0)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]])

In [27]:
data1 = pd.Series(
    [0, 1], index=['a', 'b'])
data2 = pd.Series(
    [2,3,4], index=['c','d','e'])
data3 = pd.Series(
    [5, 6], index=['f', 'g'])

In [28]:
pd.concat([data1,data2,data3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [29]:
pd.concat([data1, data2, data3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [30]:
data4= pd.Series([10,11,12], 
                 index=['a','b',"c"])
pd.concat([data1,data4],axis=1,join="inner")

Unnamed: 0,0,1
a,0,10
b,1,11


In [31]:
x=pd.concat([data1, data2, data4], 
            keys=['one', 'two','three'])
x

one    a     0
       b     1
two    c     2
       d     3
       e     4
three  a    10
       b    11
       c    12
dtype: int64

In [32]:
x=pd.concat([data1, data2, data4], 
            axis=1,
            keys=['one', 'two', 'three'])
x

Unnamed: 0,one,two,three
a,0.0,,10.0
b,1.0,,11.0
c,,2.0,12.0
d,,3.0,
e,,4.0,


In [33]:
df1 = pd.DataFrame(
    np.arange(6).reshape(3, 2),
    index=['a', 'b', 'c'],
    columns=['one', 'two'])
df2 = pd.DataFrame(
    10+np.arange(4).reshape(2,2),
    index=['a', 'c'],
    columns=['three', 'four'])

In [34]:
pd.concat([df1, df2], axis=1, 
          keys=['s1', 's2'],
          sort=False)

Unnamed: 0_level_0,s1,s1,s2,s2
Unnamed: 0_level_1,one,two,three,four
a,0,1,10.0,11.0
b,2,3,,
c,4,5,12.0,13.0


In [35]:
data1 = pd.DataFrame(
    np.random.randn(3, 4),
    columns=['a','b','c','d'])
data2 = pd.DataFrame(
    np.random.randn(2, 3),
    columns=['b','d','a'])

In [36]:
pd.concat([data1, data2], ignore_index=True)

Unnamed: 0,a,b,c,d
0,0.443128,1.033878,-0.081062,0.720712
1,1.249823,1.695462,-1.911692,-2.135979
2,0.970119,0.152867,0.21075,0.736984
3,-0.930846,-1.478824,,0.084256
4,-0.420467,1.158122,,0.501372
