In [10]:
#Another method for combining data from two datasets into one is the concatenation/binding/stacking method

import pandas as pd
from pandas import DataFrame, Series
import numpy as np

In [2]:
#We have a concatenate method from numpy to concatenate two arrays

arr = np.arange(12).reshape(4,3)

In [7]:
new_arr = np.concatenate([arr,arr], axis=1) #Axis = 1 represents columnwise

In [8]:
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [9]:
new_arr

array([[ 0,  1,  2,  0,  1,  2],
       [ 3,  4,  5,  3,  4,  5],
       [ 6,  7,  8,  6,  7,  8],
       [ 9, 10, 11,  9, 10, 11]])

In [12]:
#In pandas an alternative method is the concat method

#First lets take three unoverlapping Series objects

series1 = Series([0,1], index=list('ab'))
series2 = Series(np.arange(1,4), index=list('cde'))
series3 = Series(np.arange(4,7), index=list('fgh'))

In [13]:
#Applying concat function on these series along axis=0 which is by default, will concatenate them rowise resulting in a new Series with these elements

pd.concat([series1, series2, series3])

a    0
b    1
c    1
d    2
e    3
f    4
g    5
h    6
dtype: int64

In [14]:
#But if we pass axis=1 i.e. columnwise, it would create a dataframe

pd.concat([series1, series2, series3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,1.0,
d,,2.0,
e,,3.0,
f,,,4.0
g,,,5.0
h,,,6.0


In [15]:
#Thus, we can see that the resulting dataframe is the outer join for each of the series
#We can pass join argument as inner to get the intersection values

series4 = pd.concat([series1 * 5, series3])

In [16]:
pd.concat([series1, series4], axis=1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,5
f,,4
g,,5
h,,6


In [18]:
#For inner join i.e. intersection values
pd.concat([series1, series4], axis=1, join='inner')

Unnamed: 0,0,1
a,0,0
b,1,5


In [25]:
#Suppose we want to create a hierarchical index using the concat method then we can use the keys argument to do that
result = pd.concat([series1, series2, series3], keys=['one','two','three'])

In [26]:
result

one    a    0
       b    1
two    c    1
       d    2
       e    3
three  f    4
       g    5
       h    6
dtype: int64

In [27]:
#To recreate a dataframe from this we can use the unstack method
result.unstack()

Unnamed: 0,a,b,c,d,e,f,g,h
one,0.0,1.0,,,,,,
two,,,1.0,2.0,3.0,,,
three,,,,,,4.0,5.0,6.0


In [28]:
#In case of passing the axis=1 the passed keys become the column headers for the dataframe
pd.concat([series1, series2, series3], axis=1, keys=['one','two','three'])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,1.0,
d,,2.0,
e,,3.0,
f,,,4.0
g,,,5.0
h,,,6.0


In [29]:
#We can use the same functionalities on a dataframe
df1 = DataFrame(np.arange(6).reshape(3,2), index=list('abc'), columns=['one','two'])
df2 = DataFrame(2 * np.arange(4).reshape(2,2), index=list('ac'), columns=['three','four'])

In [30]:
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [31]:
df2

Unnamed: 0,three,four
a,0,2
c,4,6


In [32]:
pd.concat([df1,df2], axis=1, keys=['data1', 'data2'])

Unnamed: 0_level_0,data1,data1,data2,data2
Unnamed: 0_level_1,one,two,three,four
a,0,1,0.0,2.0
b,2,3,,
c,4,5,4.0,6.0


In [33]:
#We can instead of above approach pass a dict object to the concat method with each header specified
pd.concat({'data1': df1, 'data2': df2}, axis=1)

Unnamed: 0_level_0,data1,data1,data2,data2
Unnamed: 0_level_1,one,two,three,four
a,0,1,0.0,2.0
b,2,3,,
c,4,5,4.0,6.0


In [34]:
#We can also assign names to the column headers using the names argument
pd.concat({'data1': df1, 'data2': df2}, axis=1, names=['upper','lower'])

upper,data1,data1,data2,data2
lower,one,two,three,four
a,0,1,0.0,2.0
b,2,3,,
c,4,5,4.0,6.0


In [35]:
#Sometimes when concatenating dataframes, our dconcern are just the columns and the rows are meaningless
#We can use the ignore_index argument to concat such dataframes

df3 = DataFrame(np.random.randn(3,4), columns = ['a','b','c','d'])
df4 = DataFrame(np.random.randn(2,3), columns = ['b','d','a'])

In [38]:
df3

Unnamed: 0,a,b,c,d
0,-0.088904,0.68166,1.097001,0.141071
1,-1.118354,0.919827,0.262501,-1.230957
2,-0.857386,1.388915,-0.672969,0.120959


In [39]:
df4

Unnamed: 0,b,d,a
0,-0.862324,-0.772278,-0.697424
1,-0.655657,-0.129983,-0.400069


In [40]:
#Now concatinating ignoring the index
pd.concat([df3, df4], ignore_index=True)

Unnamed: 0,a,b,c,d
0,-0.088904,0.68166,1.097001,0.141071
1,-1.118354,0.919827,0.262501,-1.230957
2,-0.857386,1.388915,-0.672969,0.120959
3,-0.697424,-0.862324,,-0.772278
4,-0.400069,-0.655657,,-0.129983
