# <b>Combining Datasets: concat and append</b>

In [1]:
import pandas as pd
import numpy as np

In [7]:
Cities = [{'Tehran': 2333, 'Rasth': 8888, 'Lahijan': 777776}]
pd.DataFrame(Cities, index=['b'])

Unnamed: 0,Tehran,Rasth,Lahijan
b,2333,8888,777776


In [8]:
Cities = [{'Tehran': 2333, 'Rasth': 8888, 'Lahijan': 777776}]
pd.DataFrame(Cities)

Unnamed: 0,Tehran,Rasth,Lahijan
0,2333,8888,777776


In [9]:
Cities = {'Tehran': [2333], 'Rasth': [8888], 'Lahijan': [777776]}
pd.DataFrame(Cities)

Unnamed: 0,Tehran,Rasth,Lahijan
0,2333,8888,777776


In [43]:
Cities = {'Tehran': [2333,999], 'Rasth': [8888,99], 'Lahijan': [777776,00]}
pd.DataFrame(Cities)

Unnamed: 0,Tehran,Rasth,Lahijan
0,2333,8888,777776
1,999,99,0


In [10]:
for i in 'ABS':
    print(i)

A
B
S


In [11]:
'A'+'B'

'AB'

In [46]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data={c: [str(c)+str(i) for i in ind] for c in cols}
    return pd.DataFrame(data,ind)

In [17]:
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [18]:
class display(object):
    """Display HTML representation of multiple objects"""

    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}</div>"""

    def __init__(self, *args):
        self.args = args  # Store all arguments (names of variables as strings)

    def _repr_html_(self):
        return '\n'.join(
            self.template.format(a, eval(a)._repr_html_())
            for a in self.args
        )

    def __repr__(self):
        return '\n\n'.join(
            a + '\n' + repr(eval(a))
            for a in self.args
        )


In [19]:
df1 = pd.DataFrame(np.random.rand(3, 3), columns=['A', 'B', 'C'])
df2 = pd.DataFrame(np.random.rand(3, 3), columns=['X', 'Y', 'Z'])

display("df1", "df2")


Unnamed: 0,A,B,C
0,0.320455,0.032797,0.933497
1,0.30063,0.80653,0.601321
2,0.463549,0.991388,0.498791

Unnamed: 0,X,Y,Z
0,0.876523,0.038105,0.160379
1,0.656391,0.778274,0.184444
2,0.174063,0.832618,0.712918


In [28]:
pd.concat([df1, df2], keys=['group1', 'group2'])


Unnamed: 0,Unnamed: 1,A,B,C,X,Y,Z
group1,0,0.320455,0.032797,0.933497,,,
group1,1,0.30063,0.80653,0.601321,,,
group1,2,0.463549,0.991388,0.498791,,,
group2,0,,,,0.876523,0.038105,0.160379
group2,1,,,,0.656391,0.778274,0.184444
group2,2,,,,0.174063,0.832618,0.712918


In [34]:
# Two simple DataFrames
ddf1 = pd.DataFrame({'A': [1, 2]})
ddf2 = pd.DataFrame({'A': [3, 4]})

# Corrected: keys must match values in levels
result = pd.concat(
    [ddf1, ddf2],
    keys=['Group 1', 'Group 2'],  # These must match levels
    levels=[['Group 1', 'Group 2']],  # Custom level values
    names=['Group Label']  # Name of the MultiIndex level
)

print(result)

               A
Group Label     
Group 1     0  1
            1  2
Group 2     0  3
            1  4


## <b>Recall: Concatenation of NumPy Arrays</b>

In [24]:
x=[i for i in range(1,4)]
y=[i for i in range(4,7)]
z=[i for i in range(7,10)]
print('\n', x, '\n', y, '\n', z)


 [1, 2, 3] 
 [4, 5, 6] 
 [7, 8, 9]


In [25]:
np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [26]:
x = [[1, 2], [3, 4]]
np.concatenate([x,x], axis=1) # 1 for column wise concatenation 

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

## <b>Pandas pd.concat()</b>

In [41]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [40]:
pd.concat([ser1, ser2], ignore_index= True)

0    A
1    B
2    C
3    D
4    E
5    F
dtype: object

In [48]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
# print(df1); print(df2); print(pd.concat([df1, df2]))
display('df1', 'df2', 'pd.concat([df1, df2])')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [49]:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
display('df3', 'df4', "pd.concat([df3, df4], axis='columns')")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,C,D
0,C0,D0
1,C1,D1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


In [50]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index # make indices match
display('x', 'y', 'pd.concat([x, y])')

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A2,B2
1,A3,B3


In [51]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
x.index = y.index # make indices match
display('x', 'y', 'pd.concat([x, y])')

Unnamed: 0,A,B
2,A0,B0
3,A1,B1

Unnamed: 0,A,B
2,A2,B2
3,A3,B3

Unnamed: 0,A,B
2,A0,B0
3,A1,B1
2,A2,B2
3,A3,B3


In [52]:
try:
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)

ValueError: Indexes have overlapping values: Index([2, 3], dtype='int64')


In [53]:
display('x', 'y', 'pd.concat([x, y], ignore_index=True)')

Unnamed: 0,A,B
2,A0,B0
3,A1,B1

Unnamed: 0,A,B
2,A2,B2
3,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [54]:
display('x', 'y', "pd.concat([x, y], keys=['x', 'y'])")

Unnamed: 0,A,B
2,A0,B0
3,A1,B1

Unnamed: 0,A,B
2,A2,B2
3,A3,B3

Unnamed: 0,Unnamed: 1,A,B
x,2,A0,B0
x,3,A1,B1
y,2,A2,B2
y,3,A3,B3


In [56]:
v = pd.concat([x, y], keys=['x', 'y'])
v

Unnamed: 0,Unnamed: 1,A,B
x,2,A0,B0
x,3,A1,B1
y,2,A2,B2
y,3,A3,B3


In [59]:
v.stack()

x  2  A    A0
      B    B0
   3  A    A1
      B    B1
y  2  A    A2
      B    B2
   3  A    A3
      B    B3
dtype: object

In [60]:
v.unstack(level=0)

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,x,y,x,y
2,A0,A2,B0,B2
3,A1,A3,B1,B3


In [61]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
display('df5', 'df6', 'pd.concat([df5, df6])')

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [62]:
display('df5', 'df6',
"pd.concat([df5, df6], join='inner')")

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [63]:
display('df5', 'df6',
"pd.concat([df5, df6], join='outer')")

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [64]:
df5

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2


In [65]:
df6

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4


In [66]:
df6.reindex(df5.columns, axis=1)

Unnamed: 0,A,B,C
3,,B3,C3
4,,B4,C4


In [67]:
pd.concat([df5, df6.reindex(df5.columns, axis=1)])

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2
3,,B3,C3
4,,B4,C4
