<a href="https://colab.research.google.com/github/KryssyCo/DS-Unit-1-Sprint-2-Data-Wrangling-and-Storytelling/blob/master/Data_Science_Handbook_Chapter_3_6%2C_Combining_Datasets_Concat_and_Append.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Data Science Handbook Chapter 3.6, Combining Datasets: Concat and Append**

*Exercise concatenation of Series and DataFames with the pd.concat function.*

In [0]:
## Standard Imports

import pandas as pd
import numpy as np

In [15]:
## Define function which creates a DataFrame of a particular form
## Quikly make a Dataframe

def make_df(cols, ind):
  data = {c:[str(c) + str(i) for i in ind] for c in cols}
  return pd.DataFrame(data, ind)

## Example DataFrame

make_df('ABC', range(3))


Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [0]:
## Create a class that allows us to display multiple DataFrame's side
## by side using method _repr_html_. (Used for rich object display)
class display(object):
  """Display HTML representation of multiple objects"""
  template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
  def __init__(self, *args):
        self.args = args
        
  def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
  def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

In [17]:
## Just a reminder on how to concatenate NumPy Arrays
## This is a list or tuple of arrays to concatenate.
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]

np.concatenate([x , y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [18]:
## The argument above needs an axis keyword that allows you to 
## specify the axis alone which the result will be concatenated.

x = [[1, 2], [3,4]]
np.concatenate([x, x], axis =1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [19]:
# Pandas has a concatenation function pd.concat().
# This is a similar syntax to np.concatenate but 
# offers more options. It can be used for a simple 
# concatenation of Series or DataFrame objects, just 
# like np.concatenate() can be used for simple concatenations
# of arrays.

# Signature in Pandas v0.18

pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)


NameError: ignored

In [20]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [21]:
# It also works to concatenate higher dimensional objects, such as DataFrames

ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [22]:
# Concatenating higher-dimensional objects such as dataframes

df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
display('df1', 'df2', 'pd.concat([df1, df2])')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [23]:
# By default, concatenation takes place row-wise within the DataFrame(i.e,
# axis=0). pd.concat allows specification of an axis along which concatenation 
# will take place. Example:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
display('df3', 'df4', "pd.concat([df3, df4], axis= 1)")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,C,D
0,C0,D0
1,C1,D1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


In [24]:
# An important difference between np.concatenate and pd concat is that Pandas
# concatenation preserves indices, even if the result will have duplicate
# indices!
# Example:

x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
display('x', 'y', 'pd.concat([x, y])')


Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
2,A2,B2
3,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [0]:
# Catching the repeats as an error!
# To ensure that the indices do not overlap, you can specify the verify_integrity 
# flag. With this set to True, the concatenation will raise an exception if there
# are duplicate indices.
# Example:

try:
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)
    
# ValueError: Indexes have overlapping values: [0, 1]

In [26]:
# Ignore the Index

# With the ignore_index flag set to true, the concatenation will create a new 
# integer index for the resulting Series

display('x', 'y', 'pd.concat([x, y], ignore_index=True)')

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
2,A2,B2
3,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [27]:
# Adding MultiIndex keys
# Another option is to use the keys option to specify a label for the data 
# sources; the result will be a hierarchically indexed series containing the 
# data:

#The result is a multiply indexed DataFrame

display('x', 'y', "pd.concat([x, y], keys=['x', 'y'])")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
2,A2,B2
3,A3,B3

Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,2,A2,B2
y,3,A3,B3


In [28]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
display('df5', 'df6', 'pd.concat([df5, df6])')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [29]:
# To change NA Values, we can specify one of several options for the join and 
# join_axes parameters of the concatenate function. By default, the join is a 
# union of the input columns (join='outer'),but we can change this to an 
# intersection of the columns using join='inner':

display('df5', 'df6', "pd.concat([df5, df6], join='inner')")

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [30]:
# Or you can directly specify the index of the remaining columns
# use the join_axes argument, which takes a list of index objects. In the
# following code, we specify that the returned columns should be the same as those
# of the first input.

display('df5', 'df6', "pd.concat([df5, df6], join_axes=[df5.columns])")

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2
3,,B3,C3
4,,B4,C4


In [32]:
# The append() method
# Series and DataFrame objects have an append method that can accomplish the same
# thing in fewer keystrokes. Instead of calling pd.concat([df1, df2]), you can simply call df1.append(df2):
# Example:

display('df1', 'df2', 'df1.append(df2)')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [0]:
# # Unlike the append() and extend() methods of Python lists, the append() method
# in Pandas does not modify the original object–instead it creates a new object 
# with the combined data. It also is not a very efficient method, because it 
# involves creation of a new index and data buffer. Thus, if you plan to do 
# multiple append operations, it is generally better to build a list of DataFrames
# and pass them all at once to the concat() function.