# DataFrame: Merging

In [1]:
import pandas as pd

## pandas.concat()

- Use `pd.concat()` to concatenate 2 or more DataFrames.
- `pd.concat()` matches indices / columns depending on axis.

In [None]:
# Generate dummy data.
df1 = pd.DataFrame(
    {
        "col_1": [1, 2, 3],
        "col_2": [4, 5, 6],
    },
    index=["a", "b", "c"],
)
df1

In [None]:
# Another set of dummy data.
# Note: Only partial overlap in index and columns!
df2 = pd.DataFrame(
    {
        "col_2": [1, 2, 3],
        "col_3": [4, 5, 6],
    },
    index=["b", "c", "d"],
)
df2

In [None]:
# Concat based on axis = 0 / "index".
# Similar to SQL UNION ALL.
#
# Note:
# - Missing values for non-overlapping columns.
# - Duplicate values in index.
pd.concat(
    [df1, df2],
    axis=0,
)

In [None]:
# Concat based on axis = 1 / "columns".
#
# Note:
# - Matches row indices.
# - Missing values for non-overlapping indices.
# - Duplicate columns (col_2).
pd.concat(
    [df1, df2],
    axis=1,
)

In [None]:
# Set join to "inner" to include only overlapping rows.
pd.concat(
    [df1, df2],
    axis=1,
    join="inner",
)

## DataFrame.join()

- Use `join()` to merge 2 DataFrames.
- `join()` uses the indices to join the records.

In [None]:
# Joining df1 and df2 similar to concat.
# Note: Error due to duplicate column (col_2)
df1.join(df2)

In [None]:
# Add a suffix to prevent the error.
df1.join(df2, rsuffix="_right")

### Join Types

Available join types via `how` parameter:

|Join|Result|
|---|---|
|`"left"`|Left DataFrame determines row set (default).|
|`"right"`|Right DataFrame determines row set.|
|`"inner"`|Only rows shared between both DataFrames.|
|`"outer"`|All rows from both DataFrames.|
|`"cross"`|Cartesian product of all rows.|

Note: Similar to the SQL JOIN types.

In [None]:
# Example of a left join.
# Note: Index of df1 determines row selection.
df1.join(df2, rsuffix="_right", how="left")

In [None]:
# Example of a outer join.
# Note: Both df1 and df2 determine row selection.
df1.join(df2, rsuffix="_right", how="outer")

In [None]:
# Example of a cross join.
# Note: All possible combinations, new index.
# Note: No missing values, data type is int.
df1.join(df2, rsuffix="_right", how="cross")

## DataFrame.merge()

- The `merge()` method is similar to `join()`, but provides more options.
- Use `merge()` unless you are using indices.

In [None]:
# Generate dummy data.
left = pd.DataFrame({
    "name": ["Henk", "Ingrid", "Henk"],
    "city": ["Amsterdam", "Amsterdam", "Rotterdam"],
    "age": [24, 56, 33],
})
left

In [None]:
# Another set of dummy data.
right = pd.DataFrame({
    "name": ["Henk", "Ingrid", "Henk", "Sanne"],
    "city": ["Amsterdam", "Amsterdam", "Rotterdam", "Rotterdam"],
    "score": [4, 5, 6, 7],
})
right

In [None]:
# Merge using a shared column.
#
# Note:
# - Duplicate rows for duplicate names ("Henk").
# - Automatically adds suffixes _x and _y where needed.
left.merge(right, on="name", how="left")

In [None]:
# Merge using multiple columns (name + city)
# Note: Eliminates duplicate rows / columns.
left.merge(right, on=["name", "city"], how="left")

In [None]:
# Optional: Use validate parameter to check merge.
# Valid options are "1:1", "1:m", "m:1" and "m:m".
left.merge(right, on=["name", "city"], validate="1:1")