In [2]:
print("""
@File         : merging_dataframes_with_pd.merge.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2025-01-02 21:36:23
@Email        : cuixuanstephen@gmail.com
@Description  : Merging DataFrames with pd.merge
""")


@File         : merging_dataframes_with_pd.merge.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2025-01-02 21:36:23
@Email        : cuixuanstephen@gmail.com
@Description  : Merging DataFrames with pd.merge



In [3]:
import pandas as pd

重塑数据的另一项常见任务称为合并 merging，在某些情况下也称为连接 joining，后者在数据库术语中经常使用。连接是指将对象“堆叠”在一起或彼此相邻，而合并是指在两个实体之间找到一个公共键（或一组键），并使用它来将实体中的其他列混合在一起：

![Merging two pd.DataFrame objects](../../IMAGES/FIG7-3.png)

In [4]:
df_q1 = pd.DataFrame([
    ["AAPL", 100., 50., 75.],
    ["MSFT", 80., 42., 62.],
    ["AMZN", 60., 100., 120.],
], columns=["ticker", "shares", "low", "high"])
df_q1 = df_q1.convert_dtypes(dtype_backend="numpy_nullable")
df_q1

Unnamed: 0,ticker,shares,low,high
0,AAPL,100,50,75
1,MSFT,80,42,62
2,AMZN,60,100,120


In [5]:
df_q2 = pd.DataFrame([
    ["AAPL", 80., 70., 80., 77.],
    ["MSFT", 90., 50., 60., 55.],
    ["IBM", 100., 60., 70., 64.],
    ["GE", 42., 30., 50., 44.],
], columns=["ticker", "shares", "low", "high", "close"])
df_q2 = df_q2.convert_dtypes(dtype_backend="numpy_nullable")
df_q2

Unnamed: 0,ticker,shares,low,high,close
0,AAPL,80,70,80,77
1,MSFT,90,50,60,55
2,IBM,100,60,70,64
3,GE,42,30,50,44


使用 `pd.merge`，可以通过向 `on=` 传递一个参数来阐明希望 pandas 用于对齐的列：

In [6]:
pd.merge(df_q1, df_q2, on=['ticker'])

Unnamed: 0,ticker,shares_x,low_x,high_x,shares_y,low_y,high_y,close
0,AAPL,100,50,75,80,70,80,77
1,MSFT,80,42,62,90,50,60,55


默认情况下，`pd.merge` 执行内部合并；如果我们想要一个更类似于 `pd.concat` 示例的结果，我们可以传递 `how="outer"`：

In [8]:
pd.merge(df_q1, df_q2, on='ticker', how='outer')

Unnamed: 0,ticker,shares_x,low_x,high_x,shares_y,low_y,high_y,close
0,AAPL,100.0,50.0,75.0,80.0,70.0,80.0,77.0
1,AMZN,60.0,100.0,120.0,,,,
2,GE,,,,42.0,30.0,50.0,44.0
3,IBM,,,,100.0,60.0,70.0,64.0
4,MSFT,80.0,42.0,62.0,90.0,50.0,60.0,55.0


虽然 `pd.concat` 仅允许执行 inner 或 outer 合并，但 `pd.merge` 还支持 left 合并，它保留第一个 pd.DataFrame 中的所有数据，并合并第二个 pd.DataFrame 中的数据作为可以匹配的关键字段：

In [9]:
pd.merge(df_q1, df_q2, on='ticker', how='left')

Unnamed: 0,ticker,shares_x,low_x,high_x,shares_y,low_y,high_y,close
0,AAPL,100,50,75,80.0,70.0,80.0,77.0
1,MSFT,80,42,62,90.0,50.0,60.0,55.0
2,AMZN,60,100,120,,,,


`how="right"` reverses that, ensuring that every row from the second pd.DataFrame is represented in the output:

In [10]:
pd.merge(df_q1, df_q2, on=["ticker"], how="right")

Unnamed: 0,ticker,shares_x,low_x,high_x,shares_y,low_y,high_y,close
0,AAPL,100.0,50.0,75.0,80,70,80,77
1,MSFT,80.0,42.0,62.0,90,50,60,55
2,IBM,,,,100,60,70,64
3,GE,,,,42,30,50,44


使用 `how="outer"` 时的另一个功能是能够提供一个 `indicator=` 参数，它将告诉生成的 pd.DataFrame 中每一行的来源：

In [11]:
pd.merge(df_q1, df_q2, on=["ticker"], how="outer", indicator=True)

Unnamed: 0,ticker,shares_x,low_x,high_x,shares_y,low_y,high_y,close,_merge
0,AAPL,100.0,50.0,75.0,80.0,70.0,80.0,77.0,both
1,AMZN,60.0,100.0,120.0,,,,,left_only
2,GE,,,,42.0,30.0,50.0,44.0,right_only
3,IBM,,,,100.0,60.0,70.0,64.0,right_only
4,MSFT,80.0,42.0,62.0,90.0,50.0,60.0,55.0,both


`pd.concat` 输出与 `pd.merge` 输出之间的另一个区别是，前者在列中生成 `pd.MultiIndex`，从根本上防止了同时出现在两个 pd.DataFrame 对象中的列标签发生冲突。相比之下，`pd.merge` 会为同时出现在两个 `pd.DataFrame` 对象中的列添加后缀以消除歧义。来自左侧 pd.DataFrame 的列将以 `_x` 为后缀，而 `_y` 后缀表示该列来自右侧 `pd.DataFrame`。

为了更好地控制此后缀，可以将元组参数传递给 `suffixes=`：

In [12]:
pd.merge(
    df_q1, df_q2, on='ticker', how='outer', suffixes=('_q1', '_q2')
)

Unnamed: 0,ticker,shares_q1,low_q1,high_q1,shares_q2,low_q2,high_q2,close
0,AAPL,100.0,50.0,75.0,80.0,70.0,80.0,77.0
1,AMZN,60.0,100.0,120.0,,,,
2,GE,,,,42.0,30.0,50.0,44.0
3,IBM,,,,100.0,60.0,70.0,64.0
4,MSFT,80.0,42.0,62.0,90.0,50.0,60.0,55.0


但是，应该知道，只有当列名出现在两个 pd.DataFrame 对象中时，才会应用后缀。如果某一列仅出现在其中一个对象中，而不是两个对象中，则不会应用后缀。

如果我们的关键列在两个 pd.DataFrame 对象中有不同的名称，这会是个问题吗？

In [13]:
df_q2 = df_q2.rename(columns={'ticker': 'SYMBOL'})
df_q2

Unnamed: 0,SYMBOL,shares,low,high,close
0,AAPL,80,70,80,77
1,MSFT,90,50,60,55
2,IBM,100,60,70,64
3,GE,42,30,50,44


使用 pd.merge，唯一改变的是，现在需要向 `left_on=` 和 `right_on=` 传递两个不同的参数，而不是只向 `on=` 传递一个参数：

In [14]:
pd.merge(
    df_q1, df_q2, left_on=['ticker'], right_on=['SYMBOL'], how='outer'
)

Unnamed: 0,ticker,shares_x,low_x,high_x,SYMBOL,shares_y,low_y,high_y,close
0,AAPL,100.0,50.0,75.0,AAPL,80.0,70.0,80.0,77.0
1,AMZN,60.0,100.0,120.0,,,,,
2,,,,,GE,42.0,30.0,50.0,44.0
3,,,,,IBM,100.0,60.0,70.0,64.0
4,MSFT,80.0,42.0,62.0,MSFT,90.0,50.0,60.0,55.0


In [15]:
lows = pd.DataFrame([
    ["AAPL", "Q1", 50.],
    ["MSFT", "Q1", 42.],
    ["AMZN", "Q1", 100.],
    ["AAPL", "Q2", 70.],
    ["MSFT", "Q2", 50.],
    ["IBM", "Q2", 60.],
    ["GE", "Q2", 30.],
], columns=["ticker", "quarter", "low"])
lows = lows.convert_dtypes(dtype_backend="numpy_nullable")
lows

Unnamed: 0,ticker,quarter,low
0,AAPL,Q1,50
1,MSFT,Q1,42
2,AMZN,Q1,100
3,AAPL,Q2,70
4,MSFT,Q2,50
5,IBM,Q2,60
6,GE,Q2,30


In [16]:
highs = pd.DataFrame([
    ["AAPL", "Q1", 75.],
    ["MSFT", "Q1", 62.],
    ["AMZN", "Q1", 120.],
    ["AAPL", "Q2", 80.],
    ["MSFT", "Q2", 60.],
    ["IBM", "Q2", 70.],
    ["GE", "Q2", 50.],
], columns=["SYMBOL", "QTR", "high"])
highs = highs.convert_dtypes(dtype_backend="numpy_nullable")
highs

Unnamed: 0,SYMBOL,QTR,high
0,AAPL,Q1,75
1,MSFT,Q1,62
2,AMZN,Q1,120
3,AAPL,Q2,80
4,MSFT,Q2,60
5,IBM,Q2,70
6,GE,Q2,50


通过这些 pd.DataFrame 对象的布局，我们的关键字段现在变成了股票行情和季度的组合。通过将适当的标签作为参数传递给 `left_on=` 和 `right_on=`，pandas 仍然能够执行此合并：

In [18]:
pd.merge(
    lows, highs, left_on=['ticker', 'quarter'],
    right_on=['SYMBOL', 'QTR']
)

Unnamed: 0,ticker,quarter,low,SYMBOL,QTR,high
0,AAPL,Q1,50,AAPL,Q1,75
1,MSFT,Q1,42,MSFT,Q1,62
2,AMZN,Q1,100,AMZN,Q1,120
3,AAPL,Q2,70,AAPL,Q2,80
4,MSFT,Q2,50,MSFT,Q2,60
5,IBM,Q2,60,IBM,Q2,70
6,GE,Q2,30,GE,Q2,50


尝试合并数据时需要额外考虑的是 pd.DataFrame 对象中键的唯一性。如果对此理解不充分或不正确，则会导致应用程序中出现难以检测的错误。幸运的是，`pd.merge` 可以帮助提前检测这些问题。

In [19]:
sales = pd.DataFrame([
    ["Jan", "John", 10],
    ["Feb", "John", 20],
    ["Mar", "John", 30],
], columns=["month", "salesperson", "sales"])
sales = sales.convert_dtypes(dtype_backend="numpy_nullable")
sales

Unnamed: 0,month,salesperson,sales
0,Jan,John,10
1,Feb,John,20
2,Mar,John,30


In [20]:
regions = pd.DataFrame([
    ["John", "Northeast"],
    ["Jane", "Southwest"],
], columns=["salesperson", "region"])
regions = regions.convert_dtypes(dtype_backend="numpy_nullable")
regions

Unnamed: 0,salesperson,region
0,John,Northeast
1,Jane,Southwest


In [21]:
pd.merge(sales, regions, on='salesperson')

Unnamed: 0,month,salesperson,sales,region
0,Jan,John,10,Northeast
1,Feb,John,20,Northeast
2,Mar,John,30,Northeast


In [22]:
pd.merge(sales, regions, on='salesperson')['sales'].sum()

60

In [23]:
regions_orig = regions
regions = pd.DataFrame([
    ["John", "Smith", "Northeast"],
    ["Jane", "Doe", "Southwest"],
    ["John", "Newhire", "Southeast"],
], columns=["salesperson", "last_name", "region"])
regions = regions.convert_dtypes(dtype_backend="numpy_nullable")
regions

Unnamed: 0,salesperson,last_name,region
0,John,Smith,Northeast
1,Jane,Doe,Southwest
2,John,Newhire,Southeast


In [24]:
pd.merge(sales, regions, on=['salesperson'])

Unnamed: 0,month,salesperson,sales,last_name,region
0,Jan,John,10,Smith,Northeast
1,Jan,John,10,Newhire,Southeast
2,Feb,John,20,Smith,Northeast
3,Feb,John,20,Newhire,Southeast
4,Mar,John,30,Smith,Northeast
5,Mar,John,30,Newhire,Southeast


突然将两个 pd.DataFrame 对象之间的关系更改为多对多（或 n 对 n），这会重复我们的大部分数据并产生错误的销售
数量：

In [25]:
pd.merge(sales, regions, on=['salesperson'])['sales'].sum()

120

为了使用 pandas 提前捕捉这些意外情况，可以为 `pd.merge` 提供一个 `validate=` 参数，该参数建立了两个对象之间合并键的预期关系。使用我们原始的 pd.DataFrame 对象验证 `many_to_one` 就可以了：

In [26]:
pd.merge(sales, regions_orig, on=['salesperson'], validate='many_to_one')

Unnamed: 0,month,salesperson,sales,region
0,Jan,John,10,Northeast
1,Feb,John,20,Northeast
2,Mar,John,30,Northeast


In [28]:
try:
    pd.merge(sales, regions, on=['salesperson'], validate='many_to_one')
except pd.errors.MergeError as e:
    print(e)

Merge keys are not unique in right dataset; not a many-to-one merge


> 在现实世界中，检测此类问题并不那么简单。可能试图合并数千或数百万行数据，因此即使大量行受到关系问题的影响，它们也可能很容易被忽略。尝试手动检测此类问题就像大海捞针，因此我强烈建议使用此数据验证功能以避免意外。