In [1]:
print("""
@Description: Reshaping and pivoting
@Author(s): Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime: 2023-07-02 22:54:41
""")


@Description: Reshaping and pivoting
@Author(s): Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime: 2023-07-02 22:54:41



# 数据集的重塑和透视

This chapter covers
1. Comparing wide and narrow data
2. Generating a pivot table from a DataFrame
3. Aggregating values by sum, average, count, and more
4. Stacking and unstacking DataFrame index levels
5. Melting a DataFrame

## 宽数据和窄数据

## Creating a pivot table from a DataFrame

In [2]:
import pandas as pd

In [3]:
pd.read_csv('sales_by_employee.csv').head()

Unnamed: 0,Date,Name,Customer,Revenue,Expenses
0,1/1/20,Oscar,Logistics XYZ,5250,531
1,1/1/20,Oscar,Money Corp.,4406,661
2,1/2/20,Oscar,PaperMaven,8661,1401
3,1/3/20,Oscar,PaperGenius,7075,906
4,1/4/20,Oscar,Paper Pound,2524,1767


In [4]:
sales = pd.read_csv('sales_by_employee.csv', parse_dates=['Date'], date_format='%m/%d/%y')
sales.tail()

Unnamed: 0,Date,Name,Customer,Revenue,Expenses
21,2020-01-01,Creed,Money Corp.,4430,548
22,2020-01-02,Creed,Average Paper Co.,8026,1906
23,2020-01-02,Creed,Average Paper Co.,5188,1768
24,2020-01-04,Creed,PaperMaven,3144,1314
25,2020-01-05,Creed,Money Corp.,938,1053


### The pivot_table method

In [5]:
sales.pivot_table(index='Date', values=['Revenue', 'Expenses'])

Unnamed: 0_level_0,Expenses,Revenue
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01,637.5,4293.5
2020-01-02,1244.4,7303.0
2020-01-03,1313.666667,4865.833333
2020-01-04,1450.6,3948.0
2020-01-05,1196.25,4834.75


In [6]:
sales.pivot_table(index='Date', values=['Revenue', 'Expenses'], aggfunc='mean')

Unnamed: 0_level_0,Expenses,Revenue
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01,637.5,4293.5
2020-01-02,1244.4,7303.0
2020-01-03,1313.666667,4865.833333
2020-01-04,1450.6,3948.0
2020-01-05,1196.25,4834.75


In [7]:
sales.pivot_table(index='Date', values=['Revenue', 'Expenses'], aggfunc='sum')

Unnamed: 0_level_0,Expenses,Revenue
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01,3825,25761
2020-01-02,6222,36515
2020-01-03,7882,29195
2020-01-04,7253,19740
2020-01-05,4785,19339


In [8]:
sales.pivot_table(index='Date', values='Revenue', aggfunc='sum')

Unnamed: 0_level_0,Revenue
Date,Unnamed: 1_level_1
2020-01-01,25761
2020-01-02,36515
2020-01-03,29195
2020-01-04,19740
2020-01-05,19339


In [9]:
sales.pivot_table(index='Date', columns='Name', values='Revenue', aggfunc='sum')

Name,Creed,Dwight,Jim,Michael,Oscar
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01,4430.0,2639.0,1864.0,7172.0,9656.0
2020-01-02,13214.0,,8278.0,6362.0,8661.0
2020-01-03,,11912.0,4226.0,5982.0,7075.0
2020-01-04,3144.0,,6155.0,7917.0,2524.0
2020-01-05,938.0,7771.0,,7837.0,2793.0


In [10]:
sales.pivot_table(index='Date', 
                  columns='Name', values='Revenue', 
                  aggfunc='sum', fill_value=0)

Name,Creed,Dwight,Jim,Michael,Oscar
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01,4430,2639,1864,7172,9656
2020-01-02,13214,0,8278,6362,8661
2020-01-03,0,11912,4226,5982,7075
2020-01-04,3144,0,6155,7917,2524
2020-01-05,938,7771,0,7837,2793


In [11]:
sales.pivot_table(index='Date', 
                  columns='Name', values='Revenue', 
                  aggfunc='sum', fill_value=0, margins=True)

Name,Creed,Dwight,Jim,Michael,Oscar,All
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01 00:00:00,4430,2639,1864,7172,9656,25761
2020-01-02 00:00:00,13214,0,8278,6362,8661,36515
2020-01-03 00:00:00,0,11912,4226,5982,7075,29195
2020-01-04 00:00:00,3144,0,6155,7917,2524,19740
2020-01-05 00:00:00,938,7771,0,7837,2793,19339
All,21726,22322,20523,35270,30709,130550


In [12]:
sales.pivot_table(index='Date', 
                  columns='Name', values='Revenue', 
                  aggfunc='sum', fill_value=0, 
                  margins=True, margins_name='Total')

Name,Creed,Dwight,Jim,Michael,Oscar,Total
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01 00:00:00,4430,2639,1864,7172,9656,25761
2020-01-02 00:00:00,13214,0,8278,6362,8661,36515
2020-01-03 00:00:00,0,11912,4226,5982,7075,29195
2020-01-04 00:00:00,3144,0,6155,7917,2524,19740
2020-01-05 00:00:00,938,7771,0,7837,2793,19339
Total,21726,22322,20523,35270,30709,130550


### 数据透视表的其他选项

In [13]:
sales.pivot_table(index='Date',
                  columns='Name',
                  values='Revenue',
                  aggfunc='count')

Name,Creed,Dwight,Jim,Michael,Oscar
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01,1.0,1.0,1.0,1.0,2.0
2020-01-02,2.0,,1.0,1.0,1.0
2020-01-03,,3.0,1.0,1.0,1.0
2020-01-04,1.0,,2.0,1.0,1.0
2020-01-05,1.0,1.0,,1.0,1.0


In [14]:
sales.pivot_table(index='Date',
                  columns='Name',
                  values='Revenue',
                  aggfunc=['count', 'sum'])

Unnamed: 0_level_0,count,count,count,count,count,sum,sum,sum,sum,sum
Name,Creed,Dwight,Jim,Michael,Oscar,Creed,Dwight,Jim,Michael,Oscar
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2020-01-01,1.0,1.0,1.0,1.0,2.0,4430.0,2639.0,1864.0,7172.0,9656.0
2020-01-02,2.0,,1.0,1.0,1.0,13214.0,,8278.0,6362.0,8661.0
2020-01-03,,3.0,1.0,1.0,1.0,,11912.0,4226.0,5982.0,7075.0
2020-01-04,1.0,,2.0,1.0,1.0,3144.0,,6155.0,7917.0,2524.0
2020-01-05,1.0,1.0,,1.0,1.0,938.0,7771.0,,7837.0,2793.0


In [16]:
sales.pivot_table(index='Date',
                  columns='Name',
                  values=['Revenue', 'Expenses'],
                  aggfunc={'Revenue': 'min', 'Expenses': 'max'})

Unnamed: 0_level_0,Expenses,Expenses,Expenses,Expenses,Expenses,Revenue,Revenue,Revenue,Revenue,Revenue
Name,Creed,Dwight,Jim,Michael,Oscar,Creed,Dwight,Jim,Michael,Oscar
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2020-01-01,548.0,368.0,1305.0,412.0,661.0,4430.0,2639.0,1864.0,7172.0,4406.0
2020-01-02,1906.0,,462.0,685.0,1401.0,5188.0,,8278.0,6362.0,8661.0
2020-01-03,,1321.0,1923.0,1772.0,906.0,,2703.0,4226.0,5982.0,7075.0
2020-01-04,1314.0,,1889.0,1857.0,1767.0,3144.0,,2287.0,7917.0,2524.0
2020-01-05,1053.0,1475.0,,1633.0,624.0,938.0,7771.0,,7837.0,2793.0


In [17]:
sales.pivot_table(index=['Name', 'Date'],
                  values='Revenue', aggfunc='sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue
Name,Date,Unnamed: 2_level_1
Creed,2020-01-01,4430
Creed,2020-01-02,13214
Creed,2020-01-04,3144
Creed,2020-01-05,938
Dwight,2020-01-01,2639
Dwight,2020-01-03,11912
Dwight,2020-01-05,7771
Jim,2020-01-01,1864
Jim,2020-01-02,8278
Jim,2020-01-03,4226


In [19]:
sales.pivot_table(index=['Date', 'Name'],
                  values='Revenue', aggfunc='sum').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue
Date,Name,Unnamed: 2_level_1
2020-01-01,Creed,4430
2020-01-01,Dwight,2639
2020-01-01,Jim,1864
2020-01-01,Michael,7172
2020-01-01,Oscar,9656
2020-01-02,Creed,13214
2020-01-02,Jim,8278
2020-01-02,Michael,6362
2020-01-02,Oscar,8661
2020-01-03,Dwight,11912


## 对索引级别进行堆叠和取消堆叠

In [20]:
sales.head()

Unnamed: 0,Date,Name,Customer,Revenue,Expenses
0,2020-01-01,Oscar,Logistics XYZ,5250,531
1,2020-01-01,Oscar,Money Corp.,4406,661
2,2020-01-02,Oscar,PaperMaven,8661,1401
3,2020-01-03,Oscar,PaperGenius,7075,906
4,2020-01-04,Oscar,Paper Pound,2524,1767


In [22]:
by_name_and_date = sales.pivot_table(
    index='Name', columns='Date', values='Revenue', aggfunc='sum'
)
by_name_and_date

Date,2020-01-01,2020-01-02,2020-01-03,2020-01-04,2020-01-05
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Creed,4430.0,13214.0,,3144.0,938.0
Dwight,2639.0,,11912.0,,7771.0
Jim,1864.0,8278.0,4226.0,6155.0,
Michael,7172.0,6362.0,5982.0,7917.0,7837.0
Oscar,9656.0,8661.0,7075.0,2524.0,2793.0


The stack method moves an index level from the column axis to the row axis.

In [24]:
by_name_and_date.stack()

Name     Date      
Creed    2020-01-01     4430.0
         2020-01-02    13214.0
         2020-01-04     3144.0
         2020-01-05      938.0
Dwight   2020-01-01     2639.0
         2020-01-03    11912.0
         2020-01-05     7771.0
Jim      2020-01-01     1864.0
         2020-01-02     8278.0
         2020-01-03     4226.0
         2020-01-04     6155.0
Michael  2020-01-01     7172.0
         2020-01-02     6362.0
         2020-01-03     5982.0
         2020-01-04     7917.0
         2020-01-05     7837.0
Oscar    2020-01-01     9656.0
         2020-01-02     8661.0
         2020-01-03     7075.0
         2020-01-04     2524.0
         2020-01-05     2793.0
dtype: float64

Notice that the DataFrame’s NaNs are absent from the Series. Pandas kept cells with NaNs in the by_name_and_date pivot table to maintain the structural integrity of the rows and columns. The shape of this MultiIndex Series allows pandas to discard the NaN values.

The complementary unstack method moves an index level from the row axis to the column axis. 

In [26]:
sales_by_customer = sales.pivot_table(
    index=['Customer', 'Name'],
    values='Revenue', aggfunc='sum'
)
sales_by_customer.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Revenue
Customer,Name,Unnamed: 2_level_1
Average Paper Co.,Creed,13214
Average Paper Co.,Jim,2287
Best Paper Co.,Dwight,2703
Best Paper Co.,Michael,15754
Logistics XYZ,Dwight,9209


The unstack method moves the innermost level of the row index to the column index

In [27]:
sales_by_customer.unstack()

Unnamed: 0_level_0,Revenue,Revenue,Revenue,Revenue,Revenue
Name,Creed,Dwight,Jim,Michael,Oscar
Customer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Average Paper Co.,13214.0,,2287.0,,
Best Paper Co.,,2703.0,,15754.0,
Logistics XYZ,,9209.0,,7172.0,5250.0
Money Corp.,5368.0,,8278.0,,4406.0
Paper Pound,,7771.0,4226.0,,5317.0
PaperGenius,,2639.0,1864.0,12344.0,7075.0
PaperMaven,3144.0,,3868.0,,8661.0


## 融合数据

In [28]:
sales.head(1)

Unnamed: 0,Date,Name,Customer,Revenue,Expenses
0,2020-01-01,Oscar,Logistics XYZ,5250,531


In [29]:
video_game_sales = pd.read_csv('video_game_sales.csv')
video_game_sales.head(1)

Unnamed: 0,Name,NA,EU,JP,Other
0,Wii Sports,41.49,29.02,3.77,8.46


Pandas melts a DataFrame with the melt method. (Melting is the process of converting a wide data set to a narrow one.) The method accepts two primary parameters:

1. The id_vars parameter sets the identifier column, the column for which the wide data set aggregates data. Name is the identifier column in video_game_sales. The data set aggregates sales per video game.
2. The value_vars parameter accepts the column(s) whose values pandas will melt and store in a new column.

In [31]:
video_game_sales.melt(id_vars='Name', value_vars='NA').head(5)

Unnamed: 0,Name,variable,value
0,Wii Sports,,41.49
1,Super Mario Bros.,,29.08
2,Mario Kart Wii,,15.85
3,Wii Sports Resort,,15.75
4,Pokemon Red/Pokemon Blue,,11.27


In [33]:
regional_sales_columns = ["NA", "EU", "JP", "Other"]
video_game_sales.melt(
    id_vars='Name', value_vars=regional_sales_columns
)

Unnamed: 0,Name,variable,value
0,Wii Sports,,41.49
1,Super Mario Bros.,,29.08
2,Mario Kart Wii,,15.85
3,Wii Sports Resort,,15.75
4,Pokemon Red/Pokemon Blue,,11.27
...,...,...,...
66259,Woody Woodpecker in Crazy Castle 5,Other,0.00
66260,Men in Black II: Alien Escape,Other,0.00
66261,SCORE International Baja 1000: The Official Game,Other,0.00
66262,Know How 2,Other,0.00


In [35]:
video_game_sales_by_region = video_game_sales.melt(
    id_vars='Name', value_vars=regional_sales_columns, value_name='Sales', var_name='Region'
)
video_game_sales_by_region.head()

Unnamed: 0,Name,Region,Sales
0,Wii Sports,,41.49
1,Super Mario Bros.,,29.08
2,Mario Kart Wii,,15.85
3,Wii Sports Resort,,15.75
4,Pokemon Red/Pokemon Blue,,11.27


In [37]:
video_game_sales_by_region.pivot_table(index='Name', values='Sales', aggfunc='sum').head()

Unnamed: 0_level_0,Sales
Name,Unnamed: 1_level_1
'98 Koshien,0.4
.hack//G.U. Vol.1//Rebirth,0.17
.hack//G.U. Vol.2//Reminisce,0.23
.hack//G.U. Vol.3//Redemption,0.17
.hack//Infection Part 1,1.26


## Exploding a list of values(展开值列表)

In [38]:
recipes = pd.read_csv('recipes.csv')
recipes.head()

Unnamed: 0,Recipe,Ingredients
0,Cashew Crusted Chicken,"Apricot preserves, Dijon mustard, curry powder..."
1,Tomato Basil Salmon,"Salmon filets, basil, tomato, olive oil, Parme..."
2,Parmesan Cheese Chicken,"Bread crumbs, Parmesan cheese, Italian seasoni..."


In [40]:
recipes['Ingredients'] = recipes['Ingredients'].str.split(',')

In [41]:
recipes

Unnamed: 0,Recipe,Ingredients
0,Cashew Crusted Chicken,"[Apricot preserves, Dijon mustard, curry pow..."
1,Tomato Basil Salmon,"[Salmon filets, basil, tomato, olive oil, ..."
2,Parmesan Cheese Chicken,"[Bread crumbs, Parmesan cheese, Italian seas..."


In [46]:
recipes.explode('Ingredients')

Unnamed: 0,Recipe,Ingredients
0,Cashew Crusted Chicken,Apricot preserves
0,Cashew Crusted Chicken,Dijon mustard
0,Cashew Crusted Chicken,curry powder
0,Cashew Crusted Chicken,chicken breasts
0,Cashew Crusted Chicken,cashews
1,Tomato Basil Salmon,Salmon filets
1,Tomato Basil Salmon,basil
1,Tomato Basil Salmon,tomato
1,Tomato Basil Salmon,olive oil
1,Tomato Basil Salmon,Parmesan cheese
