Grouping and Aggregating Data
- Grouping in Pandas means splitting data into groups based on a column.
- Aggregating means summarizing each group with a single value, like adding up sales (sum), finding the average price (mean), or counting rows (count).

In [None]:
# ============================== Grouping and Aggregating Data in Pandas ==============================

# --- Grouping Data ---
# Group by a single column
# df.groupby('column')

# Group by multiple columns
# df.groupby(['col1', 'col2'])

# --- Aggregation Functions ---
# Get sum of each group
# df.groupby('column').sum()

# Get mean of each group
# df.groupby('column').mean()

# Get minimum value of each group
# df.groupby('column').min()

# Get maximum value of each group
# df.groupby('column').max()

# Count rows in each group (excluding NaNs)
# df.groupby('column').count()

# Size of each group (including NaNs)
# df.groupby('column').size()

# Standard deviation for each group
# df.groupby('column').std()

# Variance of each group
# df.groupby('column').var()

# --- Custom Aggregation using agg() ---
# Aggregate using a single function
# df.groupby('column').agg('sum')

# Aggregate with multiple functions
# df.groupby('column').agg(['sum', 'mean'])

# Aggregate different columns with different functions
# df.groupby('column').agg({'col1': 'sum', 'col2': 'mean'})

# --- Resetting Index after Grouping ---
# Reset index after aggregation to get flat DataFrame
# df.groupby('column').sum().reset_index()

# --- Filtering Groups ---
# Keep only groups with more than 2 rows
# df.groupby('column').filter(lambda x: len(x) > 2)


# --- Groupby with Sorting ---
# Group without sorting group labels
# df.groupby('column', sort=False).sum()


In [2]:
import pandas as pd

In [8]:
df = pd.read_excel("Coffee.xlsx")

In [9]:
df.head()

Unnamed: 0,transaction_id,transaction_date,transaction_time,transaction_qty,store_id,store_location,product_id,unit_price,product_category,product_type,product_detail
0,1,2023-01-01,07:06:11,2,5,Lower Manhattan,32,3.0,Coffee,Gourmet brewed coffee,Ethiopia Rg
1,2,2023-01-01,07:08:56,2,5,Lower Manhattan,57,3.1,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg
2,3,2023-01-01,07:14:04,2,5,Lower Manhattan,59,4.5,Drinking Chocolate,Hot chocolate,Dark chocolate Lg
3,4,2023-01-01,07:20:24,1,5,Lower Manhattan,22,2.0,Coffee,Drip coffee,Our Old Time Diner Blend Sm
4,5,2023-01-01,07:22:41,2,5,Lower Manhattan,57,3.1,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg


In [None]:
# Group by a single column
df.groupby('product_category')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000025EBE346E10>

In [32]:
# Group by multiple columns
df.groupby(['store_location', 'product_category'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000025EC3844440>

In [33]:
# Sum of each group
df.groupby('product_category')['transaction_qty'].sum()

product_category
Bakery                23214
Branded                 776
Coffee                89250
Coffee beans           1828
Drinking Chocolate    17457
Flavours              10511
Loose Tea              1210
Packaged Chocolate      487
Tea                   69737
Name: transaction_qty, dtype: int64

In [34]:
# Mean of each group
df.groupby('product_category')['unit_price'].mean()

product_category
Bakery                 3.551682
Branded               17.720214
Coffee                 3.023646
Coffee beans          21.018397
Drinking Chocolate     4.148827
Flavours               0.800000
Loose Tea              9.267438
Packaged Chocolate     9.050595
Tea                    2.817121
Name: unit_price, dtype: float64

In [35]:
# Minimum value of each group
df.groupby('product_category')['unit_price'].min()

product_category
Bakery                 2.65
Branded               12.00
Coffee                 2.00
Coffee beans          10.00
Drinking Chocolate     3.50
Flavours               0.80
Loose Tea              8.95
Packaged Chocolate     6.40
Tea                    2.50
Name: unit_price, dtype: float64

In [36]:
# Maximum value of each group
df.groupby('product_category')['unit_price'].max()

product_category
Bakery                 5.63
Branded               28.00
Coffee                 4.25
Coffee beans          45.00
Drinking Chocolate     4.75
Flavours               0.80
Loose Tea             10.95
Packaged Chocolate    13.33
Tea                    4.00
Name: unit_price, dtype: float64

In [37]:
# Count of rows in each group
df.groupby('product_category')['transaction_id'].count()

product_category
Bakery                22796
Branded                 747
Coffee                58416
Coffee beans           1753
Drinking Chocolate    11468
Flavours               6790
Loose Tea              1210
Packaged Chocolate      487
Tea                   45449
Name: transaction_id, dtype: int64

In [38]:
# Size of each group (including NaNs if present)
df.groupby('product_category').size()

product_category
Bakery                22796
Branded                 747
Coffee                58416
Coffee beans           1753
Drinking Chocolate    11468
Flavours               6790
Loose Tea              1210
Packaged Chocolate      487
Tea                   45449
dtype: int64

In [39]:
# Standard deviation of unit_price per category
df.groupby('product_category')['unit_price'].std()

product_category
Bakery                0.393963
Branded               6.807042
Coffee                0.683111
Coffee beans          9.040836
Drinking Chocolate    0.511951
Flavours              0.000000
Loose Tea             0.597210
Packaged Chocolate    2.870465
Tea                   0.392372
Name: unit_price, dtype: float64

In [40]:
# Variance of unit_price per category
df.groupby('product_category')['unit_price'].var()

product_category
Bakery                 0.155207
Branded               46.335824
Coffee                 0.466641
Coffee beans          81.736712
Drinking Chocolate     0.262094
Flavours               0.000000
Loose Tea              0.356660
Packaged Chocolate     8.239569
Tea                    0.153956
Name: unit_price, dtype: float64

In [42]:
# Aggregation using string function
df.groupby('product_category')[['transaction_qty', 'unit_price']].agg('sum')

Unnamed: 0_level_0,transaction_qty,unit_price
product_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Bakery,23214,80964.14
Branded,776,13237.0
Coffee,89250,176629.3
Coffee beans,1828,36845.25
Drinking Chocolate,17457,47578.75
Flavours,10511,5432.0
Loose Tea,1210,11213.6
Packaged Chocolate,487,4407.64
Tea,69737,128035.35


In [43]:
# Multiple aggregations
df.groupby('product_category')['unit_price'].agg(['sum', 'mean'])

Unnamed: 0_level_0,sum,mean
product_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Bakery,80964.14,3.551682
Branded,13237.0,17.720214
Coffee,176629.3,3.023646
Coffee beans,36845.25,21.018397
Drinking Chocolate,47578.75,4.148827
Flavours,5432.0,0.8
Loose Tea,11213.6,9.267438
Packaged Chocolate,4407.64,9.050595
Tea,128035.35,2.817121


In [44]:
# Different aggregation per column
df.groupby('product_category').agg({
    'transaction_qty': 'sum',
    'unit_price': 'mean'
})

Unnamed: 0_level_0,transaction_qty,unit_price
product_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Bakery,23214,3.551682
Branded,776,17.720214
Coffee,89250,3.023646
Coffee beans,1828,21.018397
Drinking Chocolate,17457,4.148827
Flavours,10511,0.8
Loose Tea,1210,9.267438
Packaged Chocolate,487,9.050595
Tea,69737,2.817121


In [45]:
# Flatten the grouped result to a DataFrame
df.groupby('product_category')['transaction_qty'].sum().reset_index()

Unnamed: 0,product_category,transaction_qty
0,Bakery,23214
1,Branded,776
2,Coffee,89250
3,Coffee beans,1828
4,Drinking Chocolate,17457
5,Flavours,10511
6,Loose Tea,1210
7,Packaged Chocolate,487
8,Tea,69737


In [46]:
# Keep product categories with more than 2 transactions
df.groupby('product_category').filter(lambda x: len(x) > 2)

Unnamed: 0,transaction_id,transaction_date,transaction_time,transaction_qty,store_id,store_location,product_id,unit_price,product_category,product_type,product_detail
0,1,2023-01-01,07:06:11,2,5,Lower Manhattan,32,3.00,Coffee,Gourmet brewed coffee,Ethiopia Rg
1,2,2023-01-01,07:08:56,2,5,Lower Manhattan,57,3.10,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg
2,3,2023-01-01,07:14:04,2,5,Lower Manhattan,59,4.50,Drinking Chocolate,Hot chocolate,Dark chocolate Lg
3,4,2023-01-01,07:20:24,1,5,Lower Manhattan,22,2.00,Coffee,Drip coffee,Our Old Time Diner Blend Sm
4,5,2023-01-01,07:22:41,2,5,Lower Manhattan,57,3.10,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg
...,...,...,...,...,...,...,...,...,...,...,...
149111,149452,2023-06-30,20:18:41,2,8,Hell's Kitchen,44,2.50,Tea,Brewed herbal tea,Peppermint Rg
149112,149453,2023-06-30,20:25:10,2,8,Hell's Kitchen,49,3.00,Tea,Brewed Black tea,English Breakfast Lg
149113,149454,2023-06-30,20:31:34,1,8,Hell's Kitchen,45,3.00,Tea,Brewed herbal tea,Peppermint Lg
149114,149455,2023-06-30,20:57:19,1,8,Hell's Kitchen,40,3.75,Coffee,Barista Espresso,Cappuccino


In [47]:
# Disable sorting of group labels
df.groupby('product_category', sort=False)['transaction_qty'].sum()

product_category
Coffee                89250
Tea                   69737
Drinking Chocolate    17457
Bakery                23214
Flavours              10511
Loose Tea              1210
Coffee beans           1828
Packaged Chocolate      487
Branded                 776
Name: transaction_qty, dtype: int64