https://towardsdatascience.com/all-about-pandas-groupby-explained-with-25-examples-494e04a8ef56

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sales = pd.read_csv('https://raw.githubusercontent.com/SonerYldrm/datasets/main/sales.csv')

In [3]:
sales.head()

Unnamed: 0,id,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,1,4187,PG2,498,420.76,569.91,13,58
1,2,4195,PG2,473,545.64,712.41,16,58
2,3,4204,PG2,968,640.42,854.91,22,88
3,4,4219,PG2,241,869.69,1034.55,14,45
4,5,4718,PG2,1401,12.54,26.59,50,285


Example 1: Single aggregation

We can calculate the average stock quantity for each store as follows:

In [4]:
sales.groupby("product_group")["stock_qty"].mean()

product_group
PG1      660.666667
PG2     2142.013333
PG3     2460.025641
PG4     1332.532951
PG5    17823.850980
PG6     2963.008230
Name: stock_qty, dtype: float64

Example 2: Multiple aggregations

We can do multiple aggregations in a single operation. Here is how we can calculate the average stock quantity and price for each store.

In [5]:
sales.groupby("product_group")[["stock_qty","price"]].mean()

Unnamed: 0_level_0,stock_qty,price
product_group,Unnamed: 1_level_1,Unnamed: 2_level_1
PG1,660.666667,303.612821
PG2,2142.013333,124.980133
PG3,2460.025641,83.571538
PG4,1332.532951,31.033381
PG5,17823.85098,87.09498
PG6,2963.00823,39.300165


Example 3: Multiple aggregations — 2

We can also use the agg function for calculating multiple aggregate values.

In [6]:
sales.groupby("product_group")["stock_qty"].agg(["mean","max"])

Unnamed: 0_level_0,mean,max
product_group,Unnamed: 1_level_1,Unnamed: 2_level_1
PG1,660.666667,1805
PG2,2142.013333,44996
PG3,2460.025641,14955
PG4,1332.532951,10722
PG5,17823.85098,4104542
PG6,2963.00823,52748


Example 4: Named aggregations

In [7]:
sales.groupby("product_group").agg(avg_stock_qty =("stock_qty","mean"),max_stock_qty = ("stock_qty","max"))

Unnamed: 0_level_0,avg_stock_qty,max_stock_qty
product_group,Unnamed: 1_level_1,Unnamed: 2_level_1
PG1,660.666667,1805
PG2,2142.013333,44996
PG3,2460.025641,14955
PG4,1332.532951,10722
PG5,17823.85098,4104542
PG6,2963.00823,52748


Example 5: Multiple aggregations and multiple functions

In [8]:
sales.groupby("product_group")[["stock_qty", "price"]].agg(["mean","max"])

Unnamed: 0_level_0,stock_qty,stock_qty,price,price
Unnamed: 0_level_1,mean,max,mean,max
product_group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
PG1,660.666667,1805,303.612821,1500.05
PG2,2142.013333,44996,124.980133,1034.55
PG3,2460.025641,14955,83.571538,593.66
PG4,1332.532951,10722,31.033381,949.91
PG5,17823.85098,4104542,87.09498,740.91
PG6,2963.00823,52748,39.300165,712.41


Example 6: Named aggregations using different columns

In [9]:
sales.groupby("product_group").agg(avg_stock_qty = ("stock_qty", "mean"),avg_price=("price","mean"))

Unnamed: 0_level_0,avg_stock_qty,avg_price
product_group,Unnamed: 1_level_1,Unnamed: 2_level_1
PG1,660.666667,303.612821
PG2,2142.013333,124.980133
PG3,2460.025641,83.571538
PG4,1332.532951,31.033381
PG5,17823.85098,87.09498
PG6,2963.00823,39.300165


Example 7: as_index parameter

In [10]:
sales.groupby("product_group", as_index = False).agg(avg_stock_qty = ("stock_qty","mean"),avg_price=("price","mean"))

Unnamed: 0,product_group,avg_stock_qty,avg_price
0,PG1,660.666667,303.612821
1,PG2,2142.013333,124.980133
2,PG3,2460.025641,83.571538
3,PG4,1332.532951,31.033381
4,PG5,17823.85098,87.09498
5,PG6,2963.00823,39.300165


In [11]:
sales.head()

Unnamed: 0,id,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
0,1,4187,PG2,498,420.76,569.91,13,58
1,2,4195,PG2,473,545.64,712.41,16,58
2,3,4204,PG2,968,640.42,854.91,22,88
3,4,4219,PG2,241,869.69,1034.55,14,45
4,5,4718,PG2,1401,12.54,26.59,50,285


Example 8: Multiple columns for grouping

In [12]:
sales.groupby(["product_group","id"], as_index=False).agg(avg_sales=("last_week_sales","mean")).head()

Unnamed: 0,product_group,id,avg_sales
0,PG1,125,100.0
1,PG1,347,30.0
2,PG1,369,21.0
3,PG1,405,24.0
4,PG1,406,36.0


Example 9: Sorting the output

In [13]:
sales.groupby(["product_group","id"], as_index=False).agg(avg_sales=("last_week_sales","mean")).sort_values(by="avg_sales",ascending=False).head()

Unnamed: 0,product_group,id,avg_sales
755,PG5,992,3222.0
817,PG6,340,2690.0
777,PG6,116,2036.0
855,PG6,414,1883.0
113,PG2,999,1772.0


Example 10: Largest n values

#### largest 2 values

In [14]:
sales.groupby("product_group")["last_week_sales"].nlargest(2)

product_group     
PG1            524     160
               469     106
PG2            998    1772
               997     331
PG3            231     947
               632     344
PG4            211     429
               298     421
PG5            991    3222
               264     638
PG6            339    2690
               115    2036
Name: last_week_sales, dtype: int64

Example 11: Smallest n values

In [15]:
sales.groupby("product_group")["last_week_sales"].nsmallest(2)

product_group     
PG1            710    12
               590    13
PG2            21     12
               23     12
PG3            634    17
               312    18
PG4            75     12
               81     12
PG5            130    12
               215    12
PG6            20     12
               52     12
Name: last_week_sales, dtype: int64

Example 12: The nth value

In [16]:
sales_sorted = sales.sort_values(by=["product_group","last_month_sales"],ascending=False, ignore_index=True)

In [17]:
sales_sorted.groupby("product_group").nth(4)

Unnamed: 0_level_0,id,product_code,stock_qty,cost,price,last_week_sales,last_month_sales
product_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PG1,459,5167,980,63.25,106.31,64,203
PG2,249,7335,4269,3.3,10.44,268,941
PG3,91,3521,2923,46.7,56.99,208,850
PG4,213,7450,8504,2.23,5.69,314,1211
PG5,262,4230,5163,29.04,26.59,547,1902
PG6,339,889,21569,12.98,16.14,808,2990


Example 13: The nth with negative index

In [18]:
sales_sorted.groupby("product_group").nth(-2)

Unnamed: 0_level_0,id,product_code,stock_qty,cost,price,last_week_sales,last_month_sales
product_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
PG1,723,4150,22,405.1,612.66,14,29
PG2,308,9372,350,1.41,1.42,16,40
PG3,313,9800,1398,64.32,99.66,18,65
PG4,781,2279,525,1.34,1.23,14,18
PG5,773,2987,2266,7.15,28.49,13,20
PG6,357,4982,884,21.12,41.79,12,19


Example 14: Unique values

In [19]:
sales_sorted.groupby("product_group", as_index=False).agg(unıque_values=("product_group","unique"))


Unnamed: 0,product_group,unıque_values
0,PG1,[PG1]
1,PG2,[PG2]
2,PG3,[PG3]
3,PG4,[PG4]
4,PG5,[PG5]
5,PG6,[PG6]


Example 15: Number of unique values


In [20]:
sales_sorted.groupby("product_group", as_index=False).agg(number_of_unıque_values=("product_group","nunique"))

Unnamed: 0,product_group,number_of_unıque_values
0,PG1,1
1,PG2,1
2,PG3,1
3,PG4,1
4,PG5,1
5,PG6,1


Example 16: Lambda expressions

In [21]:
sales.groupby("product_group").agg(total_sales_in_thousands =("last_month_sales",lambda x : x.sum()/1000))

Unnamed: 0_level_0,total_sales_in_thousands
product_group,Unnamed: 1_level_1
PG1,4.794
PG2,23.84
PG3,14.929
PG4,56.352
PG5,79.125
PG6,102.96


Example 17: Lambda expressions with apply

In [23]:
sales.groupby("product_group").apply(
    lambda x: (x.last_week_sales - x.last_month_sales / 4).mean()
)

product_group
PG1    4.705128
PG2    2.306667
PG3    3.557692
PG4    5.661891
PG5    6.124510
PG6    9.814815
dtype: float64

Example 18: The dropna paramater

In [27]:
sales.isnull().sum()

id                  0
product_code        0
product_group       0
stock_qty           0
cost                0
price               0
last_week_sales     0
last_month_sales    0
dtype: int64

In [None]:
# without dropna
sales.groupby("store")["price"].mean()


In [None]:
Output
store
Daisy     69.327426
Rose      60.513700
Violet    67.808727
Name: price, dtype: float64

In [None]:
# with dropna
sales.groupby("store", dropna=False)["price"].mean()
Output
store
Daisy     69.327426
Rose      60.513700
Violet    67.808727
NaN       96.000000
Name: price, dtype: float64

Example 19: How many groups

In [28]:
sales.groupby(["product_group","id"]).ngroups

1000

Example 20: Getting a particular group



In [41]:
pg1 = sales.groupby(["product_group","id"]).get_group(("PG1",723))

In [42]:
pg1 

Unnamed: 0,id,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales
722,723,4150,PG1,22,405.1,612.66,14,29


In [53]:
sales["rank"] = sales.groupby("product_group")["id"].rank(ascending=False, method="dense")

In [55]:
sales.head()

Unnamed: 0,id,product_code,product_group,stock_qty,cost,price,last_week_sales,last_month_sales,rank
0,1,4187,PG2,498,420.76,569.91,13,58,75.0
1,2,4195,PG2,473,545.64,712.41,16,58,74.0
2,3,4204,PG2,968,640.42,854.91,22,88,73.0
3,4,4219,PG2,241,869.69,1034.55,14,45,72.0
4,5,4718,PG2,1401,12.54,26.59,50,285,71.0


Example 22: Cumulative sum

In [56]:
df = pd.DataFrame(
   {
     "date": pd.date_range(start="2022-08-01", periods=8, freq="D"),
     "category": list("AAAABBBB"),
     "value": np.random.randint(10, 30, size=8)
   }
)
df

Unnamed: 0,date,category,value
0,2022-08-01,A,28
1,2022-08-02,A,19
2,2022-08-03,A,21
3,2022-08-04,A,23
4,2022-08-05,B,15
5,2022-08-06,B,10
6,2022-08-07,B,18
7,2022-08-08,B,11


In [60]:
df["cum_sum"] = df.groupby("category")["value"].cumsum()

In [61]:
df

Unnamed: 0,date,category,value,cum_sum
0,2022-08-01,A,28,28
1,2022-08-02,A,19,47
2,2022-08-03,A,21,68
3,2022-08-04,A,23,91
4,2022-08-05,B,15,15
5,2022-08-06,B,10,25
6,2022-08-07,B,18,43
7,2022-08-08,B,11,54


Example 23: Cumulative sum with expanding

In [62]:
df["cum_sum_2"] = df.groupby("category")["value"].expanding().sum().values

In [63]:
df

Unnamed: 0,date,category,value,cum_sum,cum_sum_2
0,2022-08-01,A,28,28,28.0
1,2022-08-02,A,19,47,47.0
2,2022-08-03,A,21,68,68.0
3,2022-08-04,A,23,91,91.0
4,2022-08-05,B,15,15,15.0
5,2022-08-06,B,10,25,25.0
6,2022-08-07,B,18,43,43.0
7,2022-08-08,B,11,54,54.0


Example 24: Cumulative mean

In [64]:
df["cum_mean"] = df.groupby("category")["value"].expanding().mean().values

In [65]:
df

Unnamed: 0,date,category,value,cum_sum,cum_sum_2,cum_mean
0,2022-08-01,A,28,28,28.0,28.0
1,2022-08-02,A,19,47,47.0,23.5
2,2022-08-03,A,21,68,68.0,22.666667
3,2022-08-04,A,23,91,91.0,22.75
4,2022-08-05,B,15,15,15.0,15.0
5,2022-08-06,B,10,25,25.0,12.5
6,2022-08-07,B,18,43,43.0,14.333333
7,2022-08-08,B,11,54,54.0,13.5


Example 25: Current highest with expanding

In [66]:
df["current_highest"] = df.groupby(
    "category"
)["value"].expanding().max().values
df

Unnamed: 0,date,category,value,cum_sum,cum_sum_2,cum_mean,current_highest
0,2022-08-01,A,28,28,28.0,28.0,28.0
1,2022-08-02,A,19,47,47.0,23.5,28.0
2,2022-08-03,A,21,68,68.0,22.666667,28.0
3,2022-08-04,A,23,91,91.0,22.75,28.0
4,2022-08-05,B,15,15,15.0,15.0,15.0
5,2022-08-06,B,10,25,25.0,12.5,15.0
6,2022-08-07,B,18,43,43.0,14.333333,18.0
7,2022-08-08,B,11,54,54.0,13.5,18.0
