## 4.8 Grouping Data & Aggregation Variables

### This notebook contains the following points:

## 1.0 Preparational Steps
## 2.0 Working along the Course Material

## 1.0 Preparational Steps

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path = r'C:\Users\chris\OneDrive\Desktop\Data Analytics CF\202203_Instacart Basket Analysis'

In [3]:
ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merged_new_variables.pkl'))

In [4]:
df = ords_prods_merged[:1000000]

In [5]:
df.shape

(1000000, 19)

In [6]:
df.head(10)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,True,196,1,0,Soda,77.0,7.0,9.0,both,Mid-range product,Regularly busy,regular days,Average orders
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,both,Mid-range product,Regularly busy,regular days,Average orders
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23.0,19.0,4.4,both,Low-range product,Regularly busy,regular days,Average orders
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23.0,19.0,4.7,both,Low-range product,Regularly busy,regular days,Average orders
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,both,Low-range product,Regularly busy,regular days,Average orders
5,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77.0,7.0,9.0,both,Mid-range product,Regularly busy,slowest days,Average orders
6,2398795,1,2,3,7,15.0,False,10258,2,0,Pistachios,117.0,19.0,3.0,both,Low-range product,Regularly busy,slowest days,Average orders
7,2398795,1,2,3,7,15.0,False,12427,3,1,Original Beef Jerky,23.0,19.0,4.4,both,Low-range product,Regularly busy,slowest days,Average orders
8,2398795,1,2,3,7,15.0,False,13176,4,0,Bag of Organic Bananas,24.0,4.0,10.3,both,Mid-range product,Regularly busy,slowest days,Average orders
9,2398795,1,2,3,7,15.0,False,26088,5,1,Aged White Cheddar Popcorn,23.0,19.0,4.7,both,Low-range product,Regularly busy,slowest days,Average orders


## 2.0 Working along the Course Material

### 2.1 Grouping Data with pandas

In [7]:
# grouping df by product_name
df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D2928C6EB0>

### 2.2 Performing a Single Aggregation

In [8]:
df.groupby('department_id').agg({'order_number' : ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1.0,14.800024
2.0,17.091743
3.0,17.913544
4.0,17.893092
5.0,15.21427
6.0,15.382135
7.0,17.694027
8.0,16.458105
9.0,15.957363
10.0,20.091818


In [9]:
df.groupby('department_id')['order_number'].mean()

department_id
1.0     14.800024
2.0     17.091743
3.0     17.913544
4.0     17.893092
5.0     15.214270
6.0     15.382135
7.0     17.694027
8.0     16.458105
9.0     15.957363
10.0    20.091818
11.0    16.482026
12.0    15.615061
13.0    16.484023
14.0    17.524632
15.0    15.691875
16.0    18.014071
17.0    16.150593
18.0    19.602850
19.0    17.631340
20.0    17.138607
21.0    21.956893
Name: order_number, dtype: float64

In [11]:
df.groupby('department_id').order_number.mean()

department_id
1.0     14.800024
2.0     17.091743
3.0     17.913544
4.0     17.893092
5.0     15.214270
6.0     15.382135
7.0     17.694027
8.0     16.458105
9.0     15.957363
10.0    20.091818
11.0    16.482026
12.0    15.615061
13.0    16.484023
14.0    17.524632
15.0    15.691875
16.0    18.014071
17.0    16.150593
18.0    19.602850
19.0    17.631340
20.0    17.138607
21.0    21.956893
Name: order_number, dtype: float64

### 2.3 Performing Multiple Aggregations 

In [13]:
df.groupby('department_id').agg({'order_number' : ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1.0,14.800024,1,99
2.0,17.091743,1,98
3.0,17.913544,1,99
4.0,17.893092,1,99
5.0,15.21427,1,99
6.0,15.382135,1,99
7.0,17.694027,1,99
8.0,16.458105,1,91
9.0,15.957363,1,99
10.0,20.091818,1,99


### 2.4 Aggregating Data with transform()

In [16]:
ords_prods_merged['max_order'] = ords_prods_merged.groupby(['user_id'])['order_number'].transform(np.max)

In [17]:
ords_prods_merged.head(15)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,,True,196,1,0,Soda,77.0,7.0,9.0,both,Mid-range product,Regularly busy,regular days,Average orders,10
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,both,Mid-range product,Regularly busy,regular days,Average orders,10
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23.0,19.0,4.4,both,Low-range product,Regularly busy,regular days,Average orders,10
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23.0,19.0,4.7,both,Low-range product,Regularly busy,regular days,Average orders,10
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,both,Low-range product,Regularly busy,regular days,Average orders,10
5,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77.0,7.0,9.0,both,Mid-range product,Regularly busy,slowest days,Average orders,10
6,2398795,1,2,3,7,15.0,False,10258,2,0,Pistachios,117.0,19.0,3.0,both,Low-range product,Regularly busy,slowest days,Average orders,10
7,2398795,1,2,3,7,15.0,False,12427,3,1,Original Beef Jerky,23.0,19.0,4.4,both,Low-range product,Regularly busy,slowest days,Average orders,10
8,2398795,1,2,3,7,15.0,False,13176,4,0,Bag of Organic Bananas,24.0,4.0,10.3,both,Mid-range product,Regularly busy,slowest days,Average orders,10
9,2398795,1,2,3,7,15.0,False,26088,5,1,Aged White Cheddar Popcorn,23.0,19.0,4.7,both,Low-range product,Regularly busy,slowest days,Average orders,10


In [18]:
pd.options.display.max_rows = None

In [21]:
ords_prods_merged.head(100)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,,True,196,1,0,Soda,77.0,7.0,9.0,both,Mid-range product,Regularly busy,regular days,Average orders,10
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,both,Mid-range product,Regularly busy,regular days,Average orders,10
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23.0,19.0,4.4,both,Low-range product,Regularly busy,regular days,Average orders,10
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23.0,19.0,4.7,both,Low-range product,Regularly busy,regular days,Average orders,10
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,both,Low-range product,Regularly busy,regular days,Average orders,10
5,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77.0,7.0,9.0,both,Mid-range product,Regularly busy,slowest days,Average orders,10
6,2398795,1,2,3,7,15.0,False,10258,2,0,Pistachios,117.0,19.0,3.0,both,Low-range product,Regularly busy,slowest days,Average orders,10
7,2398795,1,2,3,7,15.0,False,12427,3,1,Original Beef Jerky,23.0,19.0,4.4,both,Low-range product,Regularly busy,slowest days,Average orders,10
8,2398795,1,2,3,7,15.0,False,13176,4,0,Bag of Organic Bananas,24.0,4.0,10.3,both,Mid-range product,Regularly busy,slowest days,Average orders,10
9,2398795,1,2,3,7,15.0,False,26088,5,1,Aged White Cheddar Popcorn,23.0,19.0,4.7,both,Low-range product,Regularly busy,slowest days,Average orders,10


### 2.5 Deriving new columns with loc()

In [26]:
ords_prods_merged.loc[ords_prods_merged['max_order'] > 40, 'loyalty_flag']  = 'Loyal customer'

In [27]:
ords_prods_merged.loc[(ords_prods_merged['max_order'] <= 40) & (ords_prods_merged['max_order'] >10), 'loyalty_flag'] = 'Regular customer'

In [28]:
ords_prods_merged.loc[ords_prods_merged['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [29]:
ords_prods_merged.head(20)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag
0,2539329,1,1,2,8,,True,196,1,0,...,77.0,7.0,9.0,both,Mid-range product,Regularly busy,regular days,Average orders,10,New customer
1,2539329,1,1,2,8,,True,14084,2,0,...,91.0,16.0,12.5,both,Mid-range product,Regularly busy,regular days,Average orders,10,New customer
2,2539329,1,1,2,8,,True,12427,3,0,...,23.0,19.0,4.4,both,Low-range product,Regularly busy,regular days,Average orders,10,New customer
3,2539329,1,1,2,8,,True,26088,4,0,...,23.0,19.0,4.7,both,Low-range product,Regularly busy,regular days,Average orders,10,New customer
4,2539329,1,1,2,8,,True,26405,5,0,...,54.0,17.0,1.0,both,Low-range product,Regularly busy,regular days,Average orders,10,New customer
5,2398795,1,2,3,7,15.0,False,196,1,1,...,77.0,7.0,9.0,both,Mid-range product,Regularly busy,slowest days,Average orders,10,New customer
6,2398795,1,2,3,7,15.0,False,10258,2,0,...,117.0,19.0,3.0,both,Low-range product,Regularly busy,slowest days,Average orders,10,New customer
7,2398795,1,2,3,7,15.0,False,12427,3,1,...,23.0,19.0,4.4,both,Low-range product,Regularly busy,slowest days,Average orders,10,New customer
8,2398795,1,2,3,7,15.0,False,13176,4,0,...,24.0,4.0,10.3,both,Mid-range product,Regularly busy,slowest days,Average orders,10,New customer
9,2398795,1,2,3,7,15.0,False,26088,5,1,...,23.0,19.0,4.7,both,Low-range product,Regularly busy,slowest days,Average orders,10,New customer


In [30]:
ords_prods_merged['loyalty_flag'].value_counts()

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

In [31]:
ords_prods_merged[['user_id', 'loyalty_flag', 'order_number']].head(60)

Unnamed: 0,user_id,loyalty_flag,order_number
0,1,New customer,1
1,1,New customer,1
2,1,New customer,1
3,1,New customer,1
4,1,New customer,1
5,1,New customer,2
6,1,New customer,2
7,1,New customer,2
8,1,New customer,2
9,1,New customer,2
