# Grouping Data & Aggregating Variables

### This script will address the following:
1. Importing libraries
2. Importing data
3. Grouping & aggregating for Task 4.8

### 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os

### 2. Importing Data

In [2]:
path = r'C:\Users\keely\Documents\Courses\CareerFoundry\Immersion\Achievement 4 - Python\01-2023 Instacart Basket Analysis'

In [3]:
df_ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merged_busy_updates.pkl'))

In [4]:
df_ords_prods_merged.shape

(32404859, 18)

In [5]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_label,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Most orders
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy days,Average orders
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy days,Most orders
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy days,Average orders
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy days,Most orders


### 3. Grouping & Aggregating Task 4.8

In [6]:
# 2) In this Exercise, you learned how to find the aggregated mean of the “order_number” column grouped by “department_id” 
# for a subset of your dataframe. Now, repeat this process for the entire dataframe.

df_ords_prods_merged.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.457838
2,17.27792
3,17.170395
4,17.811403
5,15.215751
6,16.439806
7,17.225802
8,15.34065
9,15.895474
10,20.197148


##### 3) The groupby and aggregate functions above produced all the departments and their means, while the groupby and aggregate function run on the first 1 million rows only included some of the departments.

In [7]:
# 4) Follow the instructions for creating a loyalty flag for existing customers using the transform and
# loc() functions.

df_ords_prods_merged['max_order'] = df_ords_prods_merged.groupby(['user_id'])['order_number'].transform(np.max)

In [8]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_label,busiest_day,busiest_days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Most orders,10
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy days,Average orders,10
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy days,Most orders,10
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy days,Average orders,10
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy days,Most orders,10


In [9]:
# Creating loyalty flags, next three cells:

df_ords_prods_merged.loc[df_ords_prods_merged['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [10]:
df_ords_prods_merged.loc[(df_ords_prods_merged['max_order'] <= 40) & (df_ords_prods_merged['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [11]:
df_ords_prods_merged.loc[df_ords_prods_merged['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [12]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_label,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Most orders,10,New customer
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy days,Average orders,10,New customer
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy days,Most orders,10,New customer


In [13]:
# 5) The marketing team at Instacart wants to know whether there’s a difference between the spending habits of 
# the three types of customers you identified: loyal customer, regular customer, and new customer.

df_ords_prods_merged.groupby('loyalty_flag').agg({'prices': ['min', 'max', 'mean','sum']})

Unnamed: 0_level_0,prices,prices,prices,prices
Unnamed: 0_level_1,min,max,mean,sum
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Loyal customer,1.0,99999.0,10.386335,106814040.0
New customer,1.0,99999.0,13.29467,83011784.0
Regular customer,1.0,99999.0,12.495717,198391696.0


In [14]:
# 6) Create spending flags for each user based on the average price across all orders. If average price spent is lower
# than 10, label them a 'Low spender.' If equal to or greater than 10, label them a 'High spender.'

df_ords_prods_merged['avg_spent'] = df_ords_prods_merged.groupby(['user_id'])['prices'].transform(np.mean)

In [15]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,aisle_id,department_id,prices,price_label,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_spent
0,2539329,1,1,2,8,,True,196,1,0,...,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Most orders,10,New customer,6.367797
1,2398795,1,2,3,7,15.0,False,196,1,1,...,77,7,9.0,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797
2,473747,1,3,3,12,21.0,False,196,1,1,...,77,7,9.0,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797
3,2254736,1,4,4,7,29.0,False,196,1,1,...,77,7,9.0,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797
4,431534,1,5,4,15,28.0,False,196,1,1,...,77,7,9.0,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797


In [16]:
# Creating the spending flags.

df_ords_prods_merged.loc[df_ords_prods_merged['avg_spent'] < 10, 'spending_flag'] = 'Low spender'

In [17]:
df_ords_prods_merged.loc[df_ords_prods_merged['avg_spent'] >= 10, 'spending_flag'] = 'High spender'

In [18]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,department_id,prices,price_label,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_spent,spending_flag
0,2539329,1,1,2,8,,True,196,1,0,...,7,9.0,Mid-range product,Regularly busy,Regularly busy,Most orders,10,New customer,6.367797,Low spender
1,2398795,1,2,3,7,15.0,False,196,1,1,...,7,9.0,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low spender
2,473747,1,3,3,12,21.0,False,196,1,1,...,7,9.0,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797,Low spender
3,2254736,1,4,4,7,29.0,False,196,1,1,...,7,9.0,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender
4,431534,1,5,4,15,28.0,False,196,1,1,...,7,9.0,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797,Low spender


In [19]:
df_ords_prods_merged.shape

(32404859, 22)

In [20]:
# 7)  Create an order frequency flag that marks the regularity of a user’s ordering behavior according to the median in the 
# “days_since_prior_order” column. If days_since_prior order median greater than 20: Non-frequent customer. 
# Higher than 10 or lower or equal to 20: Regular customer. 
# Lower or equal to 10: Frequent customer. 

df_ords_prods_merged['med_days_since_prior_order'] = df_ords_prods_merged.groupby(['user_id'])['days_since_prior_order'].transform(np.median)

In [21]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,prices,price_label,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_spent,spending_flag,med_days_since_prior_order
0,2539329,1,1,2,8,,True,196,1,0,...,9.0,Mid-range product,Regularly busy,Regularly busy,Most orders,10,New customer,6.367797,Low spender,20.5
1,2398795,1,2,3,7,15.0,False,196,1,1,...,9.0,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5
2,473747,1,3,3,12,21.0,False,196,1,1,...,9.0,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5
3,2254736,1,4,4,7,29.0,False,196,1,1,...,9.0,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5
4,431534,1,5,4,15,28.0,False,196,1,1,...,9.0,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5


In [22]:
df_ords_prods_merged.loc[df_ords_prods_merged['med_days_since_prior_order'] > 20, 'order_frequency_flag'] = 'Non-frequent customer'

In [23]:
df_ords_prods_merged.loc[(df_ords_prods_merged['med_days_since_prior_order'] > 10) & (df_ords_prods_merged['med_days_since_prior_order'] <= 20), 'order_frequency_flag'] = 'Regular customer'

In [24]:
df_ords_prods_merged.loc[df_ords_prods_merged['med_days_since_prior_order'] <= 10, 'order_frequency_flag'] = 'Frequent customer'

In [25]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,...,price_label,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_spent,spending_flag,med_days_since_prior_order,order_frequency_flag
0,2539329,1,1,2,8,,True,196,1,0,...,Mid-range product,Regularly busy,Regularly busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,False,196,1,1,...,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,False,196,1,1,...,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,False,196,1,1,...,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,False,196,1,1,...,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [26]:
df_ords_prods_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_flags.pkl'))

In [27]:
# Checking size of data types to see if any changes need to be made before proceeding. This is to prevent CPU/RAM
# issues during processing.

df_ords_prods_merged.dtypes

order_id                       object
user_id                        object
order_number                    int16
orders_day_of_week               int8
order_hour_of_day                int8
days_since_prior_order        float16
new_customer                     bool
product_id                      int32
add_to_cart_order               int16
reordered                        int8
product_name                   object
aisle_id                        int16
department_id                    int8
prices                        float32
price_label                    object
busiest_day                    object
busiest_days                   object
busiest_period_of_day          object
max_order                       int16
loyalty_flag                   object
avg_spent                     float32
spending_flag                  object
med_days_since_prior_order    float16
order_frequency_flag           object
dtype: object