# 4.8 Instacart grouping data & aggregating variables

## This script contains the following points:
### 1. Import and verify 'orders_products_merged.pkl' dataframe
### 2. Limit dataframe to first 1000000 rows
### 3. Creating a pandas object by using groupby
### 4. Grouping average number of orders per department ID using the aggregate function
### 5. Grouping average number of orders per department ID using the mean function
### 6. Performing multiple aggregations including mean, min, max number of orders per department id
### 7. Using transform and  loc() to identify loyalty customers on full dataframe
### 8. Creating loyalty labels by user id corresponding to their order value ('Loyal customer', 'Regular customer', 'New customer')
### 9. Export as 'orders_products_merged.pkl'

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

### 1. Import and verify 'orders_products_merged.pkl' dataframe

In [2]:
path = r'C:\Users\howl6\OneDrive\Certificates\CareerFoundry\Coursework\Data_Immersion\Chapter 4\Instacart Basket Analysis'

In [3]:
# import 'orders_products_merged.pkl' dataframe

ords_prods_merge = pd.read_pickle(os.path.join(path,'02_Data','Prepared_Data', 'orders_products_merged.pkl'))

In [4]:
# view top 5 rows

ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,first_time_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,False,5,0,both,Mid-range product,Regularly busy,Regularly busy,Most orders
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,False,1,1,both,Mid-range product,Regularly busy,Regularly busy,Most orders
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,False,20,0,both,Mid-range product,Busiest day,Busiest days,Average orders
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,True,10,0,both,Mid-range product,Regularly busy,Slowest days,Most orders
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,False,11,1,both,Mid-range product,Least busy,Slowest days,Most orders


In [5]:
# view rows, columns

ords_prods_merge.shape

(32404859, 19)

### 2. Limit dataframe to first 1000000 rows

In [8]:
df = ords_prods_merge[:1000000]

In [9]:
df.shape

(1000000, 19)

### 3. Creating a pandas object using groupby

In [10]:
df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000027417CE4FA0>

### 4. Grouping average number of orders per department ID using the aggregate function.

In [11]:
df.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.577493
2,17.320781
3,16.084944
4,17.530458
5,14.763075
6,16.658449
7,17.03159
8,15.076662
9,15.44758
10,18.681852


### 5. Grouping average number of orders per department ID using the mean function

In [12]:
df.groupby('department_id')['order_number'].mean()

department_id
1     15.577493
2     17.320781
3     16.084944
4     17.530458
5     14.763075
6     16.658449
7     17.031590
8     15.076662
9     15.447580
10    18.681852
11    15.447411
12    14.327957
13    16.548642
14    16.960241
15    16.121948
16    17.803851
17    15.593633
18    19.674252
19    16.899756
20    16.255442
21    25.535479
Name: order_number, dtype: float64

### 6. Performing multiple aggregations including mean, min, max number of orders per department id

df.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})

### 7. Using transform and loc() to identify loyalty customers on full dataframe (using NumPy)

In [13]:
ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)

In [14]:
ords_prods_merge.head(500)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,first_time_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,False,5,0,both,Mid-range product,Regularly busy,Regularly busy,Most orders,32
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,False,1,1,both,Mid-range product,Regularly busy,Regularly busy,Most orders,32
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,False,20,0,both,Mid-range product,Busiest day,Busiest days,Average orders,5
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,True,10,0,both,Mid-range product,Regularly busy,Slowest days,Most orders,3
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,False,11,1,both,Mid-range product,Least busy,Slowest days,Most orders,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1,Chocolate Sandwich Cookies,61,19,5.8,3015767,54753,3,3,11,2.0,False,3,0,both,Mid-range product,Regularly busy,Slowest days,Most orders,3
496,1,Chocolate Sandwich Cookies,61,19,5.8,2212721,54868,2,4,12,5.0,False,9,0,both,Mid-range product,Least busy,Slowest days,Most orders,4
497,1,Chocolate Sandwich Cookies,61,19,5.8,3164945,55408,3,1,14,30.0,False,1,0,both,Mid-range product,Regularly busy,Busiest days,Most orders,3
498,1,Chocolate Sandwich Cookies,61,19,5.8,1135249,55640,3,1,13,30.0,False,1,0,both,Mid-range product,Regularly busy,Busiest days,Most orders,5


In [15]:
## allows viewing rows unrestricted by max number.

pd.options.display.max_rows = None

### 8. Creating loyalty labels by user id corresponding to their order value ('Loyal customer', 'Regular customer', 'New customer')

In [16]:
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [17]:
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [18]:
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [19]:
ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

In [None]:
# Majority of customers are 'Regular customers', with between 10-40 orders.

In [20]:
# Verifying that the correct loyalty flags were assigned by 'user-id', 'loyalty_flag', 'order_number'

ords_prods_merge[['user_id', 'loyalty_flag', 'order_number']].head(60)

Unnamed: 0,user_id,loyalty_flag,order_number
0,138,Regular customer,28
1,138,Regular customer,30
2,709,New customer,2
3,764,New customer,1
4,764,New customer,3
5,777,Regular customer,16
6,825,New customer,3
7,910,Regular customer,12
8,1052,Regular customer,10
9,1052,Regular customer,15


### 9. Export as 'orders_products_merged.pkl'

In [21]:
ords_prods_merge.to_pickle(os.path.join(path, '02_Data','Prepared_Data', 'orders_products_merged.pkl'))