# 4.8 Exercise - Grouping Data and Aggregating Variables

### Script contents:

#### Importing libraries and data
#### Grouping data with pandas
#### Aggregating data with agg()
#### Aggregating data with transform()
#### Deriving new columns with loc()
#### Export data

## Import libraries

In [49]:
# Import libraries

import pandas as pd
import numpy as np
import os

## Import data


In [50]:
# Creating a string of the path to main project folder

path = '/Users/jarrettpugh/Library/CloudStorage/OneDrive-Personal/Data Analytics/Career Foundry - DA Bootcamp/A4 Python Fundamentals for Data Analysts/Instacart Basket Analysis'

In [51]:
# Import orders_products_merged.pkl as dataframe ords_prods_merge

ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))

## 4.8 Exercise

In [52]:
df = ords_prods_merge[:1000000]

In [53]:
# Check shape

df.shape

(1000000, 19)

In [54]:
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average Orders,10,New customer
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Average Orders,10,New customer
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Most Orders,10,New customer
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Average Orders,10,New customer
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Most Orders,10,New customer


### Grouping Data with Pandas

In [55]:
# Groupby function will group a given dataframe by a given column

df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x12f901610>

#### Groupby() function series of steps:

1. Split the data into groups based on some criteria.
2. Apply a function to each group separately.
3. Combine the results into a dataframe or alternative data structure or create a new column in the current dataframe.

### Aggregating Data with agg()

In [56]:
# Split the data into groups based on “department_id.”
# Apply the agg() function to each group to obtain the mean values for the “order_number” column.

df.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


In [57]:
# the command above could be replaced with a command that uses the mean() function to achieve the same results

df.groupby('department_id')['order_number'].mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_number, dtype: float64

In [58]:
# or even like this

df.groupby('department_id').order_number.mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_number, dtype: float64

In [59]:
# Producing mean, min, and max of order number, grouped by department id

df.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


## Aggregating Data with transform()

#### Create loyalty flag using transform and loc

In [60]:
# Split the data into groups based on the “user_id” column.
# Apply the transform() function on the “order_number” column to generate the maximum orders for each user.
# Create a new column, “max_order,” into which you’ll place the results of your aggregation.

ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)

In [61]:
ords_prods_merge.head(15)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average Orders,10,New customer
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Average Orders,10,New customer
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Most Orders,10,New customer
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Average Orders,10,New customer
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Most Orders,10,New customer
5,3367565,1,6,2,7,19.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Average Orders,10,New customer
6,550135,1,7,1,9,20.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Very busy,Average Orders,10,New customer
7,3108588,1,8,1,14,14.0,False,196,2,1,Soda,77,7,9.0,Mid-range product,Very busy,Most Orders,10,New customer
8,2295261,1,9,1,16,0.0,False,196,4,1,Soda,77,7,9.0,Mid-range product,Very busy,Average Orders,10,New customer
9,2550362,1,10,4,8,30.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Average Orders,10,New customer


In [None]:
ords_prods_merge.head(100)

In [63]:
# This command tells pandas not to assign any options regarding the maximum number of rows to display

pd.options.display.max_rows = None

In [72]:
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average Orders,10,New customer
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Average Orders,10,New customer
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Most Orders,10,New customer
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Average Orders,10,New customer
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Most Orders,10,New customer


## Deriving Columns with loc()

Flag criteria:
1. If the maximum orders the user has made is over 40, then the customer will be labeled a “Loyal customer.”
2. If the maximum orders the user has made is over 10 but less than or equal to 40, then the customer will be labeled a “Regular customer.”
3. If the maximum orders the user has made is less than or equal to 10, then the customer will be labeled a “New customer.”

In [65]:
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40,
                     'loyalty_flag'] = 'Loyal customer'

In [66]:
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10),
                     'loyalty_flag'] = 'Regular customer'

In [67]:
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10,
                     'loyalty_flag'] = 'New customer'

In [68]:
ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

loyalty_flag
Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: count, dtype: int64

In [69]:
# Check flags

ords_prods_merge[['user_id', 'loyalty_flag', 'order_number', 'max_order']].head(60)

Unnamed: 0,user_id,loyalty_flag,order_number,max_order
0,1,New customer,1,10
1,1,New customer,2,10
2,1,New customer,3,10
3,1,New customer,4,10
4,1,New customer,5,10
5,1,New customer,6,10
6,1,New customer,7,10
7,1,New customer,8,10
8,1,New customer,9,10
9,1,New customer,10,10


In [70]:
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average Orders,10,New customer
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Average Orders,10,New customer
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Most Orders,10,New customer
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Average Orders,10,New customer
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,Mid-range product,Not busy,Most Orders,10,New customer


In [71]:
ords_prods_merge = ords_prods_merge.drop('_merge', axis=1)

KeyError: "['_merge'] not found in axis"

In [None]:
ords_prods_merge.head()

## Export data

In [None]:
# Export ords_prods_merge as pickle file 'orders_products_merged'

ords_prods_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged.pkl'))