# Contents
### Aggregating Data with agg()
### Aggregating Data with transform()
### Deriving Columns with loc()


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [7]:
# Import Data
path = r'/Users/maitran/Documents/Instacart Basket Analysis'

In [9]:
# Import the orders_products_merged dataframe
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_2.pkl'))

In [10]:
# Create a subset
df = ords_prods_merge[:1000000]

In [11]:
df.shape

(1000000, 20)

In [12]:
df.head()

Unnamed: 0.1,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,Unnamed: 0,product_name,aisle_id,department_id,prices,_merge,busiest_day,busiest_days,busiest_period_of_day,price_range_loc
0,2539329,1,prior,1,2,8,,196,1,0,195,Soda,77,7,9.0,both,Regularly busy,Regularly days,Average orders,Mid-range product
1,2398795,1,prior,2,3,7,15.0,196,1,1,195,Soda,77,7,9.0,both,Regularly busy,Slowest days,Average orders,Mid-range product
2,473747,1,prior,3,3,12,21.0,196,1,1,195,Soda,77,7,9.0,both,Regularly busy,Slowest days,Most orders,Mid-range product
3,2254736,1,prior,4,4,7,29.0,196,1,1,195,Soda,77,7,9.0,both,Least busy,Slowest days,Average orders,Mid-range product
4,431534,1,prior,5,4,15,28.0,196,1,1,195,Soda,77,7,9.0,both,Least busy,Slowest days,Most orders,Mid-range product


## Aggregating Data with agg() 

### Performing a Single Aggregation

In [14]:
# Split the data into groups based on “department_id.”
# Apply the agg() function to each group to obtain the mean values for the “order_number” column.
df.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


#### Observation: can easily see, for instance, that the department with an ID of 4 (produce) has a mean of around 19. Conversely, the department with an ID of 17 (household), only has a mean of around 11. The insight you can glean from this, then, is that produce sells considerably more on average than household goods.


In [16]:
# Replace with a command that uses the mean() function to achieve the same results
df.groupby('department_id')['order_number'].mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_number, dtype: float64

### Performing Multiple Aggregations

In [19]:
df.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


## Aggregating Data with transform()

In [20]:
#Create another new column that assigns a loyalty flag to each customer according to the criteria
ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)

In [22]:
ords_prods_merge.head(15)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,product_name,aisle_id,department_id,prices,_merge,busiest_day,busiest_days,busiest_period_of_day,price_range_loc,max_order
0,2539329,1,prior,1,2,8,,196,1,0,...,Soda,77,7,9.0,both,Regularly busy,Regularly days,Average orders,Mid-range product,10
1,2398795,1,prior,2,3,7,15.0,196,1,1,...,Soda,77,7,9.0,both,Regularly busy,Slowest days,Average orders,Mid-range product,10
2,473747,1,prior,3,3,12,21.0,196,1,1,...,Soda,77,7,9.0,both,Regularly busy,Slowest days,Most orders,Mid-range product,10
3,2254736,1,prior,4,4,7,29.0,196,1,1,...,Soda,77,7,9.0,both,Least busy,Slowest days,Average orders,Mid-range product,10
4,431534,1,prior,5,4,15,28.0,196,1,1,...,Soda,77,7,9.0,both,Least busy,Slowest days,Most orders,Mid-range product,10
5,3367565,1,prior,6,2,7,19.0,196,1,1,...,Soda,77,7,9.0,both,Regularly busy,Regularly days,Average orders,Mid-range product,10
6,550135,1,prior,7,1,9,20.0,196,1,1,...,Soda,77,7,9.0,both,Regularly busy,Busiest days,Most orders,Mid-range product,10
7,3108588,1,prior,8,1,14,14.0,196,2,1,...,Soda,77,7,9.0,both,Regularly busy,Busiest days,Most orders,Mid-range product,10
8,2295261,1,prior,9,1,16,0.0,196,4,1,...,Soda,77,7,9.0,both,Regularly busy,Busiest days,Most orders,Mid-range product,10
9,2550362,1,prior,10,4,8,30.0,196,1,1,...,Soda,77,7,9.0,both,Least busy,Slowest days,Average orders,Mid-range product,10


### Observation: We can see that the numbers in the “order_number” column increase up to 10. This matches the value in the “max_order” column, so we know the data is accurate.

## Deriving Columns with loc()

In [23]:
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [24]:
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [25]:
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [27]:
ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

In [29]:
# Check output
ords_prods_merge[['user_id', 'loyalty_flag', 'order_number']].head(60)

Unnamed: 0,user_id,loyalty_flag,order_number
0,1,New customer,1
1,1,New customer,2
2,1,New customer,3
3,1,New customer,4
4,1,New customer,5
5,1,New customer,6
6,1,New customer,7
7,1,New customer,8
8,1,New customer,9
9,1,New customer,10


In [1]:
ords_prods_merge.groupby('loyalty_flag').agg({'prices': ['mean', 'min', 'max', 'sum']})

NameError: name 'ords_prods_merge' is not defined