# 01. Preparing Notebook
# 02. Grouping Data with pandas
# 03. Aggregating Data with agg()
# 04. Aggregating Data with transform()
# 05. Deriving Columns with loc()

## 01. Preparing Notebook

In [3]:
# importing libraries
import pandas as pd
import numpy as np
import os

In [4]:
# creating path
path = r'C:\Users\lifti\OneDrive\CareerFoundry\Data Immersion\Achievement4\Instacart Basket Analysis'

In [5]:
# importing dataframe
ords_prods_merge = pd.read_pickle (os.path.join(path, '02 Data','Prepared Data','orders_products_new.pkl'))

In [6]:
ords_prods_merge

Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range products,Regularly busy,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range products,Regularly busy,Least busy,Average orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range products,Regularly busy,Least busy,Most orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range products,Least busy,Least busy,Average orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range products,Least busy,Least busy,Most orders
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,1320836,202557,17,2,15,1.0,43553,2,1,Orange Energy Shots,64,7,3.7,Low-range products,Regularly busy,Regularly busy,Most orders
32404855,31526,202557,18,5,11,3.0,43553,2,1,Orange Energy Shots,64,7,3.7,Low-range products,Regularly busy,Regularly busy,Most orders
32404856,758936,203436,1,2,7,,42338,4,0,"Zucchini Chips, Pesto",50,19,6.9,Mid-range products,Regularly busy,Regularly busy,Average orders
32404857,2745165,203436,2,3,5,15.0,42338,16,1,"Zucchini Chips, Pesto",50,19,6.9,Mid-range products,Regularly busy,Least busy,Least orders


In [5]:
# creating subset of ords_prods_merge of the frist 1.000.000 entries
df = ords_prods_merge[:1000000]

In [6]:
df.shape

(1000000, 17)

## 02. Grouping Data with pandas

In [7]:
df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002930001B400>

#### Process for grouping data
#### 1. Split the data into groups based on some criteria (df.groupby('...')
#### 2. Apply a function to each group separately
#### 3. Combine the results into a df or alternative data structure or create a new column in the current df.

## 03. Aggregating Data with agg()

### Performing a Single Aggregation

In [10]:
# group by 'department_id', aggregate 'order_number' and calculat mean
# = the average number of orders per department_id
df.groupby('department_id').agg({'order_number_history': ['mean']})

Unnamed: 0_level_0,order_number_history
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


In [13]:
# the average number of orders per department_id
df.groupby('department_id')['order_number_history'].mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_number_history, dtype: float64

In [14]:
# the average number of orders per department_id
df.groupby('department_id').order_number_history.mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_number_history, dtype: float64

### Performing Multiple Aggregations

In [15]:
# the average, max, min number of orders per department_id
df.groupby('department_id').agg({'order_number_history': ['mean','min','max']})

Unnamed: 0_level_0,order_number_history,order_number_history,order_number_history
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


## 04. Aggregating Data with transform()

In [7]:
# create a new column 'max_order'; groupby 'user_id'; transform 'order_number_history' to generate the max
# orders for each user; 
ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number_history'].transform(np.max)

In [8]:
ords_prods_merge

Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range products,Regularly busy,Regularly busy,Average orders,10
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range products,Regularly busy,Least busy,Average orders,10
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range products,Regularly busy,Least busy,Most orders,10
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range products,Least busy,Least busy,Average orders,10
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range products,Least busy,Least busy,Most orders,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,1320836,202557,17,2,15,1.0,43553,2,1,Orange Energy Shots,64,7,3.7,Low-range products,Regularly busy,Regularly busy,Most orders,31
32404855,31526,202557,18,5,11,3.0,43553,2,1,Orange Energy Shots,64,7,3.7,Low-range products,Regularly busy,Regularly busy,Most orders,31
32404856,758936,203436,1,2,7,,42338,4,0,"Zucchini Chips, Pesto",50,19,6.9,Mid-range products,Regularly busy,Regularly busy,Average orders,3
32404857,2745165,203436,2,3,5,15.0,42338,16,1,"Zucchini Chips, Pesto",50,19,6.9,Mid-range products,Regularly busy,Least busy,Least orders,3


## 05. Deriving Columns with loc()

In [9]:
# using a .loc function to set a flag for max_order:
# max_order > 40 = 'Loyal customer'
# 10 > max_order >= 40 'Regular customer'
# max_order <= 10 'New customer'
ords_prods_merge.loc[ords_prods_merge['max_order']>40, 'loyalty_flag'] = 'Loyal customer'

In [10]:
# 10 > max_order >= 40 'Regular customer'
ords_prods_merge.loc[(ords_prods_merge['max_order']>10) & (ords_prods_merge['max_order']<=40), 'loyalty_flag'] = 'Regular customer'

In [11]:
# max_order <= 10 'New customer'
ords_prods_merge.loc[ords_prods_merge['max_order']<=10, 'loyalty_flag'] = 'New customer'

In [12]:
# counting 'loyalty_flag'
ords_prods_merge['loyalty_flag'].value_counts()

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

In [13]:
ords_prods_merge

Unnamed: 0,order_id,user_id,order_number_history,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range products,Regularly busy,Regularly busy,Average orders,10,New customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range products,Regularly busy,Least busy,Average orders,10,New customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range products,Regularly busy,Least busy,Most orders,10,New customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range products,Least busy,Least busy,Average orders,10,New customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range products,Least busy,Least busy,Most orders,10,New customer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,1320836,202557,17,2,15,1.0,43553,2,1,Orange Energy Shots,64,7,3.7,Low-range products,Regularly busy,Regularly busy,Most orders,31,Regular customer
32404855,31526,202557,18,5,11,3.0,43553,2,1,Orange Energy Shots,64,7,3.7,Low-range products,Regularly busy,Regularly busy,Most orders,31,Regular customer
32404856,758936,203436,1,2,7,,42338,4,0,"Zucchini Chips, Pesto",50,19,6.9,Mid-range products,Regularly busy,Regularly busy,Average orders,3,New customer
32404857,2745165,203436,2,3,5,15.0,42338,16,1,"Zucchini Chips, Pesto",50,19,6.9,Mid-range products,Regularly busy,Least busy,Least orders,3,New customer


In [16]:
# export ords_prods_merge
ords_prods_merge.to_pickle (os.path.join (path,'02 Data','Prepared Data','orders_products.pkl'))