## This script contains:

##### Import the pandas, NumPy, and os libraries
##### Import 'df_merged' as pickle
##### Aggregated mean of the 'order_number' variable grouped by 'department_id' (subset)
##### Aggregated mean of the 'order_number' variable grouped by 'department_id' (entire df)
##### Create a loyalty flag for existing customers using the transform() and loc() functions
##### Determine whether the prices of products purchased by loyal customers differ from those purchased by regular or new customers
##### Create a spending flag for each user based on the average price across all their orders
##### Create an order frequency flag that marks the regularity of a user´s ordering behavior according to the median in the 'days_since_prior_order' column
##### Export your dataframe as a pickle file and store it correctly in your 'Prepared Data' folder

### Import the pandas, NumPy, and os libraries

In [2]:
# Import libraries

import pandas as pd
import numpy as np
import os

### Import 'df_merged' as pickle

In [3]:
# Import data sets using the path as string

path = r'C:\Users\loren\Desktop\Career Foundry\2. Data Immersion Course\A4 Python Fundamentals for Data Analysts\07-2023 Instacart Basket Analysis'
df = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged_new_variables.pkl'))

### Aggregated mean of the 'order_number' variable grouped by 'department_id' (subset)

In [4]:
# Creating subset

df_1_mi = df[:1000000]

In [5]:
# Grouping variable by depart ID

df_1_mi.groupby('department_id')['order_number'].mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_number, dtype: float64

### Aggregated mean of the 'order_number' variable grouped by 'department_id' (entire df)

In [6]:
# Grouping variable by depart ID

df.groupby('department_id')['order_number'].mean()

department_id
1     15.457838
2     17.277920
3     17.170395
4     17.811403
5     15.215751
6     16.439806
7     17.225802
8     15.340650
9     15.895474
10    20.197148
11    16.170638
12    15.887671
13    16.583536
14    16.773669
15    16.165037
16    17.665606
17    15.694469
18    19.310397
19    17.177343
20    16.473447
21    22.902379
Name: order_number, dtype: float64

Except for the depart ID #17, all the others depart ID's has a higher avg in the subset compared to the entire dataframe

### Create a loyalty flag for existing customers using the transform() and loc() functions

In [7]:
# Creating a flag using transform()

df['max_order'] = df.groupby(['user_id'])['order_number'].transform(np.max)

In [8]:
# Checking result

df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy,Average orders,10
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy,Most orders,10
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy,Average orders,10
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy,Most orders,10


In [9]:
# Creating a flag using loc()

df.loc[df['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [10]:
df.loc[(df['max_order'] <= 40) & (df['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [11]:
df.loc[df['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [12]:
# Counting values on new variable ('loyalty_flag')

df['loyalty_flag'].value_counts(dropna = False)

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

In [13]:
# Checking result

df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy,Average orders,10,New customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least busy,Most orders,10,New customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy,Average orders,10,New customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy,Most orders,10,New customer


### Determine whether the prices of products purchased by loyal customers differ from those purchased by regular or new customers

In [14]:
# Grouping 'loyalty_flag' by price

df.groupby('loyalty_flag').agg({'prices': ['mean', 'min', 'max']})

Unnamed: 0_level_0,prices,prices,prices
Unnamed: 0_level_1,mean,min,max
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Loyal customer,10.386336,1.0,99999.0
New customer,13.29467,1.0,99999.0
Regular customer,12.495717,1.0,99999.0


The avg price spent by new customers is higher than the loyal customers

### Create a spending flag for each user based on the average price across all their orders

In [15]:
# Creating 'avg_price' variable using transform

df['avg_price'] = df.groupby(['user_id'])['prices'].transform(np.mean)

In [16]:
# Creating a flag using loc()

df.loc[df['prices'] < 10, 'spending_flag'] = 'Low spender'

In [17]:
df.loc[df['prices'] >= 10, 'spending_flag'] = 'High spender'

In [18]:
# Checking result

df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,9.0,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,9.0,both,Mid-range product,Regularly busy,Least busy,Average orders,10,New customer,6.367797,Low spender
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,9.0,both,Mid-range product,Regularly busy,Least busy,Most orders,10,New customer,6.367797,Low spender
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,9.0,both,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,9.0,both,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender


### Create an order frequency flag that marks the regularity of a user´s ordering behavior according to the median in the 'days_since_prior_order' column

In [19]:
# Creating 'avg_price' variable using transform

df['regularity'] = df.groupby(['user_id'])['days_since_prior_order'].transform(np.mean).round(0)

# I used "round(0)" in the end of the function to show me the entire number

In [20]:
# Creating a flag using loc()

df.loc[df['regularity'] > 20, 'frequency_flag'] = 'Non-frequent customer'

In [21]:
# Creating a flag using loc()

df.loc[(df['regularity'] <= 20) & (df['regularity'] > 10), 'frequency_flag'] = 'Regular customer'

In [22]:
# Creating a flag using loc()

df.loc[df['regularity'] <= 10, 'frequency_flag'] = 'Frequent customer'

In [23]:
# Checking result

df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,regularity,frequency_flag
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,19.0,Regular customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Least busy,Average orders,10,New customer,6.367797,Low spender,19.0,Regular customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Least busy,Most orders,10,New customer,6.367797,Low spender,19.0,Regular customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,19.0,Regular customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,19.0,Regular customer


### Export your dataframe as a pickle file and store it correctly in your 'Prepared Data' folder

In [24]:
df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged_grouped.pkl'))