# This script contains the following:
- Import libraries
- Load datasets
- Create a subset: 1st million rows
- Grouping data
- Aggregation
- Mutiple aggregations
- Create Loyalty flags (customer buying the product often)

# Import libraries

In [4]:
import pandas as pd
import numpy as np
import os

# Load dataframes

In [5]:
# Define paths and filenames
path = r'/Users/macbook/Dropbox/Mac/Documents/Pro/Data Analyst/Course_Career foundry/A4_Python/2023.08_Instacart basket analysis/02_data'
orders_file_path = os.path.join(path, 'prepared data', 'orders_products_merged_v2')

# Import dataframes
df_ords = pd.read_pickle(orders_file_path)

In [12]:
# Checking import
df_ords.head()

Unnamed: 0,order_id,user_id,number_order_client,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,merge_indicator,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196.0,1.0,0.0,both,Soda,77,7,9.0,both,mid-range product,regularly busy,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,196.0,1.0,1.0,both,Soda,77,7,9.0,both,mid-range product,regularly busy,Least busy days,Average orders
2,473747,1,3,3,12,21.0,196.0,1.0,1.0,both,Soda,77,7,9.0,both,mid-range product,regularly busy,Least busy days,Most orders
3,2254736,1,4,4,7,29.0,196.0,1.0,1.0,both,Soda,77,7,9.0,both,mid-range product,least busy,Least busy days,Average orders
4,431534,1,5,4,15,28.0,196.0,1.0,1.0,both,Soda,77,7,9.0,both,mid-range product,least busy,Least busy days,Most orders


In [14]:
# Drop the '_merge' and 'merge_indicator' columns from the dataframe 'df_ords'
df_ords = df_ords.drop(['_merge', 'merge_indicator'], axis=1)

In [16]:
# Check drop columns
df_ords.columns

Index(['order_id', 'user_id', 'number_order_client', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_day',
       'busiest_days', 'busiest_period_of_day'],
      dtype='object')

# Create a subset: 1st million rows

In [17]:
# Create sub dataframe with 1st million rows
df = df_ords[:1000000]

In [18]:
# checking shape
df.shape

(1000000, 17)

In [19]:
# checking 
df.head()

Unnamed: 0,order_id,user_id,number_order_client,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196.0,1.0,0.0,Soda,77,7,9.0,mid-range product,regularly busy,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,196.0,1.0,1.0,Soda,77,7,9.0,mid-range product,regularly busy,Least busy days,Average orders
2,473747,1,3,3,12,21.0,196.0,1.0,1.0,Soda,77,7,9.0,mid-range product,regularly busy,Least busy days,Most orders
3,2254736,1,4,4,7,29.0,196.0,1.0,1.0,Soda,77,7,9.0,mid-range product,least busy,Least busy days,Average orders
4,431534,1,5,4,15,28.0,196.0,1.0,1.0,Soda,77,7,9.0,mid-range product,least busy,Least busy days,Most orders


# Grouping data

In [28]:
# Group data by product name
df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x157201f10>

# Aggregation

In [20]:
# Grouping the dataframe by 'department_id' column and calculating the mean of 'number_order_client'
round(
    df.groupby('department_id').agg({'number_order_client': ['mean']})
    ,0)    

Unnamed: 0_level_0,number_order_client
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,19.0
7,17.0
13,18.0
14,19.0
16,19.0
17,11.0
19,19.0
20,18.0


In [21]:
# alternative way to find the mean
round(
    df.groupby('department_id')['number_order_client'].mean()
)

department_id
4     19.0
7     17.0
13    18.0
14    19.0
16    19.0
17    11.0
19    19.0
20    18.0
Name: number_order_client, dtype: float64

# Multiple aggregation

In [22]:
# Average, minimum & maximun of orders by department id
df.groupby('department_id').agg({'number_order_client': ['mean', 'min', 'max']})

Unnamed: 0_level_0,number_order_client,number_order_client,number_order_client
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


# Create Loyalty flags (customer buying the product often)

In [23]:
# Adding a new column 'max_order' to the DataFrame 'df_ords' by calculating the maximum value of 'number_order_client'
df_ords['max_order'] = df_ords.groupby(['user_id'])['number_order_client'].transform(np.max)

In [24]:
# Checking
df_ords.head(100)

Unnamed: 0,order_id,user_id,number_order_client,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,,196.0,1.0,0.0,Soda,77,7,9.0,mid-range product,regularly busy,Regularly busy,Average orders,10
1,2398795,1,2,3,7,15.0,196.0,1.0,1.0,Soda,77,7,9.0,mid-range product,regularly busy,Least busy days,Average orders,10
2,473747,1,3,3,12,21.0,196.0,1.0,1.0,Soda,77,7,9.0,mid-range product,regularly busy,Least busy days,Most orders,10
3,2254736,1,4,4,7,29.0,196.0,1.0,1.0,Soda,77,7,9.0,mid-range product,least busy,Least busy days,Average orders,10
4,431534,1,5,4,15,28.0,196.0,1.0,1.0,Soda,77,7,9.0,mid-range product,least busy,Least busy days,Most orders,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3226575,360,1,5,12,,196.0,1.0,0.0,Soda,77,7,9.0,mid-range product,regularly busy,Regularly busy,Most orders,3
96,1469869,377,3,5,17,3.0,196.0,9.0,0.0,Soda,77,7,9.0,mid-range product,regularly busy,Regularly busy,Average orders,3
97,1927023,387,2,4,10,22.0,196.0,3.0,0.0,Soda,77,7,9.0,mid-range product,least busy,Least busy days,Most orders,8
98,858092,420,4,1,19,30.0,196.0,2.0,0.0,Soda,77,7,9.0,mid-range product,regularly busy,Busiest days,Average orders,22


In [26]:
# Setting the maximum number of displayed rows for pandas DataFrame to unlimited
pd.options.display.max_rows = None

## Create loyalty flags with loc()

In [28]:
# Defining loyal customer flag
df_ords.loc[df_ords['max_order'] > 40, 'loyalty_flag'] = 'loyal customer'

In [29]:
# Defining regular customer flag
df_ords.loc[(df_ords['max_order'] <= 40) & (df_ords['max_order'] > 10), 'loyalty_flag'] = 'regular customer'

In [30]:
# Defining new customer flag
df_ords.loc[df_ords ['max_order']<= 10, 'loyalty_flag'] = 'new customer'

In [31]:
# Printing flags
df_ords['loyalty_flag'].value_counts(dropna = False)

regular customer    15876776
loyal customer      10284093
new customer         6243990
Name: loyalty_flag, dtype: int64

In [32]:
# Checking
df_ords[['user_id', 'loyalty_flag', 'number_order_client']].head(60)

Unnamed: 0,user_id,loyalty_flag,number_order_client
0,1,new customer,1
1,1,new customer,2
2,1,new customer,3
3,1,new customer,4
4,1,new customer,5
5,1,new customer,6
6,1,new customer,7
7,1,new customer,8
8,1,new customer,9
9,1,new customer,10
