## Importing important libraries¶

In [3]:
import pandas as pd  # For DataFrames
import numpy as np  # For numeric calculations
import os  # For file management

## Import Pickle file into Pandas

In [7]:
# Data set path

path = r"/Users/martin/anaconda_projects/11-02-2025 Instacart Basket Analysis"

In [13]:
# Import of the "ords_prods_merge" data set 

ords_prods_merge = pd.read_pickle(os.path.join(path, '02 data' , 'Prepared Data' , 'ords_prods_merge_2.pkl')) 

In [15]:
# Check the result

ords_prods_merge.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_po,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,Busiest day,Busiest days,busiest_period_of_day
0,0,2539329,1,prior,1,2,8,,196,1,0,both,Soda,77,7,9.0,Regularly busy,Regularly busy,Average orders
1,0,2539329,1,prior,1,2,8,,14084,2,0,both,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Regularly busy,Regularly busy,Average orders
2,0,2539329,1,prior,1,2,8,,12427,3,0,both,Original Beef Jerky,23,19,4.4,Regularly busy,Regularly busy,Average orders
3,0,2539329,1,prior,1,2,8,,26088,4,0,both,Aged White Cheddar Popcorn,23,19,4.7,Regularly busy,Regularly busy,Average orders
4,0,2539329,1,prior,1,2,8,,26405,5,0,both,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Regularly busy,Regularly busy,Average orders


In [19]:
# Creating a subset by executing the following code:

df = ords_prods_merge[:1000000] # Why? To avoid any potential processing issues!

In [21]:
# Shape check "df_ords_prods_merge" data

df.shape

(1000000, 19)

## 1) Group data: groupby()

In [30]:
# First step of the workflow: Split the data into groups based on some criteria.

df.groupby('product_name') #  Let’s try it on your “product_name” column in your df dataframe

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x117993560>

## 2) Aggregating Data with agg()

### Performing a Single Aggregation

In [46]:
# Groups the data by department_id and calculates the average order_number for each department.

df.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,14.794722
2,17.091743
3,17.930716
4,17.892234
5,15.211405
6,15.382228
7,17.699986
8,16.485269
9,15.965921
10,20.091818


In [50]:
# Or instead (achieves the same result)

df.groupby('department_id')['order_number'].mean()

department_id
1     14.794722
2     17.091743
3     17.930716
4     17.892234
5     15.211405
6     15.382228
7     17.699986
8     16.485269
9     15.965921
10    20.091818
11    16.484907
12    15.615845
13    16.485279
14    17.499513
15    15.690354
16    18.005083
17    16.155822
18    19.606536
19    17.630640
20    17.138204
21    21.996844
Name: order_number, dtype: float64

### Performing Multiple Aggregations

In [53]:
# This time producing not only the mean but also the min and max:

df.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,14.794722,1,99
2,17.091743,1,98
3,17.930716,1,99
4,17.892234,1,99
5,15.211405,1,99
6,15.382228,1,99
7,17.699986,1,99
8,16.485269,1,91
9,15.965921,1,99
10,20.091818,1,99


## 3) Aggregating Data with transform()

In [59]:
ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)

# First, a new column called “max_order” is created, which will be what stores the maximum order number for each user (step 3).
# Then, the ords_prods_merge dataframe is grouped by the “user_id” column (step 1).
# And finally, the transform() function is applied on the “order_number” column with the np.max argument (step 2).

  ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)


In [87]:
# Checking the results

ords_prods_merge.head(10)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_po,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,Busiest day,Busiest days,busiest_period_of_day,max_order
0,0,2539329,1,prior,1,2,8,,196,1,0,both,Soda,77,7,9.0,Regularly busy,Regularly busy,Average orders,10
1,0,2539329,1,prior,1,2,8,,14084,2,0,both,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Regularly busy,Regularly busy,Average orders,10
2,0,2539329,1,prior,1,2,8,,12427,3,0,both,Original Beef Jerky,23,19,4.4,Regularly busy,Regularly busy,Average orders,10
3,0,2539329,1,prior,1,2,8,,26088,4,0,both,Aged White Cheddar Popcorn,23,19,4.7,Regularly busy,Regularly busy,Average orders,10
4,0,2539329,1,prior,1,2,8,,26405,5,0,both,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Regularly busy,Regularly busy,Average orders,10
5,1,2398795,1,prior,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0,Regularly busy,Slowest days,Average orders,10
6,1,2398795,1,prior,2,3,7,15.0,10258,2,0,both,Pistachios,117,19,3.0,Regularly busy,Slowest days,Average orders,10
7,1,2398795,1,prior,2,3,7,15.0,12427,3,1,both,Original Beef Jerky,23,19,4.4,Regularly busy,Slowest days,Average orders,10
8,1,2398795,1,prior,2,3,7,15.0,13176,4,0,both,Bag of Organic Bananas,24,4,10.3,Regularly busy,Slowest days,Average orders,10
9,1,2398795,1,prior,2,3,7,15.0,26088,5,1,both,Aged White Cheddar Popcorn,23,19,4.7,Regularly busy,Slowest days,Average orders,10


In [77]:
# It’s still only showing you a limited number of rows—not the 100 you asked for. 
# This is because Jupyter’s display option is currently set to prevent you from printing too many rows via the head() function.

In [None]:
# This command tells pandas not to assign any options regarding the maximum number of rows to display

pd.options.display.max_rows = None

In [83]:
# Now the results from the head function is showing all 100 rows.

## 4) Deriving Columns with loc()

In [95]:
# create a flag that assigns a “loyalty” label to a user ID based on its corresponding max order value.

ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [97]:
# Checking the result

ords_prods_merge['loyalty_flag'].value_counts(dropna=False)

loyalty_flag
Regular customer    15891077
Loyal customer      10293737
New customer         6249398
Name: count, dtype: int64

In [108]:
# Checking the result

ords_prods_merge[['user_id', 'loyalty_flag', 'order_number']].head(100)

Unnamed: 0,user_id,loyalty_flag,order_number
0,1,New customer,1
1,1,New customer,1
2,1,New customer,1
3,1,New customer,1
4,1,New customer,1
5,1,New customer,2
6,1,New customer,2
7,1,New customer,2
8,1,New customer,2
9,1,New customer,2
