## Importing important libraries¶

In [3]:
import pandas as pd  # For DataFrames
import numpy as np  # For numeric calculations
import os  # For file management

## Import Pickle file into Pandas

In [8]:
# Data set path

path = r"/Users/martin/anaconda_projects/11-02-2025 Instacart Basket Analysis"

In [10]:
# Import of the "ords_prods_merge" data set 

ords_prods_merge = pd.read_pickle(os.path.join(path, '02 data' , 'Prepared Data' , 'ords_prods_merge_2.pkl')) 

# Task 2) Aggregating Order Numbers for the Full Dataset

## 1) Group data: groupby()

In [32]:
# This function groups the data in the DataFrame ords_prods_merge according to the product name (product_name).

ords_prods_merge.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x124ceaa50>

In [34]:
# Problem: groupby() alone shows no visible results!

## 2) Aggregate data: agg()

In [36]:
# Calculate average number of orders per department

ords_prods_merge.groupby('department_id').agg({'order_number': 'mean'})

Unnamed: 0_level_0,order_number
department_id,Unnamed: 1_level_1
1,15.457687
2,17.27792
3,17.179756
4,17.811403
5,15.213779
6,16.439806
7,17.225773
8,15.34052
9,15.895474
10,20.197148


In [40]:
# Insight: Customers order most frequently in the department with the id-number = 1!

## 4) Create new column: transform()

In [43]:
# How often has each customer ordered?

ords_prods_merge['max_order'] = ords_prods_merge.groupby('user_id')['order_number'].transform(np.max)

  ords_prods_merge['max_order'] = ords_prods_merge.groupby('user_id')['order_number'].transform(np.max)


In [53]:
ords_prods_merge[['user_id', 'order_number', 'max_order']].head(100)

Unnamed: 0,user_id,order_number,max_order
0,1,1,10
1,1,1,10
2,1,1,10
3,1,1,10
4,1,1,10
...,...,...,...
95,2,4,14
96,2,5,14
97,2,5,14
98,2,5,14


In [55]:
# Insight: Customer 1 has ordered a total of 10 times, customer 2 14 times.

## Task 3) Comparing Aggregated Order Numbers: Full Dataset vs. Subset


### Comparison of the aggregation results  

### After calculating the 'order_number' mean values for the subset and the entire DataFrame, the following findings emerge:

### - The values for the entire DataFrame differ little to not at all from those of the subset.
### - Some departments have higher average values than in the subset. This indicates that the subset does not accurately reflect all order patterns.
### - The difference could be greater for the highly frequented departments (e.g. “produce” or “dairy”) in particular, as they are more strongly represented in the complete set.
### - Overall, it can be seen that a subset with 1 million lines provides a good approximation, but is not always representative of the entire dataset.

## Task 4) Categorizing Customers Based on Order History

## Categorize customers: loc()

In [58]:
# Who is a “loyal customer”?

ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [60]:
# Checking the result

ords_prods_merge['loyalty_flag'].value_counts(dropna=False)

loyalty_flag
Regular customer    15891077
Loyal customer      10293737
New customer         6249398
Name: count, dtype: int64

In [62]:
# Result: Most customers are regualar customer with a total of 15891077. 
# Followed by loyal customers with a total of 10293737.
# The least amond of customers are new customers with a total of 6249398. 

In [66]:
# Since the result above doesn't say much, let's check it again with the command:

ords_prods_merge[['user_id', 'loyalty_flag', 'order_number']].head(100)

Unnamed: 0,user_id,loyalty_flag,order_number
0,1,New customer,1
1,1,New customer,1
2,1,New customer,1
3,1,New customer,1
4,1,New customer,1
...,...,...,...
95,2,Regular customer,4
96,2,Regular customer,5
97,2,Regular customer,5
98,2,Regular customer,5


In [68]:
# Result: Customer 1 is a New Customer (few orders). Customer 2 is a Regular Customer (average number of orders).

## Task 5) Analyzing Spending Habits by Customer Loyalty

In [84]:
# Groups the data according to customer loyalty (loyalty_flag)  
# and calculates the average (mean), minimum (min)  
# and maximum (max) price for each group.

ords_prods_merge.groupby('loyalty_flag').agg({'prices' : ['mean', 'min', 'max']})

Unnamed: 0_level_0,prices,prices,prices
Unnamed: 0_level_1,mean,min,max
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Loyal customer,10.388747,1.0,99999.0
New customer,13.29437,1.0,99999.0
Regular customer,12.496203,1.0,99999.0


In [86]:
# Insight from the data:
# - New customers have the highest average spend per product (13.29), while loyal customers spend the least (10.39).  
# - The minimum price is the same for all groups (1.0), as is the maximum price (99.999.0), which could indicate incorrect or implausible price data.  
# - Possible interpretation: New customers may be trying out more expensive products, while loyal customers already know which cheaper products they regularly buy.  

## Task 6)

In [97]:
# Aggregating Data with transform() Function

ords_prods_merge['avg_product_price'] = ords_prods_merge.groupby(['user_id'])['prices'].transform(np.mean)


  ords_prods_merge['avg_product_price'] = ords_prods_merge.groupby(['user_id'])['prices'].transform(np.mean)


In [107]:
# Checking Relevant Columns 

ords_prods_merge[['user_id', 'avg_product_price']].head(1000)

Unnamed: 0,user_id,avg_product_price
0,1,6.367797
1,1,6.367797
2,1,6.367797
3,1,6.367797
4,1,6.367797
...,...,...
995,12,8.116216
996,12,8.116216
997,12,8.116216
998,12,8.116216


In [109]:
# Creating a 'Spending' Label Flag using loc Function 

ords_prods_merge.loc[ords_prods_merge["avg_product_price"] >= 10, "spending_flag"] = "High spender"

ords_prods_merge.loc[ords_prods_merge["avg_product_price"] < 10, "spending_flag"] = "Low spender"

In [111]:
# Examining Value Count on 'spending_flag' Column

ords_prods_merge['spending_flag'].value_counts(dropna = False)

spending_flag
Low spender     31798751
High spender      635461
Name: count, dtype: int64

In [115]:
# Checking Releavent Columns

ords_prods_merge[['user_id', 'avg_product_price', 'spending_flag']].head(1000)

Unnamed: 0,user_id,avg_product_price,spending_flag
0,1,6.367797,Low spender
1,1,6.367797,Low spender
2,1,6.367797,Low spender
3,1,6.367797,Low spender
4,1,6.367797,Low spender
...,...,...,...
995,12,8.116216,Low spender
996,12,8.116216,Low spender
997,12,8.116216,Low spender
998,12,8.116216,Low spender


## Task 7) Identifying Frequent vs. Non-Frequent Customers