## Grouping data and aggregating variables

In [16]:
import pandas as pd

# Import the main file
ords_prods_merge = pd.read_pickle("/Users/ging/Documents/Data Analytics Course/04_2024_Instacart Basket Analysis/02 Data/Prepared Data/updated_ords_prods_merge.pkl")

# Print all the column names
print(df.columns.tolist())


['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id', 'eval_set', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'product_name', 'aisle_id', 'department_id', 'prices', 'busiest_day', 'updated_busiest_day', 'busiest_period_of_day']


In [21]:
# Subset of the first 1 million observations
subset_df = ords_prods_merge.head(1000000)

# Aggregated mean of "order_number" column grouped by "department_id" for the subset
aggregated_mean_subset = subset_df.groupby('department_id')['order_number'].mean()

print("Aggregated mean of 'order_number' column grouped by 'department_id' for the subset of the first 1 million observations:")
print(aggregated_mean_subset)


Aggregated mean of 'order_number' column grouped by 'department_id' for the subset of the first 1 million observations:
department_id
1     15.629758
2     18.470641
3     17.407619
4     17.985948
5     14.407687
6     16.515856
7     17.413529
8     16.050344
9     16.084773
10    21.290293
11    16.501834
12    16.090756
13    16.788996
14    16.888037
15    16.442748
16    17.809716
17    16.081831
18    19.198872
19    17.404819
20    16.670273
21    23.252093
Name: order_number, dtype: float64


In [22]:
# Aggregated mean of "order_number" column grouped by "department_id" for all observations
aggregated_mean_all = ords_prods_merge.groupby('department_id')['order_number'].mean()

print("Aggregated mean of 'order_number' column grouped by 'department_id' for all observations:")
print(aggregated_mean_all)


Aggregated mean of 'order_number' column grouped by 'department_id' for all observations:
department_id
1     15.457687
2     17.277920
3     17.179756
4     17.811403
5     15.213779
6     16.439806
7     17.225773
8     15.340520
9     15.895474
10    20.197148
11    16.170828
12    15.887622
13    16.583304
14    16.757377
15    16.165037
16    17.663250
17    15.694469
18    19.310397
19    17.177343
20    16.473447
21    22.902379
Name: order_number, dtype: float64


## The two results, representing the aggregated mean of the order_number column grouped by department_id, demonstrate both similarities and differences:

Similarities:

1) Both results show department 10 as having the highest aggregated mean of order_number, indicating that it consistently has a higher average order number across both the subset and the full dataset.
2) Both results exhibit a general trend where certain departments have higher aggregated means compared to others, suggesting consistency in ordering patterns across the dataset.

Differences:

1) The specific values of the aggregated means differ slightly between the subset and the full dataset. While the overall trends remain similar, the exact averages may vary due to the differences in the data sizes.
2) Some departments may show more noticeable differences in their aggregated means between the subset and the full dataset, indicating potential variations in ordering behavior across different segments of the data.
3) The department with the lowest aggregated mean may differ between the subset and the full dataset, although the differences may not be substantial. This suggests that certain departments may perform consistently across different subsets of the data, while others may exhibit more variability.

In [24]:
# Step 1: Split the data into groups based on the "user_id" column
grouped_data = ords_prods_merge.groupby('user_id')

# Step 2: Apply the transform() function to generate the maximum orders for each user
ords_prods_merge['max_order'] = grouped_data['order_number'].transform('max')

# Step 3: Create a new column, "loyalty_flag," based on the criteria
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

# Step 4: Print the frequency of the "loyalty_flag" column
loyalty_flag_freq = ords_prods_merge['loyalty_flag'].value_counts()
print(loyalty_flag_freq)


loyalty_flag
Regular customer    15891077
Loyal customer      10293737
New customer         6249398
Name: count, dtype: int64


In [25]:
ords_prods_merge[['user_id', 'loyalty_flag', 'order_number']].head(60)

Unnamed: 0,user_id,loyalty_flag,order_number
0,202279,New customer,3
1,202279,New customer,3
2,202279,New customer,3
3,202279,New customer,3
4,202279,New customer,3
5,202279,New customer,3
6,202279,New customer,3
7,202279,New customer,3
8,202279,New customer,3
9,205970,Regular customer,16


In [27]:
# Import necessary libraries
import pandas as pd


# Group the data by loyalty_flag and calculate basic statistics of product prices for each loyalty category
loyalty_price_stats = ords_prods_merge.groupby('loyalty_flag')['prices'].describe()

# Print the statistics for each loyalty category
print("Basic Statistics of Product Prices by Loyalty Category:")
print(loyalty_price_stats)



Basic Statistics of Product Prices by Loyalty Category:
                       count       mean         std  min  25%  50%   75%  \
loyalty_flag                                                               
Loyal customer    10293737.0  10.388747  327.864108  1.0  4.2  7.4  11.2   
New customer       6249398.0  13.294370  597.301692  1.0  4.2  7.4  11.3   
Regular customer  15891077.0  12.496203  539.478009  1.0  4.2  7.4  11.3   

                      max  
loyalty_flag               
Loyal customer    99999.0  
New customer      99999.0  
Regular customer  99999.0  


In [29]:
# Group the data by user_id and calculate the mean of prices for each user
user_mean_prices = ords_prods_merge.groupby('user_id')['prices'].mean()

# Create a new column 'spending_flag' and assign the default value
ords_prods_merge['spending_flag'] = 'Low spender'

# Update the spending flag based on the mean price
ords_prods_merge.loc[user_mean_prices[user_mean_prices >= 10].index, 'spending_flag'] = 'High spender'

# Print the frequency of the spending flag column
print("Frequency of Spending Flags:")
print(ords_prods_merge['spending_flag'].value_counts())


Frequency of Spending Flags:
spending_flag
Low spender     32428837
High spender        5375
Name: count, dtype: int64


In [31]:
# Calculate the median of days_since_prior_order for each user
user_median_days = ords_prods_merge.groupby('user_id')['days_since_prior_order'].median()

# Create a new column 'order_frequency_flag' and assign the default value
ords_prods_merge['order_frequency_flag'] = 'Frequent customer'

# Update the order frequency flag based on the median days since prior order
ords_prods_merge.loc[user_median_days[user_median_days > 20].index, 'order_frequency_flag'] = 'Non-frequent customer'
ords_prods_merge.loc[user_median_days[(user_median_days <= 20) & (user_median_days > 10)].index, 'order_frequency_flag'] = 'Regular customer'

# Print the frequency of the order frequency flag column
print("Frequency of Order Frequency Flags:")
print(ords_prods_merge['order_frequency_flag'].value_counts())


Frequency of Order Frequency Flags:
order_frequency_flag
Frequent customer        32314590
Regular customer            60004
Non-frequent customer       59618
Name: count, dtype: int64


In [32]:
# Define the file path
file_path = "/Users/ging/Documents/Data Analytics Course/04_2024_Instacart Basket Analysis/02 Data/Prepared Data/4_8_ords_prods_merge.pkl"

# Export the dataframe as a pickle file
ords_prods_merge.to_pickle(file_path)
