# Import libraries & dataframes

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path = r'C:\Users\muril\Data Analysis Projects\07-2023 Instacart Basket Analysis\02 Data'
ords_prods_merged = pd.read_csv(os.path.join(path, 'Prepared Data', 'orders_products_merged.csv'), index_col=0)

In [35]:
cust = pd.read_csv(os.path.join(path, 'Prepared Data', 'customers_cleaned.csv'), index_col=0)

# 5. Assign labels

## 5.1 price_label flag

In [3]:
# assign high, mid-range and low flags to products
ords_prods_merged.loc[ords_prods_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'
ords_prods_merged.loc[(ords_prods_merged['prices'] <= 15) & (ords_prods_merged['prices'] > 5), 'price_range_loc'] = 'Mid-range product' 
ords_prods_merged.loc[ords_prods_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [4]:
# Check count of loc function output
ords_prods_merged['price_range_loc'].value_counts(dropna=False)

Mid-range product     21894136
Low-range product     10126339
High-range product      412555
Name: price_range_loc, dtype: int64

## 5.2 Busiest days flag

In [5]:
# find the counts of orders by day of the week
ords_prods_merged['order_day_of_week'].value_counts(dropna = False)

0    6209410
1    5665604
6    4500101
2    4217610
5    4209334
3    3843929
4    3787042
Name: order_day_of_week, dtype: int64

Saturday (0) and Sunday (1) are the busiest days and Wednesday (4) and Tuesday (3) are the quietest days.

In [6]:
# add column to dataframe that tags the busiest day and the quietest day

result = []

for value in ords_prods_merged["order_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [7]:
# add this column to the dataframe
ords_prods_merged['busiest_day'] = result

In [8]:
ords_prods_merged['busiest_day'].value_counts(dropna = False)

Regularly busy    22436578
Busiest day        6209410
Least busy         3787042
Name: busiest_day, dtype: int64

In [9]:
# add column to dataframe that tags the busiest two days and the quietest two days

result2 = []

for value in ords_prods_merged["order_day_of_week"]:
  if value in (0,1):
    result2.append("Busiest days")
  elif value in (3,4):
    result2.append("Slowest days")
  else:
    result2.append("Regularly busy")

In [10]:
# add this column to the dataframe
ords_prods_merged['busiest_days'] = result2

In [11]:
ords_prods_merged['busiest_days'].value_counts(dropna = False)

Regularly busy    12927045
Busiest days      11875014
Slowest days       7630971
Name: busiest_days, dtype: int64

The function seems to have performed correctly, the totals are correct and the entries return correctly for the observations above.

## 5.3 Busiest time of day flag

In [12]:
# frequencies of order hour
ords_prods_merged['order_hour_of_day'].value_counts(dropna = False)

10    2764288
11    2738483
14    2691448
15    2664420
13    2663169
12    2620719
16    2537358
9     2456591
17    2089385
8     1719888
18    1637858
19    1259335
20     976991
7      891900
21     796341
22     634715
23     402593
6      290763
0      218925
1      115780
5       88054
2       69429
4       53280
3       51317
Name: order_hour_of_day, dtype: int64

The time periods “Most orders”, “Average orders” and “Fewest orders” will be distributed by 25%, 50% and 25% respectively of the number hours of the day, meaning that Most Orders will be the top 6 hours, Fewest Orders will be the bottom 6 hours and Average Orders will be the middle 12 hours.

Most Orders hours - 10, 11, 14, 15, 13, 12

Fewest Orders hours - 0, 1, 5, 2, 4, 3

In [13]:
# add column to dataframe that tags the busiest and quietest hours

result3 = []

for value in ords_prods_merged["order_hour_of_day"]:
  if value in (10, 11, 14, 15, 13, 12):
    result3.append("Most orders")
  elif value in (0, 1, 5, 2, 4, 3):
    result3.append("Fewest orders")
  else:
    result3.append("Average orders")

In [14]:
# add this column to the dataframe
ords_prods_merged['busiest_period_of_day'] = result3

In [15]:
# check flag counts
ords_prods_merged['busiest_period_of_day'].value_counts(dropna = False)

Most orders       16142527
Average orders    15693718
Fewest orders       596785
Name: busiest_period_of_day, dtype: int64

In [16]:
del result
del result2
del result3

## 5.4 Loyalty flag

In [17]:
# find maximum number of orders per user
loyalty = ords_prods_merged.groupby('user_id')['order_id'].nunique().reset_index()
loyalty.head()

Unnamed: 0,user_id,order_id
0,1,10
1,2,14
2,3,12
3,4,5
4,5,4


In [18]:
# assign loyalty flag - greater than 40 returns 'loyal', less than 10 returns 'new', otherwise 'regular'
loyalty['loyalty_flag'] = loyalty['order_id'].apply(lambda order_id: 'loyal' if order_id > 40 else 'new' if order_id < 10 else 'regular')

In [25]:
loyalty['loyalty_flag'].value_counts()

new        104514
regular     84678
loyal       17017
Name: loyalty_flag, dtype: int64

In [24]:
loyalty['loyalty_flag'].isnull().sum()

0

Flag assignation appears to have functioned correctly.

In [27]:
# merge with dataframe
ords_prods_merged = ords_prods_merged.merge(loyalty[['user_id','loyalty_flag']], on='user_id', how='left')

In [31]:
del loyalty

In [37]:
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,loyalty_flag
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,regular
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Average orders,regular
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Most orders,regular
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Average orders,regular
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Most orders,regular


# Export dataframe