## Imports

In [17]:
# Libraries
import pandas as pd
import numpy as np
import os

In [18]:
# Files
path = os.path.expanduser('~/Desktop/CareerFoundry/2.4/4.10.') # path to the root folder

### Order History
df_order_history = pd.read_pickle(os.path.join(path, '02 Data', 'Clean Data', 'order_history.pkl'))
### Order History No Customer Data
df_order_history_no_customer = pd.read_pickle(os.path.join(path, '02 Data', 'Clean Data', 'order_history_no_customer.pkl')) 

In [19]:
df_order_history.head()

Unnamed: 0.1,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,merge_flag_orders_prod_prior,...,merge_flag_orders_products_prior_products,Unnamed: 0,gender,state,date_joined,dependants,family_status,age_group,income_bracket,merge_flag_customer


### Price Range

In [15]:
# Create a copy to prevent SettingWithCopyWarning
df = df_order_history[:1000000]
df.head()

Unnamed: 0.1,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,merge_flag_orders_prod_prior,...,merge_flag_orders_products_prior_products,Unnamed: 0,gender,state,date_joined,dependants,family_status,age_group,income_bracket,merge_flag_customer


In [13]:
# Define function to label price ranges
def price_label(row):
    if row['prices'] <= 5:
        return 'Low'
    elif row['prices'] <= 15:
        return 'Mid'
    elif row['prices'] > 15:
        return 'High'
    else:
        return 'Err'

In [14]:
df = df.copy()  # ensures it's not a view
df['price_range'] = df.apply(price_label, axis=1)

df.head() # checking if it worked #1

Unnamed: 0.1,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,merge_flag_orders_prod_prior,...,Unnamed: 0,gender,state,date_joined,dependants,family_status,age_group,income_bracket,merge_flag_customer,price_range


In [9]:
# Quick validation
print(df['price_range'].value_counts(dropna=False))

Series([], Name: count, dtype: int64)


In [7]:
# same functionality, but using .loc instead of the function
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High'
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid' 
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low'
df['price_range'].value_counts(dropna=False)

ValueError: cannot set a frame with no defined index and a scalar

## Busiest Day

In [None]:
result = []
# using a for loop to determine the busiest days of the week and labeling them 
for value in df["order_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [None]:
df['busiest_day'] = result # adding a new column to our dataframe with busiest days of the week

In [None]:
df.head()

In [None]:
# change of requirements now days can 5 levels of business: The busiest day, Second busiest day, 
#The least busy day, Second to least busy day, Normal day
result.clear()
for value in df["order_hour_of_day"]:
    if value == 0:
        result.append("The busiest day")
    elif value == 2:
        result.append("Second busiest day")
    elif value == 6:
        result.append("The least busy day")
    elif value == 5:
        result.append("Second to least busy day")
    else:
        result.append("Normal day")

result

## Busiest Period of the Day

In [None]:
# counted how many orders happened each hour, and broke down by quantiles as low-selling hours and high-selling hours.
hourly_orders = df['order_hour_of_day'].value_counts().sort_index()
low_threshold = hourly_orders.quantile(0.33)
high_threshold = hourly_orders.quantile(0.66) 
hour_labels = {}

for hour, count in hourly_orders.items():
    if count <= low_threshold:
        hour_labels[hour] = 'Fewest orders'
    elif count <= high_threshold:
        hour_labels[hour] = 'Average orders'
    else:
        hour_labels[hour] = 'Most orders'
df['busiest_period_of_day'] = df['order_hour_of_day'].map(hour_labels)
df['busiest_period_of_day'].value_counts()



In [None]:
df.head()

## Loyalty Flag

In [None]:
# creating a smaller df with just the relevant columns
user_summary = df[['user_id', 'order_number', 'prices']].copy()

In [None]:
# max_order per user
user_summary['max_order'] = user_summary.groupby('user_id')['order_number'].transform('max')


In [None]:
# creating loyalty_flag based on max_order
user_summary['loyalty_flag'] = 'Other'
user_summary.loc[user_summary['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'
user_summary.loc[user_summary['max_order'].between(11, 40), 'loyalty_flag'] = 'Regular customer'
user_summary.loc[user_summary['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

user_summary['loyalty_flag'] = np.select(
    condlist=conditions,
    choicelist=choices,
    default='Other'  # explicitly set a string default
)

In [None]:
# calculating average price per user (mean_prices)
user_summary['mean_prices'] = user_summary.groupby('user_id')['prices'].transform('mean')


In [None]:
# creating spending_flag
user_summary['spending_flag'] = np.where(user_summary['mean_prices'] < 10, 'Low spender', 'High spender')


In [None]:
# droping duplicates to keep only one row per user
user_flags = user_summary[['user_id', 'max_order', 'loyalty_flag', 'mean_prices', 'spending_flag']].drop_duplicates()


In [None]:
# merge back to main DataFrame
df = df.merge(user_flags, on='user_id', how='left')

In [None]:
df['spending_flag'].value_counts() # quick check

In [None]:
df.to_pickle(os.path.join(path, '02 Data','Exports', 'order_history_derived_top_1000000.pkl'))