In [1]:
import pandas as pd
import numpy as np
import os

# Project folder path
project_folder = r'C:\Users\Jose Zambom\OneDrive - Exel Industries\Data Analysis\Data Analytics Immersion\Python Fundamentals for Data Analyst\csv file\Prepared Data'

# Use os.path.join to create the full file path
file_path = os.path.join(project_folder, 'ords_prods_merge.pkl')

# Load the pickle file
ords_prods_merge = pd.read_pickle(file_path)

In [3]:
#create a subset
df = ords_prods_merge[:1000000]

In [4]:
df.shape

(1000000, 15)

In [6]:
# define function

def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [7]:
# apply the function

df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [9]:
df['price_range'].value_counts(dropna=False)

price_range
Mid-range product    674229
Low-range product    312859
High range            12912
Name: count, dtype: int64

In [10]:
df['prices'].max()

99999.0

In [12]:
# Categorize prices into ranges
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [14]:
df['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     674229
Low-range product     312859
High-range product     12912
Name: count, dtype: int64

In [15]:
ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [17]:
result = []

for value in ords_prods_merge["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [18]:
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly 

In [19]:
ords_prods_merge['busiest_day'] = result

In [22]:
ords_prods_merge['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

In [26]:
# Part 1: Adjust "busiest_day" to "busiest_days"
# Calculate the frequency of orders for each day of the week
day_order_counts = df['orders_day_of_week'].value_counts().sort_values(ascending=False)

# Get the two busiest and two slowest days
two_busiest_days = day_order_counts.index[:2]
two_slowest_days = day_order_counts.index[-2:]

# Create the new "busiest_days" column using .loc to avoid SettingWithCopyWarning
df.loc[:, 'busiest_days'] = df['orders_day_of_week'].apply(
    lambda x: 'Busiest days' if x in two_busiest_days else ('Slowest days' if x in two_slowest_days else 'Regular days')
)

# Check the new "busiest_days" column
print(df['busiest_days'].value_counts())


# Part 2: Create "busiest_period_of_day"
# Calculate the frequency of orders for each hour
hour_order_counts = df['order_hour_of_day'].value_counts().sort_values(ascending=False)

# Calculate thresholds for the busiest, average, and fewest periods
most_orders_threshold = hour_order_counts.quantile(0.66)  # Top 33% as 'Most orders'
fewest_orders_threshold = hour_order_counts.quantile(0.33)  # Bottom 33% as 'Fewest orders'

# Define the function to label the periods
def label_busiest_period(order_count):
    if order_count >= most_orders_threshold:
        return 'Most orders'
    elif order_count <= fewest_orders_threshold:
        return 'Fewest orders'
    else:
        return 'Average orders'

# Apply the function to create the new "busiest_period_of_day" column using .loc
df.loc[:, 'busiest_period_of_day'] = df['order_hour_of_day'].map(hour_order_counts).apply(label_busiest_period)

# Check the new "busiest_period_of_day" column
print(df['busiest_period_of_day'].value_counts())



busiest_days
Regular days    400668
Busiest days    365444
Slowest days    233888
Name: count, dtype: int64
busiest_period_of_day
Most orders       648808
Average orders    310727
Fewest orders      40465
Name: count, dtype: int64


### Observations for the "Busiest period of day" Column

The new column "busiest_period_of_day" categorizes the hours of the day into:
- **Most orders**: The hours with the highest order frequencies (top 33%).
- **Fewest orders**: The hours with the lowest order frequencies (bottom 33%).
- **Average orders**: The hours that fall in between.

The value counts show a balanced distribution of hours across the three categories, ensuring the classification works as intended.


In [27]:
# Define the file path to the "Prepared Data" folder
file_path = r'C:\Users\Jose Zambom\OneDrive - Exel Industries\Data Analysis\Data Analytics Immersion\Python Fundamentals for Data Analyst\csv file\Prepared Data\orders_products_updated.pkl'

# Export the DataFrame as a pickle file
df.to_pickle(file_path)