In [3]:
import pandas as pd
import os

# Path to the project folder
project_path = r'C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis'

# File paths
orders_products_combined_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl')
products_checked_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'products_checked_clean.csv')

# Step 1: Load the combined dataset
df_combined = pd.read_pickle(orders_products_combined_path)

# Check the shape and head of the dataframe
print("Combined DataFrame shape:", df_combined.shape)
print(df_combined.head())
print(df_combined.columns)

Combined DataFrame shape: (32434489, 9)
   order_id  user_id  order_number  order_day_of_week  order_hour_of_day  \
0   2539329        1             1                  2                  8   
1   2539329        1             1                  2                  8   
2   2539329        1             1                  2                  8   
3   2539329        1             1                  2                  8   
4   2539329        1             1                  2                  8   

   days_since_prior_order  product_id  add_to_cart_order  reordered  
0               11.114836         196                  1          0  
1               11.114836       14084                  2          0  
2               11.114836       12427                  3          0  
3               11.114836       26088                  4          0  
4               11.114836       26405                  5          0  
Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_ho

In [5]:
# Step 2: Load the products data
df_products = pd.read_csv(products_checked_path)

# Check the shape and columns of the products dataframe
print("Products DataFrame shape:", df_products.shape)
print(df_products.head())
print(df_products.columns)

# Merge the combined orders and products data to include prices
df_combined = df_combined.merge(df_products[['product_id', 'prices']], on='product_id', how='left')

# Check the shape and columns after merge
print("Shape after merging with products:", df_combined.shape)
print(df_combined.head())
print(df_combined.columns)

Products DataFrame shape: (49688, 5)
   product_id                                       product_name  aisle_id  \
0           1                         Chocolate Sandwich Cookies        61   
1           2                                   All-Seasons Salt       104   
2           3               Robust Golden Unsweetened Oolong Tea        94   
3           4  Smart Ones Classic Favorites Mini Rigatoni Wit...        38   
4           5                          Green Chile Anytime Sauce         5   

   department_id  prices  
0             19     5.8  
1             13     9.3  
2              7     4.5  
3              1    10.5  
4             13     4.3  
Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'prices'], dtype='object')
Shape after merging with products: (32435059, 10)
   order_id  user_id  order_number  order_day_of_week  order_hour_of_day  \
0   2539329        1             1                  2                  8   
1   2539329        1             1   

In [9]:
# Step 3: Creating the price_label column using loc()
df_combined.loc[df_combined['prices'] <= 5, 'price_label'] = 'Low-range product'
df_combined.loc[(df_combined['prices'] > 5) & (df_combined['prices'] <= 15), 'price_label'] = 'Mid-range product'
df_combined.loc[df_combined['prices'] > 15, 'price_label'] = 'High-range product'

In [11]:
# Check the distribution of the new column
print("Price label distribution:")
print(df_combined['price_label'].value_counts())

Price label distribution:
price_label
Mid-range product     21889009
Low-range product     10126339
High-range product      417682
Name: count, dtype: int64


In [13]:
# Step 4: Creating the busiest_day column
df_combined['busiest_day'] = df_combined['order_day_of_week'].apply(
    lambda x: 'Busiest days' if x in [0, 1] else 'Least busy days' if x in [4, 5] else 'Regularly busy'
)

# Check the distribution of the new column
print("Busiest day distribution:")
print(df_combined['busiest_day'].value_counts())


Busiest day distribution:
busiest_day
Regularly busy     12562434
Busiest days       11875759
Least busy days     7996866
Name: count, dtype: int64


In [15]:
# Step 5: Creating the busiest_period_of_day column
df_combined['busiest_period_of_day'] = df_combined['order_hour_of_day'].apply(
    lambda x: 'Most orders' if 10 <= x <= 16 else 'Fewest orders' if 0 <= x <= 6 else 'Average orders'
)

# Check the distribution of the new column
print("Busiest period of day distribution:")
print(df_combined['busiest_period_of_day'].value_counts())

Busiest period of day distribution:
busiest_period_of_day
Most orders       18681054
Average orders    12866367
Fewest orders       887638
Name: count, dtype: int64


In [17]:
# Final Step: Export the dataframe as a pickle file
export_path = os.path.join(project_path, '02 Data', 'Prepared Data', 'orders_products_combined_with_labels.pkl')
df_combined.to_pickle(export_path)
print(f"DataFrame exported to {export_path}")

DataFrame exported to C:\Users\sudee\OneDrive\Documents\Python Scripts\Instacart Basket Analysis\02 Data\Prepared Data\orders_products_combined_with_labels.pkl



### Explanation:

1. **Introduction**: 
   - Provides an overview of what will be done in the task.
   
2. **Step 1: Load and Inspect the Data**: 
   - Loads the combined dataset and checks its shape and columns.

3. **Step 2: Load and Merge Products Data**: 
   - Loads the products data and merges it with the combined dataframe to include the `prices` column.

4. **Step 3: Create `price_label` Column**: 
   - Uses `loc()` to categorize products into `Low-range`, `Mid-range`, and `High-range` based on their prices.
   - Checks the distribution of the new column.

5. **Step 4: Update `busiest_day` Column**: 
   - Updates the `busiest_day` column to label the two busiest days and two least busy days.
   - Checks the distribution of the new column.

6. **Step 5: Create `busiest_period_of_day` Column**: 
   - Uses `apply()` with a lambda function to categorize hours into periods with the most, average, and fewest orders.
   - Checks the distribution of the new column.

7. **Step 6: Export the Final Dataframe**: 
   - Exports the updated dataframe with new columns as a pickle file.

