<a href="https://colab.research.google.com/github/J-Lehrer/aai_540_group_2_final_project/blob/main/Maria_Leal_Reduced_of_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing necessary libraries

In [1]:
import zipfile
import os
import pandas as pd

# Decompressing the "Archive" files.

In [3]:
# Defining the path to the "Archives" folder.
archive_path = "/content/drive/MyDrive/MLOps/archive.zip"
extract_path ="/content/drive/MyDrive/MLOps/Instant_Cart"

# Extract the archive
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print(f"Files extracted to {extract_path}")

Files extracted to /content/drive/MyDrive/MLOps/Instant_Cart


# Loading the extracted CSV files

In [2]:
extract_path ="/content/drive/MyDrive/MLOps/Instant_Cart"

In [3]:
# Defining the paths to the files
orders_path = os.path.join(extract_path, 'orders.csv')
order_products_prior_path = os.path.join(extract_path, 'order_products__prior.csv')
order_products_train_path = os.path.join(extract_path, 'order_products__train.csv')
aisles_path = os.path.join(extract_path, 'aisles.csv')
departments_path = os.path.join(extract_path, 'departments.csv')
products_path = os.path.join(extract_path, 'products.csv')

# Loading the CSV files into DataFrames
orders = pd.read_csv(orders_path)
order_products_prior = pd.read_csv(order_products_prior_path)
order_products_train = pd.read_csv(order_products_train_path)
aisles = pd.read_csv(aisles_path)
departments = pd.read_csv(departments_path)
products = pd.read_csv(products_path)

print("Files successfully load!")

Files successfully load!


## Checking Dataframes

In [3]:
# Checking DataFrame shape and info
print("Orders:")
print(orders.shape)
print(orders.info())
print("")
print("Order Products Prior:")
print(order_products_prior.shape)
print(order_products_prior.info())
print("")
print("Order Products Train:")
print(order_products_train.shape)
print(order_products_train.info())
print("")
print("Aisles:")
print(aisles.shape)
print(aisles.info())
print("")
print("Departments:")
print(departments.shape)
print(departments.info())
print("")
print("Products:")
print(products.shape)
print(products.info())


Orders:
(3421083, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            int64  
 4   order_dow               int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 182.7+ MB
None

Order Products Prior:
(32434489, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 989.8 MB
None

Order Products Train:
(1384617, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138

# Aggregate product orders

In this step, I am combining the dataset: order_products_prior.csv and order_products_train files to compute the total frequency of each product.

In [21]:
# Combining prior and train datasets
all_order_products = pd.concat([order_products_prior, order_products_train])

# Calculating product frequency
product_frequency = all_order_products.groupby('product_id').size().reset_index(name='order_count')

# Getting the top 10,000 most ordered products
top_10k_products = product_frequency.nlargest(10000, 'order_count')
print("Top 10,000 products identified!")

Top 10,000 products identified!


# Filtering orders by Top 10,000 products

In this step, I am filtering the orders to include only those that contain one or more of the top 10,000 products

In [22]:
# Filtering orders with top 10k products
filtered_orders = all_order_products[all_order_products['product_id'].isin(top_10k_products['product_id'])]

# Getting the list of relevant order IDs
filtered_order_ids = filtered_orders['order_id'].unique()

# Filtering the orders DataFrame
filtered_orders_df = orders[orders['order_id'].isin(filtered_order_ids)]
print(f"Filtered orders to include only top 10,000 products. Remaining orders: {len(filtered_orders_df)}")


Filtered orders to include only top 10,000 products. Remaining orders: 3321331


In [23]:
print(filtered_orders_df.columns)

Index(['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')


# Further limiting orders by Size

In the previous step, the total number of orders were reduced to: 3,321,331. Next, I am further limiting the size by retaining orders with a minimum of (5) items to reduce the dataset size further.

In [24]:
# Counting the number of items in each order
order_item_count = filtered_orders.groupby('order_id').size().reset_index(name='item_count')

# Setting the minimum item threshold (e.g., X = 5)
X = 5

# Filtering orders with at least X items
large_orders = order_item_count[order_item_count['item_count'] >= X]

# Getting the list of valid order IDs
valid_order_ids = large_orders['order_id']

# Filtering the original dataset to keep only the valid order IDs
filtered_orders_df = filtered_orders[filtered_orders['order_id'].isin(valid_order_ids)]

# Counting unique orders in the final filtered dataset
unique_orders_count = filtered_orders_df['order_id'].nunique()

print(f"Filtered down to orders with at least {X} items.")
print(f"Final number of unique orders: {unique_orders_count}")
print(f"Final number of rows (products): {len(filtered_orders_df)}")



Filtered down to orders with at least 5 items.
Final number of unique orders: 2400986
Final number of rows (products): 28368235


# Further limiting order to "Active Users"

Previously, I limited the orders to include only those that contain only 5 products from the top 10k products. We ended with a total size of: 2,400,986. For further reducing the dataset, I am going to focus on users with consistent purchasing behavior. For example, users with more than "Y" total orders. For accomplishing this step, I am going to count the total number of orders per user and filter users with at least 10 orders.

In [25]:
print(filtered_orders_df.columns)


Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered'], dtype='object')


In [26]:
# Merging user_id into filtered_orders_df
filtered_orders_df = pd.merge(
    filtered_orders_df,
    orders[['order_id', 'user_id']],
    on='order_id',
    how='left'
)

# Confirming the user_id column is now included
print(filtered_orders_df.columns)


Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id'], dtype='object')


In [27]:
# Counting the total number of orders per user
user_order_counts = filtered_orders_df.groupby('user_id').size().reset_index(name='order_count')

# Setting the threshold for active users (e.g., Y = 10 orders)
Y = 10
active_users = user_order_counts[user_order_counts['order_count'] >= Y]
active_user_ids = active_users['user_id']

print(f"Number of active users with at least {Y} orders: {len(active_user_ids)}")


Number of active users with at least 10 orders: 184340


In [28]:
filtered_orders_active_users = filtered_orders_df[filtered_orders_df['user_id'].isin(active_user_ids)]

print(f"Remaining orders: {len(filtered_orders_active_users)}")
print(f"Unique users: {filtered_orders_active_users['user_id'].nunique()}")


Remaining orders: 28313728
Unique users: 184340


In [29]:
print(f"Unique orders remaining: {filtered_orders_active_users['order_id'].nunique()}")

Unique orders remaining: 2391917


In [30]:
print(filtered_orders_active_users.columns)

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id'], dtype='object')


In [31]:
# Saving filtered_orders_active_users DataFram into csv
filtered_orders_active_users.to_csv('/content/drive/MyDrive/MLOps/Instant_Cart/filtered_orders_active_users.csv', index=False)

# Further filtering by top 10 aisles or departments.

On the previous process, we obtained a total of 2,391,917 orders. This time, I am focusing on the most frequently ordered items within the top 10 departments. I will be identifying the top 10 departments with the highest number of orders for later retaining only the orders and products that belong to these top aisles or departments.

In [4]:
filtered_orders_active_users = pd.read_csv('/content/drive/MyDrive/MLOps/Instant_Cart/filtered_orders_active_users.csv')

In the next step, I am merging aisles.csv and departments.csv with products.csv to enrich the product information with aisle and department details.

In [6]:
# Merging products with aisles and departments
products_enriched = pd.merge(
    products,
    aisles,
    on='aisle_id',
    how='left'
)
products_enriched = pd.merge(
    products_enriched,
    departments,
    on='department_id',
    how='left'
)

# Verifying the columns in the enriched product dataset
print(products_enriched.columns)


Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'aisle',
       'department'],
      dtype='object')


Below, I joing the enriched products information (products_enriched) with the (filtered_orders_active_users) DataFrame.

In [7]:
# Merging product details into the filtered orders dataset
filtered_orders_with_details = pd.merge(
    filtered_orders_active_users,
    products_enriched,
    on='product_id',
    how='left'
)

# Verifying the merged dataset
print(filtered_orders_with_details.columns)


Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id',
       'product_name', 'aisle_id', 'department_id', 'aisle', 'department'],
      dtype='object')


Next, I group the data by aisle and department (separate) and calculate the total number of orders. Then, sorting the results in descending order and retain the top 10.

In [8]:
# Counting orders by aisle
aisle_order_counts = filtered_orders_with_details.groupby('aisle').size().reset_index(name='order_count')

# Getting the top 10 aisles
top_aisles = aisle_order_counts.nlargest(10, 'order_count')
top_aisle_names = top_aisles['aisle']

print(f"Top 10 aisles: {list(top_aisle_names)}")


Top 10 aisles: ['fresh fruits', 'fresh vegetables', 'packaged vegetables fruits', 'yogurt', 'packaged cheese', 'milk', 'water seltzer sparkling water', 'chips pretzels', 'soy lactosefree', 'bread']


In [9]:
# Counting orders by department
department_order_counts = filtered_orders_with_details.groupby('department').size().reset_index(name='order_count')

# Getting the top 10 departments
top_departments = department_order_counts.nlargest(10, 'order_count')
top_department_names = top_departments['department']

print(f"Top 10 departments: {list(top_department_names)}")


Top 10 departments: ['produce', 'dairy eggs', 'snacks', 'beverages', 'frozen', 'pantry', 'bakery', 'deli', 'canned goods', 'dry goods pasta']


In [10]:
# Filtering orders for top aisles
filtered_by_aisles = filtered_orders_with_details[filtered_orders_with_details['aisle'].isin(top_aisle_names)]

print(f"Remaining orders after filtering by top aisles: {len(filtered_by_aisles)}")


Remaining orders after filtering by top aisles: 14096649


In [11]:
# Filtering orders for top departments
filtered_by_departments = filtered_orders_with_details[filtered_orders_with_details['department'].isin(top_department_names)]

print(f"Remaining orders after filtering by top departments: {len(filtered_by_departments)}")


Remaining orders after filtering by top departments: 25702383


In [12]:
# Verifying aisles or departments in the filtered dataset
print(filtered_by_aisles['aisle'].value_counts())
print(filtered_by_departments['department'].value_counts())


aisle
fresh fruits                     3485989
fresh vegetables                 3378122
packaged vegetables fruits       1686448
yogurt                           1364214
packaged cheese                   905963
milk                              806036
water seltzer sparkling water     720034
chips pretzels                    635021
soy lactosefree                   587908
bread                             526914
Name: count, dtype: int64
department
produce            9133122
dairy eggs         4988600
snacks             2366457
beverages          2168493
frozen             1888991
pantry             1502823
bakery             1042682
deli                940087
canned goods        928885
dry goods pasta     742243
Name: count, dtype: int64


I decided to filter the data by "Top 10 Departments" for the following reasons:

- It aligns with the project's goal of creatinga resuable ML pipeline, as department-level insights generalize better.
- Provides a broader perspective, covering diverse products and trends.
- Reduces the dataset size efficiently while retaining valuable data for high-level analysis.

Next, I filter the dataset by the top 10 departments: produce, dairy eggs, snacks, etc. Then, I save the filtered dataset into .csv format.

In [13]:
import pandas as pd

# Paths to the files
output_path = "/content/drive/MyDrive/MLOps/Instant_Cart/filtered_by_top_departments.csv"  # Desired output path

# Top 10 departments
top_departments = [
    "produce", "dairy eggs", "snacks", "beverages", "frozen",
    "pantry", "bakery", "deli", "canned goods", "dry goods pasta"
]

# Filtering the dataset to include only top departments
filtered_by_departments = filtered_orders_with_details[
    filtered_orders_with_details['department'].isin(top_departments)
]

# Saving the filtered dataset
filtered_by_departments.to_csv(output_path, index=False)

# Summary of the filtered dataset
filtered_summary = {
    "Remaining Rows": len(filtered_by_departments),
    "Unique Orders": filtered_by_departments['order_id'].nunique(),
    "Unique Users": filtered_by_departments['user_id'].nunique(),
    "Unique Departments": filtered_by_departments['department'].nunique()
}

print("Filtered dataset saved successfully.")
print(filtered_summary)


Filtered dataset saved successfully.
{'Remaining Rows': 25702383, 'Unique Orders': 2389985, 'Unique Users': 184304, 'Unique Departments': 10}


# Further filtering rarely reordered products





From the previous filtering process, we obtained a total number of unique orders of: 2,389,985. Now, I will proceed to calculate the reorder rate for each product by grouping the data by product_id and taking the mean of the "reordered" column.

In [1]:
import pandas as pd

# Loading the filtered dataset
filtered_by_top_departments_path = "/content/drive/MyDrive/MLOps/Instant_Cart/filtered_by_top_departments.csv"
filtered_by_top_departments = pd.read_csv(filtered_by_top_departments_path)

# Calculating reorder rate for each product
product_reorder_rate = filtered_by_top_departments.groupby('product_id')['reordered'].mean().reset_index()
product_reorder_rate.rename(columns={'reordered': 'reorder_rate'}, inplace=True)

# Setting a threshold for rarely reordered products (e.g., 0.2 or 20%)
threshold = 0.2
frequently_reordered_products = product_reorder_rate[product_reorder_rate['reorder_rate'] > threshold]

print(f"Products with a reorder rate above {threshold}: {len(frequently_reordered_products)}")


Products with a reorder rate above 0.2: 8002


In [2]:
# Filtering the dataset to include only frequently reordered products
filtered_frequent_reorders = filtered_by_top_departments[
    filtered_by_top_departments['product_id'].isin(frequently_reordered_products['product_id'])
]

print(f"Filtered dataset to include frequently reordered products.")
print(f"Remaining Rows: {len(filtered_frequent_reorders)}")
print(f"Unique Orders: {filtered_frequent_reorders['order_id'].nunique()}")
print(f"Unique Products: {filtered_frequent_reorders['product_id'].nunique()}")


Filtered dataset to include frequently reordered products.
Remaining Rows: 25454061
Unique Orders: 2389931
Unique Products: 8002


In [3]:
# Saving the filtered dataset
output_path_frequent_reorders = "/content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_reorders.csv"
filtered_frequent_reorders.to_csv(output_path_frequent_reorders, index=False)

print(f"Filtered dataset saved at {output_path_frequent_reorders}.")


Filtered dataset saved at /content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_reorders.csv.


# Filtering for frequent buyer

From the previous result, the dataset was only reduced to: 2,389,931 orders. Next, I am filtering by identifying frequent buyey. For example: retain users who order top products at least N times.

## N = 450.
This first dataset will be balanced between most frequent buyers and non active users.

**Note to consider:**

As we increase "N", the focus is only on the most active users, which may bias the model toward frequent buyes and their behavior.

The model might become less generalize to users with lower purchasing activity.

If the dataset is heavily skewed toward a small group of active users, some product-specific might be lost. However, keeping more frequent buyers might still capture sufficient trends highly reordered products.

A piperline trained on a subset of frequent buyers may perform for similr groups but may not generalize to less frequent users or broader audience.

In [51]:
# Defining the threshold for frequent buyers (e.g., at least N = 10 orders)
N = 450

# Counting the number of orders per user
user_order_counts = filtered_frequent_reorders.groupby('user_id').size().reset_index(name='order_count')

# Filtering users with at least N orders
frequent_buyers = user_order_counts[user_order_counts['order_count'] >= N]
frequent_buyer_ids = frequent_buyers['user_id']

print(f"Number of frequent buyers with at least {N} orders: {len(frequent_buyer_ids)}")


Number of frequent buyers with at least 450 orders: 11033


In [52]:
# Filtering orders for frequent buyers
filtered_frequent_buyers = filtered_frequent_reorders[
    filtered_frequent_reorders['user_id'].isin(frequent_buyer_ids)
]

print(f"Filtered dataset to include orders from frequent buyers.")
print(f"Remaining Rows: {len(filtered_frequent_buyers)}")
print(f"Unique Orders: {filtered_frequent_buyers['order_id'].nunique()}")
print(f"Unique Users: {filtered_frequent_buyers['user_id'].nunique()}")


Filtered dataset to include orders from frequent buyers.
Remaining Rows: 7588145
Unique Orders: 562019
Unique Users: 11033


In [53]:
# Saving the filtered dataset
output_path_frequent_buyers = "/content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_buyers_v1.csv"
filtered_frequent_buyers.to_csv(output_path_frequent_buyers, index=False)

print(f"Filtered dataset saved at {output_path_frequent_buyers}.")


Filtered dataset saved at /content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_buyers_v1.csv.


## N = 850

This dataset will be for the less generalize ML model.

In [56]:
N = 850

# Counting the number of orders per user
user_order_counts = filtered_frequent_reorders.groupby('user_id').size().reset_index(name='order_count')

# Filtering users with at least N orders
frequent_buyers = user_order_counts[user_order_counts['order_count'] >= N]
frequent_buyer_ids = frequent_buyers['user_id']

print(f"Number of frequent buyers with at least {N} orders: {len(frequent_buyer_ids)}")

Number of frequent buyers with at least 850 orders: 2081


In [57]:
# Filtering orders for frequent buyers
filtered_frequent_buyers = filtered_frequent_reorders[
    filtered_frequent_reorders['user_id'].isin(frequent_buyer_ids)
]

print(f"Filtered dataset to include orders from frequent buyers.")
print(f"Remaining Rows: {len(filtered_frequent_buyers)}")
print(f"Unique Orders: {filtered_frequent_buyers['order_id'].nunique()}")
print(f"Unique Users: {filtered_frequent_buyers['user_id'].nunique()}")


Filtered dataset to include orders from frequent buyers.
Remaining Rows: 2253205
Unique Orders: 137382
Unique Users: 2081


In [58]:
# Saving the filtered dataset
output_path_frequent_buyers = "/content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_buyers_v2.csv"
filtered_frequent_buyers.to_csv(output_path_frequent_buyers, index=False)

print(f"Filtered dataset saved at {output_path_frequent_buyers}.")


Filtered dataset saved at /content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_buyers_v2.csv.
