# Setup

## Importing necessary libraries

In [None]:
import zipfile
import os
import pandas as pd

## Decompressing the "Archive" files.

In [None]:
# Defining the path to the "Archives" folder.
archive_path = "/content/drive/MyDrive/MLOps/archive.zip"
extract_path ="/content/drive/MyDrive/MLOps/Instant_Cart"

# Extract the archive
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print(f"Files extracted to {extract_path}")

Files extracted to /content/drive/MyDrive/MLOps/Instant_Cart


## Loading the extracted CSV files

In [None]:
extract_path ="/content/drive/MyDrive/MLOps/Instant_Cart"

In [None]:
# Defining the paths to the files
orders_path = os.path.join(extract_path, 'orders.csv')
order_products_prior_path = os.path.join(extract_path, 'order_products__prior.csv')
order_products_train_path = os.path.join(extract_path, 'order_products__train.csv')
aisles_path = os.path.join(extract_path, 'aisles.csv')
departments_path = os.path.join(extract_path, 'departments.csv')
products_path = os.path.join(extract_path, 'products.csv')

# Loading the CSV files into DataFrames
orders = pd.read_csv(orders_path)
order_products_prior = pd.read_csv(order_products_prior_path)
order_products_train = pd.read_csv(order_products_train_path)
aisles = pd.read_csv(aisles_path)
departments = pd.read_csv(departments_path)
products = pd.read_csv(products_path)

print("Files successfully load!")

Files successfully load!


## Checking Dataframes

In [None]:
# Checking DataFrame shape and info
print("Orders:")
print(orders.shape)
print(orders.info())
print("")
print("Order Products Prior:")
print(order_products_prior.shape)
print(order_products_prior.info())
print("")
print("Order Products Train:")
print(order_products_train.shape)
print(order_products_train.info())
print("")
print("Aisles:")
print(aisles.shape)
print(aisles.info())
print("")
print("Departments:")
print(departments.shape)
print(departments.info())
print("")
print("Products:")
print(products.shape)
print(products.info())

Orders:
(3421083, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            int64  
 4   order_dow               int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 182.7+ MB
None

Order Products Prior:
(32434489, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 989.8 MB
None

Order Products Train:
(1384617, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138

# Data Prep

## Aggregate product orders

In this step, I am combining the dataset: order_products_prior.csv and order_products_train files to compute the total frequency of each product.

In [None]:
# Combining prior and train datasets
all_order_products = pd.concat([order_products_prior, order_products_train])

# Calculating product frequency
product_frequency = all_order_products.groupby('product_id').size().reset_index(name='order_count')

# Getting the top 10,000 most ordered products
top_10k_products = product_frequency.nlargest(10000, 'order_count')
print("Top 10,000 products identified!")

Top 10,000 products identified!


## Filtering orders by Top 10,000 products

In this step, I am filtering the orders to include only those that contain one or more of the top 10,000 products

In [None]:
# Filtering orders with top 10k products
filtered_orders = all_order_products[all_order_products['product_id'].isin(top_10k_products['product_id'])]

# Getting the list of relevant order IDs
filtered_order_ids = filtered_orders['order_id'].unique()

# Filtering the orders DataFrame
filtered_orders_df = orders[orders['order_id'].isin(filtered_order_ids)]
print(f"Filtered orders to include only top 10,000 products. Remaining orders: {len(filtered_orders_df)}")

Filtered orders to include only top 10,000 products. Remaining orders: 3321331


In [None]:
print(filtered_orders_df.columns)

Index(['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')


## Further limiting orders by Size

In the previous step, the total number of orders were reduced to: 3,321,331. Next, I am further limiting the size by retaining orders with a minimum of (5) items to reduce the dataset size further.

In [None]:
# Counting the number of items in each order
order_item_count = filtered_orders.groupby('order_id').size().reset_index(name='item_count')

# Setting the minimum item threshold (e.g., X = 5)
X = 5

# Filtering orders with at least X items
large_orders = order_item_count[order_item_count['item_count'] >= X]

# Getting the list of valid order IDs
valid_order_ids = large_orders['order_id']

# Filtering the original dataset to keep only the valid order IDs
filtered_orders_df = filtered_orders[filtered_orders['order_id'].isin(valid_order_ids)]

# Counting unique orders in the final filtered dataset
unique_orders_count = filtered_orders_df['order_id'].nunique()

print(f"Filtered down to orders with at least {X} items.")
print(f"Final number of unique orders: {unique_orders_count}")
print(f"Final number of rows (products): {len(filtered_orders_df)}")

Filtered down to orders with at least 5 items.
Final number of unique orders: 2400986
Final number of rows (products): 28368235


## Further limiting order to "Active Users"

Previously, I limited the orders to include only those that contain only 5 products from the top 10k products. We ended with a total size of: 2,400,986. For further reducing the dataset, I am going to focus on users with consistent purchasing behavior. For example, users with more than "Y" total orders. For accomplishing this step, I am going to count the total number of orders per user and filter users with at least 10 orders.

In [None]:
print(filtered_orders_df.columns)


Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered'], dtype='object')


In [None]:
# Merging user_id into filtered_orders_df
filtered_orders_df = pd.merge(
    filtered_orders_df,
    orders[['order_id', 'user_id']],
    on='order_id',
    how='left'
)

# Confirming the user_id column is now included
print(filtered_orders_df.columns)

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id'], dtype='object')


In [None]:
# Counting the total number of orders per user
user_order_counts = filtered_orders_df.groupby('user_id').size().reset_index(name='order_count')

# Setting the threshold for active users (e.g., Y = 10 orders)
Y = 10
active_users = user_order_counts[user_order_counts['order_count'] >= Y]
active_user_ids = active_users['user_id']

print(f"Number of active users with at least {Y} orders: {len(active_user_ids)}")

Number of active users with at least 10 orders: 184340


In [None]:
filtered_orders_active_users = filtered_orders_df[filtered_orders_df['user_id'].isin(active_user_ids)]

print(f"Remaining orders: {len(filtered_orders_active_users)}")
print(f"Unique users: {filtered_orders_active_users['user_id'].nunique()}")

Remaining orders: 28313728
Unique users: 184340


In [None]:
print(f"Unique orders remaining: {filtered_orders_active_users['order_id'].nunique()}")

Unique orders remaining: 2391917


In [None]:
print(filtered_orders_active_users.columns)

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id'], dtype='object')


In [None]:
# Saving filtered_orders_active_users DataFram into csv
filtered_orders_active_users.to_csv('/content/drive/MyDrive/MLOps/Instant_Cart/filtered_orders_active_users.csv', index=False)

## Further filtering by top 10 aisles or departments.

On the previous process, we obtained a total of 2,391,917 orders. This time, I am focusing on the most frequently ordered items within the top 10 departments. I will be identifying the top 10 departments with the highest number of orders for later retaining only the orders and products that belong to these top aisles or departments.

In [None]:
filtered_orders_active_users = pd.read_csv('/content/drive/MyDrive/MLOps/Instant_Cart/filtered_orders_active_users.csv')

In the next step, I am merging aisles.csv and departments.csv with products.csv to enrich the product information with aisle and department details.

In [None]:
# Merging products with aisles and departments
products_enriched = pd.merge(
    products,
    aisles,
    on='aisle_id',
    how='left'
)
products_enriched = pd.merge(
    products_enriched,
    departments,
    on='department_id',
    how='left'
)

# Verifying the columns in the enriched product dataset
print(products_enriched.columns)

Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'aisle',
       'department'],
      dtype='object')


Below, I joing the enriched products information (products_enriched) with the (filtered_orders_active_users) DataFrame.

In [None]:
# Merging product details into the filtered orders dataset
filtered_orders_with_details = pd.merge(
    filtered_orders_active_users,
    products_enriched,
    on='product_id',
    how='left'
)

# Verifying the merged dataset
print(filtered_orders_with_details.columns)

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id',
       'product_name', 'aisle_id', 'department_id', 'aisle', 'department'],
      dtype='object')


Next, I group the data by aisle and department (separate) and calculate the total number of orders. Then, sorting the results in descending order and retain the top 10.

In [None]:
# Counting orders by aisle
aisle_order_counts = filtered_orders_with_details.groupby('aisle').size().reset_index(name='order_count')

# Getting the top 10 aisles
top_aisles = aisle_order_counts.nlargest(10, 'order_count')
top_aisle_names = top_aisles['aisle']

print(f"Top 10 aisles: {list(top_aisle_names)}")

Top 10 aisles: ['fresh fruits', 'fresh vegetables', 'packaged vegetables fruits', 'yogurt', 'packaged cheese', 'milk', 'water seltzer sparkling water', 'chips pretzels', 'soy lactosefree', 'bread']


In [None]:
# Counting orders by department
department_order_counts = filtered_orders_with_details.groupby('department').size().reset_index(name='order_count')

# Getting the top 10 departments
top_departments = department_order_counts.nlargest(10, 'order_count')
top_department_names = top_departments['department']

print(f"Top 10 departments: {list(top_department_names)}")

Top 10 departments: ['produce', 'dairy eggs', 'snacks', 'beverages', 'frozen', 'pantry', 'bakery', 'deli', 'canned goods', 'dry goods pasta']


In [None]:
# Filtering orders for top aisles
filtered_by_aisles = filtered_orders_with_details[filtered_orders_with_details['aisle'].isin(top_aisle_names)]

print(f"Remaining orders after filtering by top aisles: {len(filtered_by_aisles)}")

Remaining orders after filtering by top aisles: 14096649


In [None]:
# Filtering orders for top departments
filtered_by_departments = filtered_orders_with_details[filtered_orders_with_details['department'].isin(top_department_names)]

print(f"Remaining orders after filtering by top departments: {len(filtered_by_departments)}")

Remaining orders after filtering by top departments: 25702383


In [None]:
# Verifying aisles or departments in the filtered dataset
print(filtered_by_aisles['aisle'].value_counts())
print(filtered_by_departments['department'].value_counts())

aisle
fresh fruits                     3485989
fresh vegetables                 3378122
packaged vegetables fruits       1686448
yogurt                           1364214
packaged cheese                   905963
milk                              806036
water seltzer sparkling water     720034
chips pretzels                    635021
soy lactosefree                   587908
bread                             526914
Name: count, dtype: int64
department
produce            9133122
dairy eggs         4988600
snacks             2366457
beverages          2168493
frozen             1888991
pantry             1502823
bakery             1042682
deli                940087
canned goods        928885
dry goods pasta     742243
Name: count, dtype: int64


I decided to filter the data by "Top 10 Departments" for the following reasons:

- It aligns with the project's goal of creatinga resuable ML pipeline, as department-level insights generalize better.
- Provides a broader perspective, covering diverse products and trends.
- Reduces the dataset size efficiently while retaining valuable data for high-level analysis.

Next, I filter the dataset by the top 10 departments: produce, dairy eggs, snacks, etc. Then, I save the filtered dataset into .csv format.

In [None]:
# Paths to the files
output_path = "/content/drive/MyDrive/MLOps/Instant_Cart/filtered_by_top_departments.csv"  # Desired output path

# Top 10 departments
top_departments = [
    "produce", "dairy eggs", "snacks", "beverages", "frozen",
    "pantry", "bakery", "deli", "canned goods", "dry goods pasta"
]

# Filtering the dataset to include only top departments
filtered_by_departments = filtered_orders_with_details[
    filtered_orders_with_details['department'].isin(top_departments)
]

# Saving the filtered dataset
filtered_by_departments.to_csv(output_path, index=False)

# Summary of the filtered dataset
filtered_summary = {
    "Remaining Rows": len(filtered_by_departments),
    "Unique Orders": filtered_by_departments['order_id'].nunique(),
    "Unique Users": filtered_by_departments['user_id'].nunique(),
    "Unique Departments": filtered_by_departments['department'].nunique()
}

print("Filtered dataset saved successfully.")
print(filtered_summary)

Filtered dataset saved successfully.
{'Remaining Rows': 25702383, 'Unique Orders': 2389985, 'Unique Users': 184304, 'Unique Departments': 10}


## Further filtering rarely reordered products





From the previous filtering process, we obtained a total number of unique orders of: 2,389,985. Now, I will proceed to calculate the reorder rate for each product by grouping the data by product_id and taking the mean of the "reordered" column.

In [None]:
# Loading the filtered dataset
filtered_by_top_departments_path = "/content/drive/MyDrive/MLOps/Instant_Cart/filtered_by_top_departments.csv"
filtered_by_top_departments = pd.read_csv(filtered_by_top_departments_path)

# Calculating reorder rate for each product
product_reorder_rate = filtered_by_top_departments.groupby('product_id')['reordered'].mean().reset_index()
product_reorder_rate.rename(columns={'reordered': 'reorder_rate'}, inplace=True)

# Setting a threshold for rarely reordered products (e.g., 0.2 or 20%)
threshold = 0.2
frequently_reordered_products = product_reorder_rate[product_reorder_rate['reorder_rate'] > threshold]

print(f"Products with a reorder rate above {threshold}: {len(frequently_reordered_products)}")

Products with a reorder rate above 0.2: 8002


In [None]:
# Filtering the dataset to include only frequently reordered products
filtered_frequent_reorders = filtered_by_top_departments[
    filtered_by_top_departments['product_id'].isin(frequently_reordered_products['product_id'])
]

print(f"Filtered dataset to include frequently reordered products.")
print(f"Remaining Rows: {len(filtered_frequent_reorders)}")
print(f"Unique Orders: {filtered_frequent_reorders['order_id'].nunique()}")
print(f"Unique Products: {filtered_frequent_reorders['product_id'].nunique()}")

Filtered dataset to include frequently reordered products.
Remaining Rows: 25454061
Unique Orders: 2389931
Unique Products: 8002


In [None]:
# Saving the filtered dataset
output_path_frequent_reorders = "/content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_reorders.csv"
filtered_frequent_reorders.to_csv(output_path_frequent_reorders, index=False)

print(f"Filtered dataset saved at {output_path_frequent_reorders}.")

Filtered dataset saved at /content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_reorders.csv.


## Filtering for frequent buyer

From the previous result, the dataset was only reduced to: 2,389,931 orders. Next, I am filtering by identifying frequent buyey. For example: retain users who order top products at least N times.

### N = 450.
This first dataset will be balanced between most frequent buyers and non active users.

**Note to consider:**

As we increase "N", the focus is only on the most active users, which may bias the model toward frequent buyes and their behavior.

The model might become less generalize to users with lower purchasing activity.

If the dataset is heavily skewed toward a small group of active users, some product-specific might be lost. However, keeping more frequent buyers might still capture sufficient trends highly reordered products.

A piperline trained on a subset of frequent buyers may perform for similr groups but may not generalize to less frequent users or broader audience.

In [None]:
# Defining the threshold for frequent buyers (e.g., at least N = 10 orders)
N = 450

# Counting the number of orders per user
user_order_counts = filtered_frequent_reorders.groupby('user_id').size().reset_index(name='order_count')

# Filtering users with at least N orders
frequent_buyers = user_order_counts[user_order_counts['order_count'] >= N]
frequent_buyer_ids = frequent_buyers['user_id']

print(f"Number of frequent buyers with at least {N} orders: {len(frequent_buyer_ids)}")

Number of frequent buyers with at least 450 orders: 11033


In [None]:
# Filtering orders for frequent buyers
filtered_frequent_buyers = filtered_frequent_reorders[
    filtered_frequent_reorders['user_id'].isin(frequent_buyer_ids)
]

print(f"Filtered dataset to include orders from frequent buyers.")
print(f"Remaining Rows: {len(filtered_frequent_buyers)}")
print(f"Unique Orders: {filtered_frequent_buyers['order_id'].nunique()}")
print(f"Unique Users: {filtered_frequent_buyers['user_id'].nunique()}")

Filtered dataset to include orders from frequent buyers.
Remaining Rows: 7588145
Unique Orders: 562019
Unique Users: 11033


In [None]:
# Saving the filtered dataset
output_path_frequent_buyers = "/content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_buyers_v1.csv"
filtered_frequent_buyers.to_csv(output_path_frequent_buyers, index=False)

print(f"Filtered dataset saved at {output_path_frequent_buyers}.")

Filtered dataset saved at /content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_buyers_v1.csv.


### N = 850

This dataset will be for the less generalize ML model.

In [None]:
N = 850

# Counting the number of orders per user
user_order_counts = filtered_frequent_reorders.groupby('user_id').size().reset_index(name='order_count')

# Filtering users with at least N orders
frequent_buyers = user_order_counts[user_order_counts['order_count'] >= N]
frequent_buyer_ids = frequent_buyers['user_id']

print(f"Number of frequent buyers with at least {N} orders: {len(frequent_buyer_ids)}")

Number of frequent buyers with at least 850 orders: 2081


In [None]:
# Filtering orders for frequent buyers
filtered_frequent_buyers = filtered_frequent_reorders[
    filtered_frequent_reorders['user_id'].isin(frequent_buyer_ids)
]

print(f"Filtered dataset to include orders from frequent buyers.")
print(f"Remaining Rows: {len(filtered_frequent_buyers)}")
print(f"Unique Orders: {filtered_frequent_buyers['order_id'].nunique()}")
print(f"Unique Users: {filtered_frequent_buyers['user_id'].nunique()}")

Filtered dataset to include orders from frequent buyers.
Remaining Rows: 2253205
Unique Orders: 137382
Unique Users: 2081


In [None]:
# Saving the filtered dataset
output_path_frequent_buyers = "/content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_buyers_v2.csv"
filtered_frequent_buyers.to_csv(output_path_frequent_buyers, index=False)

print(f"Filtered dataset saved at {output_path_frequent_buyers}.")

Filtered dataset saved at /content/drive/MyDrive/MLOps/Instant_Cart/filtered_frequent_buyers_v2.csv.


# EDA

## Setup

### Installing necessary libraries

In [None]:
! pip install pyathena

In [None]:
!pip install awswrangler

In [None]:
!pip install seaborn

In [None]:
from pyathena import connect

In [None]:
import awswrangler as wr
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

### Converting InstantCart CSV dataset into Parquet

In [None]:
csv_path = "s3://sagemaker-us-east-1-209611057751/data-lake/project/filtered_frequent_buyers_v1.csv"
df = pd.read_csv(csv_path)
# Loading CSV from S3

train_df, remaining_df = train_test_split(df, train_size=0.4, random_state=42)

# Split the remaining data into production and temp datasets (66% production, 33% temp)
production_df, temp_df = train_test_split(remaining_df, train_size=0.666666, random_state=42)

# Split the temp data into test and validation datasets (50% test, 50% validation)
test_df, validation_df = train_test_split(temp_df, train_size=0.5, random_state=42)

In [None]:
# Defining S3 paths
parquet_output_path = "s3://sagemaker-us-east-1-209611057751/data-lake/project/partitioned/"

# Save each split dataset to Parquet with partitioning by 'department'
wr.s3.to_parquet(
    df=train_df,
    path=parquet_output_path + "train/",
    dataset=True,
    mode="overwrite",
    partition_cols=["department"],
    compression="snappy"
)

wr.s3.to_parquet(
    df=production_df,
    path=parquet_output_path + "production/",
    dataset=True,
    mode="overwrite",
    partition_cols=["department"],
    compression="snappy"
)

wr.s3.to_parquet(
    df=test_df,
    path=parquet_output_path + "test/",
    dataset=True,
    mode="overwrite",
    partition_cols=["department"],
    compression="snappy"
)

wr.s3.to_parquet(
    df=validation_df,
    path=parquet_output_path + "validation/",
    dataset=True,
    mode="overwrite",
    partition_cols=["department"],
    compression="snappy"
)

In [None]:
# Create the database if it doesn't exist
database_name = "instacart_db"
try:
    # Create the database in AWS Glue
    wr.catalog.create_database(name=database_name)
    print(f"Database '{database_name}' created successfully!")
except Exception as e:
    print(f"Error creating database: {e}")

In [None]:
# Register the Parquet tables in AWS Glue
table_name = "instacart_orders"

wr.catalog.create_parquet_table(
    database=database_name,
    table=table_name,
    path=parquet_output_path + "train/",
    columns_types={
        "order_id": "bigint",
        "product_id": "bigint",
        "add_to_cart_order": "int",
        "reordered": "int",
        "user_id": "bigint",
        "product_name": "string",
        "aisle_id": "int",
        "department_id": "int",
        "aisle": "string",
        "department": "string"
    },
    partitions_types={"department": "string"},
    description="Partitioned Instacart orders dataset for optimized Athena queries."
)

print("Partitioned Parquet table registered in AWS Glue successfully.")

### Setting up Database for InstantCart

In [None]:
from pyathena import connect

# Defineing AWS Resources
bucket_name = "sagemaker-us-east-1-209611057751"
region = "us-east-1"
database_name = "instacart_db"
table_name = "instacart_orders"
s3_data_location = f"s3://{bucket_name}/data-lake/project/partitioned/train/"  # Using partitioned dataset

# Defining Athena Staging Directory
s3_staging_dir = f"s3://{bucket_name}/athena/instacart_staging/"

# Creating Athena Connection
try:
    conn = connect(s3_staging_dir=s3_staging_dir, region_name=region)
    cursor = conn.cursor()
    print("Connected to Athena successfully.")
except Exception as e:
    print("Error connecting to Athena:", e)

# Creating Database
create_db_query = f"CREATE DATABASE IF NOT EXISTS {database_name}"
cursor.execute(create_db_query)
print(f"Database '{database_name}' created successfully!")

# Verifying Database Creation
cursor.execute("SHOW DATABASES")
databases = [row[0] for row in cursor.fetchall()]
if database_name in databases:
    print(f"Database '{database_name}' exists!")

### Creating Athena database

In [None]:
create_table_query = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name} (
    order_id BIGINT,
    product_id BIGINT,
    add_to_cart_order INT,
    reordered INT,
    user_id BIGINT,
    product_name STRING,
    aisle_id INT,
    department_id INT,
    aisle STRING
)
PARTITIONED BY (department STRING)  -- Partitioned by 'department'
STORED AS PARQUET
LOCATION 's3://sagemaker-us-east-1-209611057751/data-lake/project/partitioned/train/'
TBLPROPERTIES ('parquet.compression'='SNAPPY');
"""

# Execute Table Creation Query
cursor.execute(create_table_query)
print(f"Table '{table_name}' created successfully in database '{database_name}'.")

In [None]:
# Running MSCK REPAIR to Load Partitions
cursor.execute(f"MSCK REPAIR TABLE {database_name}.{table_name}")
print("Partitions updated successfully.")

In [None]:
cursor.execute("SHOW DATABASES")
databases = [row[0] for row in cursor.fetchall()]
if database_name in databases:
    print(f"Database '{database_name}' exists in Athena!")
else:
    print(f"Database '{database_name}' does not exist.")

In [None]:
# Running a Sample Query to Verify Data
test_query = f"SELECT count(*) FROM {database_name}.{table_name} ;"
cursor.execute(test_query)
rows = cursor.fetchall()

print("Sample Query Results:")
for row in rows:
    print(row)

In [None]:
cursor.execute(f"SHOW PARTITIONS {database_name}.{table_name}")
partitions = cursor.fetchall()
if partitions:
    print(f"Partitions found in table '{table_name}': {partitions}")
else:
    print(f"No partitions found in table '{table_name}'.")

## Checking the Total Orders and Unique Users

In [None]:
# First Query: Total Orders & Unique Users
query = f"""
SELECT
    COUNT(DISTINCT order_id) AS total_orders,
    COUNT(DISTINCT user_id) AS unique_users
FROM {database_name}.{table_name};
"""

# Executing query
cursor.execute(query)
rows = cursor.fetchall()

# Printing results
print("Total Orders & Unique Users:")
for row in rows:
    print(row)

## Top 10 Most Ordered Products

In [None]:
query = f"""
SELECT product_name, COUNT(*) AS total_orders
FROM {database_name}.{table_name}
GROUP BY product_name
ORDER BY total_orders DESC
LIMIT 10;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Top 10 Most Ordered Products:")
for row in rows:
    print(row)

In [None]:
# ===========================
#  Bar Chart: Top 10 Most Ordered Products
# ===========================

products = ["Banana", "Bag of Organic Bananas", "Organic Strawberries", "Organic Hass Avocado",
            "Organic Baby Spinach", "Organic Raspberries", "Organic Avocado", "Organic Whole Milk",
            "Limes", "Large Lemon"]
total_orders = [52073, 46964, 39311, 31555, 29459, 21479, 19582, 19211, 17136, 16719]

# Creating DataFrame
df_products = pd.DataFrame({"Product Name": products, "Total Orders": total_orders})

# Plotting Bar Chart
plt.figure(figsize=(12, 6))
sns.barplot(x="Total Orders", y="Product Name", data=df_products, palette="viridis")
plt.xlabel("Total Orders")
plt.ylabel("Product Name")
plt.title("Top 10 Most Ordered Products")
plt.show()

### Insights

Overall it can be seen that perishable products are the most ordered products and generally follow a trend of the quicker the product spoils the more often it is ordered. This makes sense as the customers most likely are making smaller orders of those products so as to not let them spoil by having them sit on their counters or refrigerators for an extended period of time. A condensed list of some notable findings can be seen in the list below:

*   Bananas are the most ordered item with 52073 orders.
*   Organic produce is extremely popular making up 7 out of 10 most ordered items.
*   Fruits and vegitables make up 9 out of the top 10 most ordered items.
*   All of the top 10 items spoil within a matter of weeks after opening.

## Reorder Rate per Product

In [None]:
query = f"""
SELECT
    product_name,
    COUNT(*) AS total_orders,
    SUM(reordered) AS total_reorders,
    ROUND(100.0 * SUM(reordered) / COUNT(*), 2) AS reorder_rate
FROM {database_name}.{table_name}
GROUP BY product_name
ORDER BY reorder_rate DESC
LIMIT 10;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Top 10 Products with Highest Reorder Rate:")
for row in rows:
    print(row)

### Insights for Top 10 Products with Highest Reorder Rate:

It is notable that all of the most reordered items have a 100% reorder rate, indicating that customers who order those items have a strong preference for them specifically. Additional insights as they relate to each product can be seen below:
*   **Thirst Quencher Caffeine-Free Naturally Flavored Citrus Soda** and **Smoked Whitefish Salad** have the highest reorder frequency, highlighting that these items are consistently chosen by customers who keep coming back for more.
*   **100% Lactose-Free Milk** has a **100% reorder rate**, indicating it is a highly demanded product among lactose-intolerant customers who consistently reorder it.
*   **Seltzer Water** and **Sparkling Water, Bottles** as well as **Premium Lots of Pulp Orange Juice** are all drink product meaning that customers who order these product may have a strong preference when it comes to the types of drinks and the brands that they consume.

## Orders by Department

In [None]:
query = f"""
SELECT department, COUNT(*) AS total_orders
FROM {database_name}.{table_name}
GROUP BY department
ORDER BY total_orders DESC;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Orders by Department:")
for row in rows:
    print(row)

In [None]:
# ===========================
# Bar Chart: Orders by Department
# ===========================

# Data from previous query (Department, Total Orders)
departments = ["produce", "dairy eggs", "snacks", "beverages", "frozen",
               "pantry", "bakery", "deli", "canned goods", "dry goods pasta"]
total_orders = [1116850, 619999, 294213, 234613, 201359, 138924, 127884, 114349, 102416, 84651]

# Creating DataFrame
df_orders_by_department = pd.DataFrame({"Department": departments, "Total Orders": total_orders})

# Plotting Bar Chart
plt.figure(figsize=(12, 6))
sns.barplot(x="Total Orders", y="Department", data=df_orders_by_department, palette="magma")
plt.xlabel("Total Orders")
plt.ylabel("Department")
plt.title("Total Orders by Department")
plt.show()

### Insights

Unsurprisingly, the departments with the most orders generally follow a trend of items that spoil the fastest while adding in a factor of items that are consumed at the highest frequency. Produce and dairy/eggs are atop the list by a sizable margin, followed by items that are consumed on a mostly daily basis, snacks and beverages. Next, frozen items are consumed regularly, but have a longer shelf life so they do not need to be ordered as often. The rest of the top ten departments are almost all non/less perishable items. The exemption to this is deli, which falls into the previously discussed category of items that are consumed regularly and have a shorter shelf life so they are most likely ordered in smaller batches. Additional insights can be seen below in a listed format:
*   Produce is the most ordered department with ~1.11 million orders. This aligns with the earlier Top Ordered Products (bananas, avocados, berries).
*   Dairy & Eggs ranks second with 619K orders.
*   Snacks are the third most popular category (294K orders).
*   Beverages (~234K orders) likely include popular items like bottled water, juices, and coffee some of which were in the most reordered top 10.
*   Frozen Foods (~201K orders) suggest customers stock up on frozen essentials.

## Most Popular Aisles

In [None]:
query = f"""
SELECT aisle, COUNT(*) AS total_orders
FROM {database_name}.{table_name}
GROUP BY aisle
ORDER BY total_orders DESC
LIMIT 10;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Top 10 Aisles:")
for row in rows:
    print(row)

In [None]:
# ===========================
# Bar Chart: Most Popular Aisles
# ===========================

# Data from previous query (Aisle, Total Orders)
aisles = ["fresh fruits", "fresh vegetables", "packaged vegetables fruits", "yogurt",
          "packaged cheese", "milk", "water seltzer sparkling water", "chips pretzels",
          "soy lactosefree", "bread"]
aisle_orders = [450026, 404252, 205492, 184903, 112690, 103953, 78019, 74536, 68786, 65469]

# Creating DataFrame
df_aisles = pd.DataFrame({"Aisle": aisles, "Total Orders": aisle_orders})

# Plotting Bar Chart
plt.figure(figsize=(12, 6))
sns.barplot(x="Total Orders", y="Aisle", data=df_aisles, palette="coolwarm")
plt.xlabel("Total Orders")
plt.ylabel("Aisle")
plt.title("Top 10 Most Popular Aisles")
plt.show()

### Insights

The most popular aisles by total orders reflects much of the same findings that have been shown thus far in the data analysis: those items that are the most perishable are ordered the most often. Aisles that have fresh fruits and vegetables rank the highest by a noticeable amount, followed by packaged fruits and vegetables. Then are the dairy products and a few items that fall into the category of snack foods. Other insights can be seen below:

*   Fresh Produce Dominance: Fresh Fruits (450K orders) and Fresh Vegetables (404K orders) are the top two aisles.
*   Combined, these two alone account for over 950K orders, which reinforces that Produce is the top department.
*   Dairy is Highly Popular: Yogurt (184K orders) and Packaged Cheese (112K orders) show strong demand with the addition of Milk (103K orders) confirms that dairy products are household essentials and ordered regularly.
* Beverages have high order numbers: Water, Seltzer, and Sparkling Water (78K orders) ranks #7, showing strong demand for bottled drinks.
*   Snacks & Bread are Key Pantry Items: Chips & Pretzels (74K orders) are among the most frequently purchased snacks. While Bread (65K orders) is typically a staple food.
*   Plant-Based Alternatives: Soy & Lactose-Free Products (68K orders) indicate consumers with dietary restrictions order items that comply with their diet often.

## Reorder Ratio by Department

In [None]:
query = f"""
SELECT
    department,
    COUNT(*) AS total_orders,
    SUM(reordered) AS total_reorders,
    ROUND(100.0 * SUM(reordered) / COUNT(*), 2) AS reorder_ratio
FROM {database_name}.{table_name}
GROUP BY department
ORDER BY reorder_ratio DESC;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Reorder Ratio by Department:")
for row in rows:
    print(row)

### Insights

Much of the information from the reorder rate by department reflects the trends in the other categories explored so far. The small exception is that beverages and dairy/eggs rank higher on the list than produce. This may be caused by specific preferences in the type of dairy products and beverages as opposed to produce, which may have a higher variance order to order from the same customer. The other departments show bread, deli and snack products are reordered often which aligns with the information that has been gathered with regards to the trend of items that are consumed frequently. Finally, items that have a longer shelf life are reordered less often, possibly indicating that there may be more bulk ordering of these items and there could be orders where these items are not ordered or skipped. Other specifics about the reorder ratio by department can be seen below:

*   Dairy & Eggs Have the Highest Reorder Rate (82.73%)
*   Beverages Rank #2 in Reorders (80.85%)
*   Produce Has a High Reorder Rate (80.71%)
*   Bakery (80.43%) & Deli (78.53%) Show Strong Reorder Loyalty
*   Snacks & Frozen Foods Have Moderate Reorder Rates (~70%)
*   Canned Goods & Dry Goods Have Lower Reorder Rates (~65%)
*   Pantry Has the Lowest Reorder Rate (57.26%)

In [None]:
# ===========================
# Pie Chart: Reorder Ratio by Department
# ===========================

# Data from previous query (Department, Reorder Ratio)
departments = ["dairy eggs", "beverages", "produce", "bakery", "deli",
               "snacks", "frozen", "canned goods", "dry goods pasta", "pantry"]
reorder_ratio = [82.73, 80.85, 80.71, 80.43, 78.53, 74.01, 71.87, 65.51, 65.33, 57.26]

# Create DataFrame
df_departments = pd.DataFrame({"Department": departments, "Reorder Ratio": reorder_ratio})

# Plot Pie Chart
plt.figure(figsize=(10, 6))
plt.pie(df_departments["Reorder Ratio"], labels=df_departments["Department"],
        autopct="%1.1f%%", colors=sns.color_palette("viridis", len(departments)), startangle=140)
plt.title("Reorder Ratio by Department")
plt.axis("equal")  # Equal aspect ratio ensures the pie is drawn as a circle
plt.show()

In [None]:
# ===========================
# Stacked Bar Chart: Reordered vs Non-Reordered Orders by Department
# ===========================

# Data from previous query (Department, Reordered Orders)
departments = ["dairy eggs", "beverages", "produce", "bakery", "deli",
               "snacks", "frozen", "canned goods", "dry goods pasta", "pantry"]
total_orders = [619999, 234613, 1116850, 127884, 114349, 294213, 201359, 102416, 84651, 138924]
reordered = [512956, 189692, 901358, 102853, 89802, 217758, 144719, 67094, 55299, 79541]
non_reordered = [total - reorder for total, reorder in zip(total_orders, reordered)]

# Create DataFrame
df_reorders = pd.DataFrame({"Department": departments, "Reordered": reordered, "Non-Reordered": non_reordered})

# Plot Stacked Bar Chart (Reordered vs Non-Reordered)
df_reorders.set_index("Department")[["Reordered", "Non-Reordered"]].plot(kind="bar", stacked=True, figsize=(12, 6), colormap="viridis")
plt.xlabel("Department")
plt.ylabel("Number of Orders")
plt.title("Reordered vs Non-Reordered Orders by Department")
plt.legend(["Reordered", "Non-Reordered"])
plt.xticks(rotation=45)
plt.show()

### Analysis for reordered vs non-reordered orders by department:

Looking at the items reordered or not by department shows the same trends that have been seen throughout the entire expository data analysis. Produce and dairy/eggs have the highest reorder rate with the snacks, beverages and frozen goods departments all ranking similarly to each other. Additionally pantry, canned goods and dry pasta goods reinforce the trend of a reordering of frequently consumed non-perishable goods.

## Correlation Analysis

In [None]:
# Data: Numerical Features for Correlation Analysis
departments = ["dairy eggs", "beverages", "produce", "bakery", "deli",
               "snacks", "frozen", "canned goods", "dry goods pasta", "pantry"]
total_orders = [619999, 234613, 1116850, 127884, 114349, 294213, 201359, 102416, 84651, 138924]
reordered = [512956, 189692, 901358, 102853, 89802, 217758, 144719, 67094, 55299, 79541]
reorder_ratio = [82.73, 80.85, 80.71, 80.43, 78.53, 74.01, 71.87, 65.51, 65.33, 57.26]

# Creating DataFrame
df_correlation = pd.DataFrame({
    "Total Orders": total_orders,
    "Reordered Orders": reordered,
    "Reorder Ratio (%)": reorder_ratio
})

# Computing Correlation Matrix
correlation_matrix = df_correlation.corr()

# Plotting Correlation Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Order Features")
plt.show()

### The correlation matix need to be redone with departments added

## EDA summary

1. Reorder Ratio: strongly correlated with reorders.
2. Total Orders: high correlation with reorder likelihood.
3. User Reorder percerntage: Helps preduct if a user is likely to reorder.
4. Product Popularity: popular items have higher reorders.
5. Department and Aisle reorder ratios: certain categories drive higher reorders

# Feature Engineering

For creating ML Feature features, the following labels will be made:
*   User ID: This tracks individual purchase behavior.
*   Product ID: Helps identify frequently reordered products.
*   Department ID: Some departments have higher reorder rates.
*   Aisle ID: Aisle-level trends impact reorder likelihood.
*   Total Orders: Highly correlated with reorder behavior.
*   Reorder Ratio: Strong predictor of repeat purchases.
*   Total items in Orders: Determines if larger orders influence reorders.
*   User Order Frequency: Identifies frequent vs. occasional buyers.
*   User Reorder Percentage: Determines likelihood of repeat purchases.
*   Product popularity: Captures demand for the product.
*   Department reorder ratio: Some departments have stronger reorder trends.
*   Aisle Reorder ratio: Aisle-specific reorder behavior.
*   Product Reorder trend: Helps detect seasonal or trending products.

# Model Training

## Installing necessary libraries

In [None]:
! pip install pyathena

In [None]:
!pip install awswrangler

In [None]:
!pip install seaborn

In [None]:
!pip install --upgrade s3fs aiobotocore botocore

In [None]:
import s3fs
import botocore

print(f"s3fs version: {s3fs.__version__}")
print(f"botocore version: {botocore.__version__}")

## Importing necessary libraries

In [None]:
from pyathena import connect

In [None]:
import awswrangler as wr
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

## Converting InstantCart CSV dataset into Parquet

In [None]:
csv_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/filtered_frequent_buyers_v1.csv"
df = pd.read_csv(csv_path)
# Loading CSV from S3

train_df, remaining_df = train_test_split(df, train_size=0.4, random_state=42)

# Split the remaining data into production and temp datasets (66% production, 33% temp)
production_df, temp_df = train_test_split(remaining_df, train_size=0.666666, random_state=42)

# Split the temp data into test and validation datasets (50% test, 50% validation)
test_df, validation_df = train_test_split(temp_df, train_size=0.5, random_state=42)

In [None]:
# Defining S3 paths
parquet_output_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/partitioned_split/"


# Save each split dataset to Parquet with partitioning by 'department'
wr.s3.to_parquet(
    df=train_df,
    path=parquet_output_path + "train/",
    dataset=True,
    mode="overwrite",
    partition_cols=["department"],
    compression="snappy"
)

wr.s3.to_parquet(
    df=production_df,
    path=parquet_output_path + "production/",
    dataset=True,
    mode="overwrite",
    partition_cols=["department"],
    compression="snappy"
)

wr.s3.to_parquet(
    df=test_df,
    path=parquet_output_path + "test/",
    dataset=True,
    mode="overwrite",
    partition_cols=["department"],
    compression="snappy"
)

wr.s3.to_parquet(
    df=validation_df,
    path=parquet_output_path + "validation/",
    dataset=True,
    mode="overwrite",
    partition_cols=["department"],
    compression="snappy"
)

In [None]:
# Create the database if it doesn't exist
database_name = "instacart_db_split"
try:
    # Create the database in AWS Glue
    wr.catalog.create_database(name=database_name)
    print(f"Database '{database_name}' created successfully!")
except Exception as e:
    print(f"Error creating database: {e}")

In [None]:
# Register the Parquet tables in AWS Glue
table_name = "instacart_orders"

wr.catalog.create_parquet_table(
    database=database_name,
    table=table_name,
    path=parquet_output_path + "train/",
    columns_types={
        "order_id": "bigint",
        "product_id": "bigint",
        "add_to_cart_order": "int",
        "reordered": "int",
        "user_id": "bigint",
        "product_name": "string",
        "aisle_id": "int",
        "department_id": "int",
        "aisle": "string",
        "department": "string"
    },
    partitions_types={"department": "string"},
    description="Partitioned Instacart orders dataset for optimized Athena queries."
)

print("Partitioned Parquet table registered in AWS Glue successfully.")

## Setting up Database for InstantCart

In [None]:
from pyathena import connect

# Defineing AWS Resources
bucket_name = "sagemaker-us-east-1-921916832724"
region = "us-east-1"
database_name = "instacart_db_split"
table_name = "instacart_orders"
s3_data_location = f"s3://{bucket_name}/data-lake/Project/partitioned_split/train/"  # Using partitioned dataset

# Defining Athena Staging Directory
s3_staging_dir = f"s3://{bucket_name}/athena/instacart_staging_split/"

# Creating Athena Connection
try:
    conn = connect(s3_staging_dir=s3_staging_dir, region_name=region)
    cursor = conn.cursor()
    print("Connected to Athena successfully.")
except Exception as e:
    print("Error connecting to Athena:", e)

# Creating Database
create_db_query = f"CREATE DATABASE IF NOT EXISTS {database_name}"
cursor.execute(create_db_query)
print(f"Database '{database_name}' created successfully!")

# Verifying Database Creation
cursor.execute("SHOW DATABASES")
databases = [row[0] for row in cursor.fetchall()]
if database_name in databases:
    print(f"Database '{database_name}' exists!")

## Creating Athena database

In [None]:
# Define the SQL query to create the table
create_table_query = """
CREATE EXTERNAL TABLE IF NOT EXISTS instacart_db_split.instacart_orders (
    order_id BIGINT,
    product_id BIGINT,
    add_to_cart_order INT,
    reordered INT,
    user_id BIGINT,
    product_name STRING,
    aisle_id INT,
    department_id INT,
    aisle STRING
)
PARTITIONED BY (department STRING)  -- Partitioned by department
STORED AS PARQUET
LOCATION 's3://sagemaker-us-east-1-921916832724/data-lake/Project/partitioned_split/train/'
TBLPROPERTIES ('parquet.compression'='SNAPPY');
"""

# Execute the SQL query in Athena
cursor.execute(create_table_query)
print("Table 'instacart_orders' created successfully in database 'instacart_db_split'.")

In [None]:
# Running MSCK REPAIR to Load Partitions
cursor.execute(f"MSCK REPAIR TABLE {database_name}.{table_name}")
print("Partitions updated successfully.")

In [None]:
cursor.execute("SHOW DATABASES")
databases = [row[0] for row in cursor.fetchall()]
if database_name in databases:
    print(f"Database '{database_name}' exists in Athena!")
else:
    print(f"Database '{database_name}' does not exist.")

In [None]:
import boto3

s3 = boto3.client('s3')
response = s3.list_objects_v2(Bucket='sagemaker-us-east-1-921916832724', Prefix='data-lake/Project/partitioned_split/train/')
for obj in response.get('Contents', []):
    print(obj['Key'])

In [None]:
# Running a Sample Query to Verify Data
test_query = f"SELECT count(*) FROM {database_name}.{table_name} ;"
cursor.execute(test_query)
rows = cursor.fetchall()

print("Sample Query Results:")
for row in rows:
    print(row)

In [None]:
cursor.execute(f"SHOW PARTITIONS {database_name}.{table_name}")
partitions = cursor.fetchall()
if partitions:
    print(f"Partitions found in table '{table_name}': {partitions}")
else:
    print(f"No partitions found in table '{table_name}'.")

# Exploratory Data Analysis in the training set

## Checking the Total Orders and Unique Users

In [None]:
# First Query: Total Orders & Unique Users
query = f"""
SELECT
    COUNT(DISTINCT order_id) AS total_orders,
    COUNT(DISTINCT user_id) AS unique_users
FROM {database_name}.{table_name};
"""

# Executing query
cursor.execute(query)
rows = cursor.fetchall()

# Printing results
print("Total Orders & Unique Users:")
for row in rows:
    print(row)

## Top 10 Most Ordered Products

In [None]:
query = f"""
SELECT product_name, COUNT(*) AS total_orders
FROM {database_name}.{table_name}
GROUP BY product_name
ORDER BY total_orders DESC
LIMIT 10;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Top 10 Most Ordered Products:")
for row in rows:
    print(row)

In [None]:
# ===========================
#  Bar Chart: Top 10 Most Ordered Products
# ===========================

products = ["Banana", "Bag of Organic Bananas", "Organic Strawberries", "Organic Hass Avocado",
            "Organic Baby Spinach", "Organic Raspberries", "Organic Avocado", "Organic Whole Milk",
            "Limes", "Large Lemon"]
total_orders = [52073, 46964, 39311, 31555, 29459, 21479, 19582, 19211, 17136, 16719]

# Creating DataFrame
df_products = pd.DataFrame({"Product Name": products, "Total Orders": total_orders})

# Plotting Bar Chart
plt.figure(figsize=(12, 6))
sns.barplot(x="Total Orders", y="Product Name", data=df_products, palette="viridis")
plt.xlabel("Total Orders")
plt.ylabel("Product Name")
plt.title("Top 10 Most Ordered Products")
plt.show()

### Insights

- Bananas are the most ordered item with 52073 orders.
- Organic produce is extremely popular. Where 7 out of 10 items are organic.
- Dairy products like Whole Milk rank in the Top 10.
- Cirtrus fruits like Limes and Lemons are in high demand.

## Reorder Rate per Product

In [None]:
query = f"""
SELECT
    product_name,
    COUNT(*) AS total_orders,
    SUM(reordered) AS total_reorders,
    ROUND(100.0 * SUM(reordered) / COUNT(*), 2) AS reorder_rate
FROM {database_name}.{table_name}
GROUP BY product_name
ORDER BY reorder_rate DESC
LIMIT 10;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Top 10 Products with Highest Reorder Rate:")
for row in rows:
    print(row)

### Insights for Top 10 Products with Highest Reorder Rate:

- **100% Lactose-Free Milk** has a **100% reorder rate**, indicating it is a highly demanded product among lactose-intolerant customers who consistently reorder it.
- **Salted Sweet Cream Butter Quarters**, **Sparkling Water Bottles**, and **Premium Lots of Pulp Orange Juice** also show a **100% reorder rate**, signifying that these are popular products with strong customer loyalty and recurring demand.
- **Classic Baby Creamers Potatoes** and **Peru Sweet Onions** are products with **100% reorder rate**, pointing to their continued preference and high re-purchase frequency among customers.
- **Thirst Quencher Caffeine-Free Naturally Flavored Citrus Soda** and **Smoked Whitefish Salad** reflect **100% reorder rates**, highlighting that these items are consistently chosen by customers who keep coming back for more.
- **Seltzer Water** and **Green Bananas** demonstrate high reorder demand, each maintaining **100% reorder rate**, suggesting these are staple items that customers rely on regularly.

## Orders by Department

In [None]:
query = f"""
SELECT department, COUNT(*) AS total_orders
FROM {database_name}.{table_name}
GROUP BY department
ORDER BY total_orders DESC;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Orders by Department:")
for row in rows:
    print(row)

In [None]:
# ===========================
# Bar Chart: Orders by Department
# ===========================

# Data from previous query (Department, Total Orders)
departments = ["produce", "dairy eggs", "snacks", "beverages", "frozen",
               "pantry", "bakery", "deli", "canned goods", "dry goods pasta"]
total_orders = [1116850, 619999, 294213, 234613, 201359, 138924, 127884, 114349, 102416, 84651]

# Creating DataFrame
df_orders_by_department = pd.DataFrame({"Department": departments, "Total Orders": total_orders})

# Plotting Bar Chart
plt.figure(figsize=(12, 6))
sns.barplot(x="Total Orders", y="Department", data=df_orders_by_department, palette="magma")
plt.xlabel("Total Orders")
plt.ylabel("Department")
plt.title("Total Orders by Department")
plt.show()

### Insights

- Produce is the most ordered department with ~1.11 million orders. This aligns with the earlier Top Ordered Products (bananas, avocados, berries). On another hand, Fresh produce is frequently bought and likely reordered often.
- Dairy & Eggs ranks second with 619K orders.
- Snacks are the third most popular category (294K orders).
    - Expect chips, granola bars, and nuts to dominate.
- Beverages & Frozen Foods also have strong demand.
    - Beverages (~234K orders) likely include popular items like bottled water, juices, and coffee.
    - Frozen Foods (~201K orders) suggest customers stock up on frozen essentials.
- Pantry Staples, Bakery, and Deli also contribute significantly. It could likely contain bread, canned goods, and dry pasta.

## Most Popular Aisles

In [None]:
query = f"""
SELECT aisle, COUNT(*) AS total_orders
FROM {database_name}.{table_name}
GROUP BY aisle
ORDER BY total_orders DESC
LIMIT 10;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Top 10 Aisles:")
for row in rows:
    print(row)

In [None]:
# ===========================
# Bar Chart: Most Popular Aisles
# ===========================

# Data from previous query (Aisle, Total Orders)
aisles = ["fresh fruits", "fresh vegetables", "packaged vegetables fruits", "yogurt",
          "packaged cheese", "milk", "water seltzer sparkling water", "chips pretzels",
          "soy lactosefree", "bread"]
aisle_orders = [450026, 404252, 205492, 184903, 112690, 103953, 78019, 74536, 68786, 65469]

# Creating DataFrame
df_aisles = pd.DataFrame({"Aisle": aisles, "Total Orders": aisle_orders})

# Plotting Bar Chart
plt.figure(figsize=(12, 6))
sns.barplot(x="Total Orders", y="Aisle", data=df_aisles, palette="coolwarm")
plt.xlabel("Total Orders")
plt.ylabel("Aisle")
plt.title("Top 10 Most Popular Aisles")
plt.show()

### Insights
- Fresh Produce Dominance:
    - Fresh Fruits (450K orders) and Fresh Vegetables (404K orders) are the top two aisles.
    - Combined, these two alone account for over 950K orders, which reinforces why Produce is the top department.

- Dairy is Highly Popular:
    - Yogurt (184K orders) and Packaged Cheese (112K orders) show strong demand.
    - Milk (103K orders) further confirms that dairy products are household essentials.

- Beverages are a Major Category:
    - Water, Seltzer, and Sparkling Water (78K orders) ranks #7, showing strong demand for bottled drinks.

- Snacks & Bread are Key Pantry Items
    - Chips & Pretzels (74K orders) are among the most frequently purchased snacks.
    - Bread (65K orders) confirms why Bakery is among the top departments.

- Plant-Based Alternatives are Growing
    - Soy & Lactose-Free Products (68K orders) indicate increased demand for dairy-free alternatives.

## Reorder Ratio by Department

In [None]:
query = f"""
SELECT
    department,
    COUNT(*) AS total_orders,
    SUM(reordered) AS total_reorders,
    ROUND(100.0 * SUM(reordered) / COUNT(*), 2) AS reorder_ratio
FROM {database_name}.{table_name}
GROUP BY department
ORDER BY reorder_ratio DESC;
"""
cursor.execute(query)
rows = cursor.fetchall()
print("Reorder Ratio by Department:")
for row in rows:
    print(row)

### Insights

- Dairy & Eggs Have the Highest Reorder Rate (82.73%)
    - Most frequently reordered category.
    - Milk, Yogurt, and Cheese are household staples → high repurchase behavior.

- Beverages Rank #2 in Reorders (80.85%)
    - Bottled Water, Sparkling Water, and Coffee/Tea are commonly repurchased.
    - These items are frequently consumed & replaced regularly.

- Produce Has a High Reorder Rate (80.71%)
    - Fruits and vegetables have a high purchase frequency.
    - Bananas, Avocados, and Berries from previous queries reinforce this trend.

- Bakery (80.43%) & Deli (78.53%) Show Strong Reorder Loyalty
    - Bread, Bagels, and Pre-packaged Deli Items are regularly bought items.
    - Customers often stick to the same brands.

- Snacks & Frozen Foods Have Moderate Reorder Rates (~70%)
    - Chips, Pretzels, and Frozen Meals are repurchased but less frequently than fresh foods.

- Canned Goods & Dry Goods Have Lower Reorder Rates (~65%)
    - Longer shelf life → not purchased as frequently.

- Pasta, sauces, and canned vegetables last longer → lower immediate repurchase need.

- Pantry Has the Lowest Reorder Rate (57.26%)
    - Less frequent purchases of pantry staples like flour, condiments, and spices.

In [None]:
# ===========================
# Pie Chart: Reorder Ratio by Department
# ===========================

# Data from previous query (Department, Reorder Ratio)
departments = ["dairy eggs", "beverages", "produce", "bakery", "deli",
               "snacks", "frozen", "canned goods", "dry goods pasta", "pantry"]
reorder_ratio = [82.73, 80.85, 80.71, 80.43, 78.53, 74.01, 71.87, 65.51, 65.33, 57.26]

# Create DataFrame
df_departments = pd.DataFrame({"Department": departments, "Reorder Ratio": reorder_ratio})

# Plot Pie Chart
plt.figure(figsize=(10, 6))
plt.pie(df_departments["Reorder Ratio"], labels=df_departments["Department"],
        autopct="%1.1f%%", colors=sns.color_palette("viridis", len(departments)), startangle=140)
plt.title("Reorder Ratio by Department")
plt.axis("equal")  # Equal aspect ratio ensures the pie is drawn as a circle
plt.show()

In [None]:
# ===========================
# Stacked Bar Chart: Reordered vs Non-Reordered Orders by Department
# ===========================

# Data from previous query (Department, Reordered Orders)
departments = ["dairy eggs", "beverages", "produce", "bakery", "deli",
               "snacks", "frozen", "canned goods", "dry goods pasta", "pantry"]
total_orders = [619999, 234613, 1116850, 127884, 114349, 294213, 201359, 102416, 84651, 138924]
reordered = [512956, 189692, 901358, 102853, 89802, 217758, 144719, 67094, 55299, 79541]
non_reordered = [total - reorder for total, reorder in zip(total_orders, reordered)]

# Create DataFrame
df_reorders = pd.DataFrame({"Department": departments, "Reordered": reordered, "Non-Reordered": non_reordered})

# Plot Stacked Bar Chart (Reordered vs Non-Reordered)
df_reorders.set_index("Department")[["Reordered", "Non-Reordered"]].plot(kind="bar", stacked=True, figsize=(12, 6), colormap="viridis")
plt.xlabel("Department")
plt.ylabel("Number of Orders")
plt.title("Reordered vs Non-Reordered Orders by Department")
plt.legend(["Reordered", "Non-Reordered"])
plt.xticks(rotation=45)
plt.show()

**Analysis for reordered vs non-reordered orders by department**:
    
1. Produce has the highest number of orders overall

    - Most of these orders are reorders, confirming that fruits and vegetables are frequently repurchased.

2. Dairy & Eggs have the highest reorder percentage
    - Consistently repurchased products like milk, cheese, and yogurt drive these numbers.

3. Snacks and Frozen Foods have moderate reorder levels
    - Customers repurchase snacks and frozen goods but at a slightly lower frequency than fresh items.

4. Pantry and Canned Goods have the lowest reorder rates
    - These products have a longer shelf life, reducing the need for frequent repurchasing.

## Correlation Analysis

In [None]:
# Data: Numerical Features for Correlation Analysis
departments = ["dairy eggs", "beverages", "produce", "bakery", "deli",
               "snacks", "frozen", "canned goods", "dry goods pasta", "pantry"]
total_orders = [619999, 234613, 1116850, 127884, 114349, 294213, 201359, 102416, 84651, 138924]
reordered = [512956, 189692, 901358, 102853, 89802, 217758, 144719, 67094, 55299, 79541]
reorder_ratio = [82.73, 80.85, 80.71, 80.43, 78.53, 74.01, 71.87, 65.51, 65.33, 57.26]

# Creating DataFrame
df_correlation = pd.DataFrame({  # Fix this correlation analysis to show up the departments.
    "Total Orders": total_orders,
    "Reordered Orders": reordered,
    "Reorder Ratio (%)": reorder_ratio
})

# Computing Correlation Matrix
correlation_matrix = df_correlation.corr()

# Plotting Correlation Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Order Features")
plt.show()

- Total Orders & Reordered Orders (Correlation = 1.00)
    - This confirms that departments with high total orders also have high reorders.
    - Produce, Dairy, and Beverages are likely contributing to this trend.

- Total Orders & Reorder Ratio (Correlation = 0.49)
    - Moderate positive correlation → Higher orders somewhat influence the reorder rate, but not always.
    - Some departments may have high first-time purchases but lower reorder rates (e.g., snacks, pantry items).

- Reordered Orders & Reorder Ratio (Correlation = 0.52)
    - A moderate correlation suggests that higher reorder volumes influence reorder ratio but not perfectly.
    - Some products (like dairy & produce) are reordered very frequently, while others (like pantry goods) less frequently.

## Analysis of the training EDA

For creating ML Feature features, I am going to use the following labels:

- User ID: 	This tracks individual purchase behavior.
- Product ID: Helps identify frequently reordered products.
- Department ID: Some departments have higher reorder rates.
- Aisle ID: Aisle-level trends impact reorder likelihood.
- Total Orders: Highly correlated with reorder behavior.
- Reorder Ratio: Strong predictor of repeat purchases.
- Total items in Orders: 	Determines if larger orders influence reorders.
- User Order Frequency: Identifies frequent vs. occasional buyers.
- User Reorder Percentage: Determines likelihood of repeat purchases.
- Product popularity: Captures demand for the product.
- Department reorder ratio: Some departments have stronger reorder trends.
- Aisle Reorder ratio: Aisle-specific reorder behavior.
- Product Reorder trend:  Helps detect seasonal or trending products.

Strongest predictions for EDA:

1. Reorder Ratio: strongly correlated with reorders.
2. Total Orders: high correlation with reorder likelihood.
3. User Reorder percerntage: Helps preduct if a user is likely to reorder.
4. Product Popularity: popular items have higher reorders.
5. Department and Aisle reorder ratios: certain categories drive higher reorders.

# Feature Store ( Aggregated Features)

Plan for Feature store:

- We will definig the feature to store for:
    - Product-level
    - user-level
    - department-level
- Then, we will be creating the feature store in SageMaker
    - Using boto3 and sagemaker.feature_store.feature_group
- Next, we will ingest the engineered features
    - We will be saving both offline (in S3 parquet format) and online ( real-time queryable)
- Lastly, we will query the features from the Feature Store.
    - We will retrieve features for training and inference.

The Feture Store will base computed and aggregated based on the following features:

- User-Based Features

    - user_total_orders: total number of orders a user has placed
    - user_reorder_ratio: percentage of the user’s past orders that contained reorders
    - user_avg_items_per_order: average number of items per order for the user
    
- Product-Based Features

    - product_total_orders: number of times the product was ordered overall
    - product_reorder_ratio: how often the product gets reordered
    - product_unique_users: number of unique users who ordered the product
    
- Department-Based Features

    - department_reorder_ratio: average reorder ratio for all products in the department
    - aisle_reorder_ratio: average reorder ratio for all products in the aisle

In the next code, I am going to aggregate user, product, and department-level features from the training dataset stored in Athena.

In [None]:
# Athena Database & Table
database_name = "instacart_db_split"
table_name = "instacart_orders"

# ==========================
# 🔹 1. Compute User-Level Features
# ==========================
user_query = f"""
SELECT
    user_id,
    COUNT(DISTINCT order_id) AS user_total_orders,
    SUM(reordered) / COUNT(*) AS user_reorder_ratio,
    COUNT(*) / COUNT(DISTINCT order_id) AS user_avg_items_per_order
FROM {database_name}.{table_name}
GROUP BY user_id;
"""
user_features = wr.athena.read_sql_query(user_query, database=database_name)

# ==========================
# 🔹 2. Compute Product-Level Features
# ==========================
product_query = f"""
SELECT
    product_id,
    COUNT(*) AS product_total_orders,
    SUM(reordered) / COUNT(*) AS product_reorder_ratio,
    COUNT(DISTINCT user_id) AS product_unique_users
FROM {database_name}.{table_name}
GROUP BY product_id;
"""
product_features = wr.athena.read_sql_query(product_query, database=database_name)

# ==========================
# 🔹 3. Compute Department & Aisle Features
# ==========================
department_query = f"""
SELECT
    department,
    SUM(reordered) / COUNT(*) AS department_reorder_ratio
FROM {database_name}.{table_name}
GROUP BY department;
"""
department_features = wr.athena.read_sql_query(department_query, database=database_name)

aisle_query = f"""
SELECT
    aisle,
    SUM(reordered) / COUNT(*) AS aisle_reorder_ratio
FROM {database_name}.{table_name}
GROUP BY aisle;
"""
aisle_features = wr.athena.read_sql_query(aisle_query, database=database_name)

# ==========================
# 🔹 4. Load Training Data to Get User-Product Mapping
# ==========================
mapping_query = f"""
SELECT user_id, product_id, department, aisle
FROM {database_name}.{table_name}
"""
df_mapping = wr.athena.read_sql_query(mapping_query, database=database_name)

# ==========================
# 🔹 5. Merge Features into a Single Dataset
# ==========================
df_features = (
    df_mapping
    .merge(user_features, on="user_id", how="left")  # Merge user features
    .merge(product_features, on="product_id", how="left")  # Merge product features
    .merge(department_features, on="department", how="left")  # Merge department features
    .merge(aisle_features, on="aisle", how="left")  # Merge aisle features
)

print("Feature Engineering Completed Successfully!")
print(df_features.head())

# ==========================
# 🔹 6. Save Engineered Features to S3
# ==========================
parquet_output_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/features/"
wr.s3.to_parquet(
    df=df_features,
    path=parquet_output_path,
    dataset=True,
    mode="overwrite",
    partition_cols=["department"],
    compression="snappy"
)

print("Feature dataset saved to S3!")

# Optional. Run the following code just in case the VM stops and you lose the connection with the Athena database. This code is only valid for Checking the existence of Feature Store that was made in the previous step

In [None]:
# Note. Make sure to install the and import the necessary libraries first.

In [None]:
from pyathena import connect
import awswrangler as wr

# AWS Configuration
bucket_name = "sagemaker-us-east-1-921916832724"
region = "us-east-1"
database_name = "instacart_db_split"
table_name = "instacart_orders"

# Define Athena Staging Directory
s3_staging_dir = f"s3://{bucket_name}/athena/instacart_staging_split/"

# Reconnect to Athena
conn = connect(s3_staging_dir=s3_staging_dir, region_name=region)
cursor = conn.cursor()

print("✅ Reconnected to Athena.")

In [None]:
##Step 2: Verify Partitions Exist
#Since features are stored per department, let's check if partitions are loaded correctly:


cursor.execute(f"SHOW PARTITIONS {database_name}.{table_name}")
partitions = cursor.fetchall()

if partitions:
    print(f"✅ Partitions found in table '{table_name}':")
    for partition in partitions:
        print(partition)
else:
    print(f"❌ No partitions found in table '{table_name}'.")

In [None]:
#Step 3: Query Features for a Specific Department
#Since features are partitioned by department, we must query them by department name:

selected_department = "bakery"  # Change this to any department

query = f"""
SELECT *
FROM {database_name}.{table_name}
WHERE department = '{selected_department}'
LIMIT 10;
"""
cursor.execute(query)
rows = cursor.fetchall()

print(f"✅ Sample Data from '{selected_department}' Department:")
for row in rows:
    print(row)

In [None]:
#Step 4: Verify Feature Files Exist in S3
#Ensure feature files are actually in S3:

import boto3

s3 = boto3.client('s3')
prefix = "data-lake/Project/features/"

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

if 'Contents' in response:
    print("✅ Feature files found in S3:")
    for obj in response['Contents']:
        print(obj['Key'])
else:
    print("❌ No feature files found in S3!")

In [None]:
#Step 5: Load Features from S3
# If features exist in S3, reload them using awswrangler:

feature_path = f"s3://{bucket_name}/data-lake/Project/features/"

df_features = wr.s3.read_parquet(feature_path)

print("✅ Feature dataset loaded successfully!")
print(df_features.head())

# Merging New Obtained Features into one single dataset

In [None]:
# Note: Since "department" is a parition column and it was not explicitly included in df_features. I am going to loop through all departments in the S3 feature store and concatenate them into a single DataFrame

In [None]:
import awswrangler as wr
import pandas as pd

# Listing of all departments
departments = [
    "bakery", "beverages", "canned goods", "dairy eggs", "deli",
    "dry goods pasta", "frozen", "pantry", "produce", "snacks"
]

# S3 base path
s3_base_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/features/"

# Loading and merging all department data
df_list = []

for dept in departments:
    s3_path = f"{s3_base_path}department={dept}/"

    # Loading department-specific features
    df_dept = wr.s3.read_parquet(path=s3_path)

    # Adding department column back
    df_dept["department"] = dept

    df_list.append(df_dept)

# Merging all departments into a single DataFrame
df_all_departments = pd.concat(df_list, ignore_index=True)

# Displaying result
print("✅ Successfully loaded all department features!")
print(df_all_departments.head())

In [None]:
# Now that we have successfully merged all department features, I am going to save the merged dataset back to S3

# Defining output path for merged dataset
merged_output_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/merged_features/"

# Saving merged features back to S3 (partitioned by department)
wr.s3.to_parquet(
    df=df_all_departments,
    path=merged_output_path,
    dataset=True,
    mode="overwrite",
    partition_cols=["department"],
    compression="snappy"
)

print(" Merged feature dataset saved to S3!")

# (EDA) on the Merged Dataset (Aggregated Feature)

In [None]:
# Checking  the distribution of orders across departments
plt.figure(figsize=(12, 6))
sns.countplot(y=df_all_departments["department"], order=df_all_departments["department"].value_counts().index)
plt.xlabel("Total Orders")
plt.ylabel("Department")
plt.title("Total Orders by Department")
plt.show()

# Preparing the Feature Store (Aggregated)  for Model Training

In [None]:
# We now have product-level, user-level, and department-level features. Next, We will structure the dataset for model training.

In [None]:
# Selecting relevant features for training
features = [
    "user_total_orders", "user_reorder_ratio", "user_avg_items_per_order",
    "product_total_orders", "product_reorder_ratio", "product_unique_users",
    "department_reorder_ratio", "aisle_reorder_ratio"
]

df_training = df_all_departments[features]

# Saving training-ready dataset
training_output_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/training_data/"

wr.s3.to_parquet(
    df=df_training,
    path=training_output_path,
    dataset=True,
    mode="overwrite",
    compression="snappy"
)

print("✅ Training-ready dataset saved to S3!")

# Training Phase

## Logistic Regression Model using the original Dataset

### Loading required libraries

In [None]:
# Installing required libraries
!pip install scikit-learn boto3 awswrangler pandas numpy

# Importing libraries
import boto3
import awswrangler as wr
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Loading the Raw Training and  Validation Dataset

In [None]:
# Defining S3 paths for raw datasets
raw_train_s3_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/partitioned_split/train/"
raw_validation_s3_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/partitioned_split/validation/"

# Loading raw training and validation datasets
df_train_raw = wr.s3.read_parquet(raw_train_s3_path)
df_validation_raw = wr.s3.read_parquet(raw_validation_s3_path)

# Checking if datasets loaded correctly
print(" Raw Training Dataset Loaded Successfully!")
print(df_train_raw.head())

print(" Raw Validation Dataset Loaded Successfully!")
print(df_validation_raw.head())

### Training Logistic Regression on Raw Data

In [None]:
import awswrangler as wr
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#  Loading Raw Training Data
raw_train_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/partitioned_split/train/"
df_raw = wr.s3.read_parquet(raw_train_path)

#  Selecting Features & Target
X_raw = df_raw[["user_id", "product_id", "aisle_id", "department_id"]]
y_raw = df_raw["reordered"]

#  Split Data into Train and Validation Sets
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(X_raw, y_raw, test_size=0.2, random_state=42)

#  Normalize Data (Optional)
scaler = StandardScaler()
X_train_raw_scaled = scaler.fit_transform(X_train_raw)
X_val_raw_scaled = scaler.transform(X_val_raw)

#  Train Logistic Regression Model on Raw Data
log_reg_raw = LogisticRegression(max_iter=1000, random_state=42)
log_reg_raw.fit(X_train_raw_scaled, y_train_raw)

#  Predictions on Validation Set
y_pred_raw = log_reg_raw.predict(X_val_raw_scaled)

#  Evaluating Model
print(" Logistic Regression Performance on Raw Data:")
print("Accuracy:", accuracy_score(y_val_raw, y_pred_raw))
print(classification_report(y_val_raw, y_pred_raw))

- **Observations from Logistic Regression on RAW Data**

    - Accuracy: 77.75%
    - F1-Score (Weighted Avg): 68%
    - Precision for Class 0 is 0.00  (Model is not predicting non-reorders at all)
    - Recall for Class 1 is 1.00, which means the model predicts almost everything as reordered (1).

- Issues with the Model
    - Severe Class Imbalance:
        - The dataset has 471,999 reordered (1) vs. 135,053 not reordered (0).
        - The model is likely biased toward predicting everything as reordered.

- Zero Precision for Class 0:
The model never predicts non-reorders, meaning it completely ignores that class.

High Recall but Low Precision:
It catches most reorders but makes many false positives.
Not useful if we want accurate reorder predictions.

###  Updated Logistic Regression (With Class Weights)

In [None]:
# Training Logistic Regression with Class Weights
log_reg_weighted = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced")
log_reg_weighted.fit(X_train_raw_scaled, y_train_raw)

# Predictions
y_pred_weighted = log_reg_weighted.predict(X_val_raw_scaled)

# Evaluate Model
print("🔹 Logistic Regression (Balanced Classes) Performance on Raw Data:")
print("Accuracy:", accuracy_score(y_val_raw, y_pred_weighted))
print(classification_report(y_val_raw, y_pred_weighted))

 **Observations from Logistic Regression (Balanced Classes) on RAW Data**


- Accuracy: 54.9%  (Dropped from 77.75%)
- Precision (Class 0): 24% (Now at least predicting some non-reorders)
- Recall (Class 0): 48% (Improved from 0%)
- Precision (Class 1): 79%  (Dropped, but still decent)
- Recall (Class 1): 57%  (Worse than before, but balanced)
- Weighted F1-Score: 59%  (More balanced than before)

**Key Takeaways**

- Improvement:

    - The model now predicts some non-reorders (0), which is a step forward.
    - The recall for non-reorders (48%) is better than before (0%).

- Trade-offs:

- Accuracy dropped (because the model is now making more mistakes overall)
- Class 1 (Reorders) recall fell from 100% → 57%.

Better balance overall, but not ideal. The model still struggles with predicting non-reorders accurately.

## Logistic Regression Model using the Feature Store Data (Aggregated)

### Loading Feature-Engineered Dataset

In [None]:
# Adding "reordered column back to the merged-feature-engineered-dataset". This step is done since I forgot to add the target column to the feature engineering agreggated dataset.

In [None]:
import awswrangler as wr

# Loading the Original Training Dataset (Partitioned by Department)
original_train_s3_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/partitioned_split/train/"
df_train = wr.s3.read_parquet(original_train_s3_path, columns=["user_id", "product_id", "reordered", "department", "department_id"])

print(" Original Training Dataset Loaded Successfully!")
print(df_train.head())

# Loading the Feature Store Dataset (Without `reordered`)
feature_store_s3_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/merged_features/"
df_features = wr.s3.read_parquet(feature_store_s3_path)

print(" Feature Store Dataset Loaded Successfully!")
print(df_features.head())

# Verifying Column Names
print("Columns in df_train:", df_train.columns)
print("Columns in df_features:", df_features.columns)

# Merging `reordered` and `department` Column Back
df_updated_features = df_features.merge(df_train, on=["user_id", "product_id"], how="left")

# If department is missing, check for `department_id`
if "department" not in df_updated_features.columns and "department_id" in df_updated_features.columns:
    # Mapping department_id to department name (if needed)
    department_mapping = {
        1: "frozen", 2: "other", 3: "bakery", 4: "produce", 5: "alcohol",
        6: "international", 7: "beverages", 8: "pets", 9: "dry goods pasta",
        10: "bulk", 11: "personal care", 12: "meat seafood", 13: "pantry",
        14: "breakfast", 15: "canned goods", 16: "dairy eggs", 17: "household",
        18: "babies", 19: "snacks", 20: "deli", 21: "missing"
    }
    df_updated_features["department"] = df_updated_features["department_id"].map(department_mapping)

print(" Successfully Merged `reordered` and `department` Columns!")
print(df_updated_features.head())

# Verifying "department" Column Exists Before Saving
if "department" not in df_updated_features.columns:
    raise ValueError("❌ ERROR: The `department` column is STILL missing from the merged dataset!")

# Saving the Updated Dataset with `reordered` and `department`
updated_feature_store_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/features_with_reorder/"
wr.s3.to_parquet(
    df=df_updated_features,
    path=updated_feature_store_path,
    dataset=True,
    mode="overwrite",
    partition_cols=["department"],  # Ensuring correct partitioning
    compression="snappy"
)

print(" Feature Store Dataset with `reordered` and `department` Column Saved Successfully!")

## Training Logistic Regression on Feature-Engineered Data

In [None]:
import awswrangler as wr
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#  Loading Feature Engineered Training Data
feature_store_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/features_with_reorder/"
df_features = wr.s3.read_parquet(feature_store_path)

#  Selecting Features & Target (Keeping `department_id`)
X_features = df_features.drop(columns=["user_id", "product_id", "reordered"])
y_features = df_features["reordered"]

#  Converting Categorical Column "aisle" to Numerical (Label Encoding)
label_encoder = LabelEncoder()
X_features["aisle"] = label_encoder.fit_transform(X_features["aisle"])

#  Splitting Train-Validation Set
X_train_features, X_val_features, y_train_features, y_val_features = train_test_split(
    X_features, y_features, test_size=0.2, random_state=42
)

#  Normalizing Features
scaler = StandardScaler()
X_train_features_scaled = scaler.fit_transform(X_train_features)
X_val_features_scaled = scaler.transform(X_val_features)

#  Training Logistic Regression on Feature Store Data
log_reg_features = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced")
log_reg_features.fit(X_train_features_scaled, y_train_features)

#  Predictions
y_pred_features = log_reg_features.predict(X_val_features_scaled)

#  Evaluation
print("🔹 Logistic Regression Performance on Feature Store Data:")
print("Accuracy:", accuracy_score(y_val_features, y_pred_features))
print(classification_report(y_val_features, y_pred_features))

**Key Observations**


- Higher Precision on Feature Store Model:

    - The model trained on feature-engineered data has much better precision (0.94) for predicting reordered products compared to raw data (0.79).
    - This means fewer false positives—when the model predicts a reorder, it's more likely to be correct.

- Lower Recall on Feature Store Model:

    - Recall dropped from 0.57 (raw) to 0.50 (features).
    - This suggests the model is missing some reorders, likely because of the feature transformations.

- Similar F1-Score:

    - Despite recall decreasing, the F1-score remains the same (0.66), meaning overall predictive performance is stable.

- ccuracy Decreased:

    - Feature Store Model: 51.7%
    - Raw Data Model: 54.9%

The drop in accuracy suggests that feature transformations changed the class balance, but it’s expected since we engineered new feature sets.

Next, I am going to try XGBoost. Which is efficient and handles large datasets well. It is powerful for imbalanced datasets and works well with both raw and engineered features.

I am going to apply XGBoost in the RAW dataset first to later apply it in the Feature Store dataset.

# Training with XGBoost on Raw Dataset

In [None]:
!pip install xgboost

In [None]:
import awswrangler as wr
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Load Training & Validation Data
raw_train_s3_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/partitioned_split/train/"
raw_validation_s3_path = "s3://sagemaker-us-east-1-921916832724/data-lake/Project/partitioned_split/validation/"

df_train_raw = wr.s3.read_parquet(raw_train_s3_path)
df_validation_raw = wr.s3.read_parquet(raw_validation_s3_path)

print("✅ Raw Training & Validation Data Loaded!")

# Selecting Features & Target
X_train_raw = df_train_raw[["user_id", "product_id", "aisle_id", "department_id"]]
y_train_raw = df_train_raw["reordered"]
X_val_raw = df_validation_raw[["user_id", "product_id", "aisle_id", "department_id"]]
y_val_raw = df_validation_raw["reordered"]

# 🔹 **Fix Label Encoding Issue**
label_encoder_aisle = LabelEncoder()
label_encoder_department = LabelEncoder()

# Fit label encoders on combined data (train + validation) to prevent unseen labels issue
all_aisle_ids = pd.concat([X_train_raw["aisle_id"], X_val_raw["aisle_id"]])
all_department_ids = pd.concat([X_train_raw["department_id"], X_val_raw["department_id"]])

label_encoder_aisle.fit(all_aisle_ids)
label_encoder_department.fit(all_department_ids)

# Transform the train and validation sets using the fitted encoders
X_train_raw["aisle_id"] = label_encoder_aisle.transform(X_train_raw["aisle_id"])
X_train_raw["department_id"] = label_encoder_department.transform(X_train_raw["department_id"])
X_val_raw["aisle_id"] = label_encoder_aisle.transform(X_val_raw["aisle_id"])
X_val_raw["department_id"] = label_encoder_department.transform(X_val_raw["department_id"])

# Normalize Features
scaler = StandardScaler()
X_train_raw_scaled = scaler.fit_transform(X_train_raw)
X_val_raw_scaled = scaler.transform(X_val_raw)

# Train XGBoost Model
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train_raw_scaled, y_train_raw)

# Predictions
y_pred_xgb = xgb_model.predict(X_val_raw_scaled)

# Evaluate Model
print("✅ XGBoost Performance on Raw Data:")
print("Accuracy:", accuracy_score(y_val_raw, y_pred_xgb))
print(classification_report(y_val_raw, y_pred_xgb))

# XGBoost Model Results Analysis
 - The XGBoost model on raw data achieved 77.8% accuracy, but we need to analyze the imbalance further:

    - Precision (0.56) for Class 0: Indicates that many of the predicted non-reorders (0) were false positives.
    - Recall (0.02) for Class 0: Very low, meaning the model struggles to detect non-reorders (0).
    - Recall (1.00) for Class 1: The model classifies most orders as reorders (1), likely because of class imbalance.

## Adjusting Scale_pos_weight, since reorder=1 is much more frequent, we should adjust the balance

To calculate the imbalance ratio:

This is computed as:
- scale_pos_weight = count of class 0 (non-reorders) / count of class 1 (reorders)

In [None]:
import xgboost as xgb
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report

#  Step 1: Compute scale_pos_weight
class_counts = Counter(y_train_raw)
scale_pos_weight = class_counts[0] / class_counts[1]
print(f"Computed scale_pos_weight: {scale_pos_weight:.4f}")

# Step 2: Train XGBoost with the new weight
xgb_model_weighted = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,  # Apply the weight
    random_state=42
)

xgb_model_weighted.fit(X_train_raw_scaled, y_train_raw)

# Step 3: Predictions
y_pred_weighted = xgb_model_weighted.predict(X_val_raw_scaled)

# Step 4: Evaluate Performance
print("\n🔹 XGBoost Performance with scale_pos_weight:")
print("Accuracy:", accuracy_score(y_val_raw, y_pred_weighted))
print(classification_report(y_val_raw, y_pred_weighted))

**Key Observations**:

- Recall for Class 0 (Non-reorders) jumped from 2% ➝ 65%
- Recall for Class 1 (Reorders) dropped from 100% ➝ 56% (expected tradeoff)
- Overall Accuracy: 57.76% (slightly lower than before, but recall is better balanced)

## Fine Tuning XGBoost on Raw Training Dataset

In [None]:
xgb_model_tuned = xgb.XGBClassifier(
    n_estimators=200,  # Increasing trees for better learning
    max_depth=8,       # Allowing deeper splits
    learning_rate=0.05, # Reducing step size
    scale_pos_weight=0.4,  # Adjusting class balancing weight
    eval_metric="auc",
    random_state=42
)

xgb_model_tuned.fit(X_train_raw_scaled, y_train_raw)
y_pred_tuned = xgb_model_tuned.predict(X_val_raw_scaled)

# Evaluating
print("\n🔹 Fine-Tuned XGBoost Performance:")
print("Accuracy:", accuracy_score(y_val_raw, y_pred_tuned))
print(classification_report(y_val_raw, y_pred_tuned))

- **Key Takeaways from Fine-Tuned XGBoost**:
    - Accuracy: 73.57% (Improved from 57.76%)
    - Recall for Class 0 (Non-reorders): 30% (Better than 2% before)
    - Recall for Class 1 (Reorders): 86% (Still strong)
    - Precision & F1-score improved for both classes

# Splice Point of Two Notebooks
Can also be used as a checkpoint.

**Note**: this notebook is the continue of the previous notebook: "InstantCart_Project_GXboost_Model_Group_Training-Batch-Time-Inference"
On this notebook, I fixing the Batch time inference Job to use a new approach:
    - input_filter: * Set __input_filter__ to "$[1:]": indicates that we are excluding column 0 (the 'ID') before processing the inferences and keeping everything from column 1 to the last column (all the features or predictors)  
    - join_source: * Set __join_source__ to "Input": indicates our desire to join the input data with the inference results  
    - output_filter: Leave __output_filter__ to default ('$'), indicating that the joined input and inference results be will saved as output.

In [None]:
import ray

ray.init(object_store_memory=2 * 1024 * 1024 * 1024)  # 2GB memory

In [None]:
!pip install awswrangler

## Importing required libraries

In [None]:
import awswrangler as wr
import pandas as pd
import numpy as np
import xgboost as xgb
import sagemaker
import boto3
import joblib
import tarfile
from sagemaker import image_uris, get_execution_role, session
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

## Defining AWS Resources

In [None]:
region = boto3.Session().region_name
sagemaker_session = sagemaker.Session()
role = get_execution_role()
s3_bucket = "sagemaker-us-east-1-921916832724"
parquet_output_path = f"s3://{s3_bucket}/data-lake/Project/partitioned_split/"

## Full Processing Pipeline for ALL Datasets

#### Reloading All Datasets

In [None]:
import awswrangler as wr
import pandas as pd

# Defining S3 paths for the original full datasets
train_csv_full_path = "s3://sagemaker-us-east-1-921916832724/Project/train_data_full.csv"
validation_csv_full_path = "s3://sagemaker-us-east-1-921916832724/Project/validation_data_full.csv"
test_csv_full_path = "s3://sagemaker-us-east-1-921916832724/Project/test_data_full.csv"
production_csv_full_path = "s3://sagemaker-us-east-1-921916832724/Project/production_data_full.csv"

# Reloading all datasets from S3
df_train = wr.s3.read_csv(train_csv_full_path)
df_validation = wr.s3.read_csv(validation_csv_full_path)
df_test = wr.s3.read_csv(test_csv_full_path)
df_production = wr.s3.read_csv(production_csv_full_path)

print(" Train, Validation, Test & Production datasets reloaded successfully!")

#### Applying Label Encoding and Scaling

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# **Step 1: Reinitialize Label Encoders**
label_encoder_aisle = LabelEncoder()
label_encoder_department = LabelEncoder()

# **Step 2: Fit encoders using training & validation data**
all_aisle_ids = pd.concat([df_train["aisle_id"], df_validation["aisle_id"]])
all_department_ids = pd.concat([df_train["department_id"], df_validation["department_id"]])

label_encoder_aisle.fit(all_aisle_ids)
label_encoder_department.fit(all_department_ids)

print(" Label Encoders trained on training + validation data!")

# **Step 3: Apply label encoding to all datasets**
for df in [df_train, df_validation, df_test, df_production]:
    df["aisle_id"] = label_encoder_aisle.transform(df["aisle_id"])
    df["department_id"] = label_encoder_department.transform(df["department_id"])

print(" Label Encoding applied to all datasets!")

# **Step 4: Standardize numerical features**
scaler = StandardScaler()

# Fit only on training dataset
scaler.fit(df_train[["user_id", "product_id", "aisle_id", "department_id"]])

# Apply the same scaling to all datasets
for df in [df_train, df_validation, df_test, df_production]:
    df[["user_id", "product_id", "aisle_id", "department_id"]] = scaler.transform(
        df[["user_id", "product_id", "aisle_id", "department_id"]]
    )

print(" Standardization applied to all datasets!")

#### Ensuring the correct order of columns

In [None]:
# Ensure correct column order & remove headers**
train_transformed = df_train[["reordered", "user_id", "product_id", "aisle_id", "department_id"]].to_numpy()
val_transformed = df_validation[["reordered", "user_id", "product_id", "aisle_id", "department_id"]].to_numpy()
test_transformed = df_test[["reordered", "user_id", "product_id", "aisle_id", "department_id"]].to_numpy()
prod_transformed = df_production[["user_id", "product_id", "aisle_id", "department_id"]].to_numpy()

print(" Column order corrected for all datasets!")

#### Converting to Numpy ( Removing headers) and saving back to S3

In [None]:
# Save transformed datasets back to S3 (No headers)**
wr.s3.to_csv(pd.DataFrame(train_transformed),
             path="s3://sagemaker-us-east-1-921916832724/Project/train_data_transformed.csv",
             index=False,
             header=False)

wr.s3.to_csv(pd.DataFrame(val_transformed),
             path="s3://sagemaker-us-east-1-921916832724/Project/validation_data_transformed.csv",
             index=False,
             header=False)

wr.s3.to_csv(pd.DataFrame(test_transformed),
             path="s3://sagemaker-us-east-1-921916832724/Project/test_data_transformed.csv",
             index=False,
             header=False)

wr.s3.to_csv(pd.DataFrame(prod_transformed),
             path="s3://sagemaker-us-east-1-921916832724/Project/production_data_transformed.csv",
             index=False,
             header=False)

print(" All datasets transformed and saved successfully!")

#### Final checking of the datasets

In [None]:
# Checking dataset shapes
print("Transformed Dataset Shapes:")
print(f"Train Transformed: {train_transformed.shape}")
print(f"Validation Transformed: {val_transformed.shape}")
print(f"Test Transformed: {test_transformed.shape}")
print(f"Production Transformed: {prod_transformed.shape}")

# Checking first few rows of each dataset
print("\nSample Data from Transformed Train Set:")
print(train_transformed)

print("\nSample Data from Transformed Validation Set:")
print(val_transformed)

print("\nSample Data from Transformed Test Set:")
print(test_transformed)

print("\nSample Data from Transformed Production Set:")
print(prod_transformed)

## Batch inference Job

In [None]:
import sagemaker
from sagemaker.transformer import Transformer

# Defining S3 paths
test_data_s3_path = "s3://sagemaker-us-east-1-921916832724/Project/test_data_transformed.csv"
batch_output_s3_path = "s3://sagemaker-us-east-1-921916832724/Project/batch_predictions/"

# Defining the transformer for batch inference
transformer = Transformer(
    model_name="sagemaker-xgboost-2025-02-15-06-32-39-868",  # Ensure this matches your registered model name
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=batch_output_s3_path
)

# Aligning with the Correct SageMaker Format
transformer.assemble_with = "Line"
transformer.accept = "text/csv"


transformer.transform(
    data=test_data_s3_path,
    split_type="Line",
    content_type="text/csv",
    input_filter="$[1:]",  #  JSONPath syntax
    join_source="Input"  # Associate input data with predictions
)

# Waiting for completion
transformer.wait()
print(f" Batch predictions completed! Output saved at: {batch_output_s3_path}")

## Evaluations results on XGBoost (Batch Inference on Test Data)

In [None]:
import boto3
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

# AWS S3 Configuration
s3_client = boto3.client("s3")
bucket_name = "sagemaker-us-east-1-921916832724"
s3_predictions_path = "Project/batch_predictions/test_data_transformed.csv.out"

# Local path to store downloaded predictions
batch_predictions_local_path = "/tmp/test_data_transformed_predictions.csv"

# Downloading the Predictions File from S3
s3_client.download_file(bucket_name, s3_predictions_path, batch_predictions_local_path)
print(f" Batch predictions downloaded to: {batch_predictions_local_path}")

# Loading Predictions and Actual Test Labels
predictions_df = pd.read_csv(batch_predictions_local_path, header=None)
predictions = predictions_df.iloc[:, -1].values  # Last column contains predictions

# Loading the original test dataset to get actual labels
s3_test_data_path = "Project/test_data_transformed.csv"
test_data_local_path = "/tmp/test_data_transformed.csv"
s3_client.download_file(bucket_name, s3_test_data_path, test_data_local_path)

# Load test dataset
test_df = pd.read_csv(test_data_local_path, header=None)
actual_labels = test_df.iloc[:, 0].values

# Ensuring the shape matches before evaluation
if len(predictions) != len(actual_labels):
    raise ValueError(f"Shape mismatch: Predictions ({len(predictions)}) vs Labels ({len(actual_labels)})")

# Converting predictions to binary (0 or 1) based on a threshold of 0.5
binary_predictions = np.where(predictions >= 0.5, 1, 0)

# Computing Metrics
accuracy = accuracy_score(actual_labels, binary_predictions)
precision = precision_score(actual_labels, binary_predictions)
recall = recall_score(actual_labels, binary_predictions)
f1 = f1_score(actual_labels, binary_predictions)
auc_roc = roc_auc_score(actual_labels, predictions)  # Use raw probabilities for AUC

# Displaying Metrics
print("\n **Evaluation Results:**")
print(f" Accuracy: {accuracy:.4f}")
print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1 Score: {f1:.4f}")
print(f" AUC-ROC: {auc_roc:.4f}")

# Plotting AUC-ROC Curve
fpr, tpr, _ = roc_curve(actual_labels, predictions)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="blue", lw=2, label=f"AUC = {auc_roc:.4f}")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC-ROC Curve")
plt.legend(loc="lower right")
plt.grid()
plt.show()

#### Conclusion on the first Batch Inference Job with the Fine-Tune XGboost

- Accuracy (73.56%)

    - The model is correctly predicting about 73.56% of the instances.

- Precision (82.50%)

    - When the model predicts positive (1), it is correct 82.50% of the time.
    - High precision suggests fewer false positives.

- Recall (83.76%)

    - The model captures 83.76% of the actual positive cases.
    - This indicates that it is detecting most of the positive instances.

- F1 Score (83.12%)

    - This is the harmonic mean of precision and recall.
    - A high F1-score means a good balance between false positives and false negatives.

- AUC-ROC Score (0.6831)

    - This measures the model’s ability to distinguish between positive and negative classes.
    - 0.6831 is low, meaning there’s room for improvement in separating the two classes.

## Enabling Model Bias Monitor


In [None]:
import ray

ray.init(object_store_memory=2 * 1024 * 1024 * 1024)  # 2GB memory

In [None]:
!pip install awswrangler

In [None]:
import awswrangler as wr
import pandas as pd
import numpy as np
import xgboost as xgb
import sagemaker
import boto3
import joblib
import tarfile
from sagemaker import image_uris, get_execution_role, session
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
import pandas as pd
import awswrangler as wr
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Define S3 Paths
s3_bucket = "sagemaker-us-east-1-921916832724"
s3_prefix = "Project"

train_path = f"s3://{s3_bucket}/{s3_prefix}/train_data_full.csv"
validation_path = f"s3://{s3_bucket}/{s3_prefix}/validation_data_full.csv"
test_path = f"s3://{s3_bucket}/{s3_prefix}/test_data_full.csv"
production_path = f"s3://{s3_bucket}/{s3_prefix}/production_data_full.csv"

# Load Datasets
df_train = wr.s3.read_csv(train_path)
df_validation = wr.s3.read_csv(validation_path)
df_test = wr.s3.read_csv(test_path)
df_production = wr.s3.read_csv(production_path)

print(" Loaded Original Datasets Successfully!")

# Extract unique department mappings dynamically
df_departments = pd.concat([
    df_train[["department_id", "aisle"]],
    df_validation[["department_id", "aisle"]],
    df_test[["department_id", "aisle"]],
    df_production[["department_id", "aisle"]]
]).drop_duplicates().rename(columns={"aisle": "department_category"})

# Ensure unique mapping
df_departments.drop_duplicates(inplace=True)

# Save extracted department mapping to S3 (for reference)
department_mapping_path = f"s3://{s3_bucket}/{s3_prefix}/department_mapping_extracted.csv"
wr.s3.to_csv(df_departments, department_mapping_path, index=False)
print(f" Extracted Department Mapping Saved at: {department_mapping_path}")

# **Step 1: Apply Label Encoding for Aisle & Department**
label_encoder_aisle = LabelEncoder()
label_encoder_department = LabelEncoder()

# Fit encoders on the department categories and aisles
all_aisle_ids = df_departments["department_category"]
all_department_ids = df_departments["department_id"]

label_encoder_aisle.fit(all_aisle_ids)
label_encoder_department.fit(all_department_ids)

# **Step 2: Define Standard Scaler**
scaler = StandardScaler()

# **Step 3: Processing Function for Each Dataset**
def preprocess_dataset(df, dataset_name):
    df = df.copy()

    # **Apply Label Encoding**
    df["aisle_id"] = label_encoder_aisle.transform(df["aisle"])
    df["department_id_encoded"] = label_encoder_department.transform(df["department_id"])

    # **Keep the original department category**
    df["department_category"] = df["aisle"]  # Ensure category is retained for Bias Monitor

    # **Apply Standard Scaling (ONLY to numerical features)**
    numeric_features = ["user_id", "product_id", "aisle_id", "department_id_encoded"]
    scaled_values = scaler.fit_transform(df[numeric_features])

    # Convert to DataFrame with proper column names
    df_scaled = pd.DataFrame(scaled_values, columns=numeric_features)

    # **Restore "reordered" column**
    df_scaled["reordered"] = df["reordered"].values
    df_scaled.insert(0, "reordered", df_scaled.pop("reordered"))  # Move to first column

    # **Add department_category for Bias Monitoring**
    df_scaled["department_category"] = df["department_category"]

    # **Save the Transformed Dataset**
    transformed_path = f"s3://{s3_bucket}/{s3_prefix}/{dataset_name}_transformed_mapped.csv"
    wr.s3.to_csv(df_scaled, transformed_path, index=False)

    print(f" Transformed {dataset_name} Data Saved at: {transformed_path}")
    return df_scaled

# **Step 4: Process All Datasets**
df_train_scaled = preprocess_dataset(df_train, "train_data")
df_validation_scaled = preprocess_dataset(df_validation, "validation_data")
df_test_scaled = preprocess_dataset(df_test, "test_data")
df_production_scaled = preprocess_dataset(df_production, "production_data")

print("\n **Preprocessing Pipeline Completed Successfully! Data is Fully Ready for Model Training & Bias Monitoring.**")

In [None]:
import awswrangler as wr
import pandas as pd

# Define S3 Paths
s3_bucket = "sagemaker-us-east-1-921916832724"
s3_prefix = "Project"

# Load Transformed Datasets
train_transformed_path = f"s3://{s3_bucket}/{s3_prefix}/train_data_transformed_mapped.csv"
validation_transformed_path = f"s3://{s3_bucket}/{s3_prefix}/validation_data_transformed_mapped.csv"
test_transformed_path = f"s3://{s3_bucket}/{s3_prefix}/test_data_transformed_mapped.csv"
production_transformed_path = f"s3://{s3_bucket}/{s3_prefix}/production_data_transformed_mapped.csv"

df_train_transformed = wr.s3.read_csv(train_transformed_path)
df_validation_transformed = wr.s3.read_csv(validation_transformed_path)
df_test_transformed = wr.s3.read_csv(test_transformed_path)
df_production_transformed = wr.s3.read_csv(production_transformed_path)

print(" Successfully loaded all transformed datasets!")

# **Check for Missing Values**
for name, df in zip(["Train", "Validation", "Test", "Production"],
                     [df_train_transformed, df_validation_transformed, df_test_transformed, df_production_transformed]):
    missing_values = df.isnull().sum().sum()
    print(f" {name} Data - Missing Values: {missing_values}")

# **Check Unique Department Categories in Each Dataset**
for name, df in zip(["Train", "Validation", "Test", "Production"],
                     [df_train_transformed, df_validation_transformed, df_test_transformed, df_production_transformed]):
    unique_departments = df["department_category"].unique()
    print(f" {name} Data - Unique Department Categories: {unique_departments}")

# **Check Standardized Columns**
expected_columns = ["reordered", "user_id", "product_id", "aisle_id", "department_id_encoded", "department_category"]
for name, df in zip(["Train", "Validation", "Test", "Production"],
                     [df_train_transformed, df_validation_transformed, df_test_transformed, df_production_transformed]):
    if all(col in df.columns for col in expected_columns):
        print(f" {name} Data - All expected columns are present.")
    else:
        print(f" {name} Data - Missing expected columns!")

print("\n **Verification Completed!** If all checks pass, the dataset is ready for training.")

In [None]:
import sagemaker
import boto3
import pandas as pd
import json
import awswrangler as wr
import io
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from time import gmtime, strftime

# Initializing SageMaker session
sagemaker_session = sagemaker.Session()

# Define S3 bucket and prefix
bucket = "sagemaker-us-east-1-921916832724"
prefix = "Project"

# Corrected dataset paths
train_s3_path = f"s3://{bucket}/{prefix}/train_data_transformed_mapped.csv"
val_s3_path = f"s3://{bucket}/{prefix}/validation_data_transformed_mapped.csv"
output_s3_path = f"s3://{bucket}/{prefix}/models/"

# Load data from S3 to verify structure
df_train = wr.s3.read_csv(train_s3_path)
df_validation = wr.s3.read_csv(val_s3_path)

# Ensure "department_category" exists
if "department_category" not in df_train.columns:
    raise ValueError(" 'department_category' column is missing from training data!")

# Remove department_category from training data
train_columns = [col for col in df_train.columns if col != "department_category"]
df_train_filtered = df_train[train_columns]
df_validation_filtered = df_validation[train_columns]

# Upload the cleaned datasets
train_filtered_s3_path = f"s3://{bucket}/{prefix}/train_data_final.csv"
val_filtered_s3_path = f"s3://{bucket}/{prefix}/validation_data_final.csv"

wr.s3.to_csv(df_train_filtered, train_filtered_s3_path, index=False)
wr.s3.to_csv(df_validation_filtered, val_filtered_s3_path, index=False)

# Save column names separately for later reference
column_names = df_train.columns.tolist()
column_names_path = f"s3://{bucket}/{prefix}/column_names.json"

# Convert JSON data to bytes
json_bytes = json.dumps(column_names).encode("utf-8")

# Upload JSON file to S3 using awswrangler
wr.s3.upload(
    local_file=io.BytesIO(json_bytes),  # Use BytesIO to handle binary data
    path=column_names_path
)

print(f" Cleaned train & validation datasets saved at: {train_filtered_s3_path}, {val_filtered_s3_path}")
print(f" Column names JSON saved to: {column_names_path}")

# Get SageMaker execution role
role = sagemaker.get_execution_role()

# Define unique job name
job_name = "xgb-reorder-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

# Retrieve XGBoost container image
image_uri = sagemaker.image_uris.retrieve(
    "xgboost", boto3.Session().region_name, version="1.5-1"
)

# Define XGBoost Estimator
xgb_estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    volume_size=50,
    output_path=output_s3_path,
    sagemaker_session=sagemaker_session,
)

# Set XGBoost hyperparameters
xgb_estimator.set_hyperparameters(
    objective="binary:logistic",  # Required for classification
    num_round=200,                # Equivalent to n_estimators
    learning_rate=0.05,
    max_depth=8,
    scale_pos_weight=0.4
)

# Define Training & Validation Data Inputs
train_data = TrainingInput(train_filtered_s3_path, content_type="text/csv")
validation_data = TrainingInput(val_filtered_s3_path, content_type="text/csv")

# Start Training
xgb_estimator.fit({"train": train_data, "validation": validation_data}, job_name=job_name)

print(f" Training Completed: {job_name}")

In [None]:
import sagemaker
import boto3
from sagemaker.model import Model
from sagemaker import get_execution_role
from time import gmtime, strftime

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()

# Define S3 bucket and model artifact path
bucket = "sagemaker-us-east-1-921916832724"
prefix = "Project"
model_artifact_path = f"s3://{bucket}/{prefix}/models/xgb-reorder-2025-02-19-03-14-27/output/model.tar.gz"

# Define Model Registry Group Name
model_package_group_name = "XGBoost-Reorder-Predictions"

# Get SageMaker execution role
role = get_execution_role()

# Retrieve XGBoost Image URI
image_uri = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, version="1.5-1")

#  Ensure the Model Package Group Exists
sagemaker_client = boto3.client("sagemaker")

try:
    # Check if the model package group already exists
    sagemaker_client.describe_model_package_group(ModelPackageGroupName=model_package_group_name)
    print(f" Model Package Group '{model_package_group_name}' already exists.")
except sagemaker_client.exceptions.ClientError:
    # Create the model package group if it doesn't exist
    sagemaker_client.create_model_package_group(
        ModelPackageGroupName=model_package_group_name,
        ModelPackageGroupDescription="Model package group for XGBoost reorder predictions",
    )
    print(f" Created Model Package Group: {model_package_group_name}")

# Define Model Object
xgb_model = Model(
    model_data=model_artifact_path,
    role=role,
    image_uri=image_uri,
    sagemaker_session=sagemaker_session,
)

# Register Model in SageMaker Model Registry
model_package_response = sagemaker_session.sagemaker_client.create_model_package(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageDescription="XGBoost model for reorder prediction",
    InferenceSpecification={
        "Containers": [
            {
                "Image": image_uri,
                "ModelDataUrl": model_artifact_path,
                "Environment": {},
            }
        ],
        "SupportedContentTypes": ["text/csv"],
        "SupportedResponseMIMETypes": ["text/csv"],
    },
    ModelApprovalStatus="PendingManualApproval",  # Change to "Approved" if needed
)

print(f" Model Registered Successfully in SageMaker Model Registry!")
print(f" Model Package ARN: {model_package_response['ModelPackageArn']}")


In [None]:
import boto3
import os
import tarfile
import xgboost as xgb

#  Define S3 bucket and model path
bucket = "sagemaker-us-east-1-921916832724"  # Your S3 bucket
model_key = "Project/models/xgb-reorder-2025-02-19-03-14-27/output/model.tar.gz"
local_model_path = "model.tar.gz"
extract_folder = "xgb_model"

#  Download model from S3
print(f" Downloading model from: s3://{bucket}/{model_key}")
s3 = boto3.client("s3")
s3.download_file(bucket, model_key, local_model_path)

#  Extract model
print(f" Extracting model to: {extract_folder}")
with tarfile.open(local_model_path, "r:gz") as tar:
    tar.extractall(extract_folder)

#  Find the extracted model file
model_file_path = os.path.join(extract_folder, "xgboost-model")  # Correct path

if not os.path.exists(model_file_path):
    raise FileNotFoundError(f" Model file not found at: {model_file_path}")

#  Load the XGBoost model
print(f" Loading model from: {model_file_path}")
xgb_model = xgb.Booster()
xgb_model.load_model(model_file_path)

print(" Model successfully loaded and ready for inference!")



In [None]:
import boto3

#  Initialize SageMaker client
sagemaker_client = boto3.client("sagemaker")

#  Define Model Package Group Name
model_package_group_name = "XGBoost-Reorder-Predictions"

# List model packages in the registry
response = sagemaker_client.list_model_packages(
    ModelPackageGroupName=model_package_group_name,
    MaxResults=5,  #  Fetch last 5 model versions
    SortBy="CreationTime",
    SortOrder="Descending"
)

#  Display Model Package Versions
if "ModelPackageSummaryList" in response:
    print(f"\n Found Model Package Group: {model_package_group_name}\n")
    for model in response["ModelPackageSummaryList"]:
        print(f" Model Package Version: {model['ModelPackageVersion']}")
        print(f"   Model Package ARN: {model['ModelPackageArn']}")
        print(f"   Approval Status: {model['ModelApprovalStatus']}")
        print(f"   Created On: {model['CreationTime']}\n")
else:
    print(" No model package found in SageMaker Model Registry!")

In [None]:
import boto3

#  Initialize SageMaker client
sagemaker_client = boto3.client("sagemaker")

#  Define Model Package Group Name
model_package_group_name = "XGBoost-Reorder-Predictions"

# List model packages in the registry
response = sagemaker_client.list_model_packages(
    ModelPackageGroupName=model_package_group_name,
    MaxResults=5,  #  Fetch last 5 model versions
    SortBy="CreationTime",
    SortOrder="Descending"
)

#  Display Model Package Versions
if "ModelPackageSummaryList" in response:
    print(f"\n Found Model Package Group: {model_package_group_name}\n")
    for model in response["ModelPackageSummaryList"]:
        print(f" Model Package Version: {model['ModelPackageVersion']}")
        print(f"   Model Package ARN: {model['ModelPackageArn']}")
        print(f"   Approval Status: {model['ModelApprovalStatus']}")
        print(f"   Created On: {model['CreationTime']}\n")
else:
    print(" No model package found in SageMaker Model Registry!")

In [None]:
#  Approving the model package
sagemaker_client.update_model_package(
    ModelPackageArn="arn:aws:sagemaker:us-east-1:921916832724:model-package/XGBoost-Reorder-Predictions/1",
    ModelApprovalStatus="Approved"
)

print(" Model Approved!")

In [None]:
import sagemaker
from sagemaker.model import Model

#  Define Model Package ARN (Ensure you copy the correct one)
model_package_arn = "arn:aws:sagemaker:us-east-1:921916832724:model-package/XGBoost-Reorder-Predictions/1"

#  SageMaker Session
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

#  Create SageMaker Model from the Model Package
xgb_model = Model(
    image_uri=sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, version="1.5-1"),
    model_data=f"s3://sagemaker-us-east-1-921916832724/Project/models/xgb-reorder-2025-02-19-03-14-27/output/model.tar.gz",
    role=role,
    sagemaker_session=sagemaker_session,
    name="XGBoost-Reorder-Predictions"
)

#  Deploy as a SageMaker Model (but NOT an endpoint)
xgb_model.create()
print(" Model Created Successfully in SageMaker!")

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve

# Load the test dataset
test_data_path = "s3://sagemaker-us-east-1-921916832724/Project/test_data_transformed_mapped.csv"  # Update if needed
df_test = pd.read_csv(test_data_path)

# Keep `department_category` for later analysis
df_test_original = df_test.copy()  # Store original data with department_category
department_category = df_test_original["department_category"]  # Extract the column

# Separate features and labels
X_test = df_test.drop(columns=["reordered", "department_category"])  # Remove target & category
y_test = df_test["reordered"]

# Convert to DMatrix format (required for XGBoost)
dtest = xgb.DMatrix(X_test)

# Make Predictions
y_pred_probs = xgb_model.predict(dtest)
y_pred = np.where(y_pred_probs > 0.5, 1, 0)  # Convert probabilities to binary class (Threshold = 0.5)

# Add Predictions Back to Original Data
df_test_original["predicted_label"] = y_pred
df_test_original["predicted_probability"] = y_pred_probs

# Evaluate Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_probs)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print Evaluation Metrics
print(f" Model Evaluation Results:")
print(f" Accuracy: {accuracy:.4f}")
print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1-score: {f1:.4f}")
print(f" AUC-ROC: {auc_roc:.4f}")
print(" Confusion Matrix:")
print(conf_matrix)

#  AUC-ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_probs)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC-ROC = {auc_roc:.4f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()

#  Confusion Matrix Heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Not Reordered", "Reordered"], yticklabels=["Not Reordered", "Reordered"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
import pandas as pd
import boto3

# S3 bucket details
bucket_name = "sagemaker-us-east-1-921916832724"
s3_key = "Project/test_data_transformed_mapped.csv"
local_file_path = "test_data_transformed_mapped_no_headers.csv"

#  Read CSV from S3 using s3fs
df = pd.read_csv(f"s3://{bucket_name}/{s3_key}", storage_options={"anon": False})

#  Drop the headers by saving without header row
df.to_csv(local_file_path, index=False, header=False)

# Upload the updated file back to S3
s3_client = boto3.client("s3")
s3_client.upload_file(local_file_path, bucket_name, "Project/test_data_transformed_mapped_no_headers.csv")

print("Updated file uploaded successfully to S3!")

In [None]:
import pandas as pd

#  Define correct column names
columns = ["reordered", "user_id", "product_id", "aisle_id", "department_id_encoded", "department_category"]

#  Load CSV with proper columns
file_path = "s3://sagemaker-us-east-1-921916832724/Project/test_data_transformed_mapped.csv"
df = pd.read_csv(file_path, names=columns, header=None)

#  Drop only the "department_category" column
df = df.drop(columns=["department_category"], errors="ignore")

#  Save without headers (target column included)
cleaned_file_path = "test_data_transformed_mapped_with_target.csv"
df.to_csv(cleaned_file_path, index=False, header=False)

print(" Dataset cleaned and saved with target column:", cleaned_file_path)

In [None]:
print(df.head())

In [None]:
import pandas as pd

#  Load CSV **without headers**
file_path = "s3://sagemaker-us-east-1-921916832724/Project/test_data_transformed_mapped.csv"
df = pd.read_csv(file_path, header=None)

#  Drop only the "department_category" column (last column)
df = df.iloc[:, :-1]  # Drop the last column

#  Save the cleaned dataset without headers
cleaned_file_path = "test_data_transformed_mapped_no_headers.csv"
df.to_csv(cleaned_file_path, index=False, header=False)

print(" Dataset cleaned and saved without headers:", cleaned_file_path)


In [None]:
import boto3

s3_client = boto3.client("s3")
s3_bucket = "sagemaker-us-east-1-921916832724"
s3_key = "Project/test_data_transformed_mapped_no_headers.csv"

s3_client.upload_file(cleaned_file_path, s3_bucket, s3_key)

print(" Updated dataset uploaded to S3:", f"s3://{s3_bucket}/{s3_key}")

In [None]:
from sagemaker.transformer import Transformer

#  Path to the updated dataset in S3
test_data_s3_path = "s3://sagemaker-us-east-1-921916832724/Project/test_data_transformed_mapped_no_headers.csv"

#  Define Transformer
transformer = Transformer(
    model_name="XGBoost-Reorder-Predictions",
    instance_count=1,
    instance_type="ml.m5.large",
    strategy="MultiRecord",
    assemble_with="Line",
    output_path="s3://sagemaker-us-east-1-921916832724/Project/BatchInferenceResults/",
    accept="text/csv"
)

#  Execute Batch Transform Job
transformer.transform(
    data=test_data_s3_path,
    split_type="Line",
    content_type="text/csv",
    input_filter="$[1:]",  #
    join_source="Input"
)

#  Wait for job completion
transformer.wait()

print(" Batch inference job completed successfully!")

## Sage Clarify Explainability Model

In [None]:
import boto3

s3_client = boto3.client("s3")
s3_bucket = "sagemaker-us-east-1-921916832724"
s3_prefix = "Project/BatchInferenceResults/"

response = s3_client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_prefix)

print(" Files in Batch Inference Output:")
for obj in response.get("Contents", []):
    print(obj["Key"])

In [None]:
output_file = "test_data_predictions.csv"
s3_client.download_file(s3_bucket, "Project/BatchInferenceResults/test_data_transformed_mapped_no_headers.csv.out", output_file)

# Load the results
import pandas as pd
df_predictions = pd.read_csv(output_file, header=None)

print(" First 5 predictions:")
print(df_predictions.head())

In [None]:
import pandas as pd
import boto3

#  **Define Expected Headers**
headers = ["reordered", "user_id", "product_id", "aisle_id", "department_id_encoded", "probability"]

#  **S3 Paths**
s3_bucket = "sagemaker-us-east-1-921916832724"
s3_input_path = "Project/BatchInferenceResults/test_data_transformed_mapped_no_headers.csv.out"
s3_output_path = "Project/BatchInferenceResults/test_data_transformed_mapped_with_headers.csv"

#  **Download the File from S3**
s3_client = boto3.client("s3")
local_file = "test_data_transformed_mapped_no_headers.csv.out"
updated_file = "test_data_transformed_mapped_with_headers.csv"

s3_client.download_file(s3_bucket, s3_input_path, local_file)

#  **Load CSV Without Headers**
df = pd.read_csv(local_file, header=None)

#  **Assign Correct Headers**
df.columns = headers

#  **Save File with Headers**
df.to_csv(updated_file, index=False, header=True)

#  **Upload Back to S3**
s3_client.upload_file(updated_file, s3_bucket, s3_output_path)

print(f" Updated File Uploaded: s3://{s3_bucket}/{s3_output_path}")

In [None]:
import pandas as pd
df = pd.read_csv("test_data_transformed_mapped_with_headers.csv")
print(df["department_id_encoded"].unique())

In [None]:
import pandas as pd

# Load dataset (Replace with actual S3 path)
df = pd.read_csv("s3://sagemaker-us-east-1-921916832724/Project/BatchInferenceResults/test_data_transformed_mapped_fixed.csv")

# Display first few rows
print(df.head())

In [None]:
# Import Required Libraries
import boto3
import sagemaker
from sagemaker import Session
from sagemaker.s3 import S3Downloader, S3Uploader
from sagemaker.clarify import DataConfig, ModelConfig, SHAPConfig, SageMakerClarifyProcessor

# Initialize SageMaker Session
session = Session()
role = sagemaker.get_execution_role()
region = session.boto_region_name

# Define S3 Bucket and Prefix for the Project
bucket = "sagemaker-us-east-1-921916832724"
prefix = "Project"

# Define S3 Paths for Explainability
s3_data_path = f"s3://{bucket}/{prefix}/BatchInferenceResults/test_data_transformed_mapped_fixed.csv"
s3_output_path = f"s3://{bucket}/{prefix}/explainability_output"

# Define Data Config for Clarify
data_config = DataConfig(
    s3_data_input_path=s3_data_path,  # Use the correct dataset without headers
    s3_output_path=s3_output_path,
    label="reordered",  # Target column
    headers=["reordered", "user_id", "product_id", "aisle_id", "department_id_encoded"],
    dataset_type="text/csv"
)

print(" Data Config Set!")

# Define Model Config
model_config = ModelConfig(
    model_name="XGBoost-Reorder-Predictions",  # Trained XGBoost Model in SageMaker
    instance_type="ml.m5.xlarge",
    instance_count=1,
    content_type="text/csv",
    accept_type="text/csv"
)

print(" Model Config Set!")

# Define SHAP (Explainability) Config
shap_config = SHAPConfig(
    baseline=[[-1, -1, -1, -1]],  # Dummy baseline (adjust as needed)
    num_samples=50,  # Number of samples to use for SHAP
    agg_method="mean_abs",
    save_local_shap_values=True  # Save detailed SHAP values
)

print(" SHAP Config Set!")

# Initialize SageMaker Clarify Processor
clarify_processor = SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    sagemaker_session=session
)

print(" Clarify Processor Initialized!")

# Run Explainability Job
clarify_processor.run_explainability(
    data_config=data_config,
    model_config=model_config,
    explainability_config=shap_config
)

print(f" Explainability Job Started! Results will be saved to: {s3_output_path}")

In [None]:
!pip install shap

In [None]:
import boto3
import pandas as pd
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Define S3 parameters
bucket_name = "sagemaker-us-east-1-921916832724"
s3_prefix = "Project/explainability_output"
s3_file_name = "explanations_shap/out.csv"  # SHAP output file

# Local path to save the SHAP file
local_shap_file = "/tmp/shap_out.csv"

# Initialize S3 client
s3_client = boto3.client("s3")

# Download SHAP output from S3
s3_client.download_file(bucket_name, f"{s3_prefix}/{s3_file_name}", local_shap_file)
print(f" SHAP output file downloaded: {local_shap_file}")

# Load SHAP values into Pandas DataFrame
shap_df = pd.read_csv(local_shap_file)

# Display the first few rows
print(" First few rows of SHAP explanations:")
print(shap_df.head())

# -------------------------
#  GLOBAL FEATURE IMPORTANCE
# -------------------------
plt.figure(figsize=(10, 6))
shap_values_mean = shap_df.abs().mean().sort_values(ascending=False)
sns.barplot(x=shap_values_mean.values, y=shap_values_mean.index)
plt.xlabel("Mean |SHAP Value|", fontsize=12)
plt.ylabel("Feature", fontsize=12)
plt.title("Global Feature Importance", fontsize=14)
plt.show()

# -------------------------
#  SHAP SUMMARY PLOT (Without Explainer)
# -------------------------
shap_values_np = shap_df.values

shap.summary_plot(shap_values_np, feature_names=shap_df.columns)


- **Global Feature Importance (First Image)**

    - The product_id has the highest importance, meaning it has the most influence on whether a product is reordered.
    - Department ID and Aisle ID also play significant roles.
    - User ID has the least importance in the prediction.

- **SHAP Summary Plot (Second Image)**

    - Each dot represents a SHAP value for an instance in the dataset.
    - A positive SHAP value means the feature increases the prediction probability.
    - A negative SHAP value decreases the probability of a product being reordered.
    - The spread of dots indicates how much variability a feature has in affecting the model's prediction.

##  Zoom in on a specific feature (product_id)

In [None]:
import boto3
import pandas as pd
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
import shap
import matplotlib.pyplot as plt
import pandas as pd

# Load SHAP values
shap_values = pd.read_csv("s3://sagemaker-us-east-1-921916832724/Project/explainability_output/explanations_shap/out.csv")  # Update with your actual path
features = shap_values.columns[:-1]  # Excluding label

# Load the original dataset (if needed)
data = pd.read_csv("s3://sagemaker-us-east-1-921916832724/Project/test_data_transformed_mapped.csv")  # Update with actual path

In [None]:
print(data.columns)

In [None]:
print(shap_values.columns)

In [None]:
# Strip "_label0" suffix from SHAP feature names
shap_values.columns = [col.replace("_label0", "") for col in shap_values.columns]

In [None]:
print(shap_values.columns)

In [None]:
print("Shape of shap_values:", shap_values.shape)  # Should be (rows, 4)
print("Shape of data:", data.shape)  # Should be (rows, 4)

In [None]:
# Drop the extra column 'reordered' from data
data_filtered = data.drop(columns=["reordered"])

# Print shapes to confirm they match
print("Shape of shap_values:", shap_values.shape)  # Should be (758816, 4)
print("Shape of data_filtered:", data_filtered.shape)  # Should be (758816, 4)

# Ensure column names match
print("SHAP Columns:", shap_values.columns)
print("Data Columns:", data_filtered.columns)

In [None]:
import pandas as pd
import numpy as np

# Compute mean absolute SHAP values per feature
shap_importance = np.abs(shap_values).mean(axis=0)

# Find the most influential product_id based on its SHAP value
most_influential_product_index = shap_importance.argmax()

# Print the most influential product ID
most_influential_product_id = data_filtered["product_id"].iloc[most_influential_product_index]
print("Most influential product ID:", most_influential_product_id)

In [None]:
import shap
import matplotlib.pyplot as plt

# Convert department_category to categorical values for color mapping
data_filtered['department_category'] = data_filtered['department_category'].astype('category')

# Create color mapping for categories
category_colors = dict(zip(data_filtered['department_category'].cat.categories, range(len(data_filtered['department_category'].cat.categories))))

# Map the department_category column to numerical values for coloring
color_values = data_filtered['department_category'].map(category_colors)

# Generate the SHAP summary plot
shap.summary_plot(shap_values.values, data_filtered[shap_values.columns], plot_type="dot", color=color_values)

# Show the plot
plt.show()

In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt

# Compute mean absolute SHAP values grouped by department_category
shap_df = pd.DataFrame(shap_values.values, columns=shap_values.columns)
shap_df['department_category'] = data_filtered['department_category']

# Compute mean SHAP values for each category
mean_shap_per_category = shap_df.groupby('department_category').mean().abs().sum(axis=1)

# Identify the most influential department_category
most_influential_category = mean_shap_per_category.idxmax()
print(f"Most Influential Department Category: {most_influential_category}")

# Filter data for only the most influential category
filtered_data = data_filtered[data_filtered['department_category'] == most_influential_category]
filtered_shap_values = shap_values.values[data_filtered['department_category'] == most_influential_category]

# Generate SHAP summary plot for the most influential category
shap.summary_plot(filtered_shap_values, filtered_data[shap_values.columns])

# Show plot
plt.show()

In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt

# Compute mean absolute SHAP values grouped by department_category
shap_df = pd.DataFrame(shap_values.values, columns=shap_values.columns)
shap_df['department_category'] = data_filtered['department_category']

# Compute mean SHAP values for each category
mean_shap_per_category = shap_df.groupby('department_category').mean().abs().sum(axis=1)

# Get the top 10 most influential categories
top_10_categories = mean_shap_per_category.sort_values(ascending=False).head(10)

# Print the top 10 influential categories
print("Top 10 Most Influential Department Categories:\n", top_10_categories)

# Plot the top 10 categories
plt.figure(figsize=(10, 6))
top_10_categories.plot(kind='barh', color='steelblue')
plt.xlabel("Mean Absolute SHAP Value")
plt.ylabel("Department Category")
plt.title("Top 10 Most Influential Department Categories (SHAP Analysis)")
plt.gca().invert_yaxis()  # Invert y-axis for better visualization
plt.show()

In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt

# Compute mean absolute SHAP values grouped by department_category
shap_df = pd.DataFrame(shap_values.values, columns=shap_values.columns)
shap_df['department_category'] = data_filtered['department_category']

# Compute mean SHAP values for each category
mean_shap_per_category = shap_df.groupby('department_category').mean().abs().sum(axis=1)

# Get the 10 least influential categories
bottom_10_categories = mean_shap_per_category.sort_values(ascending=True).head(10)

# Print the 10 least influential categories
print("10 Least Influential Department Categories:\n", bottom_10_categories)

# Plot the bottom 10 categories
plt.figure(figsize=(10, 6))
bottom_10_categories.plot(kind='barh', color='lightcoral')
plt.xlabel("Mean Absolute SHAP Value")
plt.ylabel("Department Category")
plt.title("10 Least Influential Department Categories (SHAP Analysis)")
plt.gca().invert_yaxis()  # Invert y-axis for better visualization
plt.show()

In [None]:
data = pd.read_csv("s3://sagemaker-us-east-1-921916832724/Project/test_data_transformed_mapped.csv")

In [None]:
# Ensure 'reordered' exists in data before adding it
if 'reordered' in data.columns:
    data_filtered['reordered'] = data['reordered']
    print("Successfully added 'reordered' column to data_filtered!")
else:
    print("Error: 'reordered' column not found in data!")

In [None]:
# Compute mean absolute SHAP values grouped by department_category
shap_df = pd.DataFrame(shap_values.values, columns=shap_values.columns)
shap_df['department_category'] = data_filtered['department_category']
shap_df['reordered'] = data_filtered['reordered']  # Add reordered column

# Compute mean SHAP values for each category where reordered = 1 (reorders)
mean_shap_per_category_reordered = (
    shap_df[shap_df['reordered'] == 1]  # Filter only reordered entries
    .groupby('department_category')
    .mean()
    .abs()
    .sum(axis=1)  # Sum SHAP values for all features
)

# Get the top 10 most influential categories leading to reorders
top_10_reordered_categories = mean_shap_per_category_reordered.sort_values(ascending=False).head(10)

# Print the 10 most influential department categories for reorders
print("Top 10 Department Categories Influencing Reorders:\n", top_10_reordered_categories)

# Plot the top 10 department categories
plt.figure(figsize=(10, 6))
top_10_reordered_categories.plot(kind='barh', color='royalblue')
plt.xlabel("Mean Absolute SHAP Value")
plt.ylabel("Department Category")
plt.title("Top 10 Department Categories Leading to Reorders (SHAP Analysis)")
plt.gca().invert_yaxis()  # Invert y-axis for better visualization
plt.show()

In [None]:
# Filter dataset for products that WERE reordered
reordered_data = data_filtered[data_filtered['reordered'] == 1]

# Compute the SHAP influence for each product_id in reordered cases
shap_reordered = shap_values.values[data_filtered['reordered'] == 1]

# Create a DataFrame to hold SHAP values for each product
shap_reordered_df = pd.DataFrame(shap_reordered, columns=shap_values.columns)
shap_reordered_df['product_id'] = reordered_data['product_id'].values

# Aggregate SHAP values for each product
shap_product_reordered = shap_reordered_df.groupby('product_id').mean().abs().sum(axis=1)

# Get the **10 most influential products** for reordering
top_10_most_influential_products = shap_product_reordered.nlargest(10)

# Display results
print("Top 10 Most Influential Products for Reordering:")
print(top_10_most_influential_products)

In [None]:
import matplotlib.pyplot as plt

# Get the department category for each product in the most influential list
most_influential_products_df = top_10_most_influential_products.reset_index()
most_influential_products_df = most_influential_products_df.merge(
    data_filtered[['product_id', 'department_category']],
    on='product_id',
    how='left'
).drop_duplicates()

# Plot the results
plt.figure(figsize=(12, 6))
plt.barh(most_influential_products_df['department_category'], most_influential_products_df.iloc[:, 1], color='royalblue')

plt.xlabel("Mean Absolute SHAP Value")
plt.ylabel("Department Category")
plt.title("Top  Most Influential Products for Reordering")
plt.gca().invert_yaxis()  # Invert y-axis for better visualization

plt.show()


In [None]:
# Filter dataset for products that were NOT reordered
not_reordered_data = data_filtered[data_filtered['reordered'] == 0]

# Compute the SHAP influence for each product_id
shap_not_reordered = shap_values.values[data_filtered['reordered'] == 0]

# Create a DataFrame to hold SHAP values for each product
shap_not_reordered_df = pd.DataFrame(shap_not_reordered, columns=shap_values.columns)
shap_not_reordered_df['product_id'] = not_reordered_data['product_id'].values

# Aggregate SHAP values for each product
shap_product_not_reordered = shap_not_reordered_df.groupby('product_id').mean().abs().sum(axis=1)

# Get the 10 products with the **lowest** SHAP impact (least influential for reordering)
least_influential_products = shap_product_not_reordered.nsmallest(10)

# Display the result
print("Top 10 Products Least Likely to Be Not Reordered:")
print(least_influential_products)

In [None]:
import matplotlib.pyplot as plt

# Get the department category for each product in the least influential list
least_influential_products_df = least_influential_products.reset_index()
least_influential_products_df = least_influential_products_df.merge(
    data_filtered[['product_id', 'department_category']],
    on='product_id',
    how='left'
).drop_duplicates()

# Plot the results
plt.figure(figsize=(12, 6))
plt.barh(least_influential_products_df['department_category'], least_influential_products_df.iloc[:, 1], color='darkorange')

plt.xlabel("Mean Absolute SHAP Value")
plt.ylabel("Department Category")
plt.title("Top 10 Least Influential Products for Not Being Reordered")
plt.gca().invert_yaxis()  # Invert y-axis for better visualization

plt.show()

In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt

# Ensure 'reordered' is in the dataset
if 'reordered' not in data_filtered.columns:
    print("Warning: 'reordered' column not found in dataset.")

# Compute mean absolute SHAP values grouped by department_category
shap_df = pd.DataFrame(shap_values.values, columns=shap_values.columns)
shap_df['department_category'] = data_filtered['department_category']
shap_df['reordered'] = data_filtered['reordered']  # Add reordered column

# Compute mean SHAP values for each category where reordered = 0 (not reorders)
mean_shap_per_category_not_reordered = (
    shap_df[shap_df['reordered'] == 0]  # Filter only non-reordered entries
    .groupby('department_category')
    .mean()
    .abs()
    .sum(axis=1)  # Sum SHAP values for all features
)

# Get the top 10 most influential categories leading to not reorders
top_10_not_reordered_categories = mean_shap_per_category_not_reordered.sort_values(ascending=False).head(10)

# Print the 10 most influential department categories for not reorders
print("Top 10 Department Categories Influencing Not Reorders:\n", top_10_not_reordered_categories)

# Plot the top 10 department categories
plt.figure(figsize=(10, 6))
top_10_not_reordered_categories.plot(kind='barh', color='crimson')
plt.xlabel("Mean Absolute SHAP Value")
plt.ylabel("Department Category")
plt.title("Top 10 Department Categories Leading to Not Reorders (SHAP Analysis)")
plt.gca().invert_yaxis()  # Invert y-axis for better visualization
plt.show()

**Summary of SHAP Analysis on Reordering Behavior**:


1. General Feature Importance
    - Product ID has the highest impact on reorder predictions, followed by Department ID (Encoded), Aisle ID, and User ID.
    - The most influential feature suggests that the specific products and their respective department associations play a significant role in predicting reordering behavior.

2. Most Influential Department Categories
    - The top department categories driving reordering behavior include:
        - Spices & Seasonings
        - Baking Supplies & Decor
        - Baking Ingredients
        - Condiments
        - Ice Cream Toppings
        - Preserved Dips & Spreads
        - Salad Dressing Toppings
        - Doughs, Gelatins & Bake Mixes
        - Milk
        - Pickled Goods & Olives

These categories typically consist of frequently used ingredients and perishables, which explains their likelihood of being reordered.

3. Least Influential Department Categories
    - The department categories with the least impact on reorder likelihood include:
        - Fruit & Vegetable Snacks
        - Other Creams & Cheeses
        - Frozen Juice
        - Frozen Breakfast
        - Packaged Cheese
        - Soft Drinks
        - Refrigerated Pudding & Desserts
        - Frozen Meals
        - Frozen Appetizers & Sides
        - Juice Nectars
These categories may not be reordered frequently due to longer shelf life or sporadic purchase habits.

4. Most Influential Products for Reordering
    - The top individual products contributing to reorder likelihood are mapped to the following department categories:

        - Spices & Seasonings
        - Baking Ingredients
        - Baking Supplies & Decor
This reinforces the trend that frequently used ingredients are strong predictors of repeat purchases.

5. Least Influential Products for Reordering
    - The least impactful products (not likely to be reordered) belong to:
        - Tortillas & Flat Bread
        - Crackers
        - Coffee
        - Fresh Vegetables
        - Lunch Meat
        - Cookies & Cakes
These products might be purchased in bulk, have longer consumption cycles, or are purchased occasionally.

**Conclusion**
The analysis successfully identified which product categories and specific products strongly drive reordering behavior.
The most frequently reordered items tend to be core ingredients and consumables, while packaged, frozen, and impulse-purchase items are less likely to be reordered.
The insights from this analysis can help businesses optimize inventory management, personalized recommendations, and customer retention strategies.

# Continuous Integration and Continuous Deployment

## A SageMaker Pipeline

In [None]:
!pip install -U sagemaker


In [None]:
import sys

import boto3
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession

sagemaker_session = sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()
default_bucket = sagemaker_session.default_bucket()
model_package_group_name = f"InstaCartModelPackageGroupName"

In [None]:
bucket_name = f"sagemaker-example-files-prod-{region}"

In [None]:
!mkdir -p data

In [None]:
local_path = "data/02_19_2025_train_data_transformed_mapped.csv"

s3 = boto3.resource("s3")

base_uri = f"s3://{default_bucket}/instacart"
input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=local_path,
    desired_s3_uri=base_uri,
)
print(input_data_uri)

In [None]:
local_path = "data/02_19_2025_production_data_transformed_mapped.csv"

s3 = boto3.resource("s3")

base_uri = f"s3://{default_bucket}/instacart"
batch_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=local_path,
    desired_s3_uri=base_uri,
)
print(batch_data_uri)

### Define Parameters to Parametrize Pipeline Execution

In [None]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)
input_data = ParameterString(
    name="InputData",
    default_value=input_data_uri,
)
batch_data = ParameterString(
    name="BatchData",
    default_value=batch_data_uri,
)
mse_threshold = ParameterFloat(name="MseThreshold", default_value=6.0)

### Define a Processing Step for Feature Engineering

In [None]:
!mkdir -p code

In [None]:
%%writefile code/preprocessing.py
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

COLUMN_ORDER = [
    "reordered",  # Label (first column)
    "user_id",
    "product_id",
    "aisle_id",
    "department_id_encoded",
    "department_category"
]

DTYPES = {
    "reordered": np.float64,
    "user_id": np.float64,
    "product_id": np.float64,
    "aisle_id": np.float64,
    "department_id_encoded": np.float64,
    "department_category": str
}

if __name__ == "__main__":
    base_dir = "/opt/ml/processing"

    # Load data with correct column order and dtypes
    df = pd.read_csv(
        f"{base_dir}/input/02_19_2025_train_data_transformed_mapped.csv",
        header=0,
        usecols=COLUMN_ORDER,
        dtype=DTYPES
    )

    # Separate label and features
    y = df.pop("reordered").values.reshape(-1, 1)  # Shape: (n_samples, 1)
    X = df

    # Define transformers
    numeric_features = [col for col in X.columns if col != "department_category"]
    numeric_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_features = ["department_category"]
    categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))  # Force dense output
    ])

    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

    # Preprocess features (output is dense)
    X_pre = preprocessor.fit_transform(X)  # Shape: (n_samples, n_features)

    # Combine and shuffle
    data = np.hstack((y, X_pre))  # Works because both are 2D
    np.random.shuffle(data)

    # Split into train, validation, test
    train, val, test = np.split(data, [int(0.7*len(data)), int(0.85*len(data))])

    # Save datasets
    pd.DataFrame(train).to_csv(f"{base_dir}/train/train.csv", header=False, index=False)
    pd.DataFrame(val).to_csv(f"{base_dir}/validation/validation.csv", header=False, index=False)
    pd.DataFrame(test).to_csv(f"{base_dir}/test/test.csv", header=False, index=False)

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor


framework_version = "1.2-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type="ml.m5.xlarge",
    instance_count=processing_instance_count,
    base_job_name="sklearn-instacart-process",
    role=role,
    sagemaker_session=pipeline_session,
)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
    ],
    code="code/preprocessing.py",
)

step_process = ProcessingStep(name="InstaCartProcess", step_args=processor_args)

### Define a Training Step to Train a Model

In [None]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

model_path = f"s3://{default_bucket}/InstaCartTrain"
image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.0-1",
    py_version="py3",
    instance_type="ml.m5.xlarge",
)
xgb_train = Estimator(
    image_uri=image_uri,
    instance_type=instance_type,
    instance_count=1,
    output_path=model_path,
    role=role,
    sagemaker_session=pipeline_session,
)
xgb_train.set_hyperparameters(
    objective="reg:squarederror",
    num_round=50,
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.7,
)

train_args = xgb_train.fit(
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
    }
)

In [None]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep


step_train = TrainingStep(
    name="InstaCartTrain",
    step_args=train_args,
)

### Define a Model Evaluation Step to Evaluate the Trained Model

In [None]:
%%writefile code/evaluation.py
import json
import pathlib
import pickle
import tarfile

import joblib
import numpy as np
import pandas as pd
import xgboost

from sklearn.metrics import mean_squared_error


if __name__ == "__main__":
    model_path = f"/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path=".")

    model = pickle.load(open("xgboost-model", "rb"))

    test_path = "/opt/ml/processing/test/test.csv"
    df = pd.read_csv(test_path, header=None)

    y_test = df.iloc[:, 0].to_numpy()
    df.drop(df.columns[0], axis=1, inplace=True)

    X_test = xgboost.DMatrix(df.values)

    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    std = np.std(y_test - predictions)
    report_dict = {
        "regression_metrics": {
            "mse": {"value": mse, "standard_deviation": std},
        },
    }

    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    evaluation_path = f"{output_dir}/evaluation.json"
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))

In [None]:
from sagemaker.processing import ScriptProcessor


script_eval = ScriptProcessor(
    image_uri=image_uri,
    command=["python3"],
    instance_type="ml.m5.xlarge",
    instance_count=1,
    base_job_name="script-instacart-eval",
    role=role,
    sagemaker_session=pipeline_session,
)

eval_args = script_eval.run(
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
    ],
    code="code/evaluation.py",
)

In [None]:
from sagemaker.workflow.properties import PropertyFile


evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)
step_eval = ProcessingStep(
    name="InstaCartEval",
    step_args=eval_args,
    property_files=[evaluation_report],
)

### Define a Create Model Step to Create a Model

In [None]:
from sagemaker.model import Model

model = Model(
    image_uri=image_uri,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=pipeline_session,
    role=role,
)

In [None]:
from sagemaker.inputs import CreateModelInput
from sagemaker.workflow.model_step import ModelStep

step_create_model = ModelStep(
    name="InstaCartCreateModel",
    step_args=model.create(instance_type="ml.m5.large", accelerator_type="ml.eia1.medium"),
)

### Define a Transform Step to Perform Batch Transformation

In [None]:
from sagemaker.transformer import Transformer


transformer = Transformer(
    model_name=step_create_model.properties.ModelName,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    output_path=f"s3://{default_bucket}/InstaCartTransform",
)

In [None]:
from sagemaker.inputs import TransformInput
from sagemaker.workflow.steps import TransformStep


step_transform = TransformStep(
    name="InstaCartTransform", transformer=transformer, inputs=TransformInput(data=batch_data)
)

### Define a Register Model Step to Create a Model Package

In [None]:
from sagemaker.model_metrics import MetricsSource, ModelMetrics

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json",
    )
)

register_args = model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics,
)
step_register = ModelStep(name="InstaCartRegisterModel", step_args=register_args)

### Define a Fail Step to Terminate the Pipeline Execution and Mark it as Failed

In [None]:
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.functions import Join

step_fail = FailStep(
    name="InstaCartMSEFail",
    error_message=Join(on=" ", values=["Execution failed due to MSE >", mse_threshold]),
)

### Define a Condition Step to Check Accuracy and Conditionally Create a Model and Run a Batch Transformation and Register a Model in the Model Registry, Or Terminate the Execution in Failed State

In [None]:
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet


cond_lte = ConditionLessThanOrEqualTo(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="regression_metrics.mse.value",
    ),
    right=mse_threshold,
)

step_cond = ConditionStep(
    name="InstaCartMSECond",
    conditions=[cond_lte],
    if_steps=[step_register, step_create_model, step_transform],
    else_steps=[step_fail],
)

### Define a Pipeline of Parameters, Steps, and Conditions

In [None]:
from sagemaker.workflow.pipeline import Pipeline


pipeline_name = f"InstaCartPipeline"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_count,
        instance_type,
        model_approval_status,
        input_data,
        batch_data,
        mse_threshold,
    ],
    steps=[step_process, step_train, step_eval, step_cond],
)

### (Optional) Examining the pipeline definition

In [None]:
import json


definition = json.loads(pipeline.definition())
definition

### Submit the pipeline to SageMaker and start execution

In [None]:
pipeline.upsert(role_arn=role)

In [None]:
execution = pipeline.start()

### Pipeline Operations: Examining and Waiting for Pipeline Execution

In [None]:
execution.describe()

In [None]:
execution.wait()

In [None]:
execution.list_steps()

#### Examining the Evaluation

In [None]:
from pprint import pprint


evaluation_json = sagemaker.s3.S3Downloader.read_file(
    "{}/evaluation.json".format(
        step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
    )
)
pprint(json.loads(evaluation_json))

#### Lineage

In [None]:
import time
from sagemaker.lineage.visualizer import LineageTableVisualizer


viz = LineageTableVisualizer(sagemaker.session.Session())
for execution_step in reversed(execution.list_steps()):
    print(execution_step)
    display(viz.show(pipeline_execution_step=execution_step))
    time.sleep(5)

#### Parametrized Executions

In [None]:
execution = pipeline.start(
    parameters=dict(
        ModelApprovalStatus="Approved",
    )
)

In [None]:
execution.wait()

In [None]:
execution.list_steps()

In [None]:
execution = pipeline.start(parameters=dict(MseThreshold=3.0))

In [None]:
try:
    execution.wait()
except Exception as error:
    print(error)

In [None]:
execution.list_steps()

We thank you for taking the time to review our project.