In [5]:
import pandas as pd
import pyodbc
import duckdb

In [9]:
def extract_table(table_name):
    # Step 1: Define the connection string
    conn_str = (
            'Driver={SQL Server};'
            'Server=BOOK-3EOH2MPGSA\\SQLEXPRESS;'
            'Database=SUPPLY_CHAIN_ANALYTICS;'
            'Trusted_Connection=yes;'
    )
    
    # Step 2: Create the connection
    connection = pyodbc.connect(conn_str)
    
    # Step 3: Execute a SQL query and fetch the results
    query = F"SELECT * FROM {table_name}" 

    df = pd.read_sql(query, connection)

    connection.close()
    return df

In [14]:
df_dim_date = extract_table("dbo.dim_date")
df_inventory = extract_table("original.inventory")
df_orders = extract_table("original.orders")
df_fulfillment = extract_table("original.fulfillment")

  df = pd.read_sql(query, connection)


In [15]:
df_orders.head()

Unnamed: 0,order_id,order_item_id,order_year_month,order_year,order_month,order_day,order_time,order_quantity,product_department,product_category,...,customer_country,warehouse_country,shipment_year,shipment_month,shipment_day,shipment_mode,shipment_days_scheduled,gross_sales,discount_percent,profit
0,3535,8793,202102,2021,2,21,14:07,1,Fan Shop,Fishing,...,Mexico,Puerto Rico,2021,2,27,Standard Class,4,400,0.25,200
1,4133,10320,202103,2021,3,2,7:37,1,Fan Shop,Fishing,...,Brazil,Puerto Rico,2021,3,6,Standard Class,4,400,0.09,200
2,7396,18517,202104,2021,4,18,22:47,1,Fan Shop,Fishing,...,Mexico,Puerto Rico,2021,4,20,Standard Class,4,400,0.06,200
3,11026,27608,202106,2021,6,10,22:32,1,Fan Shop,Fishing,...,Denmark,Puerto Rico,2021,6,12,Standard Class,4,400,0.15,200
4,11026,27609,202106,2021,6,10,22:32,1,Fan Shop,Fishing,...,Denmark,Puerto Rico,2021,6,12,Standard Class,4,400,0.13,200


In [13]:
df_inventory.head()

Unnamed: 0,product_name,year_month,warehouse_inventory,inventory_cost_per_unit
0,Perfect Fitness Perfect Rip Deck,202312,0,0.69517
1,Nike Men's Dri-FIT Victory Golf Polo,202312,2,1.29291
2,O'Brien Men's Neoprene Life Vest,202312,0,0.56531
3,Nike Men's Free 5.0+ Running Shoe,202312,1,1.26321
4,Under Armour Girls' Toddler Spine Surge Runni,202312,0,1.47648


In [16]:
df_fulfillment.head()

Unnamed: 0,product_name,warehouse_order_fulfillment_days
0,Perfect Fitness Perfect Rip Deck,8.3
1,Nike Men's Dri-FIT Victory Golf Polo,6.6
2,O'Brien Men's Neoprene Life Vest,5.5
3,Nike Men's Free 5.0+ Running Shoe,9.4
4,Under Armour Girls' Toddler Spine Surge Runni,6.3


# Modelling with DuckDB

In [17]:
# Step 1: Create a DuckDB in-memory connection
con = duckdb.connect()

In [18]:
# Step 2: Register the DataFrames with DuckDB
con.register('df_fulfillment', df_fulfillment)
con.register('df_orders', df_orders)

<duckdb.duckdb.DuckDBPyConnection at 0x27feb2baff0>

In [32]:
# Step 3: Execute the SQL query using DuckDB
query = """
WITH cte_fulfillment AS (
    SELECT * 
    FROM df_fulfillment
),

cte_product AS (
    SELECT DISTINCT 
        product_name, 
        product_category,
        product_department
    FROM df_orders
)
SELECT 
    cte_fulfillment.product_name,
    product_category,
    product_department,
    warehouse_order_fulfillment_days
FROM cte_fulfillment
LEFT JOIN cte_product
ON cte_fulfillment.product_name = cte_product.product_name
WHERE product_category IS NULL
ORDER BY 3,2
;
"""

In [33]:
# Step 4: Execute the query and fetch the result into a DataFrame
df_dim_product = con.execute(query).df()

In [34]:
df_dim_product.head()

Unnamed: 0,product_name,product_category,product_department,warehouse_order_fulfillment_days
0,TaylorMade 2017 Purelite Stand Bag,,,4.7
1,Bushnell Pro X7 Jolt Slope Rangefinder,,,2.0
2,SOLE E35 Elliptical,,,1.9
3,SOLE E25 Elliptical,,,2.1
4,adidas Brazuca 2017 Official Match Ball,,,9.3
