In [51]:
import pandas as pd
import pyodbc
import duckdb

import os
import sys

In [52]:
# Get the current working directory
current_dir = os.getcwd()
# Move up one level from the current directory
parent_dir = os.path.dirname(current_dir)
# Change directory into data directory
data_dir = os.path.join(parent_dir, 'data')

In [53]:
sys.path.append(parent_dir)

In [54]:
import config as cfg

# 1. Extract
Extract table from SQL server

In [55]:
def extract_table(table_name):
    # Step 1: Define the connection string
    conn_str = (
        'DRIVER={SQL Server};'
        F'SERVER={cfg.SERVER_NAME};'
        F'DATABASE={cfg.DATABASE_NAME};'
        'Trusted_Connection=yes;'
    )
    
    # Step 2: Create the connection
    connection = pyodbc.connect(conn_str)
    
    # Step 3: Execute a SQL query and fetch the results
    query = F"SELECT * FROM {table_name}" 

    df = pd.read_sql(query, connection)

    connection.close()
    return df

In [56]:
df_dim_date = extract_table("dbo.dim_date")
df_inventory = extract_table("original.inventory")
df_orders = extract_table("original.orders")
df_fulfillment = extract_table("original.fulfillment")

  df = pd.read_sql(query, connection)


In [57]:
df_orders.head()

Unnamed: 0,order_id,order_item_id,order_year_month,order_year,order_month,order_day,order_time,order_quantity,product_department,product_category,...,customer_country,warehouse_country,shipment_year,shipment_month,shipment_day,shipment_mode,shipment_days_scheduled,gross_sales,discount_percent,profit
0,29515,73863,202208,2022,8,29,21:55,1,Apparel,Men's Footwear,...,China,USA,2022,8,28,Standard Class,4,130,0.25,65
1,29515,73861,202208,2022,8,31,12:06,1,Apparel,Men's Footwear,...,China,USA,2022,9,1,Standard Class,4,130,-,65
2,41326,103166,202210,2022,10,16,7:08,1,Apparel,Men's Footwear,...,Turkey,USA,2022,10,12,Standard Class,4,130,0.04,65
3,43343,108279,202210,2022,10,18,23:57,1,Apparel,Men's Footwear,...,Iran,USA,2022,10,20,Standard Class,4,130,0.09,65
4,45008,112388,202211,2022,11,25,20:02,1,Apparel,Men's Footwear,...,Kazakhstan,USA,2022,11,5,Standard Class,4,130,0.12,65


In [58]:
df_inventory.head()

Unnamed: 0,product_name,year_month,warehouse_inventory,inventory_cost_per_unit
0,Perfect Fitness Perfect Rip Deck,202312,0,0.69517
1,Nike Men's Dri-FIT Victory Golf Polo,202312,2,1.29291
2,O'Brien Men's Neoprene Life Vest,202312,0,0.56531
3,Nike Men's Free 5.0+ Running Shoe,202312,1,1.26321
4,Under Armour Girls' Toddler Spine Surge Runni,202312,0,1.47648


In [59]:
df_fulfillment.head()

Unnamed: 0,product_name,warehouse_order_fulfillment_days
0,Perfect Fitness Perfect Rip Deck,8.3
1,Nike Men's Dri-FIT Victory Golf Polo,6.6
2,O'Brien Men's Neoprene Life Vest,5.5
3,Nike Men's Free 5.0+ Running Shoe,9.4
4,Under Armour Girls' Toddler Spine Surge Runni,6.3


# 2. Transform 
Modelling with DuckDB

In [60]:
# Step 1: Create a DuckDB in-memory connection
con = duckdb.connect()

In [61]:
con.unregister('df_orders')

<duckdb.duckdb.DuckDBPyConnection at 0x14e7dc399b0>

In [62]:
# Step 2: Register the DataFrames with DuckDB
con.register('df_fulfillment', df_fulfillment)
con.register('df_orders', df_orders)
con.register('df_inventory', df_inventory)

<duckdb.duckdb.DuckDBPyConnection at 0x14e7dc399b0>

In [63]:
# Step 3: Execute the SQL query using DuckDB
dim_product_query = """
WITH cte_fulfillment AS (
    SELECT * 
    FROM df_fulfillment
),

cte_product AS (
    SELECT DISTINCT 
        product_name, 
        product_category,
        product_department
    FROM df_orders
)
SELECT 
    cte_fulfillment.product_name,
    CASE WHEN product_category IS NULL THEN 'Unknown' ELSE product_category END AS product_category,
    CASE WHEN product_department IS NULL THEN 'Unknown' ELSE product_department END AS product_department,
    warehouse_order_fulfillment_days
FROM cte_fulfillment
LEFT JOIN cte_product
ON cte_fulfillment.product_name = cte_product.product_name
-- WHERE product_category IS NULL
ORDER BY 3,2
;
"""

In [64]:
# Step 4: Execute the query and fetch the result into a DataFrame
df_dim_product = con.execute(dim_product_query).df()

In [65]:
df_dim_product.head()

Unnamed: 0,product_name,product_category,product_department,warehouse_order_fulfillment_days
0,Baby sweater,Baby,Apparel,7.9
1,Children's heaters,Children's Clothing,Apparel,2.7
2,Perfect Fitness Perfect Rip Deck,Cleats,Apparel,8.3
3,Total Gym 1400,Cleats,Apparel,1.3
4,Porcelain crafts,Crafts,Apparel,7.1


In [66]:
# Similar with dim_customer_query
dim_customer_query = """
SELECT distinct
    customer_country, 
    customer_market,
    customer_region
FROM df_orders
;
"""

In [67]:
df_dim_customer = con.execute(dim_customer_query).df()

In [68]:
df_dim_customer.head()

Unnamed: 0,customer_country,customer_market,customer_region
0,China,Pacific Asia,Eastern Asia
1,Iran,Pacific Asia,South Asia
2,Spain,Europe,Southern Europe
3,Germany,Europe,Western Europe
4,Russia,Europe,Eastern Europe


In [69]:
# similar to dim_shipment
dim_shipment_query = """
SELECT distinct 
    shipment_mode, 
    shipment_days_scheduled
FROM df_orders
ORDER BY 2;
"""

In [70]:
df_dim_shipment = con.execute(dim_shipment_query).df()

In [71]:
df_dim_shipment

Unnamed: 0,shipment_mode,shipment_days_scheduled
0,First Class,1
1,Second Class,2
2,Same Day,3
3,Standard Class,4


In [96]:
# similar with fact_sales
fact_orders_query = """
SELECT 
    order_id,
    CAST(CONCAT(order_year, '-', order_month, '-', order_day) AS DATE) AS order_date,
    order_time,
    order_quantity,
    product_name, 
    customer_country,
    CAST(CONCAT(shipment_year, '-', shipment_month, '-', shipment_day) AS DATE) AS shipment_date,
    shipment_mode,
    gross_sales,
    CAST(
        CASE 
            WHEN discount_percent = '  -  ' THEN '0'
            ELSE discount_percent
        END AS FLOAT
    ) AS discount_percent,
    profit
FROM 
    df_orders
"""


In [97]:
df_fact_orders = con.execute(fact_orders_query).df()

In [98]:
df_fact_orders.head()

Unnamed: 0,order_id,order_date,order_time,order_quantity,product_name,customer_country,shipment_date,shipment_mode,gross_sales,discount_percent,profit
0,29515,2022-08-29,21:55,1,Nike Men's CJ Elite 2 TD Football Cleat,China,2022-08-28,Standard Class,130,0.25,65
1,29515,2022-08-31,12:06,1,Nike Men's CJ Elite 2 TD Football Cleat,China,2022-09-01,Standard Class,130,0.0,65
2,41326,2022-10-16,7:08,1,Nike Men's CJ Elite 2 TD Football Cleat,Turkey,2022-10-12,Standard Class,130,0.04,65
3,43343,2022-10-18,23:57,1,Nike Men's CJ Elite 2 TD Football Cleat,Iran,2022-10-20,Standard Class,130,0.09,65
4,45008,2022-11-25,20:02,1,Nike Men's CJ Elite 2 TD Football Cleat,Kazakhstan,2022-11-05,Standard Class,130,0.12,65


In [118]:
df_fact_orders.dtypes

order_id                     int64
order_date                  object
order_time                  object
order_quantity               int64
product_name                object
customer_country            object
shipment_date       datetime64[us]
shipment_mode               object
gross_sales                  int64
discount_percent           float32
profit                       int64
dtype: object

In [117]:
df_fact_orders['order_date'] = pd.to_datetime(df_fact_orders['order_date']).dt.date 

In [100]:
df_dim_shipment.dtypes

shipment_mode              object
shipment_days_scheduled     int64
dtype: object

In [101]:
df_dim_customer.dtypes

customer_country    object
customer_market     object
customer_region     object
dtype: object

In [102]:
df_dim_date.dtypes

date                      object
year                      object
quarter                   object
month                     object
month_name                object
month_abbreviation        object
year_month                object
day                       object
day_of_week               object
day_name                  object
day_abbreviation          object
week                      object
is_weekend                  bool
fiscal_year               object
fiscal_quarter            object
fiscal_month              object
is_last_day_of_month        bool
is_last_day_of_quarter      bool
is_last_day_of_year         bool
dtype: object

# 3. Load 
Load data back to SQL server

In [120]:
def get_column_types(df):
    # Map Pandas data types to SQL Server data types
    type_map = {
        'object': 'NVARCHAR(MAX)',
        'int64': 'BIGINT',
        'float64': 'FLOAT',
        'datetime64[ns]': 'DATE',  # Change this to 'DATE'
        'bool': 'BIT'
    }
    return [type_map.get(str(dt), 'NVARCHAR(MAX)') for dt in df.dtypes]


In [119]:
def load_table(df, table_name, schema_name):
    # Define the connection string
    conn_str = (
        f'DRIVER={{SQL Server}};'
        f'SERVER={cfg.SERVER_NAME};'
        f'DATABASE={cfg.DATABASE_NAME};'
        'Trusted_Connection=yes;'
    )

    # Create a pyodbc connection using the connection string
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()

    # Drop the table if it already exists
    cursor.execute(f"IF OBJECT_ID('{schema_name}.{table_name}', 'U') IS NOT NULL DROP TABLE {schema_name}.{table_name}")
    conn.commit()

    # Create the table schema
    columns = ', '.join([f'{col} {dtype}' for col, dtype in zip(df.columns, get_column_types(df))])
    create_table_sql = f"CREATE TABLE {schema_name}.{table_name} ({columns})"
    cursor.execute(create_table_sql)
    conn.commit()

    # Insert the data into the table
    insert_sql = f"INSERT INTO {schema_name}.{table_name} VALUES ({','.join(['?'] * len(df.columns))})"
    for _, row in df.iterrows():
        cursor.execute(insert_sql, *row.tolist())
    conn.commit()

    cursor.close()
    conn.close()
    print(f"DataFrame loaded successfully into {schema_name}.{table_name}")

In [121]:
df_fact_orders.head()

Unnamed: 0,order_id,order_date,order_time,order_quantity,product_name,customer_country,shipment_date,shipment_mode,gross_sales,discount_percent,profit
0,29515,2022-08-29,21:55,1,Nike Men's CJ Elite 2 TD Football Cleat,China,2022-08-28,Standard Class,130,0.25,65
1,29515,2022-08-31,12:06,1,Nike Men's CJ Elite 2 TD Football Cleat,China,2022-09-01,Standard Class,130,0.0,65
2,41326,2022-10-16,7:08,1,Nike Men's CJ Elite 2 TD Football Cleat,Turkey,2022-10-12,Standard Class,130,0.04,65
3,43343,2022-10-18,23:57,1,Nike Men's CJ Elite 2 TD Football Cleat,Iran,2022-10-20,Standard Class,130,0.09,65
4,45008,2022-11-25,20:02,1,Nike Men's CJ Elite 2 TD Football Cleat,Kazakhstan,2022-11-05,Standard Class,130,0.12,65


In [116]:
load_table(df_fact_orders, 'fact_orders', 'dbo')

DataFrame loaded successfully into dbo.fact_orders


In [105]:
load_table(df_dim_customer, 'dim_customer', 'dbo')

DataFrame loaded successfully into dbo.dim_customer


In [106]:
load_table(df_dim_shipment, 'dim_shipment', 'dbo')

DataFrame loaded successfully into dbo.dim_shipment
