In [2]:
# Necessary imports
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### Generate Synthetic data

To illustrate how the STAR schema works, weâ€™ll generate a simulated dataset representing customer orders in an online store. This data will populate our fact and dimension tables.

In [6]:
def generate_customer_data(n_customers=1000):
    """
    Generates a simulated dataset representing customer DIMENSION table.
    """
    np.random.seed(42)
    customer_ids = np.arange(1, n_customers + 1)
    first_names = np.random.choice(['Thato', 'Jane', 'Alice', 'Bob'], size=n_customers)
    last_names = np.random.choice(['Smith', 'Mkhize', 'Brown', 'Johnson'], size=n_customers)
    locations = np.random.choice(['South Africa', 'Canada', 'UK', 'Germany'], size=n_customers)
    membership_levels = np.random.choice(['Standard', 'Premium'], size=n_customers)
    customers = pd.DataFrame({
        'customer_id': customer_ids,
        'first_name': first_names,
        'last_name': last_names,
        'location': locations,
        'membership_level': membership_levels
    })
    return customers

# call the function
customers_df = generate_customer_data()
# view the data
customers_df.head()

Unnamed: 0,customer_id,first_name,last_name,location,membership_level
0,1,Alice,Mkhize,UK,Premium
1,2,Bob,Brown,Germany,Premium
2,3,Thato,Smith,Canada,Premium
3,4,Alice,Smith,Germany,Premium
4,5,Alice,Smith,Germany,Standard


In [3]:
def generate_product_data(n_products=500):
    """
    Generates a simulated dataset representing products DIMENSION table.
    """
    product_ids = np.arange(1, n_products + 1)
    product_names = np.random.choice(['Laptop', 'Phone', 'Tablet', 'Headphones'], size=n_products)
    categories = np.random.choice(['Electronics', 'Accessories'], size=n_products)
    prices = np.random.uniform(50, 1000, size=n_products)
    products = pd.DataFrame({
        'product_id': product_ids,
        'product_name': product_names,
        'category': categories,
        'price': prices
    })
    return products

products_df = generate_product_data()
products_df.head()

Unnamed: 0,product_id,product_name,category,price
0,1,Headphones,Electronics,836.142976
1,2,Tablet,Electronics,776.301405
2,3,Tablet,Electronics,594.852504
3,4,Tablet,Electronics,958.244786
4,5,Headphones,Accessories,240.45079


In [4]:
def generate_dates_data(start_date='2023-01-01', end_date='2024-02-21'):
    """
    Generates a simulated dataset representing dates DIMENSION table.
    """
    # Create a date range
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Create a DataFrame with date parts
    dates_df = pd.DataFrame({
        'order_date': date_range,
        'year': date_range.year,
        'month': date_range.month,
        'day': date_range.day,
        'week': date_range.isocalendar().week,
        'quarter': date_range.quarter
    })
    
    return dates_df

# Generate the Dates dimension table
dates_df = generate_dates_data()
dates_df.head()

Unnamed: 0,order_date,year,month,day,week,quarter
2023-01-01,2023-01-01,2023,1,1,52,1
2023-01-02,2023-01-02,2023,1,2,1,1
2023-01-03,2023-01-03,2023,1,3,1,1
2023-01-04,2023-01-04,2023,1,4,1,1
2023-01-05,2023-01-05,2023,1,5,1,1


In [8]:
def generate_order_data(n_orders=10000):
    """
    Generates a simulated dataset representing orders FACT table.
    """
    order_ids = np.arange(1, n_orders + 1)
    customer_ids = np.random.randint(1, 1000, size=n_orders)
    product_ids = np.random.randint(1, 500, size=n_orders)
    order_dates = pd.date_range('2023-01-01', periods=n_orders, freq='H')
    quantities = np.random.randint(1, 5, size=n_orders)
    total_prices = quantities * np.random.uniform(50, 1000, size=n_orders)
    orders = pd.DataFrame({
        'order_id': order_ids,
        'customer_id': customer_ids,
        'product_id': product_ids,
        'order_date': order_dates,
        'quantity': quantities,
        'total_price': total_prices
    })
    return orders

orders_df = generate_order_data()
orders_df.head()

Unnamed: 0,order_id,customer_id,product_id,order_date,quantity,total_price
0,1,220,362,2023-01-01 00:00:00,4,2235.039376
1,2,903,381,2023-01-01 01:00:00,4,1387.584631
2,3,163,92,2023-01-01 02:00:00,2,528.248399
3,4,951,279,2023-01-01 03:00:00,4,2122.080597
4,5,220,447,2023-01-01 04:00:00,1,625.784874


### Example: Querying the STAR Schema

Now that our schema is in place assume these 4 tables (orders, customers, products, dates) have been created and stored in a SQL database with the same schema as the above dataframes generated for each respective table. With this setup, we can run SQL queries to gain valuable business insights from the data.

##### Example 1: Total Sales by Product Category

In [None]:
SELECT
  p.category,
  SUM(o.total_price) AS total_sales
FROM
  orders o
JOIN
  products p
ON
  o.product_id = p.product_id
GROUP BY
  p.category
ORDER BY
  total_sales DESC;

##### Example 2: Average Order Value by Customer Membership Level

In [None]:
SELECT
  c.membership_level,
  AVG(o.total_price) AS avg_order_value
FROM
  orders o
JOIN
  customers c
ON
  o.customer_id = c.customer_id
GROUP BY
  c.membership_level
ORDER BY
  avg_order_value DESC;