# Demo : Star Schema (Sales)

Implementation demo for star schema sales.

<img src="img/postgresql-star-schema-diagram.png" align="left" width="500"/>

Import libraries.
I use pandas (optional) just for display rows and matplotlib for visualization sample.  
If needed, follow [pandas installation instruction](https://pandas.pydata.org/docs/getting_started/install.html) and [matplotlib installation instruction](https://matplotlib.org/stable/users/installing.html)

In [None]:
import pandas as pd
import psycopg2
import matplotlib.pyplot as plt

Open connection

In [None]:
try:
    conn = psycopg2.connect("host=34.101.229.192 dbname=postgres user=postgres password=CourseDE888")
    conn.set_session(autocommit=True)
    
    cur = conn.cursor()
except Exception as e: 
    print("Error: cannot open cursor for SQL interaction")
    print(e)

### Create fact & dimension tables

Cleanup tables

In [None]:
tables = ["dim_date", "dim_store", "dim_employee", "dim_product", "dim_customer", "fact_sales"]

In [None]:
for tbl in tables:
    cur.execute("DROP TABLE IF EXISTS {} CASCADE".format(tbl))

Create dimension tables

In [None]:
cur.execute('''
    CREATE TABLE IF NOT EXISTS dim_date(
        date_id integer PRIMARY key,
        "date" SMALLINT,
        "month" SMALLINT,
        "year" SMALLINT,
        day_of_week varchar,
        "quarter" SMALLINT
)
''')

cur.execute("""
    CREATE TABLE IF NOT EXISTS dim_store(
        store_id integer PRIMARY key,
        store_name varchar,
        city varchar,
        region varchar,
        country varchar
    )
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS dim_store(
        store_id integer PRIMARY key,
        store_name varchar,
        city varchar,
        region varchar,
        country varchar
    )
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS dim_employee(
        employee_id integer PRIMARY key,
        employee_number varchar,
        employee_name varchar,
        "position" varchar
    )
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS dim_product(
        product_id integer PRIMARY key,
        product_name varchar,
        category varchar,
        brand varchar
    )
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS dim_customer(
        customer_id integer PRIMARY key,
        is_member boolean,
        member_card_number varchar,
        customer_name varchar,
        phone varchar,
        email varchar
    )
""")

Create fact table

In [None]:
cur.execute("""
    CREATE TABLE IF NOT EXISTS fact_sales(
        sales_id integer,
        date_id integer REFERENCES dim_date(date_id),
        store_id integer REFERENCES dim_store(store_id),
        employee_id integer REFERENCES dim_employee(employee_id),
        product_id integer REFERENCES dim_product(product_id),
        customer_id integer REFERENCES dim_customer,
        quantity integer NOT NULL,
        price real NOT NULL,
        discount_rate real NOT NULL DEFAULT 0
    )
""")

### Import sample data from csv

In [None]:
for tbl in tables:
    with open("data/star_{}.csv".format(tbl), "r") as f:
        next(f) # Skip the header row.
        cur.copy_from(f, tbl, sep=',')

### Query Sample

Sample for dimension table content.

In [None]:
pd.read_sql("SELECT * FROM dim_date", con=conn).sample(5).sort_index()

In [None]:
pd.read_sql("SELECT * FROM dim_store", con=conn).sample(5).sort_index()

The fact table content sample.

In [None]:
pd.read_sql("SELECT * FROM fact_sales", con=conn).sample(5).sort_index()

Get gross revenue for each product in store, and sort from most revenue.  
Using pandas only for neater display, not mandatory requirement.

In [None]:
sql = """
    SELECT
        store.store_name,
        store.city,
        store.region,
        product.product_name,
        sum(sales.quantity) quantity_sold,
        round(
            CAST(
                sum(sales.quantity * sales.price * (100 - sales.discount_rate) / 100) 
                AS NUMERIC), 
            2) product_gross_revenue
    FROM
        fact_sales sales
    INNER JOIN dim_store store 
           ON
        sales.store_id = store.store_id
    INNER JOIN dim_product product
           ON
        sales.product_id = product.product_id
    GROUP BY
        store.store_name,
        store.city,
        store.region,
        product.product_name
    ORDER BY
        product_gross_revenue DESC
"""

gross_revenues = pd.read_sql(sql, con=conn)
gross_revenues.head()

Get employee sales performance.

In [None]:
sql = """
    SELECT
        emp.employee_number,
        emp.employee_name,
        product.brand,
        sum(sales.quantity) quantity_sold,
        round(
            CAST(
                sum(sales.quantity * sales.price * (100 - sales.discount_rate) / 100) 
                AS NUMERIC), 
            2) product_gross_revenue
    FROM
        fact_sales sales
    INNER JOIN dim_employee emp
           ON
        sales.employee_id = emp.employee_id
    INNER JOIN dim_product product
           ON
        sales.product_id = product.product_id
    GROUP BY
        product.brand,
        emp.employee_number,
        emp.employee_name    
    ORDER BY
        employee_name
"""

emp_brand_revenues = pd.read_sql(sql, con=conn)
emp_brand_revenues.head()

Get all data combination.

In [None]:
sql = """
    SELECT
        sales.sales_id,
        store.store_name,
        store.city,
        store.region,
        store.country,
        dim_date.date,
        dim_date.MONTH,
        dim_date.YEAR,
        dim_date.day_of_week,
        dim_date.quarter,
        emp.employee_number,
        emp.employee_name,
        emp.position,
        product.product_name,
        product.category,
        product.brand,
        cust.is_member,
        cust.member_card_number,
        cust.customer_name,
        cust.phone,
        cust.email,
        sales.quantity,
        sales.price,
        sales.discount_rate
    FROM
        fact_sales sales
    INNER JOIN dim_store store 
           ON
        sales.store_id = store.store_id
    INNER JOIN dim_date dim_date
           ON
        sales.date_id = dim_date.date_id
    INNER JOIN dim_employee emp
           ON
        sales.employee_id = emp.employee_id
    INNER JOIN dim_product product
           ON
        sales.product_id = product.product_id
    INNER JOIN dim_customer cust
           ON
        sales.customer_id = cust.customer_id
"""

all_sales = pd.read_sql(sql, con=conn)
all_sales.sample(5).sort_index()

Aggregate using all data (usually on BI visualization tools), sorted by sum(quantity sold) largest first.

In [None]:
popular_brand_by_regions = all_sales[["region", "brand", "quantity"]].groupby(["region"]).apply(
    lambda x: x.groupby(["brand"]).sum().sort_values("quantity", ascending=False))

popular_brand_by_regions

In [None]:
all_sales[["region", "quantity"]].groupby("region").sum().plot.pie(
    subplots=True, title="Sales quantity distribution", 
    figsize=(7,7), autopct='%1.1f%%', fontsize=12, shadow=True)