## Week 1 - Data with Danny

`DuckDB` / `SQL` only

In [None]:
#| echo: false

from pathlib import Path

import duckdb
from typing import Union, Optional

In [None]:
#| echo: false

from loguru import logger
from IPython.display import Markdown, display

### Database setup

In [None]:
#| echo: false

SQL_DIR = Path.cwd().parent / "sql"
DATA_DIR = Path.cwd().parent / "data"

TABLES_SQL = SQL_DIR / "week1.sql"
DUCK_DB = DATA_DIR / "week1.ddb"

In [None]:
#| echo: false

def create_database(create_table_sql: Union[str, Path], duck_db: Optional[Path] = None) -> duckdb.DuckDBPyConnection:
    """
    Creates and returns a DuckDB database connection.

    This function can either connect to (or create) a persistent database file or create an in-memory database,
    and it executes SQL commands from a provided file or string to set up tables.

    Args:
    create_table_sql (Union[str, Path]): The SQL commands as a string or path to an SQL file that initialises the database schema.
    duck_db (Optional[Path]): The path to the database file. If None, an in-memory database is created.

    Returns:
    duckdb.DuckDBPyConnection: A connection object to the DuckDB database.
    """
    if duck_db:
        db = duckdb.connect(str(duck_db))
        logger.info(f"DuckDB: persisted as {duck_db}")
    else:
        db = duckdb.connect()  # in memory database 
        logger.info("DuckDB: in-memory")
    if Path(create_table_sql).exists():
        with open(create_table_sql, "r") as sql_file:
            sql_text = sql_file.read()
        logger.info(f"Create table SQL file: {create_table_sql}")
    else:
        sql_text = create_table_sql
        logger.info(f"Create table SQL: {create_table_sql}")
    db.query(sql_text)
    return db

In [None]:
db = create_database(create_table_sql=TABLES_SQL) # in-memory database

#db = create_database(create_table_sql=TABLES_SQL, duck_db=DUCK_DB)

In [None]:
assert db   # confirm DuckDB database created

In [None]:
db.query("PRAGMA version;")

### Schema ERD

```mermaid
erDiagram
    SALES {
        varchar customer_id "Customer ID"
        date order_date "Order date"
        integer product_id "Product ID"
    }

    MENU {
        integer product_id PK "Unique product ID"
        varchar product_name "Name of the product"
        integer price "Price of the product"
    }

    MEMBERS {
        varchar customer_id PK "Customer ID"
        date join_date "Membership join date"
    }

    SALES ||--|| MENU : "contains"
    SALES }|--|| MEMBERS : "places"
```

### Table information

In [None]:
db.query("SHOW TABLES;").df()   # .df() method to return as dataframe

In [None]:
db.query("PRAGMA show_tables_expanded")   # Display detailed table info

### Simple example queries

In [None]:
db.query("SELECT * FROM members").df()

In [None]:
db.query("SELECT COUNT(*) FROM members").df().iloc[0, 0]


```
PRAGMA table_info('table_name');
CALL pragma_table_info('table_name'); same info as show_tables_expanded I think
cid INTEGER,        -- cid of the column
name VARCHAR,       -- name of the column
type VARCHAR,       -- type of the column
notnull BOOLEAN,    -- if the column is marked as NOT NULL
dflt_value VARCHAR, -- default value of the column, or NULL if not specified
pk BOOLEAN          -- part of the primary key or not

SET database_size;   database size info
CALL pragma_database_size();

PRAGMA storage_info('table_name');  technical table size etc info
CALL pragma_storage_info('table_name');

PRAGMA show_databases; get database names

PRAGMA version; DuckDB version info
CALL pragma_version();
PRAGMA user_agent;
```

#### Case Study Questions

Each of the following case study questions can be answered using a single SQL statement:


##### What is the total amount each customer spent at the restaurant?

In [None]:
db.qdf(
    "SELECT customer_id, SUM(price) FROM sales as s JOIN menu as m ON s.product_id = m.product_id GROUP BY customer_id")

##### How many days has each customer visited the restaurant?

In [None]:
db.qdf(
    "SELECT customer_id, COUNT(DISTINCT order_date) FROM sales GROUP BY customer_id"
)

##### What was the first item from the menu purchased by each customer?

In [None]:
db.qdf(
    "SELECT customer_id, MIN(order_date) FROM sales GROUP BY customer_id"
)

In [None]:
db.qdf(
    "SELECT customer_id, product_id FROM sales WHERE order_date = '2021-01-01' GROUP BY customer_id, product_id"
)

##### What is the most purchased item on the menu and how many times was it purchased by all customers?

In [None]:
db.qdf(
    "SELECT product_id, COUNT(product_id) FROM sales GROUP BY product_id ORDER BY COUNT(product_id) DESC"
)

##### Which item was the most popular for each customer?

In [None]:
db.qdf(
    "SELECT customer_id, product_id, COUNT(product_id) FROM sales GROUP BY customer_id, product_id ORDER BY COUNT(product_id) DESC"
)

##### Which item was purchased first by the customer after they became a member?

In [None]:
db.qdf(
    """
    SELECT * FROM sales JOIN members on sales.customer_id = members.customer_id  WHERE sales.customer_id = 'A' AND order_date >= join_date ORDER BY order_date LIMIT 1
    """
)

In [None]:
db.qdf(
    """
    SELECT * FROM sales JOIN members on sales.customer_id = members.customer_id  WHERE sales.customer_id = 'B' AND order_date >= join_date ORDER BY order_date LIMIT 1
    """
)

##### Which item was purchased just before the customer became a member?

In [None]:
db.qdf(
    """
    SELECT * FROM sales JOIN members on sales.customer_id = members.customer_id 
    WHERE sales.customer_id = 'A' AND order_date < join_date ORDER BY order_date LIMIT 1
    """
)

In [None]:
db.qdf(
    """
    SELECT * FROM sales JOIN members on sales.customer_id = members.customer_id 
    WHERE sales.customer_id = 'B' AND order_date < join_date ORDER BY order_date LIMIT 1
    """
)

##### What is the total items and amount spent for each member before they became a member?

In [None]:
db.qdf(
    """
    SELECT count(*) FROM sales JOIN members on sales.customer_id = members.customer_id
    WHERE sales.customer_id = 'A' AND order_date < join_date
    """
)

In [None]:
# need to join menu table for this

In [None]:
db.qdf(
    """
    SELECT SUM(price) FROM sales JOIN members on sales.customer_id = members.customer_id JOIN menu on menu.product_id = sales.product_id
    WHERE sales.customer_id = 'A' AND order_date < join_date
    """
)

In [None]:
db.qdf(
    """
    SELECT count(*) FROM sales JOIN members on sales.customer_id = members.customer_id
    WHERE sales.customer_id = 'B' AND order_date < join_date
    """
)

In [None]:
db.qdf(
    """
    SELECT SUM(price) FROM sales JOIN members on sales.customer_id = members.customer_id JOIN menu on menu.product_id = sales.product_id  WHERE sales.customer_id = 'B' AND order_date < join_date
    """
)

##### If each $1 spent equates to 10 points and sushi has a 2x points multiplier - how many points would each customer have?


##### If each $1 spent equates to 10 points and sushi has a 2x points multiplier - how many points would each customer have?

In the first week after a customer joins the program (including their join date) they earn 2x points on all items, not just sushi - how many points do customer A and B have at the end of January?