In [1]:
from pathlib import Path

from IPython.display import Markdown, display
from sql_8week_danny.sql_engine import DuckDBEngine

In [2]:
SQL_DIR = Path.cwd().parent / "sql"
DATA_DIR = Path.cwd().parent / "data"

TABLES_SQL = SQL_DIR / "week1.sql"
DUCK_DB = DATA_DIR / "week1_duckdb.db"

In [3]:
def create_database(persist=True):
    if persist and DUCK_DB:
        duck = DuckDBEngine(
            str(DUCK_DB), rm_db=True
        )  # rm the db if it exists to avoid table creation errors
    else:
        duck = DuckDBEngine()
    duck.execute_sql_file(TABLES_SQL)  # create the tables
    return duck

In [4]:
duck = create_database()

[32m2024-04-20 17:17:10.990[0m | [1mINFO    [0m | [36msql_8week_danny.sql_engine[0m:[36m__init__[0m:[36m14[0m - [1mRemoving existing /Users/mjboothaus/code/github/mjboothaus/sql-8week-danny/data/week1_duckdb.db
[0m
[32m2024-04-20 17:17:10.995[0m | [1mINFO    [0m | [36msql_8week_danny.sql_engine[0m:[36m__init__[0m:[36m16[0m - [1mPersisting /Users/mjboothaus/code/github/mjboothaus/sql-8week-danny/data/week1_duckdb.db
[0m


In [5]:
table_names, tables_df = duck.load_tables_to_df()

[32m2024-04-20 17:17:11.700[0m | [1mINFO    [0m | [36msql_8week_danny.sql_engine[0m:[36mload_tables_to_df[0m:[36m61[0m - [1mLoaded ['members', 'menu', 'sales'] to dataframes[0m


In [6]:
table_names

['members', 'menu', 'sales']

In [7]:
for table in table_names:
    display(Markdown(f"**{table}** table: {len(tables_df[table])} record(s)"))
    display(tables_df[table])

**members** table: 2 record(s)

Unnamed: 0,customer_id,join_date
0,A,2021-01-07
1,B,2021-01-09


**menu** table: 3 record(s)

Unnamed: 0,product_id,product_name,price
0,1,sushi,10
1,2,curry,15
2,3,ramen,12


**sales** table: 15 record(s)

Unnamed: 0,customer_id,order_date,product_id
0,A,2021-01-01,1
1,A,2021-01-01,2
2,A,2021-01-07,2
3,A,2021-01-10,3
4,A,2021-01-11,3
5,A,2021-01-11,3
6,B,2021-01-01,2
7,B,2021-01-02,2
8,B,2021-01-04,1
9,B,2021-01-11,1


In [8]:
count = duck.query("SELECT COUNT(*) FROM members", return_dataframe=False)
print(count)

2


In [9]:
# duck.close()  # required to remove the DB file lock (.wal)

#### Case Study Questions

Each of the following case study questions can be answered using a single SQL statement:


What is the total amount each customer spent at the restaurant?

In [13]:
duck.query("SELECT customer_id, SUM(price) FROM sales as s JOIN menu as m ON s.product_id = m.product_id GROUP BY customer_id")

Unnamed: 0,customer_id,sum(price)
0,A,76.0
1,B,74.0
2,C,36.0


How many days has each customer visited the restaurant?

In [17]:
duck.query("SELECT customer_id, COUNT(DISTINCT order_date) FROM sales GROUP BY customer_id")

Unnamed: 0,customer_id,count(DISTINCT order_date)
0,B,6
1,C,2
2,A,4


What was the first item from the menu purchased by each customer?

In [18]:
duck.query("SELECT customer_id, MIN(order_date) FROM sales GROUP BY customer_id")

Unnamed: 0,customer_id,min(order_date)
0,A,2021-01-01
1,B,2021-01-01
2,C,2021-01-01


In [25]:
duck.query("SELECT customer_id, product_id FROM sales WHERE order_date = '2021-01-01' GROUP BY customer_id, product_id")

Unnamed: 0,customer_id,product_id
0,A,1
1,A,2
2,B,2
3,C,3


What is the most purchased item on the menu and how many times was it purchased by all customers?

In [33]:
duck.query("SELECT product_id, COUNT(product_id) FROM sales GROUP BY product_id ORDER BY COUNT(product_id) DESC")

Unnamed: 0,product_id,count(product_id)
0,3,8
1,2,4
2,1,3


Which item was the most popular for each customer?

In [35]:
duck.query("SELECT customer_id, product_id, COUNT(product_id) FROM sales GROUP BY customer_id, product_id ORDER BY COUNT(product_id) DESC")

Unnamed: 0,customer_id,product_id,count(product_id)
0,A,3,3
1,C,3,3
2,A,2,2
3,B,1,2
4,B,2,2
5,B,3,2
6,A,1,1


Which item was purchased first by the customer after they became a member?
Which item was purchased just before the customer became a member?
What is the total items and amount spent for each member before they became a member?
If each $1 spent equates to 10 points and sushi has a 2x points multiplier - how many points would each customer have?
In the first week after a customer joins the program (including their join date) they earn 2x points on all items, not just sushi - how many points do customer A and B have at the end of January?