In [None]:
import pandas as pd
import duckdb

# Load SQL extension
%load_ext sql

# Initialize 🦆 DuckDB connection
conn = duckdb.connect()

# Import database
%sql conn --alias duckdb

In [None]:
powerball_df = pd.read_csv("https://data.ny.gov/api/views/d6yy-54nr/rows.csv")

powerball_df.rename(
    columns={k: k.lower().replace(" ", "_") for k in powerball_df.columns}, inplace=True
)

In [None]:
%%sql
SELECT * FROM powerball_df LIMIT 5

Write a query that splits the winning numbers into separate columns. Your query should return a result with columns: `draw_date`, `num1`, `num2`, `num3`, `num4`, `num5`, `num6`, and `multiplier`

In [None]:
%%sql
SELECT
    draw_date,
    SPLIT_PART(winning_numbers, ' ', 1) AS num1,
    SPLIT_PART(winning_numbers, ' ', 2) AS num2,
    SPLIT_PART(winning_numbers, ' ', 3) AS num3,
    SPLIT_PART(winning_numbers, ' ', 4) AS num4,
    SPLIT_PART(winning_numbers, ' ', 5) AS num5,
    SPLIT_PART(winning_numbers, ' ', 6) AS num6,
    multiplier::INT
FROM powerball_df
LIMIT 5

Using the above as a base, write a new query that returns a table where each drawn number represents a row and each column is the count of occurences where that number was drawn in the proper position. Your response should look like this:

| range_str | num1_ct | num2_ct | num3_ct | num4_ct | num5_ct | num6_ct |
|----------:|--------:|--------:|--------:|--------:|--------:|--------:|
|        01 |     121 |       0 |       0 |       0 |       0 |      54 |
|        02 |     112 |       9 |       0 |       0 |       0 |      51 |
|        03 |     106 |      18 |       1 |       0 |       0 |      52 |
|        04 |      90 |      22 |       0 |       0 |       0 |      64 |
|        05 |      96 |      17 |       0 |       0 |       0 |      59 |

Hints:
- The numbers aren't actually numbers— their left padded strings. 
- We can't be sure every number has been drawn to create the "index" (range_str)— it might be best to generate the index instead.

In [None]:
%%sql
WITH range AS (
    SELECT
        r.range,
        lpad(r.range::TEXT, 2, '0') AS range_str
    FROM range(1,100) r
    ), nums AS (
    SELECT
        draw_date,
        SPLIT_PART(winning_numbers, ' ', 1) AS num1,
        SPLIT_PART(winning_numbers, ' ', 2) AS num2,
        SPLIT_PART(winning_numbers, ' ', 3) AS num3,
        SPLIT_PART(winning_numbers, ' ', 4) AS num4,
        SPLIT_PART(winning_numbers, ' ', 5) AS num5,
        SPLIT_PART(winning_numbers, ' ', 6) AS num6,
        multiplier::INT as multiplier
    FROM powerball_df
)
SELECT
    range.range_str,

    COUNT(DISTINCT CASE WHEN range.range_str = num1 THEN draw_date END) AS num1_ct,
    COUNT(DISTINCT CASE WHEN range.range_str = num2 THEN draw_date END) AS num2_ct,
    COUNT(DISTINCT CASE WHEN range.range_str = num3 THEN draw_date END) AS num3_ct,
    COUNT(DISTINCT CASE WHEN range.range_str = num4 THEN draw_date END) AS num4_ct,
    COUNT(DISTINCT CASE WHEN range.range_str = num5 THEN draw_date END) AS num5_ct,
    COUNT(DISTINCT CASE WHEN range.range_str = num6 THEN draw_date END) AS num6_ct,
FROM nums
CROSS JOIN range
GROUP BY 1
ORDER BY 1 ASC

In [None]:
%%sql
WITH range AS (
    SELECT
        r.range,
        lpad(r.range::TEXT, 2, '0') AS range_str
    FROM range(1,100) r
    ), nums AS (
    SELECT
        draw_date,
        SPLIT_PART(winning_numbers, ' ', 1) AS num1,
        SPLIT_PART(winning_numbers, ' ', 2) AS num2,
        SPLIT_PART(winning_numbers, ' ', 3) AS num3,
        SPLIT_PART(winning_numbers, ' ', 4) AS num4,
        SPLIT_PART(winning_numbers, ' ', 5) AS num5,
        SPLIT_PART(winning_numbers, ' ', 6) AS num6,
        multiplier::INT as multiplier
    FROM powerball_df
), num_counts AS (
    SELECT
        range.range_str,

        COUNT(DISTINCT CASE WHEN range.range_str = num1 THEN draw_date END) AS num1_ct,
        COUNT(DISTINCT CASE WHEN range.range_str = num2 THEN draw_date END) AS num2_ct,
        COUNT(DISTINCT CASE WHEN range.range_str = num3 THEN draw_date END) AS num3_ct,
        COUNT(DISTINCT CASE WHEN range.range_str = num4 THEN draw_date END) AS num4_ct,
        COUNT(DISTINCT CASE WHEN range.range_str = num5 THEN draw_date END) AS num5_ct,
        COUNT(DISTINCT CASE WHEN range.range_str = num6 THEN draw_date END) AS num6_ct,
    FROM nums
    CROSS JOIN range
    GROUP BY 1
    ORDER BY 2 DESC
)
SELECT
    DISTINCT
    FIRST_VALUE(range_str) OVER (ORDER BY num1_ct DESC) AS most_popular_num1,
    FIRST_VALUE(range_str) OVER (ORDER BY num2_ct DESC) AS most_popular_num2,
    FIRST_VALUE(range_str) OVER (ORDER BY num3_ct DESC) AS most_popular_num3,
    FIRST_VALUE(range_str) OVER (ORDER BY num4_ct DESC) AS most_popular_num4,
    FIRST_VALUE(range_str) OVER (ORDER BY num5_ct DESC) AS most_popular_num5,
    FIRST_VALUE(range_str) OVER (ORDER BY num6_ct DESC) AS most_popular_num6,
FROM num_counts

In [None]:
%%sql
WITH range AS (
    SELECT
        r.range,
        lpad(r.range::TEXT, 2, '0') AS range_str
    FROM range(1,100) r
    ), nums AS (
    SELECT
        draw_date,
        SPLIT_PART(winning_numbers, ' ', 1) AS num1,
        SPLIT_PART(winning_numbers, ' ', 2) AS num2,
        SPLIT_PART(winning_numbers, ' ', 3) AS num3,
        SPLIT_PART(winning_numbers, ' ', 4) AS num4,
        SPLIT_PART(winning_numbers, ' ', 5) AS num5,
        SPLIT_PART(winning_numbers, ' ', 6) AS num6,
        multiplier::INT as multiplier
    FROM powerball_df
), num_draws AS (
    SELECT COUNT(*) as num_draws FROM powerball_df
), num_counts AS (
    SELECT
        range.range_str,
        num_draws.num_draws,
        COUNT(DISTINCT CASE WHEN range.range_str = num1 THEN draw_date END) AS num1_ct,
        COUNT(DISTINCT CASE WHEN range.range_str = num2 THEN draw_date END) AS num2_ct,
        COUNT(DISTINCT CASE WHEN range.range_str = num3 THEN draw_date END) AS num3_ct,
        COUNT(DISTINCT CASE WHEN range.range_str = num4 THEN draw_date END) AS num4_ct,
        COUNT(DISTINCT CASE WHEN range.range_str = num5 THEN draw_date END) AS num5_ct,
        COUNT(DISTINCT CASE WHEN range.range_str = num6 THEN draw_date END) AS num6_ct,

        COUNT(DISTINCT CASE WHEN range.range_str = num1 THEN draw_date END) / num_draws AS num1_pct,
        COUNT(DISTINCT CASE WHEN range.range_str = num2 THEN draw_date END) / num_draws AS num2_pct,
        COUNT(DISTINCT CASE WHEN range.range_str = num3 THEN draw_date END) / num_draws AS num3_pct,
        COUNT(DISTINCT CASE WHEN range.range_str = num4 THEN draw_date END) / num_draws AS num4_pct,
        COUNT(DISTINCT CASE WHEN range.range_str = num5 THEN draw_date END) / num_draws AS num5_pct,
        COUNT(DISTINCT CASE WHEN range.range_str = num6 THEN draw_date END) / num_draws AS num6_pct,
    FROM nums
    CROSS JOIN range
    CROSS JOIN num_draws
    GROUP BY 1,2
    ORDER BY 2 DESC
), agg AS (
SELECT
    DISTINCT
        FIRST_VALUE(range_str) OVER (ORDER BY num1_ct DESC) AS most_popular_num1,
        FIRST_VALUE(range_str) OVER (ORDER BY num2_ct DESC) AS most_popular_num2,
        FIRST_VALUE(range_str) OVER (ORDER BY num3_ct DESC) AS most_popular_num3,
        FIRST_VALUE(range_str) OVER (ORDER BY num4_ct DESC) AS most_popular_num4,
        FIRST_VALUE(range_str) OVER (ORDER BY num5_ct DESC) AS most_popular_num5,
        FIRST_VALUE(range_str) OVER (ORDER BY num6_ct DESC) AS most_popular_num6,
        
        CASE FIRST_VALUE(range_str) OVER (ORDER BY num1_ct DESC) WHEN range_str THEN num1_pct END  AS most_popular_num1_pct,
        CASE FIRST_VALUE(range_str) OVER (ORDER BY num2_ct DESC) WHEN range_str THEN num2_pct END  AS most_popular_num2_pct,
        CASE FIRST_VALUE(range_str) OVER (ORDER BY num3_ct DESC) WHEN range_str THEN num3_pct END  AS most_popular_num3_pct,
        CASE FIRST_VALUE(range_str) OVER (ORDER BY num4_ct DESC) WHEN range_str THEN num4_pct END  AS most_popular_num4_pct,
        CASE FIRST_VALUE(range_str) OVER (ORDER BY num5_ct DESC) WHEN range_str THEN num5_pct END  AS most_popular_num5_pct,
        CASE FIRST_VALUE(range_str) OVER (ORDER BY num6_ct DESC) WHEN range_str THEN num6_pct END  AS most_popular_num6_pct
FROM num_counts
)
SELECT
    DISTINCT
    most_popular_num1,
    most_popular_num2,
    most_popular_num3,
    most_popular_num4,
    most_popular_num5,
    most_popular_num6,
    
    MAX(most_popular_num1_pct) AS num1_pct,
    MAX(most_popular_num2_pct) AS num2_pct,
    MAX(most_popular_num3_pct) AS num3_pct,
    MAX(most_popular_num4_pct) AS num4_pct,
    MAX(most_popular_num5_pct) AS num5_pct,
    MAX(most_popular_num6_pct) AS num6_pct,

    MAX(most_popular_num1_pct) *
    MAX(most_popular_num2_pct) *
    MAX(most_popular_num3_pct) *
    MAX(most_popular_num4_pct) *
    MAX(most_popular_num5_pct) *
    MAX(most_popular_num6_pct) as pct_product
FROM agg
GROUP BY 1,2,3,4,5,6

In [None]:
%%sql
SELECT
    *
FROM powerball_df 
WHERE winning_numbers = '01 12 37 45 59 24'
# :(