# Athena SQL Example

In [62]:
import dataclasses

import numpy as np
import pandas as pd
import awswrangler as wr
from boto_session_manager import BotoSesManager
from s3pathlib import S3Path, context
from pyathena import connect
from pyathena.pandas.util import as_pandas

from rich import print as rprint
from rich.console import Console

## Helpers

In [63]:
console = Console()

In [14]:
@dataclasses.dataclass
class Config:
    aws_profile: str = dataclasses.field()
    bucket: str = dataclasses.field()
    prefix: str = dataclasses.field()
    glue_database: str = dataclasses.field()

    @property
    def s3dir_database(self) -> S3Path:
        return S3Path(f"s3://{self.bucket}/{self.prefix}").to_dir()

    def get_s3dir_table(self, table: str) -> S3Path:
        return self.s3dir_database.joinpath(table).to_dir()


config = Config(
    aws_profile = "bmt_app_dev_us_east_1",
    bucket = "bmt-app-dev-us-east-1-data",
    prefix = "poc/2023-12-01-athena-in-python",
    glue_database = "athena_sql_examples",
)
rprint(config)
print(config.s3dir_database.console_url)

https://console.aws.amazon.com/s3/buckets/bmt-app-dev-us-east-1-data?prefix=poc/2023-12-01-athena-in-python/


In [45]:
bsm = BotoSesManager(profile_name=config.aws_profile)
context.attach_boto_session(bsm.boto_ses)
s3dir_athena_result = S3Path(f"s3://{config.bucket}/athena/results/").to_dir()
conn = connect(
    s3_staging_dir=s3dir_athena_result.uri,
    profile_name=config.aws_profile,
    region_name=bsm.aws_region,
)
cursor = conn.cursor()

In [44]:
def create_database(
    bsm: "BotoSesManager",
    database: str,
):
    databases = wr.catalog.databases(boto3_session=bsm.boto_ses)
    if database not in databases.values:
        wr.catalog.create_database(database, boto3_session=bsm.boto_ses)


def create_table_with_dummy_data(
    bsm: "BotoSesManager",
    df: pd.DataFrame,
    s3dir_table: S3Path,
    database: str,
    table: str,
    mode: str = "overwrite",
    delete_old_table: bool = True,
    delete_old_data: bool = True,
    delete_new_data: bool = False,
):
    if delete_old_table:
        tables = wr.catalog.tables(database=database, boto3_session=bsm.boto_ses)
        if table in tables.values:
            bsm.glue_client.delete_table(
                DatabaseName=database,
                Name=table,
            )

    if delete_old_data:
        s3dir_table.delete(bsm=bsm)

    wr.s3.to_parquet(
        df=df,
        path=s3dir_table.uri,
        dataset=True,
        database=database,
        table=table,
        mode=mode,
        boto3_session=bsm.boto_ses,
    )
    
    if delete_new_data:
        s3dir_table.delete(bsm=bsm)


def create_table_v1(
    df: pd.DataFrame,
    table: str,
):
    return create_table_with_dummy_data(
        bsm=bsm,
        df=df,
        s3dir_table=config.get_s3dir_table(table),
        database=config.glue_database,
        table=table,
    )

create_database(bsm, config.glue_database)

In [69]:
def test_sql(
    table: str,
    df: pd.DataFrame,
    sql: str,
    recreate_table: bool = True,
) -> pd.DataFrame:
    console.rule(f"table: {table}")
    display(df)
    if recreate_table:
        create_table_v1(df, table)
    console.rule("sql")
    rprint(sql)
    cursor.execute(sql)
    df_result = as_pandas(cursor)
    console.rule("query result")
    display(df_result)
    return df_result

## Flattening Nested Arrays

In [67]:
table = "flattening_nested_arrays"

df = pd.DataFrame(
    [
        (1, "2000-01-01", ["tag1", "tag2", "tag3"])
    ],
    columns=["post_id", "create_time", "tags"]
)

sql = f"""
SELECT
    t.post_id,
    tag
FROM {config.glue_database}.{table} t
CROSS JOIN UNNEST(t.tags) as t(tag)
"""

df_result = test_sql(table, df, sql)

Unnamed: 0,post_id,create_time,tags
0,1,2000-01-01,"[tag1, tag2, tag3]"


Unnamed: 0,post_id,tag
0,1,tag1
1,1,tag2
2,1,tag3


In [75]:
table = "flattening_nested_arrays"

df = pd.DataFrame(
    [
        (
            "engineering", 
            [
                {"first": "Bob", "last": "Smith", "age": 40},
                {"first": "Jane", "last": "Doe", "age": 30},
                {"first": "Billy", "last": "Wilson", "age": 8},
            ]
        ),
    ],
    columns=["department", "people"]
)

sql = f"""
SELECT
    names.first AS first_name,
    names.last AS last_name,
    tt1.department
FROM {config.glue_database}.{table} tt1
CROSS JOIN UNNEST(tt1.people) as t(names)
"""

df_result = test_sql(table, df, sql)

Unnamed: 0,department,people
0,engineering,"[{'first': 'Bob', 'last': 'Smith', 'age': 40},..."


Unnamed: 0,first_name,last_name,department
0,Bob,Smith,engineering
1,Jane,Doe,engineering
2,Billy,Wilson,engineering


In [81]:
# WITH dataset AS (
#     SELECT 
#         ARRAY[
#             CAST(ROW('Sally', 'engineering', ARRAY[1,2,3,4]) AS ROW(name VARCHAR, department VARCHAR, scores ARRAY(INTEGER))),
#             CAST(ROW('John', 'finance', ARRAY[7,8,9]) AS ROW(name VARCHAR, department VARCHAR, scores ARRAY(INTEGER))),
#             CAST(ROW('Amy', 'devops', ARRAY[12,13,14,15]) AS ROW(name VARCHAR, department VARCHAR, scores ARRAY(INTEGER)))
#         ] AS users
# ),
# users AS (
#  SELECT person, score
#  FROM
#    dataset,
#    UNNEST(dataset.users) AS t(person),
#    UNNEST(person.scores) AS t(score)
# )
# SELECT person.name, person.department, SUM(score) AS total_score FROM users
# GROUP BY (person.name, person.department)
# ORDER BY (total_score) DESC
# LIMIT 1

table = "flattening_nested_arrays"

df = pd.DataFrame(
    [
        (
            "company ABC",
            [
                {"name": "Sally", "department": "engineering", "score": [1, 2, 3, 4]},
                {"name": "John", "department": "finance", "score": [7, 8, 9]},
                {"name": "Amy", "department": "hr", "score": [12, 13, 14, 15]},
            ],
        )
    ],
    columns=["company", "users"],
)

# sql = f"""
# SELECT
#     names.first AS first_name,
#     names.last AS last_name,
#     tt1.department
# FROM {config.glue_database}.{table} tt1
# CROSS JOIN UNNEST(tt1.people) as t(names)
# """

df_result = test_sql(table, df, sql)


Unnamed: 0,company,users
0,company ABC,"[{'name': 'Sally', 'department': 'engineering'..."


OperationalError: COLUMN_NOT_FOUND: line 6:19: Column 'tt1.people' cannot be resolved or requester is not authorized to access requested resources