### Register the CSV file with Athena

In [2]:
import boto3
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [3]:
ingest_create_athena_table_csv_passed = False

In [4]:
%store

Stored variables and their in-db values:
data_path                                         -> '/root/AAI-540/Module2/csv'
ingest_create_athena_db_mod2_passed               -> True
ingest_create_athena_table_csv_passed             -> True
s3_private_path_csv                               -> 's3://sagemaker-us-east-1-004608622582/module2_dat
setup_dependencies_mod2_passed                    -> True
setup_s3_bucket_passed                            -> True


In [5]:
%store -r ingest_create_athena_db_mod2_passed 

In [6]:
try:
    ingest_create_athena_db_mod2_passed
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not create the Athena Database.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")

In [7]:
print(ingest_create_athena_db_mod2_passed)

True


In [8]:
if not ingest_create_athena_db_mod2_passed:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not create the Athena Database.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")
else:
    print("[OK]")

[OK]


In [9]:
%store -r s3_private_path_csv

In [10]:
try:
    s3_private_path_csv
except NameError:
    print("*****************************************************************************")
    print("[ERROR] PLEASE RE-RUN THE PREVIOUS COPY TSV TO S3 NOTEBOOK ******************")
    print("[ERROR] THIS NOTEBOOK WILL NOT RUN PROPERLY. ********************************")
    print("*****************************************************************************")

In [11]:
print(s3_private_path_csv)

s3://sagemaker-us-east-1-004608622582/module2_data/csv


### Import PyAthena

In [12]:
from pyathena import connect

In [13]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [14]:
# Set Athena parameters
database_name = "mod2_db"
table_name_csv = "music"

In [15]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [16]:
# Create Statement
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
            track_id STRING,
            artists STRING,
            popularity INT,
            duration_ms INT,
            explicit BOOLEAN,
            danceability FLOAT,
            energy FLOAT,
            key INT,
            loudness FLOAT,
            mode INT,
            speechiness FLOAT,
            acousticness FLOAT,
            instrumentalness FLOAT,
            liveness FLOAT,
            valence FLOAT,
            tempo FLOAT,
            time_signature INT,
            track_genre STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_private_path_csv
)

print(statement)


CREATE EXTERNAL TABLE IF NOT EXISTS mod2_db.music(
            track_id STRING,
            artists STRING,
            popularity INT,
            duration_ms INT,
            explicit BOOLEAN,
            danceability FLOAT,
            energy FLOAT,
            key INT,
            loudness FLOAT,
            mode INT,
            speechiness FLOAT,
            acousticness FLOAT,
            instrumentalness FLOAT,
            liveness FLOAT,
            valence FLOAT,
            tempo FLOAT,
            time_signature INT,
            track_genre STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-us-east-1-004608622582/module2_data/csv'
TBLPROPERTIES ('skip.header.line.count'='1')


In [17]:
import pandas as pd

pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


### Verify that Table has been created successfully

In [18]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,music


In [19]:
if table_name_csv in df_show.values:
    ingest_create_athena_table_csv_passed = True
    
print(ingest_create_athena_table_csv_passed)

True


In [20]:
%store ingest_create_athena_table_csv_passed

Stored 'ingest_create_athena_table_csv_passed' (bool)


In [21]:
%store

Stored variables and their in-db values:
data_path                                         -> '/root/AAI-540/Module2/csv'
ingest_create_athena_db_mod2_passed               -> True
ingest_create_athena_table_csv_passed             -> True
s3_private_path_csv                               -> 's3://sagemaker-us-east-1-004608622582/module2_dat
setup_dependencies_mod2_passed                    -> True
setup_s3_bucket_passed                            -> True


### Run A Sample Query

In [22]:
artists = "Jason Mraz"

statement = """SELECT * FROM {}.{}
    WHERE artists = '{}' LIMIT 10""".format(
    database_name, table_name_csv, artists
)

print(statement)

SELECT * FROM mod2_db.music
    WHERE artists = 'Jason Mraz' LIMIT 10


In [23]:
df = pd.read_sql(statement, conn)
df.head(5)

  df = pd.read_sql(statement, conn)


Unnamed: 0,track_id,artists,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,1EzrEOXmMH3G43AXT1y7pA,Jason Mraz,80,242946,False,0.703,0.444,11,-9.331,1,0.0417,0.559,0.0,0.0973,0.712,150.96,4,acoustic
1,5ivF4eQBqJiVL5IAE9jRyl,Jason Mraz,69,240165,False,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,3,acoustic
2,3S0OXQeoh0w6AY8WQVckRW,Jason Mraz,75,242946,False,0.703,0.444,11,-9.331,1,0.0417,0.559,0.0,0.0973,0.712,150.96,4,acoustic
3,0BUuuEvNa5T4lMaewyiudB,Jason Mraz,0,216386,False,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,4,acoustic
4,3Hn3LfhrQOaKihdCibJsTs,Jason Mraz,0,231266,False,0.796,0.667,5,-4.831,0,0.0392,0.381,0.0,0.221,0.754,97.988,4,acoustic


In [24]:
if not df.empty:
    print("[OK]")
else:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")

[OK]


### Review in GLUE Catalog

In [25]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="top" href="https://console.aws.amazon.com/glue/home?region={}#">AWS Glue Catalog</a></b>'.format(
            region
        )
    )
)

  from IPython.core.display import display, HTML


In [26]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>