# 1. Athena에서 Parquet로 변환

In [1]:
%store -r

In [2]:
import sys
import boto3
import sagemaker

<p>SageMaker에서 앞으로 사용할 SageMaker Session 설정, Role 정보를 설정합니다. </p>

In [3]:
sagemaker_session = sagemaker.Session()
sess = boto3.Session()
sm = sess.client('sagemaker')

## 1 ) PyAthena 수행

In [4]:
from pyathena import connect
from pyathena.pandas_cursor import PandasCursor
from pyathena.util import as_pandas

## 2 ) Athena에서 Parquet 파일 생성

In [5]:
# Set S3 path to Parquet data
s3_path_parquet = 's3://{}/amazon-reviews-pds/parquet'.format(data_bucket)
table_name_parquet = 'amazon_reviews_parquet'

In [6]:
product_category1 = 'Digital_Software'
product_category2 = 'Digital_Video_Games'

# SQL statement to execute
statement = """CREATE TABLE IF NOT EXISTS {}.{}
WITH (format = 'PARQUET', external_location = '{}', partitioned_by = ARRAY['product_category']) AS
SELECT marketplace,
         customer_id,
         review_id,
         product_id,
         product_parent,
         product_title,
         star_rating,
         helpful_votes,
         total_votes,
         vine,
         verified_purchase,
         review_headline,
         review_body,
         CAST(YEAR(DATE(review_date)) AS INTEGER) AS year,
         DATE(review_date) AS review_date,
         product_category
FROM {}.{}
WHERE product_category in ('{}', '{}')""".format(database_name, table_name_parquet, 
                                             s3_path_parquet, database_name, 
                                             table_name_tsv, product_category1, product_category2)

print(statement)

CREATE TABLE IF NOT EXISTS awsdb.amazon_reviews_parquet
WITH (format = 'PARQUET', external_location = 's3://sagemaker-us-east-1-322537213286/amazon-reviews-pds/parquet', partitioned_by = ARRAY['product_category']) AS
SELECT marketplace,
         customer_id,
         review_id,
         product_id,
         product_parent,
         product_title,
         star_rating,
         helpful_votes,
         total_votes,
         vine,
         verified_purchase,
         review_headline,
         review_body,
         CAST(YEAR(DATE(review_date)) AS INTEGER) AS year,
         DATE(review_date) AS review_date,
         product_category
FROM awsdb.amazon_reviews_tsv
WHERE product_category in ('Digital_Software', 'Digital_Video_Games')


#### connection cursor를 사용하여 Execute 실행합니다.

이 작업은 몇 분정도 걸릴 수 있습니다.

In [7]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

<pyathena.cursor.Cursor at 0x7f0ab94d6438>

In [8]:
# !aws s3 rm s3://$data_bucket/amazon-reviews-pds/parquet --recursive
# !aws s3 rm s3://$job_bucket/athena --recursive

## 2 ) `MSCK REPAIR TABLE` 수행하여 Partitions 로드

MSCK REPAIR TABLE 명령은 테이블을 생성한 후 파일 시스템에 추가되거나 파일 시스템에서 제거된 Hive 호환 파티션을 Amazon S3와 같은 파일 시스템에서 스캔합니다. 이 명령은 파티션 및 파티션과 연결된 데이터와 관련하여 카탈로그의 메타데이터를 업데이트합니다.
Parquet 파티션 로드를 위해 다음 SQL 명령을 실행합니다. (MSCK : Hive MetaStore Consistency Check)

In [9]:
statement = 'MSCK REPAIR TABLE {}.{}'.format(database_name, table_name_parquet)

print(statement)

MSCK REPAIR TABLE awsdb.amazon_reviews_parquet


In [10]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

<pyathena.cursor.Cursor at 0x7f0ab8be8fd0>

#### Partitions를 확인해봅니다.

In [11]:
statement = 'SHOW PARTITIONS {}.{}'.format(database_name, table_name_parquet)

print(statement)

SHOW PARTITIONS awsdb.amazon_reviews_parquet


In [12]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

df_partitions = as_pandas(cursor)
df_partitions.head(5)

Unnamed: 0,partition
0,product_category=Digital_Video_Games
1,product_category=Digital_Software


## 3 ) Sample Query 수행

In [13]:
product_category = 'Digital_Software'

statement = """SELECT * FROM {}.{}
    WHERE product_category = '{}' LIMIT 100""".format(database_name, table_name_parquet, product_category)

print(statement)

SELECT * FROM awsdb.amazon_reviews_parquet
    WHERE product_category = 'Digital_Software' LIMIT 100


In [14]:
# Execute statement using connection cursor
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

df = as_pandas(cursor)
df.head(5)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,year,review_date,product_category
0,US,41754720,R19OFJV91M7D8X,B000YMR61A,141393130,TurboTax Deluxe Federal + State 2007,2,12,13,N,N,"Easy to use, 1 comment 1 serious problem",I chose the deluxe version CD because of mortg...,2008,2008-02-11,Digital_Software
1,US,51669529,R1I6G894K5AGG5,B000YMR61A,141393130,TurboTax Deluxe Federal + State 2007,4,6,9,N,N,Schedule C IS for business- figures it would ...,"Schedule C IS for business, so figures it wou...",2008,2008-02-08,Digital_Software
2,US,24731012,R17OE43FFEP81I,B000YMR5X4,234295632,TurboTax Premier Federal + State 2007,2,9,16,N,N,Hassel to download,I wish that companies can test several scenari...,2008,2008-02-05,Digital_Software
3,US,16049580,R15MGDDK63B52Z,B000YMR61A,141393130,TurboTax Deluxe Federal + State 2007,3,14,14,N,N,beware of vista,i just installed turbotax deluxe 2007. If you ...,2008,2008-02-05,Digital_Software
4,US,46098046,R1GGJJA2R68033,B000YMNI2Q,847631772,TurboTax Basic 2007,1,54,60,N,N,don't waste your money,The description mentions that you can use this...,2008,2008-01-26,Digital_Software


In [15]:
%store s3_path_parquet table_name_parquet

Stored 's3_path_parquet' (str)
Stored 'table_name_parquet' (str)
