# 1. Athena에서 Parquet로 변환

In [2]:
%store -r

In [3]:
import sys
import boto3

<p>SageMaker에서 앞으로 사용할 SageMaker Session 설정, Role 정보를 설정합니다. </p>

In [4]:
sess = boto3.Session()
sm = sess.client('sagemaker')

## 1 ) PyAthena 수행

In [5]:
from pyathena import connect
from pyathena.pandas_cursor import PandasCursor
from pyathena.util import as_pandas

## 2 ) Athena에서 Parquet 파일 생성

In [6]:
data_bucket

'sagemaker-us-east-1-322537213286'

In [5]:
# Set S3 path to Parquet data
s3_path_parquet = 's3://{}/amazon-reviews-pds/parquet'.format(data_bucket)
table_name_parquet = 'amazon_reviews_parquet'

In [6]:
product_category1 = 'Digital_Software'
product_category2 = 'Digital_Video_Games'

# SQL statement to execute
statement = """CREATE TABLE IF NOT EXISTS {}.{}
WITH (format = 'PARQUET', external_location = '{}', partitioned_by = ARRAY['product_category']) AS
SELECT marketplace,
         customer_id,
         review_id,
         product_id,
         product_parent,
         product_title,
         star_rating,
         helpful_votes,
         total_votes,
         vine,
         verified_purchase,
         review_headline,
         review_body,
         CAST(YEAR(DATE(review_date)) AS INTEGER) AS year,
         DATE(review_date) AS review_date,
         product_category
FROM {}.{}
WHERE product_category in ('{}', '{}')""".format(database_name, table_name_parquet, 
                                             s3_path_parquet, database_name, 
                                             table_name_tsv, product_category1, product_category2)

print(statement)

CREATE TABLE IF NOT EXISTS awsdb_1124.amazon_reviews_parquet
WITH (format = 'PARQUET', external_location = 's3://sagemaker-us-east-1-322537213286/amazon-reviews-pds/parquet', partitioned_by = ARRAY['product_category']) AS
SELECT marketplace,
         customer_id,
         review_id,
         product_id,
         product_parent,
         product_title,
         star_rating,
         helpful_votes,
         total_votes,
         vine,
         verified_purchase,
         review_headline,
         review_body,
         CAST(YEAR(DATE(review_date)) AS INTEGER) AS year,
         DATE(review_date) AS review_date,
         product_category
FROM awsdb_1124.amazon_reviews_tsv
WHERE product_category in ('Digital_Software', 'Digital_Video_Games')


#### connection cursor를 사용하여 Execute 실행합니다.

이 작업은 몇 분정도 걸릴 수 있습니다.

In [7]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

<pyathena.cursor.Cursor at 0x7f44fda66ad0>

## 2 ) `MSCK REPAIR TABLE` 수행하여 Partitions 로드

MSCK REPAIR TABLE 명령은 테이블을 생성한 후 파일 시스템에 추가되거나 파일 시스템에서 제거된 Hive 호환 파티션을 Amazon S3와 같은 파일 시스템에서 스캔합니다. 이 명령은 파티션 및 파티션과 연결된 데이터와 관련하여 카탈로그의 메타데이터를 업데이트합니다.
Parquet 파티션 로드를 위해 다음 SQL 명령을 실행합니다. (MSCK : Hive MetaStore Consistency Check)

In [8]:
statement = 'MSCK REPAIR TABLE {}.{}'.format(database_name, table_name_parquet)

print(statement)

MSCK REPAIR TABLE awsdb_1124.amazon_reviews_parquet


In [9]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

<pyathena.cursor.Cursor at 0x7f44fd6c4310>

#### Partitions를 확인해봅니다.

In [10]:
statement = 'SHOW PARTITIONS {}.{}'.format(database_name, table_name_parquet)

print(statement)

SHOW PARTITIONS awsdb_1124.amazon_reviews_parquet


In [11]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

df_partitions = as_pandas(cursor)
df_partitions.head(5)

Unnamed: 0,partition
0,product_category=Digital_Software
1,product_category=Digital_Video_Games


## 3 ) Sample Query 수행

In [12]:
product_category = 'Digital_Software'

statement = """SELECT * FROM {}.{}
    WHERE product_category = '{}' LIMIT 100""".format(database_name, table_name_parquet, product_category)

print(statement)

SELECT * FROM awsdb_1124.amazon_reviews_parquet
    WHERE product_category = 'Digital_Software' LIMIT 100


In [13]:
# Execute statement using connection cursor
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

df = as_pandas(cursor)
df.head(5)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,year,review_date,product_category
0,US,50864732,R1NSSTTFSB6H1L,B005GW7X7A,667850400,Quickstart: Calendar Studio Pro [Download],2,4,5,N,Y,calendar,Calendar maker was a little disappointing. I f...,2012,2012-01-27,Digital_Software
1,US,11632424,R3FEF66AYCILGZ,B00576JM6A,419258587,iBank 4,5,8,8,N,Y,This was an awesome olution to my Quicken data...,I got a new iMac after using a Powerbook for o...,2012,2012-01-27,Digital_Software
2,US,15526727,RECG1YZ46MM0V,B005S4Y13K,70285996,TurboTax Deluxe Federal + E-file + State 2011 ...,4,0,0,N,Y,No issues with download/install,Just downloaded and installed and it took 5 mi...,2012,2012-01-27,Digital_Software
3,US,50533445,R38LUPCPRE82D,B005S4YBVM,92728872,TurboTax Home & Business Federal + E-file + St...,5,17,17,N,N,Great program that works great with Lion,TurboTax 2011 Home & Office for the Mac once a...,2012,2012-01-27,Digital_Software
4,US,28463523,R1US6EQZYZVPEU,B0060C382Y,283542968,Norton Internet Security 2012 - 1 User 3PC,5,1,1,N,Y,so easy to use,"Love this product. Very easy to use, download...",2012,2012-01-26,Digital_Software


In [14]:
%store s3_path_parquet table_name_parquet

Stored 's3_path_parquet' (str)
Stored 'table_name_parquet' (str)
