# 1. Athena에서 Parquet로 변환

In [1]:
%store -r

In [2]:
import sys
import boto3
import sagemaker

<p>SageMaker에서 앞으로 사용할 SageMaker Session 설정, Role 정보를 설정합니다. </p>

In [3]:
sagemaker_session = sagemaker.Session()
sess = boto3.Session()
sm = sess.client('sagemaker')

## 1 ) PyAthena 수행

In [4]:
from pyathena import connect
from pyathena.pandas_cursor import PandasCursor
from pyathena.util import as_pandas

## 2 ) Athena에서 Parquet 파일 생성

In [5]:
# Set S3 path to Parquet data
s3_path_parquet = 's3://{}/amazon-reviews-pds/parquet'.format(data_bucket)
table_name_parquet = 'amazon_reviews_parquet'

In [6]:
product_category1 = 'Digital_Software'
product_category2 = 'Digital_Video_Games'

# SQL statement to execute
statement = """CREATE TABLE IF NOT EXISTS {}.{}
WITH (format = 'PARQUET', external_location = '{}', partitioned_by = ARRAY['product_category']) AS
SELECT marketplace,
         customer_id,
         review_id,
         product_id,
         product_parent,
         product_title,
         star_rating,
         helpful_votes,
         total_votes,
         vine,
         verified_purchase,
         review_headline,
         review_body,
         CAST(YEAR(DATE(review_date)) AS INTEGER) AS year,
         DATE(review_date) AS review_date,
         product_category
FROM {}.{}
WHERE product_category in ('{}', '{}')""".format(database_name, table_name_parquet, 
                                             s3_path_parquet, database_name, 
                                             table_name_tsv, product_category1, product_category2)

print(statement)

CREATE TABLE IF NOT EXISTS awsdb_0920.amazon_reviews_parquet
WITH (format = 'PARQUET', external_location = 's3://sagemaker-us-east-1-322537213286/amazon-reviews-pds/parquet', partitioned_by = ARRAY['product_category']) AS
SELECT marketplace,
         customer_id,
         review_id,
         product_id,
         product_parent,
         product_title,
         star_rating,
         helpful_votes,
         total_votes,
         vine,
         verified_purchase,
         review_headline,
         review_body,
         CAST(YEAR(DATE(review_date)) AS INTEGER) AS year,
         DATE(review_date) AS review_date,
         product_category
FROM awsdb_0920.amazon_reviews_tsv
WHERE product_category in ('Digital_Software', 'Digital_Video_Games')


#### connection cursor를 사용하여 Execute 실행합니다.

이 작업은 몇 분정도 걸릴 수 있습니다.

In [7]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

<pyathena.cursor.Cursor at 0x7f95dbc4b8d0>

## 2 ) `MSCK REPAIR TABLE` 수행하여 Partitions 로드

MSCK REPAIR TABLE 명령은 테이블을 생성한 후 파일 시스템에 추가되거나 파일 시스템에서 제거된 Hive 호환 파티션을 Amazon S3와 같은 파일 시스템에서 스캔합니다. 이 명령은 파티션 및 파티션과 연결된 데이터와 관련하여 카탈로그의 메타데이터를 업데이트합니다.
Parquet 파티션 로드를 위해 다음 SQL 명령을 실행합니다. (MSCK : Hive MetaStore Consistency Check)

In [9]:
statement = 'MSCK REPAIR TABLE {}.{}'.format(database_name, table_name_parquet)

print(statement)

MSCK REPAIR TABLE awsdb_0920.amazon_reviews_parquet


In [10]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

<pyathena.cursor.Cursor at 0x7f95db2aa438>

#### Partitions를 확인해봅니다.

In [11]:
statement = 'SHOW PARTITIONS {}.{}'.format(database_name, table_name_parquet)

print(statement)

SHOW PARTITIONS awsdb_0920.amazon_reviews_parquet


In [12]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

df_partitions = as_pandas(cursor)
df_partitions.head(5)

Unnamed: 0,partition
0,product_category=Digital_Software
1,product_category=Digital_Video_Games


## 3 ) Sample Query 수행

In [13]:
product_category = 'Digital_Software'

statement = """SELECT * FROM {}.{}
    WHERE product_category = '{}' LIMIT 100""".format(database_name, table_name_parquet, product_category)

print(statement)

SELECT * FROM awsdb_0920.amazon_reviews_parquet
    WHERE product_category = 'Digital_Software' LIMIT 100


In [14]:
# Execute statement using connection cursor
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

df = as_pandas(cursor)
df.head(5)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,year,review_date,product_category
0,US,40928960,RDMB52973KQM6,B007SOLCP8,69964513,Checkbook [Download],3,1,1,N,Y,Overpriced,This was purchased as a stopgap to cover the t...,2013,2013-12-29,Digital_Software
1,US,24741189,R1249463V6EWSR,B009G6SVL4,768682282,Adobe Photoshop Elements 11,5,0,0,N,Y,easier to use,Photoshop is much easier to use than it used t...,2013,2013-12-29,Digital_Software
2,US,28307587,R3W50C36R78J7Z,B00FFINOWS,875090538,"TurboTax Deluxe Fed, Efile and State 2013",1,5,13,N,Y,Download didn't work,"How can a company like Intuit, which has been ...",2013,2013-12-29,Digital_Software
3,US,20278496,R3V9QPGY4ECZ48,B008SCNCTI,866682919,Norton Antivirus 2013 - 1 User / 3 PC,4,0,0,N,Y,Always reliable,Been using Norton for 10 years. It's never dis...,2013,2013-12-29,Digital_Software
4,US,31706801,R2HP7BKOTYU49F,B003LJXEPU,134665354,Professor Teaches Windows 7,2,1,1,N,Y,Frustrated,I just purchased this program for my mother's ...,2013,2013-12-29,Digital_Software


In [15]:
%store s3_path_parquet table_name_parquet

Stored 's3_path_parquet' (str)
Stored 'table_name_parquet' (str)
