In [5]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request
import io
import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer
from pyathena import connect
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, \
RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_curve, auc, mean_squared_error,\
precision_score, recall_score, f1_score, accuracy_score,\
confusion_matrix, plot_confusion_matrix, classification_report
from sagemaker.tuner import HyperparameterTuner
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import loguniform
import warnings
warnings.filterwarnings('ignore')
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [7]:
ingest_create_athena_db_passed = False

In [8]:
# set a database name
database_name = "ecommerce"

In [9]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [10]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [11]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS ecommerce


In [12]:
#S3 bucket name: olistteam5
#S3 URL: s3://olistteam5/ecommerce/

In [13]:
olist_dir='s3://olistteam5/ecommerce/'

In [14]:
## SQL statement to execute the analyte olist e-commerce table

table_name ='olist_customers_dataset'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name}(
                customer_id String,
                customer_unique_id STRING,
                customer_zip_code_prefix int,
                customer_city string,
                customer_state string
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{olist_dir}/{table_name}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM ecommerce.olist_customers_dataset LIMIT 5', conn)

DatabaseError: Execution failed on sql: SELECT * FROM ecommerce.olist_customers_dataset LIMIT 5
Permission denied on S3 path: s3://olistteam5/ecommerce/olist_customers_dataset
unable to rollback

In [None]:
table_name ='olist_geolocation_dataset'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name}(
                geolocation_zip_code_prefix String,
                geolocation_lat STRING,
                geolocation_lng string,
                geolocation_city string,
                geolocation_state string
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{olist_dir}/{table_name}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM ecommerce.olist_geolocation_dataset LIMIT 5', conn)

In [None]:
table_name ='olist_order_items_dataset'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)


ccreate_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name}(
                order_item_id INT,
                product_id STRING,
                seller_id STRING,
                shipping_limit_date STRING,
                price FLOAT,
                freight_value FLOAT
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{olist_dir}/{table_name}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM ecommerce.olist_order_items_dataset LIMIT 5', conn)

In [None]:
table_name ='olist_order_payments_dataset'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name}(
                order_id String,
                payment_sequential int,
                payment_type string,
                payment_installments int,
                payment_value float
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{olist_dir}/{table_name}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM ecommerce.olist_order_payments_dataset LIMIT 5', conn)

In [None]:
table_name ='olist_order_reviews_dataset'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name}(
                review_id String,
                order_id string,
                review_score int,
                review_comment_title string,
                review_comment_message string,
                review_creation_date string,
                review_answer_timestamp string
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{olist_dir}/{table_name}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM ecommerce.olist_order_reviews_dataset LIMIT 5', conn)

In [None]:
table_name ='olist_orders_dataset'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name}(
                order_id string,
                customer_id string,
                order_status string,
                order_purchase_timestamp string,
                order_approved_at string,
                order_delivered_carrier_date string,
                order_delivered_customer_date string,
                order_estimated_delivery_date string
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{olist_dir}/{table_name}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM ecommerce.olist_orders_dataset LIMIT 5', conn)

In [None]:
table_name ='olist_products_dataset'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name}(
                product_id string,
                product_category_name string,
                product_name_lenght float,
                product_description_lenght float,
                product_photos_qty float,
                product_weight_g float,
                product_length_cm float
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{olist_dir}/{table_name}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM ecommerce.olist_products_dataset LIMIT 5', conn)

In [None]:
table_name ='olist_sellers_dataset'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name}(
                seller_id string,
                seller_zip_code_prefix int,
                seller_city string,
                seller_state string
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{olist_dir}/{table_name}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM ecommerce.olist_sellers_dataset LIMIT 5', conn)

In [None]:
table_name ='product_category_name_translation'
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)


create_table = f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name}(
                product_category_name string,
                product_category_name_english string
                )
                
                ROW FORMAT DELIMITED
                FIELDS TERMINATED BY ','
                LOCATION '{olist_dir}/{table_name}'
                TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(create_table, conn)
pd.read_sql(f'SELECT * FROM ecommerce.product_category_name_translation LIMIT 5', conn)

In [None]:
statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(5)

In [None]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [None]:
%store ingest_create_athena_db_passed

In [None]:
## Release Resources 
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>