# Step 01: Import necessary packages

In [None]:
%load_ext sql

In [None]:
# Import necessary packages
import boto3
import configparser
import glob
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import psycopg2

from sql_queries import *
from time import time

# Step 02: Load DB Params from dwh.cfg file

__Create a IAM user in your AWS account:__  
- Give it `AdministratorAccess`, From `Attach existing policies directly` Tab  
- Take note of the access key and secret  
- Edit the file `dwh.cfg` in the same folder as this notebook and fill

In [None]:
# Edit the file dwh.cfg
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

# Define the parameters in AWS
KEY = config.get('AWS','KEY')
SECRET = config.get('AWS','SECRET')

# Define the parameters in CLUSTER
DB_NAME = config.get("CLUSTER","DB_NAME")
DB_USER = config.get("CLUSTER","DB_USER")
DB_PASSWORD = config.get("CLUSTER","DB_PASSWORD")
DB_PORT = config.get("CLUSTER","DB_PORT")
REGION = config.get("CLUSTER","REGION")
DB_CLUSTER_TYPE = config.get("CLUSTER","DB_CLUSTER_TYPE")
DB_NUM_NODES = config.get("CLUSTER","DB_NUM_NODES")
DB_NODE_TYPE = config.get("CLUSTER","DB_NODE_TYPE")
DB_CLUSTER_IDENTIFIER = config.get("CLUSTER","DB_CLUSTER_IDENTIFIER")
DB_IAM_ROLE_NAME = config.get("CLUSTER","DB_IAM_ROLE_NAME")

# Define the parameters in IAM_ROLE
ARN = config.get("IAM_ROLE", "ARN")

(DB_USER, DB_PASSWORD, DB_NAME)

pd.DataFrame({"Param":
                  ["DB_NAME", "DB_USER", "DB_PASSWORD", "DB_PORT", "ARN", "REGION", "DB_CLUSTER_TYPE", "DB_NUM_NODES", "DB_NODE_TYPE", "DB_CLUSTER_IDENTIFIER", "DB_IAM_ROLE_NAME"],
              "Value":
                  [DB_NAME, DB_USER, DB_PASSWORD, DB_PORT, ARN, REGION, DB_CLUSTER_TYPE, DB_NUM_NODES, DB_NODE_TYPE, DB_CLUSTER_IDENTIFIER, DB_IAM_ROLE_NAME]
             })

## Step 03: Create a IAM 

Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [None]:
iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name=REGION
                  )

In [None]:
from botocore.exceptions import ClientError

#1.1 Create the role, 
try:
    print("1.1 Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DB_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )    
except Exception as e:
    print(e)
    
    
print("1.2 Attaching Policy")

iam.attach_role_policy(RoleName=DB_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

print("1.3 Get the IAM role ARN")
roleArn = iam.get_role(RoleName=DB_IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

## Step 04: Create a Redshift Cluster

In [None]:
redshift = boto3.client('redshift',
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET,
                       region_name=REGION                        
                       )

In [None]:
try:
    response = redshift.create_cluster(        
        #HW
        ClusterType=DB_CLUSTER_TYPE,
        NodeType=DB_NODE_TYPE,
        NumberOfNodes=int(DB_NUM_NODES),

        #Identifiers & Credentials
        DBName=DB_NAME,
        ClusterIdentifier=DB_CLUSTER_IDENTIFIER,
        MasterUsername=DB_USER,
        MasterUserPassword=DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)

### Verify the status cluster
Run this block several times until the cluster status becomes Available

In [None]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

### Take note of the cluster endpoint and role ARN
<font color='red'>DO NOT RUN THIS unless the cluster status becomes "Available" </font>

In [None]:
DB_ENDPOINT = myClusterProps['Endpoint']['Address']
DB_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DB_ENDPOINT :: ", DB_ENDPOINT)
print("DB_ROLE_ARN :: ", roleArn)

# Step 05: Open an incoming TCP port to access the cluster endpoint

In [None]:
ec2 = boto3.resource('ec2',
                       region_name=REGION,
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

In [None]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DB_PORT),
        ToPort=int(DB_PORT)
    )
except Exception as e:
    print(e)

# ETL Processes

### Use this notebook to develop the ETL process for each of your tables before completing the etl.py file to load the whole datasets.

Get the params of the created redshift cluster

In [None]:
config = configparser.ConfigParser()
config.read("dwh.cfg")
DB_CLUSTER_IDENTIFIER = config.get("CLUSTER", "DB_CLUSTER_IDENTIFIER")

In [None]:
# Cluster status - make sure it's 'available'
redshift.describe_clusters(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Clusters'][0]["ClusterStatus"]

In [None]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, DB_ENDPOINT, DB_PORT, DB_NAME)
print(conn_string)
%sql $conn_string

In [None]:
# Create database sparkifydb and tables for this project
%run create_tables.py

In [None]:
# Execute ETL on cluster
%run etl.py

# Validation

In [None]:
# VALIDATION - VERIFY total of rows in users dimension
%sql SELECT count(*) as tt FROM users;

In [None]:
# VALIDATION - VERIFY total of rows in songs dimension
%sql SELECT count(*) as tt FROM songs;

In [None]:
# VALIDATION - VERIFY total of rows in artists dimension
%sql SELECT count(*) as tt FROM artists;

In [None]:
# VALIDATION - VERIFY total of rows in times dimension
%sql SELECT count(*) as tt FROM times;

In [None]:
# VALIDATION - VERIFY total of rows in songplays table
%sql SELECT count(*) as tt FROM songplays;

In [None]:
# VALIDATION - VERIFY total of rows in songplays table
%sql SELECT user_id, song_id, artist_id, start_time, count(*) as tt \
FROM songplays \
GROUP by user_id, song_id, artist_id, start_time 

In [None]:
# VALIDATION - VERIFY total of rows in songplays table
%sql SELECT * FROM songplays LIMIT 5

# Clean up your resources

In [None]:
redshift.delete_cluster( ClusterIdentifier=DB_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)

In [None]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

In [None]:
iam.detach_role_policy(RoleName=DB_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=DB_IAM_ROLE_NAME)