# Step 01: Import necessary packages

In [1]:
%load_ext sql

In [2]:
# Import necessary packages
import boto3
import configparser
import glob
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import psycopg2

from sql_queries import *
from time import time

# Step 02: Load DB Params from dwh.cfg file
__Create a IAM user in your AWS account:__  
- Give it `AdministratorAccess`, From `Attach existing policies directly` Tab  
- Take note of the access key and secret  
- Edit the file `dwh.cfg` in the same folder as this notebook and fill

In [3]:
# Edit the file dwh.cfg
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

# Define the parameters in AWS
KEY = config.get('AWS','KEY')
SECRET = config.get('AWS','SECRET')

# Define the parameters in CLUSTER
DB_NAME = config.get("CLUSTER","DB_NAME")
DB_USER = config.get("CLUSTER","DB_USER")
DB_PASSWORD = config.get("CLUSTER","DB_PASSWORD")
DB_PORT = config.get("CLUSTER","DB_PORT")
REGION = config.get("CLUSTER","REGION")
DB_CLUSTER_TYPE = config.get("CLUSTER","DB_CLUSTER_TYPE")
DB_NUM_NODES = config.get("CLUSTER","DB_NUM_NODES")
DB_NODE_TYPE = config.get("CLUSTER","DB_NODE_TYPE")
DB_CLUSTER_IDENTIFIER = config.get("CLUSTER","DB_CLUSTER_IDENTIFIER")
DB_IAM_ROLE_NAME = config.get("CLUSTER","DB_IAM_ROLE_NAME")

# Define the parameters in IAM_ROLE
ARN = config.get("IAM_ROLE", "ARN")

(DB_USER, DB_PASSWORD, DB_NAME)

pd.DataFrame({"Param":
                  ["DB_NAME", "DB_USER", "DB_PASSWORD", "DB_PORT", "ARN", "REGION", "DB_CLUSTER_TYPE", "DB_NUM_NODES", "DB_NODE_TYPE", "DB_CLUSTER_IDENTIFIER", "DB_IAM_ROLE_NAME"],
              "Value":
                  [DB_NAME, DB_USER, DB_PASSWORD, DB_PORT, ARN, REGION, DB_CLUSTER_TYPE, DB_NUM_NODES, DB_NODE_TYPE, DB_CLUSTER_IDENTIFIER, DB_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DB_NAME,dwh
1,DB_USER,dwhuser
2,DB_PASSWORD,Passw0rd
3,DB_PORT,5439
4,ARN,'arn:aws:iam::837754688468:role/dwhRole'
5,REGION,us-west-2
6,DB_CLUSTER_TYPE,multi-node
7,DB_NUM_NODES,4
8,DB_NODE_TYPE,dc2.large
9,DB_CLUSTER_IDENTIFIER,dwhCluster


## Step 03: Create a IAM Role
Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [4]:
iam = boto3.client('iam',aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET,
                     region_name=REGION
                  )

In [5]:
from botocore.exceptions import ClientError

#1.1 Create the role, 
try:
    print("1.1 Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path='/',
        RoleName=DB_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )    
except Exception as e:
    print(e)
    
    
print("1.2 Attaching Policy")

iam.attach_role_policy(RoleName=DB_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

print("1.3 Get the IAM role ARN")
roleArn = iam.get_role(RoleName=DB_IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

1.1 Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name dwhRole already exists.
1.2 Attaching Policy
1.3 Get the IAM role ARN
arn:aws:iam::837754688468:role/dwhRole


## Step 04: Create a Redshift Cluster

In [6]:
redshift = boto3.client('redshift',
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET,
                       region_name=REGION                        
                       )

In [7]:
try:
    response = redshift.create_cluster(        
        #HW
        ClusterType=DB_CLUSTER_TYPE,
        NodeType=DB_NODE_TYPE,
        NumberOfNodes=int(DB_NUM_NODES),

        #Identifiers & Credentials
        DBName=DB_NAME,
        ClusterIdentifier=DB_CLUSTER_IDENTIFIER,
        MasterUsername=DB_USER,
        MasterUserPassword=DB_PASSWORD,
        
        #Roles (for s3 access)
        IamRoles=[roleArn]  
    )
except Exception as e:
    print(e)

### Verify the status cluster
Run this block several times until the cluster status becomes Available

In [8]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DB_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.cbazftvy98uk.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-f0188288
7,NumberOfNodes,4


# Step 05: Open an incoming TCP port to access the cluster endpoint

In [9]:
ec2 = boto3.resource('ec2',
                     region_name=REGION,
                     aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET
                     )

In [10]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DB_PORT),
        ToPort=int(DB_PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-be6841e4')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


# Step 06: ETL Processes  

Get the params of the created redshift cluster

In [11]:
# Create database sparkifydb and tables for this project
%run create_tables.py

In [12]:
# Execute ETL on cluster
%run etl.py

>>>> Start load_staging_tables >>>>
	staging_events_copy
	staging_songs_copy
>>>> End load_staging_tables >>>>
>>>> Start insert_tables >>>>
	user_table_insert
	song_table_insert
	artist_table_insert
	time_table_insert
	songplay_table_insert
>>>> End insert_tables >>>>


# Step 07: Validation

In [13]:
# Connect to database
DB_ENDPOINT = myClusterProps['Endpoint']['Address']
DB_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']

conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, DB_ENDPOINT, DB_PORT, DB_NAME)
%sql $conn_string

'Connected: dwhuser@dwh'

In [14]:
# VALIDATION - VERIFY total of rows in staging_events
%sql select count(*) from dwh_redshift.staging_events;

 * postgresql://dwhuser:***@dwhcluster.cbazftvy98uk.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
8056


In [15]:
# VALIDATION - VERIFY total of rows in staging_songs
%sql select count(*) from dwh_redshift.staging_songs;

 * postgresql://dwhuser:***@dwhcluster.cbazftvy98uk.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
14896


In [16]:
# VALIDATION - VERIFY total of rows in users dimension
%sql SELECT count(*) as tt FROM dwh_redshift.users;

 * postgresql://dwhuser:***@dwhcluster.cbazftvy98uk.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


tt
104


In [17]:
# VALIDATION - VERIFY total of rows in songs dimension
%sql SELECT count(*) as tt FROM dwh_redshift.songs;

 * postgresql://dwhuser:***@dwhcluster.cbazftvy98uk.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


tt
14896


In [18]:
# VALIDATION - VERIFY total of rows in artists dimension
%sql SELECT count(*) as tt FROM dwh_redshift.artists;

 * postgresql://dwhuser:***@dwhcluster.cbazftvy98uk.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


tt
10025


In [19]:
# VALIDATION - VERIFY total of rows in times dimension
%sql SELECT count(*) as tt FROM dwh_redshift.times;

 * postgresql://dwhuser:***@dwhcluster.cbazftvy98uk.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


tt
6813


In [20]:
# VALIDATION - VERIFY total of rows in songplays table
%sql SELECT count(*) as tt FROM dwh_redshift.songplays;

 * postgresql://dwhuser:***@dwhcluster.cbazftvy98uk.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


tt
6820


In [21]:
# VALIDATION - VERIFY total of rows in songplays table
%sql SELECT * FROM dwh_redshift.songplays LIMIT 5

 * postgresql://dwhuser:***@dwhcluster.cbazftvy98uk.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
1752,2018-11-01 21:24:53.796000,8,free,,,139,"Phoenix-Mesa-Scottsdale, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"""
2095,2018-11-01 21:55:25.796000,26,free,,,169,"San Jose-Sunnyvale-Santa Clara, CA","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"""
1004,2018-11-02 09:04:16.796000,15,paid,,,172,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"""
877,2018-11-02 09:35:25.796000,15,paid,,,172,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"""
1028,2018-11-02 09:42:23.796000,15,paid,,,172,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"""


# Clean up your resources

In [23]:
# Delete cluster after your tests
# redshift.delete_cluster(ClusterIdentifier=DB_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)