# <center>Creating Redshift Cluster using the AWS Boto3 SDK</center>

In [1]:
import pandas as pd
import boto3
import json
import configparser

In [2]:
%load_ext sql

### STEP 0: (Prerequisite) Create an IAM User with `AdministratorAccess` and Save AWS Secret and Access Key

The following two steps were completed as prerequisites before running the code to launch the Redshift cluster:

1. A **new IAM user** was created on **AWS Management Console** with an **`AdministratorAccess`** policy attached.

2. An **Access key** and **Secret access key** were created in the **AWS Management Console**, then copied and stored in the `dwh.cfg` file located in the same folder as this notebook.
```bash
KEY= <MY_AWS_KEY>
SECRET= <MY_AWS_SECRET>
```

#### Load DWH Params from a Configuration File

In [8]:
# Import the configparser module to read configuration files
config = configparser.ConfigParser()
# Read the configuration file (dwh.cfg) containing AWS and Redshift settings
config.read_file(open('dwh.cfg'))

# Retrieve AWS Access Key and Secret Access Key from the configuration file
KEY                    = config.get('AWS', 'KEY')
SECRET                 = config.get('AWS', 'SECRET')

# Retrieve Redshift cluster properties from the configuration file
DWH_CLUSTER_TYPE       = config.get("DWH", "DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH", "DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH", "DWH_NODE_TYPE")

# Retrieve Redshift database and cluster identification details
DWH_CLUSTER_IDENTIFIER = config.get("DWH", "DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH", "DWH_DB")
DWH_DB_USER            = config.get("DWH", "DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH", "DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH", "DWH_PORT")

# Retrieve the IAM role name for Redshift to access other AWS services (e.g., S3)
DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

In [9]:
# Check the retrieved key DWH parameter values
pd.DataFrame(
    [
        ['DWH_CLUSTER_TYPE', DWH_CLUSTER_TYPE],
        ['DWH_NUM_NODES', DWH_NUM_NODES],
        ['DWH_NODE_TYPE', DWH_NODE_TYPE],
        ['DWH_CLUSTER_IDENTIFIER', DWH_CLUSTER_IDENTIFIER],
        ['DWH_DB', DWH_DB],
        ['DWH_DB_USER', DWH_DB_USER],
        ['DWH_DB_PASSWORD', DWH_DB_PASSWORD],
        ['DWH_PORT', DWH_PORT],
        ['DWH_IAM_ROLE_NAME', DWH_IAM_ROLE_NAME],
    ],
    columns=['Params', 'Values']
)

Unnamed: 0,Params,Values
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


#### Create resources and clients for EC2, S3, IAM and Redshift

To interact with **EC2** and **S3**, utilize `boto3.resource`; for **IAM** and **Redshift**, use `boto3.client`. 

In [10]:
# Create an EC2 resource to interact with AWS EC2 service
ec2 = boto3.resource(
    'ec2',
    region_name='us-west-2',
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
)

# Create an S3 resource to interact with AWS S3 service
s3 = boto3.resource(
    's3',
    region_name='us-west-2',
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
)

In [11]:
# Create an IAM client to interact with AWS IAM service
iam = boto3.client(
    'iam',
    region_name='us-west-2',
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
)

# Create a Redshift client to interact with AWS Redshift service
redshift = boto3.client(
    'redshift',
    region_name='us-west-2',
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
)

### STEP 1: Create IAM Role
- Create an IAM Role that enables Redshift to access S3 bucket (ReadOnly)

In [12]:
# Create a new IAM role that allows Redshift to interact with other AWS services
try:
    print('1.1 Creating a new role')
    iam.create_role(
        Path='/',
        RoleName=DWH_IAM_ROLE_NAME,
        Description='Allows Redshift cluster to call AWS services on my behalf.',
        AssumeRolePolicyDocument=json.dumps(
            {
                'Statement': [
                    {
                        'Action': 'sts:AssumeRole',
                        'Effect': 'Allow',
                        'Principal': {
                            'Service': 'redshift.amazonaws.com'
                        }
                    }
                ],
                'Version': '2012-10-17',
            }
        )
    )
except Exception as e:
    print(e)

# Attach a policy to the IAM role to allow Redshift to have 
# read-only access to S3
print("1.2 Attaching Policy")
iam.attach_role_policy(
    RoleName=DWH_IAM_ROLE_NAME,
    PolicyArn='arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess',  # Attach AmazonS3ReadOnlyAccess policy
)['ResponseMetadata']['HTTPStatusCode']

# Retrieve and print the ARN of the newly created IAM role
print("1.3 Get the Amazon Resource Name (ARN) for the IAM role")
roleArn = iam.get_role(
    RoleName=DWH_IAM_ROLE_NAME
)['Role']['Arn']
print(roleArn)

1.1 Creating a new role
1.2 Attaching Policy
1.3 Get the Amazon Resource Name (ARN) for the IAM role
arn:aws:iam::980658774738:role/dwhRole


### STEP 2:  Launch Redshift Cluster

In [13]:
# Create a Redshift cluster with the specified configuration
try:
    response = redshift.create_cluster(
        # Haredware configuration
        ClusterType = DWH_CLUSTER_TYPE,
        NodeType = DWH_NODE_TYPE,
        NumberOfNodes = int(DWH_NUM_NODES),

        # Identifiers and database credentials
        DBName = DWH_DB,
        ClusterIdentifier = DWH_CLUSTER_IDENTIFIER,
        MasterUsername = DWH_DB_USER,
        MasterUserPassword = DWH_DB_PASSWORD,

        # IAM role for accessing S3
        IamRoles = [roleArn],
    )
except Exception as e:
    print(e)

#### 2.1 *Describe* the cluster to see its status

In [14]:
def prettyRedshiftProps(props):
    """Formats and displays specific Redshift cluster 
    properties in a pandas DataFrame for easier readability.
    
    Args:
    - props: dict
        A dictionary containing the Redshift cluster properties, 
        typically returned from AWS SDK (boto3). This dictionary 
        includes various details about the Redshift cluster, 
        such as cluster identifier, node type, status, username,
        and more.
        
    Returns:
    - A pandas Dataframe with the relevant Redshift cluster 
      properties, where the keys are listed in one column and
      their corresponding values in another.
    """
    # Avoid truncating long text in DataFrame output
    pd.set_option('display.max_colwidth', -1)
    
    # Define the keys to be displayed
    keysToShow = ["ClusterIdentifier", "NodeType", 
                  "ClusterStatus", "MasterUsername", 
                  "DBName", "Endpoint", "NumberOfNodes", 
                  'VpcId']
    
    # Filter the key-value pairs based on keysToShow
    x = [
        (k, v) for k, v in props.items() if k in keysToShow
    ]
    
    # Return the filtered properties as a DataFrame
    return pd.DataFrame(x, columns=["Key", "Value"])

Run this block below several times until the cluster status becomes **Available**.

In [18]:
# Retrieve detailed information about the Redshift cluster
myClusterProps = redshift.describe_clusters(
    ClusterIdentifier=DWH_CLUSTER_IDENTIFIER
)['Clusters'][0]

# Display the selected Redshift cluster properties in pandas DataFrame
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.cejcerbeak3k.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-08ae7bb554e3c899d
7,NumberOfNodes,4


#### 2.2 Take note of the cluster <font color='red'> Endpoint and Role ARN </font>

<font color='red'>DO NOT RUN THIS until the cluster status becomes **Available**.</font>

In [19]:
# Extract the endpoint address of the Redshift cluster 
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']

# Extract IAM role ARN associated with the Redshift cluster
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']

# Print the Redshift cluster endpoint address
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)

# Print IAM role ARN associated with the Redshift cluster
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  dwhcluster.cejcerbeak3k.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::980658774738:role/dwhRole


The `DWH_ENDPOINT` and `DWH_ROLE_ARN` were copied and stored in the `dwh.cfg` file located in the same folder as this notebook.
```bash
[CLUSTER]
HOST=<DWH_ENDPOINT>

[IAM_ROLE]
ARN=<DWH_ROLE_ARN>
```

### STEP 3: Open an incoming TCP port to access the cluster endpoint

In [20]:
# Authorize inbound traffic to the Redshift cluster's security group
try:
    # Retrieve the VPC associated with the Redshift cluster
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    
    # Get the default security group associated with the VPC
    defaultSg = list(vpc.security_groups.all())[0]
    # Print the details of the default security group for verification
    print(defaultSg)
    
    # Authorize inbound traffic on the specified port for the security group
    defaultSg.authorize_ingress(
        GroupName=defaultSg.group_name,
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT),
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-087225a4ebbe1e00e')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


### STEP 4: Verify the connection to the Redshift cluster

In [21]:
# Construct the connection string for PostgreSQL to 
# connect to the Redshift cluster
conn_string = 'postgresql://{}:{}@{}:{}/{}'.format(
    DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT, DWH_DB
)
# Print the connection string
print(conn_string)

# Use the SQL magic command to connect to the Redshift cluster
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.cejcerbeak3k.us-west-2.redshift.amazonaws.com:5439/dwh


'Connected: dwhuser@dwh'

### STEP 5: Clean up all AWS resources
* The code below was executed at the end of the project to avoid unexpected AWS usage costs.

In [22]:
# Delete the Redshift cluster with the specified identifier
redshift.delete_cluster(
    ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
    SkipFinalClusterSnapshot=True,
)

{'Cluster': {'ClusterIdentifier': 'dwhcluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'MasterUsername': 'dwhuser',
  'DBName': 'dwh',
  'Endpoint': {'Address': 'dwhcluster.cejcerbeak3k.us-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2024, 9, 9, 2, 44, 31, 934000, tzinfo=tzlocal()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-087225a4ebbe1e00e',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-08ae7bb554e3c899d',
  'AvailabilityZone': 'us-west-2a',
  'PreferredMaintenanceWindow': 'sat:10:30-sat:11:00',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,
  'PubliclyAccessible': True,
  'Encrypted': False,
  'Tags': [],
  'EnhancedVpcRoutin

In [27]:
# Retrieve and display the properties of the deleting Redshift cluster
try:
    # Describe the properties of the Redshift cluster
    myClusterProps = redshift.describe_clusters(
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER
    )['Clusters'][0]
    
    # Display the Redshift cluster properties in a pandas DataFrame
    prettyRedshiftProps(myClusterProps)
except Exception as e:
    print(e)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,deleting
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,{'Port': 5439}
6,VpcId,vpc-08ae7bb554e3c899d
7,NumberOfNodes,4


In [28]:
# Detach the AmazonS3ReadOnlyAccess policy from the specified IAM role
iam.detach_role_policy(
    RoleName=DWH_IAM_ROLE_NAME,
    PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess",
)
# Delete the IAM role after detaching the policy
iam.delete_role(RoleName=DWH_IAM_ROLE_NAME)

{'ResponseMetadata': {'RequestId': 'f9052e9f-f43d-46a1-9189-b1f6cb2927c9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Mon, 09 Sep 2024 02:46:30 GMT',
   'x-amzn-requestid': 'f9052e9f-f43d-46a1-9189-b1f6cb2927c9',
   'content-type': 'text/xml',
   'content-length': '200'},
  'RetryAttempts': 0}}