Creating, Inserting Data and Deleting Clusters in Redshift in AWS

In [2]:
#%pip install boto3 --user

Note: you may need to restart the kernel to use updated packages.




In [3]:
import boto3
import pandas as pd
import psycopg2 
import json
import os

In [4]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('cluster.config'))

In [5]:
KEY                     = os.environ.get('AWS_KEY')
SECRET                  = os.environ.get('AWS_SECRET')

DWH_CLUSTER_TYPE        = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES           = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE           = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER  = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                  = config.get("DWH","DWH_DB")
DWH_DB_USER             = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD         = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT                = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME       = config.get("DWH","DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

('awsuser', 'I3t*hjom', 'myfirstdb')

In [6]:
pd.DataFrame({"Param":
                    ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                    [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
            })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,single-node
1,DWH_NUM_NODES,1
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,my-first-redshift
4,DWH_DB,myfirstdb
5,DWH_DB_USER,awsuser
6,DWH_DB_PASSWORD,I3t*hjom
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,redshift-s3-access


In [54]:
ec2 = boto3.resource('ec2',
                        region_name="sa-east-1",
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET
                        )

In [55]:
s3 = boto3.resource('s3',
                        region_name="sa-east-1",
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET
                        )

iam = boto3.client('iam',
                    region_name="sa-east-1",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

redshift = boto3.client('redshift',
                    region_name="sa-east-1",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
                    )

In [75]:
bucket=s3.Bucket("hfelipini-test-bucket")
log_data_files = [filename.key for filename in bucket.objects.filter(Prefix='')]
log_data_files

['Amazon_logo.svg.png',
 'Partida_do_Motor.m4a',
 'allevents_pipe.txt',
 'allusers_pipe.txt',
 'category_pipe.txt',
 'date2008_pipe.txt',
 'listings_pipe.txt',
 'sales_tab.txt',
 'venue_pipe.txt']

In [59]:
roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']

In [60]:
roleArn

'arn:aws:iam::732284161643:role/redshift-s3-access'

In [61]:
#Create Redshift Cluster using the Free Trial dc2.large with 1 node
try:
    response = redshift.create_cluster(
        ClusterType=DWH_CLUSTER_TYPE,
        NodeType=DWH_NODE_TYPE,

        #Identifiers & Credentials
        DBName=DWH_DB,
        ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
        MasterUsername=DWH_DB_USER,
        MasterUserPassword=DWH_DB_PASSWORD,

        #Roles (for s3 access)
        IamRoles=[roleArn]

    )
except Exception as e:
    print(e)

In [64]:
redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]

{'ClusterIdentifier': 'my-first-redshift',
 'NodeType': 'dc2.large',
 'ClusterStatus': 'available',
 'ClusterAvailabilityStatus': 'Available',
 'MasterUsername': 'awsuser',
 'DBName': 'myfirstdb',
 'Endpoint': {'Address': 'my-first-redshift.cizg6fhqbdnd.sa-east-1.redshift.amazonaws.com',
  'Port': 5439},
 'ClusterCreateTime': datetime.datetime(2022, 12, 4, 20, 19, 24, 985000, tzinfo=tzutc()),
 'AutomatedSnapshotRetentionPeriod': 1,
 'ManualSnapshotRetentionPeriod': -1,
 'ClusterSecurityGroups': [],
 'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-90bc42e1',
   'Status': 'active'}],
 'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
   'ParameterApplyStatus': 'in-sync'}],
 'ClusterSubnetGroupName': 'default',
 'VpcId': 'vpc-bae6f4dd',
 'AvailabilityZone': 'sa-east-1c',
 'PreferredMaintenanceWindow': 'fri:05:00-fri:05:30',
 'PendingModifiedValues': {},
 'ClusterVersion': '1.0',
 'AllowVersionUpgrade': True,
 'NumberOfNodes': 1,
 'PubliclyAccessible': True,
 'Encr

In [68]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', None)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "VpcId"]
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,my-first-redshift
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,awsuser
4,DBName,myfirstdb
5,Endpoint,"{'Address': 'my-first-redshift.cizg6fhqbdnd.sa-east-1.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-bae6f4dd


In [69]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
DB_NAME      = myClusterProps['DBName']
DB_USER      = myClusterProps['MasterUsername']
(DWH_ENDPOINT, DWH_ROLE_ARN, DB_NAME, DB_USER)

In [72]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print(defaultSg)

    defaultSg.authorize_ingress(
            GroupName=defaultSg.group_name,
            CidrIp='0.0.0.0/0',
            IpProtocol='TCP',
            FromPort=int(DWH_PORT),
            ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

ec2.SecurityGroup(id='sg-08e5c06b1cf9d87f5')


In [91]:
try:
    conn = psycopg2.connect(host=DWH_ENDPOINT, dbname=DB_NAME, user=DB_USER, password=DWH_DB_PASSWORD, port=5439)
except psycopg2.Error as e:
    print("Error: Could not make connection to the Postgres database")
    print(e)

conn.set_session(autocommit=True)

In [92]:
try:
    cur = conn.cursor()
except psycopg2.Error as e:
    print("Error: Could not get cursos to the Database")
    print(e)

In [93]:
try:
    cur.execute("""create table users(
        userid integer not null distkey sortkey,
        username char(8),
        firstname varchar(30),
        lastname varchar(30),
        city varchar(30),
        state char(2),
        email varchar(100),
        phone char(14),
        likesports boolean,
        liketheatre boolean,
        likeconcerts boolean,
        likejazz boolean,
        likeclassical boolean,
        likeopera boolean,
        likerock boolean,
        likevegas boolean,
        likebroadway boolean,
        likemusicals boolean);
    """)
except psycopg2.Error as e:
    print("Error: Issue creating table")
    print(e)

Error: Issue creating table
Relation "users" already exists



In [94]:
try:
    cur.execute("""create table venue(
        venueid smallint not null distkey sortkey,
        venuename varchar(100),
        venuecity varchar(30),
        venuestate char(2),
        venueseats integer);
    """)
except psycopg2.Error as e:
    print("Error: Issue creating table")
    print(e)

Error: Issue creating table
Relation "venue" already exists



In [100]:
try:
    cur.execute("""
        create table category(
        catid smallint not null distkey sortkey,
        catgroup varchar(10),
        catname varchar(50),
        catdesc varchar(50));

        create table date(
        dateid smallint not null distkey sortkey,
        caldate date not null,
        day character(3) not null,
        week smallint not null,
        month character(5) not null,
        qtr character(5) not null,
        year smallint not null,
        holiday boolean default('N'));

        create table event(
        eventid integer not null distkey,
        venueid smallint not null,
        catid smallint not null,
        dateid smallint not null,
        eventname varchar(200),
        starttime timestamp);

        create table listing(
        listid integer not null distkey,
        sellerid integer not null,
        eventid integer not null,
        dateid smallint not null sortkey,
        numtickets smallint not null,
        priceperticket decimal(8,2),
        totalprice decimal(8,2),
        listtime timestamp);
    """)
except psycopg2.Error as e:
    print("Error: Issue creating table")
    print(e)


Error: Issue creating table
Relation "category" already exists



In [102]:
try:
    cur.execute("""
    COPY users FROM 's3://hfelipini-test-bucket/allusers_pipe.txt'
    CREDENTIALS 'aws_iam_role=arn:aws:iam::732284161643:role/redshift-s3-access'
    DELIMITER '|'
    REGION 'sa-east-1'
    """)
except psycopg2.Error as e:
    print("Error: Issue creating table")
    print(e)

In [110]:
try:
    cur.execute("""
        SELECT * FROM USERS;
    """)
except psycopg2.Error as e:
    print("Error: Issue creating table")
    print(e)

In [111]:
row = cur.fetchone()
print(row)

(1, 'JSG99FHE', 'Rafael', 'Taylor', 'Kent', 'WA', 'Etiam.laoreet.libero@sodalesMaurisblandit.edu', '(664) 602-4412', True, True, None, False, True, None, None, True, False, True)


In [112]:
try:
    conn.close()
except psycopg2.Error as e:
    print(e)

In [113]:
redshift.delete_cluster(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER, SkipFinalClusterSnapshot=True)

{'Cluster': {'ClusterIdentifier': 'my-first-redshift',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'ClusterAvailabilityStatus': 'Modifying',
  'MasterUsername': 'awsuser',
  'DBName': 'myfirstdb',
  'Endpoint': {'Address': 'my-first-redshift.cizg6fhqbdnd.sa-east-1.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2022, 12, 4, 20, 19, 24, 985000, tzinfo=tzutc()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ManualSnapshotRetentionPeriod': -1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-90bc42e1',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-bae6f4dd',
  'AvailabilityZone': 'sa-east-1c',
  'PreferredMaintenanceWindow': 'fri:05:00-fri:05:30',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 1,
 