# **AWS DATA WAREHOUSE INTERFACE**

User will run code cells in this notebook to set the state of the data warehouse
- PREREQUISITE: IMPORT LIBRARIES
- COLLECT CLUSTER CONFIGURATION
- CONFIGURE ROLE AND POLICY
- CREATE REDSHIFT CLUSTER
- CONFIGURE REDSHIFT CLUSTER
- GET CLUSTER DESCRIPTION
- DELETE REDSHIFT CLUSTER

### **PREREQUISITE: IMPORT LIBRARIES**

In [1]:
import boto3
import configparser
import json
import pandas as pd

Reusable Functions:

In [2]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

### **COLLECT CLUSTER CONFIGURATION**

In [3]:
config = configparser.ConfigParser()
config.read('dwh.cfg')

REGION = config.get('AWS', 'REGION')
KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')
CLUSTER_TYPE = config.get('REDSHIFT', 'DWH_CLUSTER_TYPE')
NUM_NODES = config.get('REDSHIFT', 'DWH_NUM_NODES')
NODE_TYPE = config.get('REDSHIFT', 'DWH_NODE_TYPE')
CLUSTER_ID = config.get('REDSHIFT', 'DWH_CLUSTER_IDENTIFIER')
DB_NAME = config.get('REDSHIFT', 'DWH_DB')
USER = config.get('REDSHIFT', 'DWH_DB_USER')
PW = config.get('REDSHIFT', 'DWH_DB_PASSWORD')
PORT = config.get('REDSHIFT', 'DWH_PORT')
IAM_ROLE = config.get('REDSHIFT', 'DWH_IAM_ROLE_NAME')
IAM_ROLE_ARN = config.get('IAM_ROLE', 'ARN')

### **CONFIGURE ROLE AND POLICY**

In [4]:
iam = boto3.client(
    'iam',
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET,
    region_name=REGION
)
print(KEY)
print(SECRET)
print(REGION)

try:
    print("Creating a new IAM Role") 
    dwhRole = iam.create_role(
        Path='/',
        RoleName=IAM_ROLE,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument=json.dumps(
            {
                'Statement': [
                    {
                        'Action': 'sts:AssumeRole',
                        'Effect': 'Allow',
                        'Principal': 
                        {
                            'Service': 'redshift.amazonaws.com'
                        }
                    }
                ],
                'Version': '2012-10-17'
            }
        )
    )    
except Exception as e:
    print(e)

print("Attaching policy to new role")
iam.attach_role_policy(
    RoleName=IAM_ROLE,
    PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
)['ResponseMetadata']['HTTPStatusCode']

roleArn = iam.get_role(RoleName=IAM_ROLE)['Role']['Arn']

AKIAZXRKYZA6RD2HJ7VZ
0dn1zGU4m90BG27+GLC6QPXpYYJmfJBp2XfsulzX
us-west-2
Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name dwhRole already exists.
Attaching policy to new role


### **CREATE REDSHIFT CLUSTER**

In [5]:
redshift_client = boto3.client(
    'redshift',
    region_name=REGION,
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)

try:
    redshift_client.create_cluster(
        ClusterType=CLUSTER_TYPE,
        NodeType=NODE_TYPE,
        NumberOfNodes=int(NUM_NODES),
        DBName=DB_NAME,
        ClusterIdentifier=CLUSTER_ID,
        MasterUsername=USER,
        MasterUserPassword=PW,
        IamRoles=[roleArn]
    )
    print(redshift_client.describe_clusters(ClusterIdentifier=CLUSTER_ID)['Clusters'][0])
except Exception as error:
    print(error)

{'ClusterIdentifier': 'dwhcluster', 'NodeType': 'dc2.large', 'ClusterStatus': 'creating', 'ClusterAvailabilityStatus': 'Modifying', 'MasterUsername': 'dwhuser', 'DBName': 'dwh', 'AutomatedSnapshotRetentionPeriod': 1, 'ManualSnapshotRetentionPeriod': -1, 'ClusterSecurityGroups': [], 'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-088ecfe503cef1bfa', 'Status': 'active'}], 'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0', 'ParameterApplyStatus': 'in-sync'}], 'ClusterSubnetGroupName': 'default', 'VpcId': 'vpc-0531d48121ad5c297', 'PreferredMaintenanceWindow': 'mon:09:00-mon:09:30', 'PendingModifiedValues': {'MasterUserPassword': '****'}, 'ClusterVersion': '1.0', 'AllowVersionUpgrade': True, 'NumberOfNodes': 4, 'PubliclyAccessible': True, 'Encrypted': False, 'ClusterNodes': [], 'Tags': [], 'EnhancedVpcRouting': False, 'IamRoles': [{'IamRoleArn': 'arn:aws:iam::669030860861:role/dwhRole', 'ApplyStatus': 'adding'}], 'MaintenanceTrackName': 'current', 'DeferredMaintenanc

### **GET CLUSTER DESCRIPTION**

This will be called repeatedly in this notebook. It should be idempotent.

In [9]:
myClusterProps = redshift_client.describe_clusters(ClusterIdentifier=CLUSTER_ID)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwh
5,Endpoint,"{'Address': 'dwhcluster.ctr1chjrr2qy.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-0531d48121ad5c297
7,NumberOfNodes,4


In [10]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  dwhcluster.ctr1chjrr2qy.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::669030860861:role/dwhRole


In [11]:
config.set('REDSHIFT', 'DWH_ENDPOINT', DWH_ENDPOINT)
config.set('IAM_ROLE', 'ARN', DWH_ROLE_ARN)
print(config.get('IAM_ROLE', 'ARN'))
with open('dwh.cfg', 'w') as configfile:
    config.write(configfile)

arn:aws:iam::669030860861:role/dwhRole


### **CONFIGURE REDSHIFT CLUSTER**

In [12]:
ec2 = boto3.resource(
    'ec2',
    region_name=REGION,
    aws_access_key_id=KEY,
    aws_secret_access_key=SECRET
)

cluster_props = redshift_client.describe_clusters(ClusterIdentifier=CLUSTER_ID)['Clusters'][0]
vpc = ec2.Vpc(id=cluster_props['VpcId'])
defaultSg = list(vpc.security_groups.all())[0]
defaultSg.authorize_ingress(
    GroupName=defaultSg.group_name,
    CidrIp='0.0.0.0/0',
    IpProtocol='TCP',
    FromPort=int(PORT),
    ToPort=int(PORT)
)

ClientError: An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists

### **INSPECT REDSHIFT DATABASE**

In [13]:
import psycopg2

HOST = redshift_client.describe_clusters(ClusterIdentifier=CLUSTER_ID)[
        'Clusters'][0]['Endpoint']['Address']
conn = psycopg2.connect(f"host={HOST} dbname={DB_NAME} user={USER} password={PW} port={PORT}")
cur = conn.cursor()
q = """
SELECT DISTINCT tablename
FROM PG_TABLE_DEF
WHERE schemaname='public';
"""
cur.execute(q)
print(cur.fetchall())
#pd.DataFrame(data = cur.fetchall)
conn.close()

[]


In [14]:
%load_ext sql

In [15]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(USER, PW, DWH_ENDPOINT, PORT, DB_NAME)
%sql $conn_string

In [16]:
%sql SELECT * FROM staging_events LIMIT 5

 * postgresql://dwhuser:***@dwhcluster.ctr1chjrr2qy.us-west-2.redshift.amazonaws.com:5439/dwh
(psycopg2.errors.UndefinedTable) relation "staging_events" does not exist

[SQL: SELECT * FROM staging_events LIMIT 5]
(Background on this error at: https://sqlalche.me/e/14/f405)


In [None]:
%sql SELECT * FROM staging_songs LIMIT 5

In [None]:
%sql SELECT * FROM songs LIMIT 5

In [None]:
%sql SELECT * FROM artists LIMIT 5

In [None]:
%sql SELECT * FROM users LIMIT 5

In [None]:
%sql SELECT * FROM time LIMIT 5

In [None]:
%sql SELECT * FROM songplays LIMIT 5

In [None]:
data = %sql SELECT * FROM staging_events WHERE userId = 69 LIMIT 1

In [None]:
df = pd.DataFrame(data, columns=[
        'artist', 'auth', 'firstName', 'gender', 'itemInSession', 'lastName',
        'length', 'location', 'method', 'page', 'registration',
        'sessionId', 'song', 'status', 'ts', 'userAgent', 'userId', 'level'
    ]
)
print(df)

In [None]:
import config

conn = psycopg2.connect(f"""
        host={config.ENDPOINT}
        dbname={config.DB_NAME}
        user={config.USER}
        password={config.PW}
        port={config.PORT}
    """)
cur = conn.cursor()
cur.execute('SELECT * FROM staging_events;')
staging_events_data = cur.fetchall()
staging_events_df = pd.DataFrame(
    staging_events_data,
    columns=[
        'artist', 'auth', 'firstName', 'gender', 'itemInSession', 'lastName',
        'length', 'level', 'location', 'method', 'page', 'registration',
        'sessionId', 'song', 'status', 'ts', 'userAgent', 'userId'
    ]
)
for index, row in staging_events_df.iterrows():
    if row['firstName'] == "Anabelle":
        print(row)
        print(row['userId'])
        check = None
        if type(row['userId']) == float:
            check = int(row['userId'])
        if type(row['userId']) == str:
            check = int(float(row['userId']))
        print(check)
        break

### **DELETE REDSHIFT CLUSTER**

The next cell will delete the cluster. Make sure to run the cell to get cluster description after deleting the cluster to confirm its deletion.

In [None]:
try:
    redshift_client.delete_cluster(ClusterIdentifier=CLUSTER_ID, SkipFinalClusterSnapshot=True)
except Exception as error:
    print(error)

In [None]:
myClusterProps = redshift_client.describe_clusters(ClusterIdentifier=CLUSTER_ID)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Clean up resources

In [None]:
iam.detach_role_policy(RoleName=IAM_ROLE, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=IAM_ROLE)