In [1]:
%load_ext sql

In [3]:
from time import time
import configparser
import matplotlib.pyplot as plt
import pandas as pd

## Get the parameters for the created redshift cluster
We will need the redshifty cluster endpoint and the IAM role ARN that gives access to Redshift to read from S3

In [5]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
KEY = config.get('AWS', 'KEY')
SECRET = config.get('AWS', 'SECRET')

DWH_DB = config.get('DWH', 'DWH_DB')
DWH_DB_USER = config.get('DWH', 'DWH_DB_USER')
DWH_DB_PASSWORD = config.get('DWH', 'DWH_DB_PASSWORD')
DWH_PORT = config.get('DWH', 'DWH_PORT')

In [6]:
DWH_ENDPOINT = "dwhcluster.cva7bq0wyqdz.us-west-2.redshift.amazonaws.com"
DWH_ROLE_ARN = "arn:aws:iam::222619198456:role/dwhRole"

## Connect to the Redshift Cluster

In [7]:
conn_string = "postgresql://{}:{}@{}:{}/{}".format(
    DWH_DB_USER,
    DWH_DB_PASSWORD,
    DWH_ENDPOINT,
    DWH_PORT,
    DWH_DB
)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.cva7bq0wyqdz.us-west-2.redshift.amazonaws.com:5439/dwh


In [9]:
import boto3

s3 = boto3.resource(
    's3',
    region_name = 'us-west-2',
    aws_access_key_id = KEY,
    aws_secret_access_key = SECRET
)

sampleDbBucket = s3.Bucket('udacity-labs')
for obj in sampleDbBucket.objects.filter(Prefix='tickets'):
    print(obj)

s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/full/')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/full/full.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00000-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00001-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00002-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00003-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00004-d33afb94-b8af-407d-abd5-59c0ee8f5ee8-c000.csv.gz')
s3.ObjectSummary(bucket_name='udacity-labs', key='tickets/split/part-00005-d33afb94-b8af-407d-abd5-

## Create Tables

In [14]:
%%sql
DROP TABLE IF EXISTS "sporting_event_ticket";
CREATE TABLE sporting_event_ticket(
    "id" DOUBLE PRECISION DEFAULT NEXTVAL('sporting_event_ticket_seq') NOT NULL,
    "sporting_event_id" DOUBLE PRECISION NOT NULL,
    "sport_location_id" DOUBLE PRECISION NOT NULL,
    "seat_level" NUMERIC(1,0) NOT NULL,
    "seat_section" CHARACTER VARYING(15) NOT NULL,
    "seat_row" CHARACTER VARYING(10) NOT NULL,
    "seat" CHARACTER VARYING(10) NOT NULL,
    "ticketholder_id" DOUBLE PRECISION,
    "ticket_price" NUMERIC(8,2) NOT NULL
);


 * postgresql://dwhuser:***@dwhcluster.cva7bq0wyqdz.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.


[]

## Load Partitioned data into the cluster

In [15]:
%%time
query = """
copy sporting_event_ticket from 's3://udacity-labs/tickets/split/part'
credentials 'aws_iam_role={}'
gzip delimiter ';' compupdate off region 'us-west-2'
""".format(DWH_ROLE_ARN)

%sql $query

 * postgresql://dwhuser:***@dwhcluster.cva7bq0wyqdz.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
CPU times: user 7.65 ms, sys: 0 ns, total: 7.65 ms
Wall time: 13.4 s


[]

## Create tables for the non-partitioned data

In [17]:
%%sql 
DROP TABLE IF EXISTS "sporting_event_ticket_full";
CREATE TABLE "sporting_event_ticket_full"(
    "id" DOUBLE PRECISION DEFAULT NEXTVAL('sporting_event_ticket_seq') NOT NULL,
    "sporting_event_id" DOUBLE PRECISION NOT NULL,
    "sport_location_id" DOUBLE PRECISION NOT NULL,
    "seat_level" NUMERIC(1,0) NOT NULL,
    "seat_section" CHARACTER VARYING(15) NOT NULL,
    "seat_row" CHARACTER VARYING(10) NOT NULL,
    "seat" CHARACTER VARYING(10) NOT NULL,
    "ticketholder_id" DOUBLE PRECISION,
    "ticket_price" NUMERIC(8,2) NOT NULL
);

 * postgresql://dwhuser:***@dwhcluster.cva7bq0wyqdz.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.


[]

## Load the non-partitioned data into the cluster

In [19]:
%%time
query = """
copy sporting_event_ticket from 's3://udacity-labs/tickets/full/full.csv.gz'
credentials 'aws_iam_role={}'
gzip delimiter ';' compupdate off region 'us-west-2'
""".format(DWH_ROLE_ARN)

%sql $query

 * postgresql://dwhuser:***@dwhcluster.cva7bq0wyqdz.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
CPU times: user 3.44 ms, sys: 3.92 ms, total: 7.35 ms
Wall time: 22.2 s


[]

Note: It is slower than loading partitioned data