In [1]:
%load_ext sql

## AWS CONFIGURATION

In [2]:
import configparser
import boto3

In [3]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY=config.get('AWS','KEY')
SECRET= config.get('AWS','SECRET')

DWH_ROLE_ARN = config.get("IAM_ROLE","ARN")

DWH_DB_USER= config.get("CLUSTER","DB_USER")
DWH_DB_PASSWORD= config.get("CLUSTER","DB_PASSWORD")
DWH_ENDPOINT = config.get("CLUSTER","HOST")
DWH_DB_NAME= config.get("CLUSTER","DB_NAME")
DWH_DB_PORT = config.get("CLUSTER","DB_PORT")

LOG_DATA = config.get("S3", "LOG_DATA")
LOG_JSONPATH = config.get("S3", "LOG_JSONPATH")
SONG_DATA = config.get("S3", "SONG_DATA")

In [4]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_DB_PORT, DWH_DB_NAME)
%sql $conn_string

'Connected: awsuser@dev'

### Check out the sample data sources on S3

In [5]:
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                     )

sampleDbBucket =  s3.Bucket("udacity-dend")

for obj in sampleDbBucket.objects.filter(Prefix="log_data"):
    print(obj)

s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-01-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-02-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-03-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-04-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-05-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-06-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-07-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-08-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-09-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-10-events.json')
s3.ObjectSummary(b

## ETL 

In [7]:
!python3 create_tables.py

In [8]:
%%sql 
SELECT * FROM information_schema.tables 
WHERE table_schema = 'public'
ORDER BY table_name

 * postgresql://awsuser:***@redshift-cluster-1.csb0zbeiki8q.us-west-2.redshift.amazonaws.com:5439/dev
7 rows affected.


table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_name
dev,public,artists,BASE TABLE,,,,,
dev,public,songplays,BASE TABLE,,,,,
dev,public,songs,BASE TABLE,,,,,
dev,public,staging_events,BASE TABLE,,,,,
dev,public,staging_songs,BASE TABLE,,,,,
dev,public,time,BASE TABLE,,,,,
dev,public,users,BASE TABLE,,,,,


In [9]:
!python3 etl.py

In [10]:
%%sql

SELECT 'staging_events' AS table_name, COUNT(*) AS record_count FROM staging_events
UNION
SELECT 'staging_songs' AS table_name, COUNT(*) AS record_count FROM staging_songs
UNION
SELECT 'songplays' AS table_name, COUNT(*) AS record_count FROM songplays
UNION
SELECT 'users' AS table_name, COUNT(*) AS record_count FROM users
UNION
SELECT 'songs' AS table_name, COUNT(*) AS record_count FROM songs
UNION
SELECT 'artists' AS table_name, COUNT(*) AS record_count FROM artists
UNION
SELECT 'time' AS table_name, COUNT(*) AS record_count FROM time



 * postgresql://awsuser:***@redshift-cluster-1.csb0zbeiki8q.us-west-2.redshift.amazonaws.com:5439/dev
7 rows affected.


table_name,record_count
staging_songs,14896
users,6820
staging_events,8056
songplays,333
time,8023
songs,14896
artists,14896


In [11]:
%%sql

select * from staging_events limit 5;

 * postgresql://awsuser:***@redshift-cluster-1.csb0zbeiki8q.us-west-2.redshift.amazonaws.com:5439/dev
5 rows affected.


artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts,useragent,userid
,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1540919166796,38,,200,1541105830796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",39
,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1540344794796,139,,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36""",8
Des'ree,Logged In,Kaylee,F,1,Summers,246.0,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,You Gotta Be,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36""",8
,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1540344794796,139,,200,1541106132796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36""",8
Mr Oizo,Logged In,Kaylee,F,3,Summers,144.0,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Flat 55,200,1541106352796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36""",8


In [12]:
%%sql

select * from staging_songs  limit 5;

 * postgresql://awsuser:***@redshift-cluster-1.csb0zbeiki8q.us-west-2.redshift.amazonaws.com:5439/dev
5 rows affected.


artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
AR73AIO1187B9AD57B,37.0,"San Francisco, CA",-122.0,Western Addiction,118,1,SOQPWCR12A6D4FB2A3,A Poor Recipe For Civic Cohesion,2005
ARC1IHZ1187FB4E920,,,,Jamie Cullum,246,1,SOXZYWX12A6310ED0C,It's About Time,0
ARGE7G11187FB37E05,,"Brooklyn, NY",,Cyndi Lauper,240,1,SONRWUU12AF72A4283,Into The Nightlife,2008
ARBZIN01187FB362CC,1.0,27,103.0,Paris Hilton,192,1,SOERIDA12A6D4F8506,I Want You (Album Version),2006
ARTC1LV1187B9A4858,51.0,"Goldsmith's College, Lewisham, Lo",0.0,The Bonzo Dog Band,301,1,SOAFBCP12A8C13CC7D,King Of Scurf (2007 Digital Remaster),1972


In [13]:
%%sql

SELECT * from songplays limit 5;

 * postgresql://awsuser:***@redshift-cluster-1.csb0zbeiki8q.us-west-2.redshift.amazonaws.com:5439/dev
5 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
0,2018-11-24 12:43:00,73,paid,SONQBUB12A6D4F8ED0,ARFCUN31187B9AD578,692,"Tampa-St. Petersburg-Clearwater, FL","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"""
6,2018-11-26 08:33:59,44,paid,SOVWWJW12A670206BE,AR3ZL6A1187B995B37,781,"Waterloo-Cedar Falls, IA",Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0
8,2018-11-05 17:49:42,73,paid,SOHDWWH12A6D4F7F6A,ARC0IOF1187FB3F6E6,255,"Tampa-St. Petersburg-Clearwater, FL","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"""
12,2018-11-23 15:29:23,53,free,SOARUPP12AB01842E0,ARD46C811C8A414F3F,860,"Klamath Falls, OR","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.103 Safari/537.36"""
14,2018-11-14 15:24:12,80,paid,SOARUPP12AB01842E0,ARD46C811C8A414F3F,574,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""


In [14]:
%%sql


SELECT * from users limit 5;

 * postgresql://awsuser:***@redshift-cluster-1.csb0zbeiki8q.us-west-2.redshift.amazonaws.com:5439/dev
5 rows affected.


user_id,first_name,last_name,gender,level
10,Sylvie,Cruz,F,free
101,Jayden,Fox,M,free
62,Connar,Moreno,M,free
101,Jayden,Fox,M,free
95,Sara,Johnson,F,paid


In [15]:
%%sql

SELECT * from songs limit 5;

 * postgresql://awsuser:***@redshift-cluster-1.csb0zbeiki8q.us-west-2.redshift.amazonaws.com:5439/dev
5 rows affected.


song_id,title,artist_id,year,durataion
SOXZYWX12A6310ED0C,It's About Time,ARC1IHZ1187FB4E920,0,246
SONRWUU12AF72A4283,Into The Nightlife,ARGE7G11187FB37E05,2008,240
SOAFBCP12A8C13CC7D,King Of Scurf (2007 Digital Remaster),ARTC1LV1187B9A4858,1972,301
SOKPKMV12A8C14125E,Catwalk (Black Ink Mix),AR2L9A61187B9ADDBC,1995,492
SOMFRKT12A8C146C67,Without You,ARQVORN11F50C4EFEC,0,165


In [16]:
%%sql

SELECT * from time limit 5;

 * postgresql://awsuser:***@redshift-cluster-1.csb0zbeiki8q.us-west-2.redshift.amazonaws.com:5439/dev
5 rows affected.


start_time,hour,day,week,month,year,weekday
2018-11-01 20:57:10,20,1,44,11,2018,4
2018-11-01 21:01:46,21,1,44,11,2018,4
2018-11-01 21:05:52,21,1,44,11,2018,4
2018-11-01 21:11:13,21,1,44,11,2018,4
2018-11-01 21:55:25,21,1,44,11,2018,4
