In [None]:
import pandas as pd
import boto3
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

DWH_DB                 = config.get("CLUSTER","DB_NAME")
DWH_DB_USER            = config.get("CLUSTER","DB_USER")
DWH_DB_PASSWORD        = config.get("CLUSTER","DB_PASSWORD")
DWH_PORT               = config.get("CLUSTER","DB_PORT")
DWH_ENDPOINT           = config.get("CLUSTER","HOST")

In [None]:
%load_ext sql
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
print(conn_string)
%sql $conn_string

## Top users by level

In [None]:
%%sql
SELECT DISTINCT title FROM staging_songs LIMIT 2;

In [None]:
%%sql
SELECT * FROM staging_events LIMIT 1;

In [None]:
%%sql
SELECT 
    users.user_first_name as first_name, 
    users.user_last_name as last_name, 
    users.user_level as level, 
    count(songplays.song_id) as no_of_songplays
FROM users
JOIN songplays ON (songplays.user_id = users.user_id)
GROUP BY 
    first_name, 
    last_name, 
    level
ORDER BY no_of_songplays DESC

## Most popular artists

In [None]:
%%sql
SELECT 
    artists.artist_name as artist_name, 
    count(songplays.song_id) as no_of_songplays
FROM artists
JOIN songplays ON (songplays.artist_id = artists.artist_id)
GROUP BY artist_name
ORDER BY no_of_songplays DESC

## Most popular songs

In [None]:
%%sql
SELECT 
    songs.song_title as song_title, 
    count(songplays.song_id) as no_of_songplays
FROM songs
JOIN songplays ON (songplays.song_title = songs.song_id)
GROUP BY song_title
ORDER BY no_of_songplays DESC

## Highest server load times

In [None]:
%%sql
SELECT 
    time.hour as hour, 
    count(songplays.song_id) as no_of_songplays
FROM time
JOIN songplays ON (songplays.start_time = time.start_time)
GROUP BY hour
ORDER BY hour ASC >> high_server_loads

In [None]:
high_server_loads.bar()

## Most popular artist by user gender and level

In [None]:
%%sql
SELECT 
    users.user_gender as gender,
    users.user_level as user_level,
    artists.artist_name as artist_name,
    count(songplays.song_id) as no_of_songplays
FROM artists, users, songplays
WHERE 
    songplays.artist_id = artists.artist_id AND 
    songplays.user_id = users.user_id
GROUP BY 
    gender, 
    user_level, 
    artist_name
ORDER BY 
    no_of_songplays DESC >> gender_artist_distribution

In [None]:
gender_artist_distribution.bar()