# Execute code to build Data Lakes and query to display the result

## 1. execute pipeline Code

In [5]:
%run etl.py

Finish write parquet to users_table. 2022-12-27 08:02:49.182115
Finish write parquet to time_table. 2022-12-27 08:03:02.783090
Finish write parquet to songplays_table. 2022-12-27 08:08:47.501025


## 2. Analyze the Table Build Result

In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, dayofweek

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [3]:
os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [4]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [5]:
input_data = "s3a://aws-emr-resources-860223054232-us-east-1/"

### 2.1 Check songs table schema

In [6]:
songs_data = os.path.join(input_data, "songs/")

In [7]:
songs_df = spark.read.parquet(songs_data)

In [8]:
songs_df.printSchema()
songs_df.show(5)

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)

+------------------+--------------------+---------+----+------------------+
|           song_id|               title| duration|year|         artist_id|
+------------------+--------------------+---------+----+------------------+
|SOBTCUI12A8AE48B70|Faust: Ballet Mus...| 94.56281|   0|ARSUVLW12454A4C8B8|
|SOVNKJI12A8C13CB0D|Take It To Da Hou...|227.10812|2001|ARWUNH81187FB4A3E0|
|SOYVBGZ12A6D4F92A8|Piano Sonata No. ...|221.70077|   0|ARLRWBW1242077EB29|
|SODBHKO12A58A77F36|Fingers Of Love (...|335.93424|   0|ARKGS2Z1187FB494B5|
|SOGXFIF12A58A78CC4|Hanging On (Mediu...|204.06812|   0|AR5LZJD1187FB4C5E5|
+------------------+--------------------+---------+----+------------------+
only showing top 5 rows



### 2.2 Check artists table schema

In [9]:
artists_data = os.path.join(input_data, "artists/")

In [10]:
artists_df = spark.read.parquet(artists_data)

In [12]:
artists_df.printSchema()
artists_df.show(5)

root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- lattitude: double (nullable = true)
 |-- longitude: double (nullable = true)

+------------------+--------------------+--------------------+---------+---------+
|         artist_id|                name|            location|lattitude|longitude|
+------------------+--------------------+--------------------+---------+---------+
|ARSUVLW12454A4C8B8|Royal Philharmoni...|           Tennessee| 35.83073|-85.97874|
|ARXQC081187FB4AD42|William Shatner_ ...|                  UK| 54.31407| -2.23001|
|ARWUNH81187FB4A3E0|         Trick Daddy|     Miami , Florida|     null|     null|
|ARTC1LV1187B9A4858|  The Bonzo Dog Band|Goldsmith's Colle...|  51.4536| -0.01802|
|ARA23XO1187B9AF18F|     The Smithereens|Carteret, New Jersey| 40.57885|-74.21956|
+------------------+--------------------+--------------------+---------+---------+
only showing top 5 rows



### 2.3 Check users table schema

In [14]:
users_data = os.path.join(input_data, "users/")
users_df = spark.read.parquet(users_data)

In [15]:
users_df.printSchema()
users_df.show(5)

root
 |-- user_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     88|  Mohammad|Rodriguez|     M| free|
|     75|    Joseph|Gutierrez|     M| free|
|     69|  Anabelle|  Simpson|     F| free|
|     29|Jacqueline|    Lynch|     F| free|
|     68|    Jordan|Rodriguez|     F| free|
+-------+----------+---------+------+-----+
only showing top 5 rows



### 2.4 Check time table schema

In [17]:
time_data = os.path.join(input_data, "time/")
time_df = spark.read.parquet(time_data)

In [18]:
time_df.printSchema()
time_df.show(5)

root
 |-- start_time: long (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

+-------------+----+---+----+-------+----+-----+
|   start_time|hour|day|week|weekday|year|month|
+-------------+----+---+----+-------+----+-----+
|1542241826796|   0| 15|  46|      5|2018|   11|
|1542242481796|   0| 15|  46|      5|2018|   11|
|1542242741796|   0| 15|  46|      5|2018|   11|
|1542253449796|   3| 15|  46|      5|2018|   11|
|1542260935796|   5| 15|  46|      5|2018|   11|
+-------------+----+---+----+-------+----+-----+
only showing top 5 rows



### 2.5 Check songplays table schema

In [21]:
songplays_data = os.path.join(input_data, "songplays/*/*/*.parquet")
songplays_df = spark.read.parquet(songplays_data)

In [22]:
songplays_df.printSchema()
songplays_df.show(5)

root
 |-- start_time: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)

+-------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+
|   start_time|user_id|level|           song_id|         artist_id|session_id|            location|          user_agent|
+-------------+-------+-----+------------------+------------------+----------+--------------------+--------------------+
|1542837407796|     15| paid|SOZCTXZ12AB0182364|AR5KOSW1187FB35FF4|       818|Chicago-Napervill...|"Mozilla/5.0 (X11...|
|1541440182796|     73| paid|SOHDWWH12A6D4F7F6A|ARC0IOF1187FB3F6E6|       255|Tampa-St. Petersb...|"Mozilla/5.0 (Mac...|
|1542148779796|     55| free|SOXQYSC12A6310E908|AR0L04E1187B9AE90C|    

### 2.6 Analyze song event count number in Fact table

In [32]:
songs_df.createOrReplaceTempView("songs")
songplays_df.createOrReplaceTempView("songplays")

In [34]:
spark.sql("""
SELECT c.title as song_title, c.year as song_year, a.level, a.location, a.user_agent, count(*) 
FROM songplays a
JOIN songs c on a.song_id = c.song_id 
GROUP BY c.title, c.year, a.level, a.location, a.user_agent
""").show(5)

+--------------------+---------+-----+--------------------+--------------------+--------+
|          song_title|song_year|level|            location|          user_agent|count(1)|
+--------------------+---------+-----+--------------------+--------------------+--------+
|Angie (1993 Digit...|        0| free|       Palestine, TX|Mozilla/5.0 (Maci...|       1|
|   I Want A New Drug|     1983| paid|Tampa-St. Petersb...|"Mozilla/5.0 (Mac...|       1|
|Let's Get It Started|     2004| free|Nashville-Davidso...|"Mozilla/5.0 (Mac...|       1|
|Angie (1993 Digit...|        0| paid|Tampa-St. Petersb...|"Mozilla/5.0 (Mac...|       1|
|The Boy With The ...|     1985| free|San Francisco-Oak...|Mozilla/5.0 (Wind...|       1|
+--------------------+---------+-----+--------------------+--------------------+--------+
only showing top 5 rows

