In [1]:
from pyspark.sql import SparkSession, functions as F
import os
import configparser

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [4]:
# get filepath to song data file
input_data = 's3a://udacity-dend/'
song_data = f'{input_data}/song_data/A/A/*/*.json'

# read song data file
df = spark.read.json(song_data)
song_df = df

In [5]:
df.printSchema()


root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [6]:
# extract columns to create songs table
songs_table = df.select(["song_id","title","artist_id","year","duration"]).dropDuplicates()
songs_table.printSchema()

# write songs table to parquet files partitioned by year and artist
songs_table.write.partitionBy("year", "artist_id").mode('overwrite').parquet("s3a://datalakedendudacity1/test/songs.parquet")

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- year: long (nullable = true)
 |-- duration: double (nullable = true)



In [26]:
# extract columns to create artists table
artists_table = song_df.select(["artist_id","artist_name","artist_location","artist_latitude","artist_longitude"]).dropDuplicates()
artists_table.printSchema()

# write artists table to parquet files
artists_table.write.mode('overwrite').parquet("s3a://datalakedendudacity1/test/artists_table")

root
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_longitude: double (nullable = true)



In [8]:
# get filepath to log data file
log_data = f'{input_data}/log_data/*/*/*.json'

# read log data file
df = spark.read.json(log_data)

# filter by actions for song plays
df = df.filter(df.page == 'NextSong')

In [9]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [10]:
# extract columns for users table    
users_table = df.select(["userId","firstName","lastName","gender","level"]).dropDuplicates()
users_table.printSchema()

# write users table to parquet files
users_table.write.mode('overwrite').parquet("s3a://datalakedendudacity1/test/songs.parquet")

root
 |-- userId: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)



In [11]:
# create timestamp column from original timestamp column
df = df.withColumn("start_time", F.to_timestamp(df.ts/1000))

# create datetime column from original timestamp column
df.printSchema()
df.select("start_time").show(5)

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- start_time: timestamp (nullable = true)

+--------------------+
|          start_time|
+--------------------+
|2018-11-15 00:30:...|
|2018-11-15 00:41:...|
|2018-11-15 00:45:...|
|2018-11-15 03:44:...|
|2018-11-15 05:48:...|
+--------------------+
only showing top 5 rows



In [12]:
# extract columns to create time table

time_table = df.select("start_time")\
            .withColumn("hour", F.hour("start_time"))\
            .withColumn("day", F.dayofweek("start_time"))\
            .withColumn("week", F.weekofyear("start_time"))\
            .withColumn("month", F.month("start_time"))\
            .withColumn("year", F.year("start_time"))\
            .dropDuplicates()

time_table.show(5)

# write time table to parquet files partitioned by year and month
time_table.write.partitionBy("year", "month").mode('overwrite').parquet("s3a://datalakedendudacity1/test/songs.parquet")

+--------------------+----+---+----+-----+----+
|          start_time|hour|day|week|month|year|
+--------------------+----+---+----+-----+----+
|2018-11-15 18:03:...|  18|  5|  46|   11|2018|
|2018-11-21 18:02:...|  18|  4|  47|   11|2018|
|2018-11-21 19:48:...|  19|  4|  47|   11|2018|
|2018-11-14 05:08:...|   5|  4|  46|   11|2018|
|2018-11-14 06:04:...|   6|  4|  46|   11|2018|
+--------------------+----+---+----+-----+----+
only showing top 5 rows



In [13]:
# read in song data to use for songplays table
# song_df = spark.read.json(song_data)

song_df.createOrReplaceTempView("songs_table")
df.createOrReplaceTempView("log_table")

# extract columns from joined song and log datasets to create songplays table 
songplays_table = spark.sql("""
          SELECT SE.ts, 
            SE.userId,
            SE.level, 
            SS.song_id,                      
            SS.artist_id, 
            SE.sessionId,
            SS.artist_location,
            SE.userAgent
          FROM songs_table SS , log_table SE
          WHERE SE.song = SS.title
                AND SE.artist = SS.artist_name
          """)

In [None]:
# # write songplays table to parquet files partitioned by year and month
songplays_table.select('ts')

In [14]:
songplays_table.show(5)

+-------------+------+-----+------------------+------------------+---------+--------------------+--------------------+
|           ts|userId|level|           song_id|         artist_id|sessionId|     artist_location|           userAgent|
+-------------+------+-----+------------------+------------------+---------+--------------------+--------------------+
|1542837407796|    15| paid|SOZCTXZ12AB0182364|AR5KOSW1187FB35FF4|      818|           Dubai UAE|"Mozilla/5.0 (X11...|
|1541440182796|    73| paid|SOHDWWH12A6D4F7F6A|ARC0IOF1187FB3F6E6|      255|                 108|"Mozilla/5.0 (Mac...|
|1542148779796|    55| free|SOXQYSC12A6310E908|AR0L04E1187B9AE90C|      415|Wigan, Lancashire...|"Mozilla/5.0 (Mac...|
|1542378072796|    85| paid|SOLRYQR12A670215BF|ARNLO5S1187B9B80CC|      436|        Pasadena, CA|"Mozilla/5.0 (Mac...|
|1542735998796|    49| paid|SOCHRXB12A8AE48069|ARTDQRC1187FB4EFD4|      758|     Los Angeles, CA|Mozilla/5.0 (Wind...|
+-------------+------+-----+------------------+-

In [15]:
songplays_table = songplays_table.withColumn("start_time", F.to_timestamp(songplays_table.ts/1000))

songplays_table = songplays_table.select(['start_time','ts','userId','level','song_id','artist_id','sessionId','artist_location','userAgent']).withColumn("month", F.month("start_time")).withColumn("year", F.year("start_time"))
songplays_table.show(5)

+--------------------+-------------+------+-----+------------------+------------------+---------+--------------------+--------------------+-----+----+
|          start_time|           ts|userId|level|           song_id|         artist_id|sessionId|     artist_location|           userAgent|month|year|
+--------------------+-------------+------+-----+------------------+------------------+---------+--------------------+--------------------+-----+----+
|2018-11-21 21:56:...|1542837407796|    15| paid|SOZCTXZ12AB0182364|AR5KOSW1187FB35FF4|      818|           Dubai UAE|"Mozilla/5.0 (X11...|   11|2018|
|2018-11-05 17:49:...|1541440182796|    73| paid|SOHDWWH12A6D4F7F6A|ARC0IOF1187FB3F6E6|      255|                 108|"Mozilla/5.0 (Mac...|   11|2018|
|2018-11-13 22:39:...|1542148779796|    55| free|SOXQYSC12A6310E908|AR0L04E1187B9AE90C|      415|Wigan, Lancashire...|"Mozilla/5.0 (Mac...|   11|2018|
|2018-11-16 14:21:...|1542378072796|    85| paid|SOLRYQR12A670215BF|ARNLO5S1187B9B80CC|      4

In [None]:
songplays_table.write.partitionBy("year", "month").mode('overwrite').parquet("s3a://datalakedendudacity1/songplays_table")

In [19]:
songplays_table.select(['start_time','year','month']).show(1)


+--------------------+----+-----+
|          start_time|year|month|
+--------------------+----+-----+
|2018-11-21 21:56:...|2018|   11|
+--------------------+----+-----+
only showing top 1 row

