In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
# from pyspark.sql.functions import udf, col
# from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql import functions as F


config = configparser.ConfigParser()
# config.read('dl.cfg')
config.read('test1.cfg')


os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [2]:
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

In [3]:
from pyspark.sql.types import *

In [4]:
def build_song_schema():
    """Build and return a schema to use for the song data.
    Returns
        schema: StructType object, a representation of schema and defined fields
    """
    schema = StructType(
        [
            StructField('artist_id', StringType(), True),
            StructField('artist_latitude', DecimalType(), True),
            StructField('artist_longitude', DecimalType(), True),
            StructField('artist_location', StringType(), True),
            StructField('artist_name', StringType(), True),
            StructField('duration', DecimalType(), True),
            StructField('num_songs', IntegerType(), True),
            StructField('song_id', StringType(), True),
            StructField('title', StringType(), True),
            StructField('year', IntegerType(), True)
        ]
    )
    return schema

In [5]:
# input_data = "s3a://udacity-dend/"

# song_data = f'{input_data}/song_data/*/*/*/*.json'

# log_data = f'{input_data}/log_data/'

In [6]:
# read song data file
df_song = spark.read.json("s3a://udacity-dend/song_data/A/A/A/*.json", schema=build_song_schema())

In [30]:
# extract columns to create songs table
songs_table = df_song[['song_id', 'title', 'artist_id', 'year', 'duration']].drop_duplicates()

# write songs table to parquet files partitioned by year and artist
songs_table.write.save(path='song_table',
                       format='parquet',
                       partitionBy=['year', 'artist_id'],
                       mode='overwrite'
                      )

# extract columns to create artists table
artists_table = df_song[['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude']].drop_duplicates()

# write artists table to parquet files
artists_table.write.save(path='artists_table',
                       format='parquet',
                         mode='overwrite'
                      )

In [9]:
# read log data file
df_log_data = spark.read.json("s3a://udacity-dend/log-data/*/*/*.json")

# filter by actions for song plays
df_log_data = df_log_data[df_log_data['page']=='NextSong']

# extract columns for users table    
users_table = df_log_data[['userId', 'firstName', 'lastName', 'gender', 'level']].drop_duplicates()

# write users table to parquet files
users_table.write.save(path='users_table',
                       format='parquet',
                         mode='overwrite'
                      )

df_log_data = df_log_data.withColumn('timestamp', F.from_unixtime(df_log_data['ts']/1000))\
                        .withColumn('hour', F.hour(F.col('timestamp')))\
                        .withColumn('day', F.dayofmonth(F.col('timestamp')))\
                        .withColumn('month', F.month(F.col('timestamp')))\
                        .withColumn('year', F.year(F.col('timestamp')))\
                        .withColumn('weekofyear', F.weekofyear(F.col('timestamp')))\
                        .withColumn('dayofweek', F.dayofweek(F.col('timestamp')))

# extract columns to create time table
time_table = df_log_data[['timestamp','hour','day','month','year','weekofyear','dayofweek',]].drop_duplicates()

# write time table to parquet files partitioned by year and month
time_table.write.save(path='time_table',
                      format='parquet',
                      mode='overwrite',
                      partitionBy=['year','month']                      )

# read in song data to use for songplays table
df_song = spark.read.json("s3a://udacity-dend/song_data/A/A/A/*.json", schema=build_song_schema())

# extract columns from joined song and log datasets to create songplays table 
songplays_table = df_log_data.join(df_song, 
                                   on = (df_song['title'] == df_log_data['song']) & \
                                       (df_song['artist_name'] == df_log_data['artist']) & \
                                       (df_song['duration'] == df_log_data['length'])                                  
                                  )

# write songplays table to parquet files partitioned by year and month
songplays_table.write.save(path='songplays_table',
                      format='parquet',
                      mode='overwrite',
                      partitionBy=['year','month']                      )

In [12]:
df_log_data.limit(2).write.save(path='s3a://my-sparkify-data-lake/df_log_data',
                      format='parquet',
                      mode='overwrite',)
# .write.mode("overwrite").parquet("s3a://my-sparkify-data-lake/df_log_data/parquet/myFile")