In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, unix_timestamp, to_utc_timestamp, from_unixtime, to_date, dayofweek
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, weekofyear
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType, DateType, LongType, StructField, TimestampType

In [2]:
# the song_data and log_data are in a cpmpressed format in the workspace so we have to uncompress them so that we can read themtheir 
# this block of code uncopress the zipped file in their directory that is we have loaded them into our notebook yet
#import zipfile
#with zipfile.ZipFile('data/log-data.zip', 'r') as zip_ref:
#    zip_ref.extractall('data/log-data')
#import zipfile
#with zipfile.ZipFile('data/song-data.zip', 'r') as zip_ref:
#    zip_ref.extractall('data/song-data')

In [3]:
#instatiting a sparksession
spark = SparkSession \
    .builder \
    .appName("Practice_for_a_notebook") \
    .getOrCreate()

In [4]:
#showing configuration of spark context
#spark.sparkContext.getConf().getAll()

In [4]:
# first we read the log_data
# the are in a json file format

def get_files(filepath = "data/log-data/"):
    """ Gets the whole files (as paths) in a certain folder
    and saves them in a variable called all_files
    INPUT : folder path
    OUTPUT : a lsit containg the files in a certain folder
    """
    # we need to import os , glob module
    import os
    import glob
    all_files = []
    for root, dirs, files in os.walk(filepath):
        files = glob.glob(os.path.join(root,'*.json'))
        for f in files :
            all_files.append(os.path.abspath(f))
    
    return all_files


In [5]:
len(get_files())

30

In [6]:
list_of_log_files = get_files()
alls = spark.read.json(list_of_log_files)

In [7]:
alls.count()

8056

In [9]:
# this cell is responsible for counting how many records are existed in all the file in log-data folder
def log_data_row_counter():
    """
    THIS FUNCTION RETURNS HOW MANY ROWS ARE EXISTED IN ALL THE LOG-DATA JSON FILES
    """
    csv_file_path = "data/total_log_data_csv/"
    total_count_log = 0
    for i in get_files():
        df = spark.read.json(i)
        #df_csv = df.write.save(csv_file_path ,format = "csv" , header = True)
        current_file_rows_count = df.count()
        total_count_log += current_file_rows_count
    return total_count_log
ab = log_data_row_counter()
ab

8056

In [35]:
# this cell is responsible for making a directory in which csv file are loaded
def directory_maker(directory,parent_dir):
    """THIS FUNCTION CRETAES A DIRECTORY
    INPUTS:
    PARENT DIRECTORY
    DIRECTORY
    """
    #directory = "total_log_data_csv"
    import os
    # Parent Directory path
    #parent_dir = "data/"
  
    # Path
    path = os.path.join(parent_dir, directory)
  
    os.mkdir(path)
    

Directory 'total_log_data_csv' created


In [8]:
# before reading the file i had to specify my schema (explicit schema definintion)
# first we have to import the types that we may need in our schema types
from pyspark.sql.types import StructType, StringType, IntegerType, DoubleType, DateType, LongType, StructField, TimestampType
# we need also to convert our json file into one csv so that to load it once and use the schema prperty to explicitly define our schema

log_types = StructType([
    StructField("artist",StringType(),True),
    StructField("auth",StringType(),True),
    StructField("firstName",StringType(),True),
    StructField("gender",StringType(),True),
    StructField("itemInSession",LongType(),True),
    StructField("lastName",StringType(),True),
    StructField("length",DoubleType(),True),
    StructField("level",StringType(),True),
    StructField("location",StringType(),True),
    StructField("method",StringType(),True),
    StructField("page",StringType(),True),
    StructField("registration",DoubleType(),True),
    StructField("sessionId",IntegerType(),True),
    StructField("song",StringType(),True),
    StructField("status",IntegerType(),True),
    StructField("ts",LongType(),True),
    StructField("userAgent",StringType(),True),
    StructField("userId",IntegerType(),True)
])




In [5]:
log_data = spark.read.json("data/log-data/*.json" )

In [6]:
log_data.count()

8056

In [7]:
log_data.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [8]:
log_data.take(2)

[Row(artist='Harmonia', auth='Logged In', firstName='Ryan', gender='M', itemInSession=0, lastName='Smith', length=655.77751, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, sessionId=583, song='Sehr kosmisch', status=200, ts=1542241826796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26'),
 Row(artist='The Prodigy', auth='Logged In', firstName='Ryan', gender='M', itemInSession=1, lastName='Smith', length=260.07465, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, sessionId=583, song='The Big Gundown', status=200, ts=1542242481796, userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', userId='26')]

In [9]:
# i have to cast the data types
# _c means casted
log_data = log_data.withColumn("itemInSession_c", log_data["itemInSession"].cast(IntegerType())).drop("itemInSession")
log_data = log_data.withColumn("sessionId_c", log_data["sessionId"].cast(IntegerType())).drop("sessionId")
log_data = log_data.withColumn("status_c", log_data["status"].cast(IntegerType())).drop("status")


log_data = log_data.withColumn("as_date_C", to_utc_timestamp(from_unixtime(col("ts")/1000,'yyyy-MM-dd HH:mm:ss'),'EST')).drop("ts")


#df = log_data.withColumn("timestamp",unix_timestamp(log_data.ts, 'yyyy/MM/dd HH:mm:ss').cast(TimestampType()))
log_data = log_data.withColumn("userId_C", log_data["userId"].cast(IntegerType())).drop("userId")

In [10]:
log_data.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- song: string (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- itemInSession_c: integer (nullable = true)
 |-- sessionId_c: integer (nullable = true)
 |-- status_c: integer (nullable = true)
 |-- as_date_C: timestamp (nullable = true)
 |-- userId_C: integer (nullable = true)



In [11]:
# this cell shows that if we did not use distinct in the select statement that
#loads data in the user table we would get one users many times and that is not needed
log_data.where(log_data.userId_C=='26').take(2)

[Row(artist='Harmonia', auth='Logged In', firstName='Ryan', gender='M', lastName='Smith', length=655.77751, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, song='Sehr kosmisch', userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', itemInSession_c=0, sessionId_c=583, status_c=200, as_date_C=datetime.datetime(2018, 11, 15, 5, 30, 26), userId_C=26),
 Row(artist='The Prodigy', auth='Logged In', firstName='Ryan', gender='M', lastName='Smith', length=260.07465, level='free', location='San Jose-Sunnyvale-Santa Clara, CA', method='PUT', page='NextSong', registration=1541016707796.0, song='The Big Gundown', userAgent='"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"', itemInSession_c=1, sessionId_c=583, status_c=200, as_date_C=datetime.da

In [12]:
# selecting for user table
user_table_column_list = ["userId_C","firstName","lastName","gender","level"]
user_table_df = log_data.select(*user_table_column_list).distinct()
user_table_df.count()

107

In [20]:
#selecting for songplays tables so that the page is NextSong
song_play_column_list = ["","as_date_C","userId_C","level","","","sessionId_c","location","userAgent"]
song_play_df = log_data.select(*song_play_column_list).where(log_data.page=='NextSong')
song_play_df.count()

6820

In [13]:
# selecting for time table
log_data.\
withColumn("star_time", date_format(log_data.as_date_C,"HH:mm:ss")).\
withColumn('hour',date_format(log_data.as_date_C ,"h")).\
withColumn('day',date_format(log_data.as_date_C ,"d")).\
withColumn('week',weekofyear(log_data.as_date_C)).\
withColumn('month',month(log_data.as_date_C)).\
withColumn('year',year(log_data.as_date_C)).\
withColumn('week_day',dayofweek(log_data.as_date_C)).\
select("star_time","hour","day","week","month","year","week_day").show(10)

#withColumn('minute',date_format(log_data.as_date_C ,"m")).\
#withColumn('seconds',date_format(log_data.as_date_C ,"s")).\

+---------+----+---+----+-----+----+--------+
|star_time|hour|day|week|month|year|week_day|
+---------+----+---+----+-----+----+--------+
| 05:30:26|   5| 15|  46|   11|2018|       5|
| 05:41:21|   5| 15|  46|   11|2018|       5|
| 05:45:41|   5| 15|  46|   11|2018|       5|
| 06:57:51|   6| 15|  46|   11|2018|       5|
| 08:29:37|   8| 15|  46|   11|2018|       5|
| 08:44:09|   8| 15|  46|   11|2018|       5|
| 08:44:20|   8| 15|  46|   11|2018|       5|
| 10:34:34|  10| 15|  46|   11|2018|       5|
| 10:37:57|  10| 15|  46|   11|2018|       5|
| 10:48:55|  10| 15|  46|   11|2018|       5|
+---------+----+---+----+-----+----+--------+
only showing top 10 rows



# here i will start reading song data

In [14]:
# before reading the file i had to specify my schema (explicit schema definintion)

song_types = StructType([
    StructField("artist_id",StringType()),
    StructField("artist_latitude",StringType()),
    StructField("artist_location",StringType()),
    StructField("artist_longitude",StringType()),
    StructField("artist_name",StringType()),
    StructField("duration",DoubleType()),
    StructField("num_songs",IntegerType()),
    StructField("song_id",StringType()),
    StructField("title",StringType()),
    StructField("year",IntegerType())
])


In [15]:

song_data = spark.read.json("data/song-data/song_data/*/*/*/*.json", schema = song_types)

#data/song-data/song_data/A/A/A/TRAAAAW128F429D538.json

In [16]:
song_data.count()

71

In [17]:
song_data.take(1)

[Row(artist_id='ARDR4AC1187FB371A1', artist_latitude=None, artist_location='', artist_longitude=None, artist_name='Montserrat Caballé;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti', duration=511.16363, num_songs=1, song_id='SOBAYLL12A8C138AF9', title='Sono andati? Fingevo di dormire', year=0)]

In [18]:
song_data.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: integer (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)



# choosing artist data

In [22]:
artist_table_column_list = ["artist_id","artist_name","artist_location","artist_longitude","artist_latitude"]
artist_table_df = song_data.select(*artist_table_column_list).distinct()
artist_table_df.count()

69

# choosing song data

In [23]:
song_table_column_list = ["song_id","title","artist_id","year","duration"]
song_table_df = song_data.select(*song_table_column_list).distinct()
song_table_df.count()

71

In [2]:
spark.stop()

NameError: name 'spark' is not defined