# EDA on time-based features

In [None]:
S3_RESOURCE = 's3'
SCHEME = 's3a'
# TODO
BUCKET_NAME = ''
PREFIX = ''
### BEGIN STRIP ###
BUCKET_NAME = 'nibble-clients'
PREFIX = 'jedha/PySpark-Cloud/data/'
### END STRIP ###
INPUT_FILENAME = 'playlog_processed.parquet'

In [None]:
def get_s3_path(key, bucket_name=BUCKET_NAME, scheme=SCHEME):
    return f"{scheme}://{bucket_name}/{key}"

In [None]:
# TODO: load data
### BEGIN STRIP ###
df_raw = spark.read.load(get_s3_path(f'{PREFIX}/interim/{INPUT_FILENAME}'))
df_raw.count()
### END STRIP ###

NameError: ignored

In [None]:
# As a refresher
df_raw.limit(5).toPandas()

Unnamed: 0,timestamp,user,song
0,1415093932,22406,5CtwSKr1ZuQ
1,1415093933,312,L_1zmwZswS8
2,1415093934,14586,aaceHGDhUiI
3,1415093934,19635,cYMCLz5PQVw
4,1415093937,11272,eMrI0yIUXFg


In [None]:
# TODO: compute a new column `datetime`
#       that converts the timestamp to a datetime
#       drop the `timestamp` column
#       and order by `datetime`
#       save this as a new DataFrame `df`
#       show the first 5 rows of `df`
### BEGIN STRIP ###
from pyspark.sql import functions as F

df = (df_raw.withColumn('datetime', F.from_unixtime('timestamp')) \
      .drop('timestamp') \
      .orderBy('datetime') \
      .select('datetime', 'user', 'song'))
df.show(5)
### END STRIP ###

Now that we have a datetime column, we can compute new columns, namely:
- [year](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.year)
- [month](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.month)
- [dayofmonth](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.dayofmonth)
- [dayofweek](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.dayofweek)
- [dayofyear](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.dayofyear)
- [weekofyear](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=date#pyspark.sql.functions.weekofyear)

We will put the resulting DataFrame in a variable called `df_enriched`.

In [None]:
# TODO: follow previous instructions
### BEGIN STRIP ###
from functools import reduce

funcs = [F.year, F.month, F.dayofmonth, F.dayofweek, F.dayofyear, F.weekofyear]

df_enriched = reduce(
  lambda memo_df, f: memo_df.withColumn(f.__name__, f('datetime')),
  funcs, df)

# Or, alternatively
# df_enriched = df.select('*', *(f('datetime').alias(f.__name__) for f in funcs))

df_enriched.show()
### END STRIP ###

In [None]:
# TODO: print out the schema of the new dataframe
### BEGIN STRIP ###
df_enriched.printSchema()
### END STRIP ###

In [None]:
# Plot average of monthly counts
### BEGIN STRIP ###
display(df_enriched.groupBy('month').count().orderBy('month'))
### END STRIP ###

month,count
1,2010671
2,2151021
3,2523459
4,2274812
5,2389084
6,2092306
7,1958010
8,1780897
9,1988767
10,2402561


In [None]:
# TODO: clean?
### BEGIN STRIP ###
def count_by_period(col_name, df):
    return df.groupBy(col_name).count().orderBy(col_name)

from functools import partial
### END STRIP ###

In [None]:
# TODO: bar plot by year
### BEGIN STRIP ###
display(df_enriched.transform(partial(count_by_period, 'year')))
### END STRIP ###

year,count
2014,8136227
2015,8158803
2016,6024269
2017,2471406
2018,823473
2019,125358


In [None]:
# TODO: bar plot by month
### BEGIN STRIP ###
display(df_enriched.transform(partial(count_by_period, 'month')))
### END STRIP ###

month,count
1,2010671
2,2151021
3,2523459
4,2274812
5,2389084
6,2092306
7,1958010
8,1780897
9,1988767
10,2402561


In [None]:
# TODO: bar plot by weekofyear
### BEGIN STRIP ###
display(df_enriched.transform(partial(count_by_period, 'weekofyear')))
### END STRIP ###

weekofyear,count
1,435209
2,424728
3,471778
4,497088
5,474106
6,477908
7,506421
8,582808
9,602185
10,569273


In [None]:
# TODO: bar plot by dayofmonth
### BEGIN STRIP ###
display(df_enriched.transform(partial(count_by_period, 'dayofmonth')))
### END STRIP ###

dayofmonth,count
1,834271
2,861093
3,823726
4,851305
5,853476
6,886509
7,842203
8,836104
9,845744
10,848751


In [None]:
# TODO: bar plot by dayofyear
### BEGIN STRIP ###
display(df_enriched.transform(partial(count_by_period, 'dayofyear')))
### END STRIP ###

dayofyear,count
1,41515
2,48633
3,45255
4,60595
5,53693
6,61925
7,60638
8,62852
9,79243
10,55055
