# Answer Key to the Data Frame Programming Quiz

Helpful resources:
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html

In [1]:
import findspark
import os

findspark.init(os.environ['SPARK_HOME'])

In [7]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [3]:
spark = SparkSession.builder.appName('Spark Data Frame Practice').getOrCreate()

In [4]:
df = spark.read.json('data/sparkify_log_small.json')

In [5]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



# Question 1

Which page did user id "" (empty string) NOT visit?

In [13]:
df1 = df.select('page').dropDuplicates()
df2 = df.where(F.col('userId') == '').select('page').dropDuplicates()
df1.join(df2, how='leftanti', on='page').show()

+----------------+
|            page|
+----------------+
|Submit Downgrade|
|       Downgrade|
|          Logout|
|   Save Settings|
|        Settings|
|        NextSong|
|         Upgrade|
|           Error|
|  Submit Upgrade|
+----------------+



# Question 2 - Reflect

What type of user does the empty string user id most likely refer to?


Users who are only visiting the About, Home and Login pages are probably unregistered visitors.

# Question 3

How many female users do we have in the data set?

In [17]:
df.select('gender').dropDuplicates().show()

+------+
|gender|
+------+
|     F|
|  null|
|     M|
+------+



In [19]:
df.where("gender == 'F'").select('userId').dropDuplicates().count()

462

# Question 4

How many songs were played from the most played artist?

In [37]:
most_played_artist = df.groupBy('artist').count().where("artist != 'null'") \
                       .sort(F.desc('count')).head().artist

df.where(F.col('artist') == most_played_artist).select('song').dropDuplicates().count()

24

# Question 5 (challenge)

How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.



In [38]:
from pyspark.sql import Window
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [40]:
df.limit(5).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Showaddywaddy,Logged In,Kenneth,M,112,Matthews,232.93342,paid,"Charlotte-Concord-Gastonia, NC-SC",PUT,NextSong,1509380319284,5132,Christmas Tears Will Fall,200,1513720872284,"""Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537....",1046
1,Lily Allen,Logged In,Elizabeth,F,7,Chase,195.23873,free,"Shreveport-Bossier City, LA",PUT,NextSong,1512718541284,5027,Cheryl Tweedy,200,1513720878284,"""Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537....",1000
2,Cobra Starship Featuring Leighton Meester,Logged In,Vera,F,6,Blackwell,196.20526,paid,"Racine, WI",PUT,NextSong,1499855749284,5516,Good Girls Go Bad (Feat.Leighton Meester) (Alb...,200,1513720881284,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",2219
3,Alex Smoke,Logged In,Sophee,F,8,Barker,405.99465,paid,"San Luis Obispo-Paso Robles-Arroyo Grande, CA",PUT,NextSong,1513009647284,2372,Don't See The Point,200,1513720905284,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",2373
4,,Logged In,Jordyn,F,0,Jones,,free,"Syracuse, NY",GET,Home,1513648531284,1746,,200,1513720913284,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",1747


In [68]:
df_output = df.where("userId <> '' AND page IN ('NextSong', 'Home')") \
              .sort([F.col('userId'), F.col('ts')]) \
              .select(['userId', 'ts', 'page', 'song'])
df_output.limit(20).toPandas()

Unnamed: 0,userId,ts,page,song
0,10,1513790894284,NextSong,Secrets
1,10,1513828388284,NextSong,Overdue
2,100,1513750214284,NextSong,1972
3,100,1513750442284,NextSong,Secrets
4,100,1513775431284,Home,
5,100,1513775556284,NextSong,Don't It Make My Brown Eyes Blue
6,100,1513775710284,NextSong,Clouds (Of Color Bright Album Version)
7,100,1513776194284,Home,
8,100,1513776308284,NextSong,0010
9,100,1513839673284,Home,


In [69]:
is_home = F.udf(lambda x: 1 if x == 'Home' else 0, T.IntegerType())

In [70]:
df_output = df_output.withColumn('is_home', is_home(F.col('page')))
df_output = df_output.withColumn(
    'session',
    F.sum(F.col('is_home')).over(
        Window.partitionBy('userId').orderBy('ts') \
              .rowsBetween(Window.unboundedPreceding, 0)
    )
)

df_output.limit(20).toPandas()

Unnamed: 0,userId,ts,page,song,is_home,session
0,10,1513790894284,NextSong,Secrets,0,0
1,10,1513828388284,NextSong,Overdue,0,0
2,100,1513750214284,NextSong,1972,0,0
3,100,1513750442284,NextSong,Secrets,0,0
4,100,1513775431284,Home,,1,1
5,100,1513775556284,NextSong,Don't It Make My Brown Eyes Blue,0,1
6,100,1513775710284,NextSong,Clouds (Of Color Bright Album Version),0,1
7,100,1513776194284,Home,,1,2
8,100,1513776308284,NextSong,0010,0,2
9,100,1513839673284,Home,,1,3


In [71]:
df_output = df_output.where("page <> 'Home'")
df_output = df_output.groupBy(['userId', 'session']).count()
df_output.toPandas().head(20)

Unnamed: 0,userId,session,count
0,10,0,2
1,100,0,2
2,100,1,2
3,100,2,1
4,1000,0,1
5,1003,1,1
6,1005,0,1
7,1006,0,3
8,1017,0,6
9,1017,1,3


In [77]:
round(df_output.agg(F.avg(F.col('count'))).collect()[0][0])

7

What are the top 5 users who listen to songs the most on average between visiting home page?

In [85]:
df_output.groupBy('userId').agg({'count': 'avg'}) \
         .sort(F.desc('avg(count)')).show(5)

+------+----------+
|userId|avg(count)|
+------+----------+
|  1579|      60.0|
|   462|      58.5|
|  2867|      56.0|
|   445|      49.0|
|  2002|      49.0|
+------+----------+
only showing top 5 rows

