# Spark SQL Examples

Run the code cells below. This is the same code from the previous screencast.

In [1]:
import findspark
import os

findspark.init(os.environ['SPARK_HOME'])

In [13]:
from pyspark.sql import SparkSession
import pyspark.sql.types as T

In [3]:
spark = SparkSession.builder.appName('Data wrangling with Spark SQL').getOrCreate()

In [4]:
user_log = spark.read.json('data/sparkify_log_small.json')

In [5]:
user_log.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



# Create a View And Run Queries

The code below creates a temporary view against which you can run SQL queries.

In [6]:
# .createOrReplaceTempView(): create a temporary SQL table from Spark dataframe
user_log.createOrReplaceTempView('user_log_sql')

In [7]:
spark.sql("""SELECT *
          FROM user_log_sql
          LIMIT 2
          """).show()

+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|       artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|Showaddywaddy|Logged In|  Kenneth|     M|          112|Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|
|   Lily Allen|Logged In|Elizabeth|     F|            7|   Chase|195.23873| free|Shreveport-Bossie...|   PUT|NextSong|1512718541284|     5027|      

In [9]:
spark.sql("""SELECT COUNT(*)         
          FROM user_log_sql
          """).show()

+--------+
|count(1)|
+--------+
|   10000|
+--------+



In [10]:
spark.sql("""SELECT userId, firstName, page, song
          FROM user_log_sql
          WHERE userId == '1046'
          """).collect()

[Row(userId='1046', firstName='Kenneth', page='NextSong', song='Christmas Tears Will Fall'),
 Row(userId='1046', firstName='Kenneth', page='NextSong', song='Be Wary Of A Woman'),
 Row(userId='1046', firstName='Kenneth', page='NextSong', song='Public Enemy No.1'),
 Row(userId='1046', firstName='Kenneth', page='NextSong', song='Reign Of The Tyrants'),
 Row(userId='1046', firstName='Kenneth', page='NextSong', song='Father And Son'),
 Row(userId='1046', firstName='Kenneth', page='NextSong', song='No. 5'),
 Row(userId='1046', firstName='Kenneth', page='NextSong', song='Seventeen'),
 Row(userId='1046', firstName='Kenneth', page='Home', song=None),
 Row(userId='1046', firstName='Kenneth', page='NextSong', song='War on war'),
 Row(userId='1046', firstName='Kenneth', page='NextSong', song='Killermont Street'),
 Row(userId='1046', firstName='Kenneth', page='NextSong', song='Black & Blue'),
 Row(userId='1046', firstName='Kenneth', page='Logout', song=None),
 Row(userId='1046', firstName='Kenneth'

In [11]:
spark.sql("""SELECT DISTINCT page
          FROM user_log_sql
          ORDER BY page
          """).show()

+----------------+
|            page|
+----------------+
|           About|
|       Downgrade|
|           Error|
|            Help|
|            Home|
|           Login|
|          Logout|
|        NextSong|
|   Save Settings|
|        Settings|
|Submit Downgrade|
|  Submit Upgrade|
|         Upgrade|
+----------------+



# User Defined Functions

In [12]:
from datetime import datetime

In [14]:
spark.udf.register('get_hour', lambda x: datetime.fromtimestamp(x/1000).hour, T.IntegerType())

<function __main__.<lambda>(x)>

In [15]:
spark.sql('''SELECT *, get_hour(ts) AS hour
          FROM user_log_sql
          ORDER BY hour DESC
          LIMIT 1
          '''
          ).collect()

[Row(artist='Coldplay', auth='Logged In', firstName='Kaden', gender='M', itemInSession=33, lastName='Campbell', length=249.23383, level='paid', location='Detroit-Warren-Dearborn, MI', method='PUT', page='NextSong', registration=1499474856284, sessionId=4099, song='Strawberry Swing', status=200, ts=1513771206284, userAgent='"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"', userId='445', hour=23)]

In [16]:
song_in_hour = spark.sql(
    """SELECT get_hour(ts) AS hour, COUNT(*) AS play_per_hour                       
    FROM user_log_sql
    WHERE page = 'NextSong'
    GROUP BY hour
    ORDER BY hour
    """
)

In [18]:
song_in_hour.show(5)

+----+-------------+
|hour|play_per_hour|
+----+-------------+
|   0|          339|
|   1|          462|
|   2|          479|
|   3|          484|
|   4|          430|
+----+-------------+
only showing top 5 rows



# Converting Results to Pandas

In [21]:
# Convert to Pandas dataframe
song_in_hour_pd = song_in_hour.limit(5).toPandas()

In [22]:
song_in_hour_pd

Unnamed: 0,hour,play_per_hour
0,0,339
1,1,462
2,2,479
3,3,484
4,4,430
