In [0]:
# let’s start by creating a SparkSession, which is the entry point for PySpark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

sparkSQL = SparkSession.builder.appName("Implementing SparkSQL").getOrCreate()

data=[("Manoj", 23), ("Sneha", 24), ("Family", 45)]

dfSQL = sparkSQL.createDataFrame(data, ["Name", "Age"])
dfSQL.show()
dfSQL.printSchema()



+------+---+
|  Name|Age|
+------+---+
| Manoj| 23|
| Sneha| 24|
|Family| 45|
+------+---+

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)



In [0]:
# Now that you have a Spark DataFrame, we can use SparkSQL to perform some basic operations on the data.
# First, you need to register the pyspark dataframe before it can be queried with SparkSQL
# ===================================================================================================================
# NOTE - SparkSQL provides a SQL interface for quering structured and semi-structured data. If you have a DataFrame and want to apply SQL queries to it.You can create a temp view to expose the DataFrame as a table in SQL namespace. Creating a temp view provide a convenient bridge btn DataFrame and SQL Queries.
# ===================================================================================================================
# Register the DataFrame
dfSQL.createOrReplaceTempView("personal_info")



In [0]:
# “The table named ‘people’ is created, on which you can run your SQL queries.”
# You can now use the spark.sql() method to query the data

# let's perform select query on "personal_info"
query_result = spark.sql("""
                            select * from personal_info
                         """)
query_result.show()
query_result.printSchema()



+------+---+
|  Name|Age|
+------+---+
| Manoj| 23|
| Sneha| 24|
|Family| 45|
+------+---+

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)



In [0]:
sample_df = sparkSQL.read.csv("/FileStore/tables/sales.csv", header=True, inferSchema=True)
sample_df.show(10, truncate=False)
sample_df.printSchema() 
sample_df.columns



+---------------------------------+---------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|Region                           |Country              |Item Type      |Sales Channel|Order Priority|Order Date|Order ID |Ship Date |Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|
+---------------------------------+---------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|Australia and Oceania            |Tuvalu               |Baby Food      |Offline      |H             |2010-05-28|669165933|2010-06-27|9925      |255.28    |159.42   |2533654.0    |1582243.5 |951410.5    |
|Central America and the Caribbean|Grenada              |Cereal         |Online       |C             |2012-08-22|963881480|2012-09-15|2804      |205.7     |117.11   |576782.8     |

['Region',
 'Country',
 'Item Type',
 'Sales Channel',
 'Order Priority',
 'Order Date',
 'Order ID',
 'Ship Date',
 'Units Sold',
 'Unit Price',
 'Unit Cost',
 'Total Revenue',
 'Total Cost',
 'Total Profit']

In [0]:
# let's remove the white space in columns and covert to lower case
new_sample_df = sample_df.select([col(x).alias(x.replace(" ", "_").lower()) for x in sample_df.columns])
new_sample_df.show(5, truncate=False)
sample_df.show(5, truncate=False)



+---------------------------------+---------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|region                           |country              |item_type      |sales_channel|order_priority|order_date|order_id |ship_date |units_sold|unit_price|unit_cost|total_revenue|total_cost|total_profit|
+---------------------------------+---------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|Australia and Oceania            |Tuvalu               |Baby Food      |Offline      |H             |2010-05-28|669165933|2010-06-27|9925      |255.28    |159.42   |2533654.0    |1582243.5 |951410.5    |
|Central America and the Caribbean|Grenada              |Cereal         |Online       |C             |2012-08-22|963881480|2012-09-15|2804      |205.7     |117.11   |576782.8     |

In [0]:
# creating a temp view, so we can run our qurey on dataframe
sample_df.createOrReplaceTempView("sales_data")
new_sample_df.createOrReplaceTempView("new_sales_data")



In [0]:
# Start performing SQL queries on DataFrame

#  “Select” Operation
sparkSQL.sql("""
                select * from sales_data

            """).show(5, truncate=False)

sparkSQL.sql("""
                select * from new_sales_data

            """).show(5, truncate=False)

sparkSQL.sql("""
                select order_id, region, country, item_type from new_sales_data

            """).show(5, truncate=False)



+---------------------------------+---------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|Region                           |Country              |Item Type      |Sales Channel|Order Priority|Order Date|Order ID |Ship Date |Units Sold|Unit Price|Unit Cost|Total Revenue|Total Cost|Total Profit|
+---------------------------------+---------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|Australia and Oceania            |Tuvalu               |Baby Food      |Offline      |H             |2010-05-28|669165933|2010-06-27|9925      |255.28    |159.42   |2533654.0    |1582243.5 |951410.5    |
|Central America and the Caribbean|Grenada              |Cereal         |Online       |C             |2012-08-22|963881480|2012-09-15|2804      |205.7     |117.11   |576782.8     |

In [0]:
#  “When” Operation
sparkSQL.sql("""
                select order_id, item_type, sales_channel,
                    case
                        when order_priority == "H" then "High"
                        when order_priority == "L" then "LOW"
                        when order_priority == "M" then "Medium"
                        else "Not Decide"
                    end as set_order_priority
                    from new_sales_data

            """).show(10, truncate=False)



+---------+---------------+-------------+------------------+
|order_id |item_type      |sales_channel|set_order_priority|
+---------+---------------+-------------+------------------+
|669165933|Baby Food      |Offline      |High              |
|963881480|Cereal         |Online       |Not Decide        |
|341417157|Office Supplies|Offline      |LOW               |
|514321792|Fruits         |Online       |Not Decide        |
|115456712|Office Supplies|Offline      |LOW               |
|547995746|Baby Food      |Online       |Not Decide        |
|135425221|Household      |Offline      |Medium            |
|871543967|Vegetables     |Online       |High              |
|770463311|Personal Care  |Offline      |Medium            |
|616607081|Cereal         |Online       |High              |
+---------+---------------+-------------+------------------+
only showing top 10 rows



In [0]:
#  “Like” Operation
sparkSQL.sql("""
                select * from new_sales_data
                where region like "A%"
                order by region asc ;

            """).show(truncate=False)



+---------------------+------------------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|region               |country                       |item_type      |sales_channel|order_priority|order_date|order_id |ship_date |units_sold|unit_price|unit_cost|total_revenue|total_cost|total_profit|
+---------------------+------------------------------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+
|Asia                 |Kyrgyzstan                    |Vegetables     |Online       |H             |2011-06-24|814711606|2011-07-12|124       |154.06    |90.93    |19103.44     |11275.32  |7828.12     |
|Asia                 |Bangladesh                    |Clothes        |Online       |L             |2017-01-13|187310731|2017-03-01|8263      |109.28    |35.84    |902980.64    |296145.92 |6068

In [0]:
#  “Substring” Operation

sparkSQL.sql("""
                select *, substring(country, 0, 3) as country_code
                from new_sales_data 
                order by country asc;
            """).show(10, truncate=False)



+---------------------------------+----------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------+
|region                           |country   |item_type      |sales_channel|order_priority|order_date|order_id |ship_date |units_sold|unit_price|unit_cost|total_revenue|total_cost|total_profit|country_code|
+---------------------------------+----------+---------------+-------------+--------------+----------+---------+----------+----------+----------+---------+-------------+----------+------------+------------+
|Europe                           |Albania   |Clothes        |Online       |C             |2010-02-02|385383069|2010-03-18|2269      |109.28    |35.84    |247956.32    |81320.96  |166635.36   |Alb         |
|Sub-Saharan Africa               |Angola    |Household      |Offline      |M             |2011-04-23|135425221|2011-04-27|4187      |668.27    |502.54   |2798046.49   |210