In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

## RDD Test

In [3]:
from pyspark import SparkContext, SparkConf
print(sc.version)
print(sc.pythonVer)

2.4.5
3.6


## Loading XML data

In [4]:
# Loading XML
df = spark.read.format("xml").option("rowTag", "book").load("file:///home/talentum/test-jupyter/test/books.xml")
df.show()

+-----+--------------------+--------------------+---------------+-----+------------+--------------------+
|  _id|              author|         description|          genre|price|publish_date|               title|
+-----+--------------------+--------------------+---------------+-----+------------+--------------------+
|bk101|Gambardella, Matthew|An in-depth look ...|       Computer|44.95|  2000-10-01|XML Developer's G...|
|bk102|          Ralls, Kim|A former architec...|        Fantasy| 5.95|  2000-12-16|       Midnight Rain|
|bk103|         Corets, Eva|After the collaps...|        Fantasy| 5.95|  2000-11-17|     Maeve Ascendant|
|bk104|         Corets, Eva|In post-apocalyps...|        Fantasy| 5.95|  2001-03-10|     Oberon's Legacy|
|bk105|         Corets, Eva|The two daughters...|        Fantasy| 5.95|  2001-09-10|  The Sundered Grail|
|bk106|    Randall, Cynthia|When Carla meets ...|        Romance| 4.95|  2000-09-02|         Lover Birds|
|bk107|      Thurman, Paula|A deep sea diver .

## Loading Json data

In [5]:
# Loading JSON DATA
df = spark.read.json("file:///home/talentum/test-jupyter/test/people.json")
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



## Loading Avro data

In [6]:
# Reading Data from AVRO format (Running from pyspark shell)
# pyspark --packages com.databricks:spark-avro_2.11:3.2.0 --master local[2]
df = spark.read.format("com.databricks.spark.avro").load("file:///home/talentum/test-jupyter/test/episodes.avro")
# df = spark.read.format("avro").load("/data/spark/episodes.avro")
df.show()

+--------------------+----------------+------+
|               title|        air_date|doctor|
+--------------------+----------------+------+
|   The Eleventh Hour|    3 April 2010|    11|
|   The Doctor's Wife|     14 May 2011|    11|
| Horror of Fang Rock|3 September 1977|     4|
|  An Unearthly Child|23 November 1963|     1|
|The Mysterious Pl...|6 September 1986|     6|
|                Rose|   26 March 2005|     9|
|The Power of the ...| 5 November 1966|     2|
|          Castrolava|  4 January 1982|     5|
+--------------------+----------------+------+



## Loading Parquet data

In [7]:
df = spark.read.load("file:///home/talentum/test-jupyter/test/users.parquet")
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [8]:
from pyspark.sql.functions import expr

# Loading JSON DATA
raw_df = spark.read.json("file:///home/talentum/test-jupyter/test/Invoice-set1.json")
raw_df.printSchema()
raw_df.select("InvoiceNumber", "DeliveryAddress", "InvoiceLineItems").show(2, truncate = False)

explode_df = raw_df.selectExpr("InvoiceNumber", "CreatedTime", "StoreID", "PosID",
                                   "CustomerType", "PaymentMethod", "DeliveryType", "DeliveryAddress.City",
                                   "DeliveryAddress.State", "DeliveryAddress.PinCode", "explode(InvoiceLineItems) as LineItem")
explode_df.printSchema()
explode_df.select("InvoiceNumber", "City", "State", "LineItem").show(5, truncate = False)

flattened_df = explode_df \
        .withColumn("ItemCode", expr("LineItem.ItemCode")) \
        .withColumn("ItemDescription", expr("LineItem.ItemDescription")) \
        .withColumn("ItemPrice", expr("LineItem.ItemPrice")) \
        .withColumn("ItemQty", expr("LineItem.ItemQty")) \
        .withColumn("TotalValue", expr("LineItem.TotalValue")) \
        .drop("LineItem")
flattened_df.printSchema()
flattened_df.select("InvoiceNumber", "City", "State", "ItemCode", "ItemDescription", "ItemPrice", "ItemQty").show(5, truncate = False)



root
 |-- CESS: double (nullable = true)
 |-- CGST: double (nullable = true)
 |-- CashierID: string (nullable = true)
 |-- CreatedTime: long (nullable = true)
 |-- CustomerCardNo: string (nullable = true)
 |-- CustomerType: string (nullable = true)
 |-- DeliveryAddress: struct (nullable = true)
 |    |-- AddressLine: string (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- ContactNumber: string (nullable = true)
 |    |-- PinCode: string (nullable = true)
 |    |-- State: string (nullable = true)
 |-- DeliveryType: string (nullable = true)
 |-- InvoiceLineItems: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ItemCode: string (nullable = true)
 |    |    |-- ItemDescription: string (nullable = true)
 |    |    |-- ItemPrice: double (nullable = true)
 |    |    |-- ItemQty: long (nullable = true)
 |    |    |-- TotalValue: double (nullable = true)
 |-- InvoiceNumber: string (nullable = true)
 |-- NumberOfItems: long (nullable = t