In [0]:
# These are the questions that can be asked

'''
1) What if you have 3 key value pair data and one data has 4 key value pair what happens
2) what is multiple and line delimited json and which one works faster
3) how to convert nested data in spark dataframe   (Very Important full explaination in other spark folders last cell block explaination)
4) what will happend if i have corrupt json file

line delimited json:
{.........},
{.........},
{.........}

multiline:
[
    {
        ...,
        ...,
        ...
    },
    {
        ...,
        ...,
        ...
    }
]
'''

In [0]:
'''
By default json read data in line delimited format so if we want to use multiline json or load multiline json there is a process

multiline takes more time to process but used in industry 5th box i have shown how multiline to read
'''

In [0]:
# How to read json 

df_json = spark.read.format("json")\
                    .option("inferschema","true")\
                    .option("mode","PERMISSIVE")\
                    .load("/FileStore/tables/json_data.txt")

df_json.show()

+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+



In [0]:
# Now what id json have diff fields

df_json_ex = spark.read.format("json")\
                    .option("inferschema","true")\
                    .option("mode","PERMISSIVE")\
                    .load("/FileStore/tables/json_diff_field.txt")

df_json_ex.show()

# create a new coloumn and only put data which has record other null

+---+------+--------+------+
|age|gender|    name|salary|
+---+------+--------+------+
| 20|  null|  Manish| 20000|
| 25|  null|  Nikita| 21000|
| 16|  null|  Pritam| 22000|
| 35|  null|Prantosh| 25000|
| 67|     M|  Vikash| 40000|
+---+------+--------+------+



In [0]:
df_json_multi = spark.read.format("json")\
                    .option("inferschema","true")\
                    .option("mode","PERMISSIVE")\
                    .option("multiline","true")\
                    .load("/FileStore/tables/multiline_correct.txt")

df_json_multi.show()

# we need to add an option multiline true for it to run  in multiline always it will be covered with [] 

+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+



In [0]:
# how to detect corrupted records while reading json this happens automatically

df_corrupt = spark.read.format("json")\
                    .option("inferschema","true")\
                    .option("mode","PERMISSIVE")\
                    .load("/FileStore/tables/corrupt_json.txt")

df_corrupt.show()


+--------------------+----+--------+------+
|     _corrupt_record| age|    name|salary|
+--------------------+----+--------+------+
|                null|  20|  Manish| 20000|
|                null|  25|  Nikita| 21000|
|                null|  16|  Pritam| 22000|
|                null|  35|Prantosh| 25000|
|{"name":"Vikash",...|null|    null|  null|
+--------------------+----+--------+------+



In [0]:
# How to work with nested json


df_nest = spark.read.format("json")\
                    .option("inferschema","true")\
                    .option("mode","PERMISSIVE")\
                    .option("multiline","true")\
                    .load("/FileStore/tables/nested_json.json")

df_nest.show(5)
df_nest.printSchema()

+----+-------+--------------------+-------------+-------------+-------------+------+
|code|message|         restaurants|results_found|results_shown|results_start|status|
+----+-------+--------------------+-------------+-------------+-------------+------+
|null|   null|                  []|            0|            0|            1|  null|
|null|   null|[{{{17066603}, b9...|         6835|           20|            1|  null|
|null|   null|                  []|            0|            0|            1|  null|
|null|   null|                  []|            0|            0|            1|  null|
|null|   null|[{{{17093124}, b9...|         8680|           20|            1|  null|
+----+-------+--------------------+-------------+-------------+-------------+------+
only showing top 5 rows

root
 |-- code: long (nullable = true)
 |-- message: string (nullable = true)
 |-- restaurants: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- restaurant: struct (nullabl

In [0]:
# In the above output you can see the output looks like shit now if we want to make this readable and use it to gather info we need to flatten it and create a proper df 

# Basically to flat we need to flatten basically two types array type and struct type depending on what you want you have to explode array 

# array need to be exploded use explode_outer if you have null values in array if used explode only then when null is encountered the whole result will be shown as null

# while if struct then use . to traverse inside it and get the data you want and then spark will create a coloum for you when you select 

from pyspark.sql.functions import *
from pyspark.sql.types import *

# suppose i want res_id, establishment -> element, name, offer->element code in next block

In [0]:
df_temp = df_nest.select(explode("restaurants").alias("res")).printSchema()


root
 |-- res: struct (nullable = true)
 |    |-- restaurant: struct (nullable = true)
 |    |    |-- R: struct (nullable = true)
 |    |    |    |-- res_id: long (nullable = true)
 |    |    |-- apikey: string (nullable = true)
 |    |    |-- average_cost_for_two: long (nullable = true)
 |    |    |-- cuisines: string (nullable = true)
 |    |    |-- currency: string (nullable = true)
 |    |    |-- deeplink: string (nullable = true)
 |    |    |-- establishment_types: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- events_url: string (nullable = true)
 |    |    |-- featured_image: string (nullable = true)
 |    |    |-- has_online_delivery: long (nullable = true)
 |    |    |-- has_table_booking: long (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- is_delivering_now: long (nullable = true)
 |    |    |-- location: struct (nullable = true)
 |    |    |    |-- address: string (nullable = true)
 |    |    |  

In [0]:
df_temp = df_nest.select(explode("restaurants").alias("res"))

df_temp.show(truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
df_temp = df_temp.select("*",explode("res.restaurant.establishment_types").alias("establishment_element")).printSchema()

# using this as referrence next block will be creating my logic using this schema as reference 

root
 |-- res: struct (nullable = true)
 |    |-- restaurant: struct (nullable = true)
 |    |    |-- R: struct (nullable = true)
 |    |    |    |-- res_id: long (nullable = true)
 |    |    |-- apikey: string (nullable = true)
 |    |    |-- average_cost_for_two: long (nullable = true)
 |    |    |-- cuisines: string (nullable = true)
 |    |    |-- currency: string (nullable = true)
 |    |    |-- deeplink: string (nullable = true)
 |    |    |-- establishment_types: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- events_url: string (nullable = true)
 |    |    |-- featured_image: string (nullable = true)
 |    |    |-- has_online_delivery: long (nullable = true)
 |    |    |-- has_table_booking: long (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- is_delivering_now: long (nullable = true)
 |    |    |-- location: struct (nullable = true)
 |    |    |    |-- address: string (nullable = true)
 |    |    |  

In [0]:
# since got error cant use explode twice we will do this 
df_temp = df_temp.select("*",explode_outer("res.restaurant.establishment_types").alias("establishment_element"))\
                .select("*",explode_outer("res.restaurant.offers").alias("offer_element"))\
                .select("res.restaurant.R.res_id","res.restaurant.name","establishment_element","offer_element")


df_temp.show()


# We use explode outer because while using explode if we get null values in that array it will show you nothing because of which whole dataset will become blank explode outer will solve that issue 

# This is how you can tranverse or use nested json and create a clean data frame according to your needs

+--------+--------------------+---------------------+-------------+
|  res_id|                name|establishment_element|offer_element|
+--------+--------------------+---------------------+-------------+
|17066603|            The Coop|                 null|         null|
|17059541|Maggiano's Little...|                 null|         null|
|17064405|Tako Cheena by Po...|                 null|         null|
|17057797|Bosphorous Turkis...|                 null|         null|
|17057591|Bahama Breeze Isl...|                 null|         null|
|17064266|Hawkers Asian Str...|                 null|         null|
|17060516|Seasons 52 Fresh ...|                 null|         null|
|17060320|Raglan Road Irish...|                 null|         null|
|17059060|           Hillstone|                 null|         null|
|17059012|Hollerbach's Will...|                 null|         null|
|17060869|     Texas de Brazil|                 null|         null|
|17061231|    The Ravenous Pig|                 