# Processing JSON Data

- [Link](https://ch-nabarun.medium.com/read-json-using-pyspark-f792bda95741)

In [52]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import functions as F
from urllib.request import Request, urlopen

In [53]:
spark = (SparkSession
        .builder
        .appName("json-processing")
        .getOrCreate()
        )

sc = spark._sc

## #1 Simple JSON (Each line contains a JSON Document)

## #2 Multi line JSON

In [54]:
df = (
    spark
    .read
    .option("multiLine","true")
    .json('/Users/rk/Desktop/data/multi-line-json.json')
)

In [55]:
df.printSchema()

root
 |-- nationality: string (nullable = true)
 |-- results: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- user: struct (nullable = true)
 |    |    |    |-- cell: string (nullable = true)
 |    |    |    |-- dob: long (nullable = true)
 |    |    |    |-- email: string (nullable = true)
 |    |    |    |-- gender: string (nullable = true)
 |    |    |    |-- location: struct (nullable = true)
 |    |    |    |    |-- city: string (nullable = true)
 |    |    |    |    |-- state: string (nullable = true)
 |    |    |    |    |-- street: string (nullable = true)
 |    |    |    |    |-- zip: long (nullable = true)
 |    |    |    |-- md5: string (nullable = true)
 |    |    |    |-- name: struct (nullable = true)
 |    |    |    |    |-- first: string (nullable = true)
 |    |    |    |    |-- last: string (nullable = true)
 |    |    |    |    |-- title: string (nullable = true)
 |    |    |    |-- password: string (nullable = true)
 |    |    

In [56]:
df.columns

['nationality', 'results', 'seed', 'version']

In [57]:
df = df.withColumn('results',F.explode('results'))

# Read location and name
df.select("results.user.location.*","results.user.name.*").show()

+----------+-----------------+--------------------+-----+--------+---------+-----+
|      city|            state|              street|  zip|   first|     last|title|
+----------+-----------------+--------------------+-----+--------+---------+-----+
|tuttlingen|   sachsen-anhalt|   1307 kastanienweg|39587|  daniel|    heinz|   mr|
| straubing|      brandenburg|   4916 wiesenstraße|42722|  amanda|    bruns| miss|
|     essen|baden-württemberg|      8948 gartenweg|31250| bastian|schreiner|   mr|
|      leer|    niedersachsen|     7017 kirchplatz|80447|frederik|     adam|   mr|
|      daun|baden-württemberg|       7359 kirchweg|80580| laurenz| dittrich|   mr|
|  dortmund|        thüringen|     5162 drosselweg|23302|   tommi| gebhardt|   mr|
| helmstedt|           bayern|5703 berliner straße|17288|   fritz|     lang|   mr|
|     börde|      brandenburg|     6664 poststraße|66107|angelina|     fink| miss|
| offenbach|baden-württemberg|   4697 uhlandstraße|29392|  robert|     pohl|   mr|
|bur

## #3 Read JSON Data from http

In [58]:
# Online data source
url = 'https://randomuser.me/api/0.8/?results=10'

# read the online data file
data = urlopen(url).read().decode('utf-8')

In [59]:
print(data)

{
    "results": [
        {
            "user": {
                "gender": "male",
                "name": {
                    "title": "mr",
                    "first": "liam",
                    "last": "johnson"
                },
                "location": {
                    "street": "8524 grand ave",
                    "city": "kingston",
                    "state": "alberta",
                    "zip": 84469
                },
                "email": "liam.johnson@example.com",
                "username": "whitekoala531",
                "password": "ursitesux",
                "salt": "0qkdl10I",
                "md5": "2a9e865c19740317496edf5475d9b451",
                "sha1": "4a344da5552157239882eacc9c79da87ab615b64",
                "sha256": "bf5dca2f43a33cccb2d41f5d5db7585e573f3e9debdff04e34617af45d02d08f",
                "registered": 1419353813,
                "dob": 233012308,
                "phone": "438-239-0752",
                "cell": "038-465-4170

In [60]:
rdd = sc.parallelize([data])

# create a Dataframe
jsonDF = spark.read.json(rdd)

In [61]:
jsonDF.show()

+-----------+--------------------+------------------+-------+
|nationality|             results|              seed|version|
+-----------+--------------------+------------------+-------+
|         CA|[[[038-465-4170, ...|bf63979a09212ed602|    0.8|
+-----------+--------------------+------------------+-------+



In [62]:
# read all the users name:
results = (
    jsonDF
    .withColumn('results', F.explode('results'))
    .select('results.user.name.*', 
            'results.user.location.*', 
            'results.user.email',
            'results.user.username',
            'results.user.password',
            'results.user.salt',
            'results.user.md5',
            'results.user.sha1',
            'results.user.sha256',
            'results.user.dob',
            'results.user.phone',
            'results.user.cell',
            'results.user.picture.*'
           )
)

results.toPandas()

Unnamed: 0,first,last,title,city,state,street,zip,email,username,password,salt,md5,sha1,sha256,dob,phone,cell,large,medium,thumbnail
0,liam,johnson,mr,kingston,alberta,8524 grand ave,84469,liam.johnson@example.com,whitekoala531,ursitesux,0qkdl10I,2a9e865c19740317496edf5475d9b451,4a344da5552157239882eacc9c79da87ab615b64,bf5dca2f43a33cccb2d41f5d5db7585e573f3e9debdff0...,233012308,438-239-0752,038-465-4170,https://randomuser.me/api/portraits/men/51.jpg,https://randomuser.me/api/portraits/med/men/51...,https://randomuser.me/api/portraits/thumb/men/...
1,charles,lavigne,mr,princeton,ontario,6512 dufferin st,94538,charles.lavigne@example.com,purpledog669,olemiss,w8wNictT,f5b8c3699eff312807090296d64e7674,66b6bdebf33191ed0e56486d4aa6fadda4ddecc1,597d7cb1dde9bbcbc258540d7da1a28e095f7bfa0fde1b...,395571170,954-122-6907,903-834-1659,https://randomuser.me/api/portraits/men/13.jpg,https://randomuser.me/api/portraits/med/men/13...,https://randomuser.me/api/portraits/thumb/men/...
2,addison,lavoie,ms,charlottetown,ontario,7386 college ave,30862,addison.lavoie@example.com,brownladybug283,everett,pKYUDwlD,f5d2d47491cb4930c51ecd6b15b044ef,81d5d191b39cd22cac2a21bb9e4e981b1e1d0c4c,e32b5a385488dc1e8830a049510a5c76b9634d47719161...,1103302093,937-302-6434,084-892-3668,https://randomuser.me/api/portraits/women/10.jpg,https://randomuser.me/api/portraits/med/women/...,https://randomuser.me/api/portraits/thumb/wome...
3,leo,sirko,mr,windsor,alberta,3816 elgin st,58543,leo.sirko@example.com,lazypanda101,deep,pMztZJ0g,5a20418639ede5a076b5f935628bc2db,4d5abc2915636b58089646941172224102ddbdb3,91dd773682459d550618c1a7a6e10e56527ebab3526ef1...,520364950,122-292-6207,277-725-4243,https://randomuser.me/api/portraits/men/70.jpg,https://randomuser.me/api/portraits/med/men/70...,https://randomuser.me/api/portraits/thumb/men/...
4,ariane,anderson,ms,trout lake,nova scotia,8247 parliament st,63080,ariane.anderson@example.com,redmeercat366,putter,d1PKAnrP,b3719ba724afe91129f5d5e5b8f84655,71813c7e1a752fb1b7655de604026a2d15f92f11,370a8218eb5681766cc11c7d58f6b803c51f8c5418a4a8...,317019928,171-557-8902,305-857-4016,https://randomuser.me/api/portraits/women/72.jpg,https://randomuser.me/api/portraits/med/women/...,https://randomuser.me/api/portraits/thumb/wome...
5,benjamin,wong,mr,sandy lake,new brunswick,1243 concession road 6,17446,benjamin.wong@example.com,greenostrich864,bdsm,JTSHfeJh,c04131da188319fc2b1002fdb11633cc,558dce6b1ccf6d026043dfcae8012a00caf25b99,4746d4ae2505350b1bc440f09ab06941913bbd25d54d05...,538100414,353-620-8240,204-828-1143,https://randomuser.me/api/portraits/men/99.jpg,https://randomuser.me/api/portraits/med/men/99...,https://randomuser.me/api/portraits/thumb/men/...
6,noah,gill,mr,inwood,british columbia,6769 oak st,59273,noah.gill@example.com,organicmeercat501,danzig,Wmvc55ln,8824b55f32f5f09a0225b145e60fadf4,0a43fb846ab63708c84723e0268feedb01f12db0,3c0d40782524a95ca12b1090912876d772dea421f07494...,944943006,226-319-8922,489-376-2488,https://randomuser.me/api/portraits/men/35.jpg,https://randomuser.me/api/portraits/med/men/35...,https://randomuser.me/api/portraits/thumb/men/...
7,brielle,lo,ms,havelock,northwest territories,3802 king st,19965,brielle.lo@example.com,yellowkoala285,hobbes,n1Y92rDT,2d3511f99b0d6bccd6fd336e29984297,0578af9bcf9c455f97052658ec985aa7b3b2010d,c4e153378cff538ed9e04d46502ee91805bc976ad77405...,1307415460,516-253-7356,263-768-2543,https://randomuser.me/api/portraits/women/46.jpg,https://randomuser.me/api/portraits/med/women/...,https://randomuser.me/api/portraits/thumb/wome...
8,alice,french,miss,kingston,northwest territories,4837 dufferin st,99262,alice.french@example.com,ticklishwolf879,tiger2,2pwHjKEI,257bffc048ec4910f3647fcb0d2800ec,fae1ad2dabbfd97522ddad19fe7b4fc89b32389a,ccc17f9bd35caebf5c509fe965024352cacede10157dfa...,616903456,713-158-9787,194-157-1602,https://randomuser.me/api/portraits/women/34.jpg,https://randomuser.me/api/portraits/med/women/...,https://randomuser.me/api/portraits/thumb/wome...
9,carter,chow,mr,princeton,alberta,9109 arctic way,77910,carter.chow@example.com,blackpanda365,zelda,sJBLHwtg,c04a36791c439d84721fd90a905a454d,f1696c02fe5ab0a5668ad7d298e839056bb8af1c,fb9804b12824212f7cc5db611e502ce7a4374469503b72...,664707734,631-466-5373,342-323-2238,https://randomuser.me/api/portraits/men/1.jpg,https://randomuser.me/api/portraits/med/men/1.jpg,https://randomuser.me/api/portraits/thumb/men/...


## CSV File containing JSON Data

Note: **I could not find a CSV file with a JSON column. Below code is not tested.**

In [69]:
df = (spark
      .read
      .option('multiLine',"true")
      .option('escape','\"')
      .option('header', False)
      .csv('/Users/rk/Desktop/data/csv-with-json-data.csv')
     )

In [70]:
df.show()

+--------------------+----+----+
|                 _c0| _c1| _c2|
+--------------------+----+----+
|                 Tom|   B|  { |
|         "student":{|null|null|
|          "name":...|null|null|
|                   }|null|null|
|                  } |null|null|
+--------------------+----+----+



In [None]:
# escape all " in the JSON content to read properly
readCSVFileDF = spark.read.option("multiLine","true").option('escape',"\"").option('header',True).csv('mixJSON4.csv')
readCSVFileDF.printSchema()
readCSVFileDF.show(truncate=False)

# use JSON_TUPLE to read required contents : student, nationality and subject from Info column
readJSONContentDF1 = readCSVFileDF.select("*",F.json_tuple("Info","student","nationality","subject")).drop('Info')
readJSONContentDF1.show(truncate=False)

# use JSON_TUPLE to read student details - gender, city, email
readJSONContentDF2 = readJSONContentDF1.select('*',F.col('c1').alias('Nationality'),F.json_tuple('c0','gender','city','email').alias('Gender','City','MailId')).drop('c0','c1')
readJSONContentDF2.show(truncate=False)

# use JSON_TUPLE to read subject details - mainsubject and optional
finalDF = readJSONContentDF2.select('*',F.json_tuple('c2','main','optional').alias('MainSubject','OptionalSubject')).drop('c2')
finalDF.show(truncate=False)


# another way : using withcolumn
readJSONDF = readCSVFileDF.select("*",F.json_tuple("Info","student","nationality","subject")).drop('Info')
readJSONDF.show(truncate=False)

finalDF1 = readJSONDF.withColumn('Gender',F.json_tuple('c0','gender')).\
    withColumn('City',F.json_tuple('c0','city')).\
    withColumn('MailId',F.json_tuple('c0','email')).\
    withColumn('MainSubject',F.json_tuple('c2','main')).\
    withColumn('OptionalSubject',F.json_tuple('c2','optional')).\
    withColumn('Nationality',F.col('c1')).drop('c0','c1','c2')

finalDF1.show(truncate=False)