Work with StructType and StructField for nested schema definitions.
- Define a nested schema using StructType and StructField.
- Create a JSON-like dataset with nested fields.
- Load the dataset into a PySpark DataFrame.
- Extract and transform nested fields using selectExpr().

In [None]:
from pyspark.sql import SparkSession,Row

In [None]:
spark = SparkSession.builder.appName('Assessment3').getOrCreate()

In [None]:
from pyspark.sql.types import StructField ,StructType

In [None]:
# assess3_data = [Row(id = 1, name = 'Akash' , dept = ['CSE','AIML']),
#                 Row(id = 2, name = 'Venkatesh' , dept = ['CSE','CORE']),
#                 Row(id = 3, name = 'Rohith' , dept = ['CSE','CS'])]


why mentioning multiline = true solves the issue ?



*   Without **multiLine=True**, Spark reads each line separately and expects it to be a valid JSON object.


*   With **multiLine=True,** Spark understands that the entire file is one JSON object, even if it spans multiple lines.




In [None]:
json_data = spark.read.option('multiline','True').json(path = "/content/sample1.json");

In [None]:
json_data.show()

+--------------------+--------------------+
|             brewing|              coffee|
+--------------------+--------------------+
|{{Brewing Co., 10...|{{Coffee Co., 101...|
+--------------------+--------------------+



As the data is in JSON , now to convert the JSON file into well-defined table we'll use the EXPLODE()



Explode() - The explode() function in PySpark is used to transform an array column into multiple rows, where each element in the array becomes its own row.



In [None]:
json_data.printSchema()

root
 |-- brewing: struct (nullable = true)
 |    |-- country: struct (nullable = true)
 |    |    |-- company: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |-- region: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |-- coffee: struct (nullable = true)
 |    |-- country: struct (nullable = true)
 |    |    |-- company: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |-- region: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)



In [None]:
from pyspark.sql.functions import col,explode

In [None]:
cleaned_json_data = json_data.select(explode(col('coffee.region')).alias('Coffee_region'),
                                     explode(col('brewing.region')).alias('brewing_region'),
                                     col('coffee.country.id').alias('Coffee_country_id'),
                                     col('coffee.country.company').alias('Coffee_country_name'),
                                     col('brewing.country.id').alias('Brewing_country_id'),
                                     col('brewing.country.company').alias('Brewing_country_name'))

In [None]:
cleaned_json_data.show()

+------------------+--------------+-----------------+-------------------+------------------+--------------------+
|     Coffee_region|brewing_region|Coffee_country_id|Coffee_country_name|Brewing_country_id|Brewing_country_name|
+------------------+--------------+-----------------+-------------------+------------------+--------------------+
|{1, Latin America}|   {3, Europe}|              101|         Coffee Co.|               102|         Brewing Co.|
|{1, Latin America}|     {4, Asia}|              101|         Coffee Co.|               102|         Brewing Co.|
|       {2, Africa}|   {3, Europe}|              101|         Coffee Co.|               102|         Brewing Co.|
|       {2, Africa}|     {4, Asia}|              101|         Coffee Co.|               102|         Brewing Co.|
+------------------+--------------+-----------------+-------------------+------------------+--------------------+



In [None]:
# new_json_data = json_data.select(
#     explode(col('coffee.region')).alias('Coffee_region'),
#     col("Coffee_region.id").alias("Coffee_region_id"),
#     col("Coffee_region.name").alias("Coffee_region_name"),
#     explode(col('brewing.region')).alias('Brewing_region'),
#     col("Brewing_region.id").alias("Brewing_region_id"),
#     col("Brewing_region.name").alias("Brewing_region_name"),
#     col('coffee.country.id').alias('Coffee_country_id'),
#     col('coffee.country.company').alias('Coffee_country_name'),
#     col('brewing.country.id').alias('Brewing_country_id'),
#     col('brewing.country.company').alias('Brewing_country_name')
# )

In [None]:
new_json_data1 = json_data.withColumn('coffee_exploded', explode(col('coffee.region')))\
                          .withColumn('brewing_exploded', explode(col('brewing.region')))\
                          .withColumn('coffee_region_id', col('coffee_exploded.id'))\
                          .withColumn('coffee_region_name', col('coffee_exploded.name'))\
                          .withColumn('coffee_country_id', col('coffee.country.id'))\
                          .withColumn('coffee_country_name', col('coffee.country.company'))\
                          .withColumn('brewing_region_id', col('brewing_exploded.id'))\
                          .withColumn('brewing_region_name', col('brewing_exploded.name'))\
                          .withColumn('brewing_country_id', col('brewing.country.id'))\
                          .withColumn('brewing_country_name', col('brewing.country.company'))\
                          .drop('coffee_exploded', 'brewing_exploded', 'coffee', 'brewing')

In [None]:
new_json_data1.show()

+----------------+------------------+-----------------+-------------------+-----------------+-------------------+------------------+--------------------+
|coffee_region_id|coffee_region_name|coffee_country_id|coffee_country_name|brewing_region_id|brewing_region_name|brewing_country_id|brewing_country_name|
+----------------+------------------+-----------------+-------------------+-----------------+-------------------+------------------+--------------------+
|               1|     Latin America|              101|         Coffee Co.|                3|             Europe|               102|         Brewing Co.|
|               1|     Latin America|              101|         Coffee Co.|                4|               Asia|               102|         Brewing Co.|
|               2|            Africa|              101|         Coffee Co.|                3|             Europe|               102|         Brewing Co.|
|               2|            Africa|              101|         Coffee Co.| 

In [None]:
new_json_data = json_data.withColumn('coffee_exploded',explode(col('coffee.region')))\
                          .withColumn('brewing_exploded',explode(col('brewing.region')))\
                          .select(col('coffee_exploded.id').alias('coffee_region_id'),
                                  col('coffee_exploded.name').alias('coffee_region_name'),
                                  col('coffee.country.id').alias('coffee_country_id'),
                                  col('coffee.country.company').alias('coffee_country_name'),
                                  col('brewing_exploded.id').alias('brewing_region_id'),
                                  col('brewing_exploded.name').alias('brewing_region_name'),
                                  col('brewing.country.id').alias('brewing_country_id'),
                                  col('brewing.country.company').alias('brewing_country_name'))

In [None]:
new_json_data.show()

+----------------+------------------+-----------------+-------------------+-----------------+-------------------+------------------+--------------------+
|coffee_region_id|coffee_region_name|coffee_country_id|coffee_country_name|brewing_region_id|brewing_region_name|brewing_country_id|brewing_country_name|
+----------------+------------------+-----------------+-------------------+-----------------+-------------------+------------------+--------------------+
|               1|     Latin America|              101|         Coffee Co.|                3|             Europe|               102|         Brewing Co.|
|               1|     Latin America|              101|         Coffee Co.|                4|               Asia|               102|         Brewing Co.|
|               2|            Africa|              101|         Coffee Co.|                3|             Europe|               102|         Brewing Co.|
|               2|            Africa|              101|         Coffee Co.| 

**selectExpr()** is a method in PySpark that allows you to write SQL-like expressions inside select().

It is useful when you want to:

1.   Perform column selection using SQL expressions.
2.  Apply transformations like alias, CAST, and CASE WHEN directly.
3.  Avoid using col() or withColumn() for simple expressions.

- In these cases selectEXPR() is better than SELECT()



In [None]:
new_json_data1.printSchema()

root
 |-- coffee_region_id: long (nullable = true)
 |-- coffee_region_name: string (nullable = true)
 |-- coffee_country_id: long (nullable = true)
 |-- coffee_country_name: string (nullable = true)
 |-- brewing_region_id: long (nullable = true)
 |-- brewing_region_name: string (nullable = true)
 |-- brewing_country_id: long (nullable = true)
 |-- brewing_country_name: string (nullable = true)



- As ID's all are in LONG so we'll cast into Integer

In [None]:
# new_json_data1.selectExpr('cast(coffee_region_id as int)',
#                           'cast(coffee_country_id as int)',
#                           'cast(brewing_region_id as int)',
#                           'cast(brewing_country_id as int)')

In [None]:
new_json_data1=new_json_data1.selectExpr(
                                         'CAST(coffee_region_id as int) as coffee_region_id',
                                         'coffee_region_name',
                                         'cast(coffee_country_id as int) as coffee_country_id',
                                         'coffee_country_name',
                                         'cast(brewing_region_id as int) as brewing_region_id',
                                         'brewing_region_name',
                                         'cast(brewing_country_id as int) as brewing_country_id',
                                         'brewing_country_name')

In [None]:
new_json_data1.printSchema()

root
 |-- coffee_region_id: integer (nullable = true)
 |-- coffee_region_name: string (nullable = true)
 |-- coffee_country_id: integer (nullable = true)
 |-- coffee_country_name: string (nullable = true)
 |-- brewing_region_id: integer (nullable = true)
 |-- brewing_region_name: string (nullable = true)
 |-- brewing_country_id: integer (nullable = true)
 |-- brewing_country_name: string (nullable = true)



In [None]:
new_json_data1.show()

+----------------+------------------+-----------------+-------------------+-----------------+-------------------+------------------+--------------------+
|coffee_region_id|coffee_region_name|coffee_country_id|coffee_country_name|brewing_region_id|brewing_region_name|brewing_country_id|brewing_country_name|
+----------------+------------------+-----------------+-------------------+-----------------+-------------------+------------------+--------------------+
|               1|     Latin America|              101|         Coffee Co.|                3|             Europe|               102|         Brewing Co.|
|               1|     Latin America|              101|         Coffee Co.|                4|               Asia|               102|         Brewing Co.|
|               2|            Africa|              101|         Coffee Co.|                3|             Europe|               102|         Brewing Co.|
|               2|            Africa|              101|         Coffee Co.| 

**SYNTAX**

```
df.selectExpr(
    "CASE " +
    "WHEN column_name = 'value1' THEN 'result1' " +
    "WHEN column_name = 'value2' THEN 'result2' " +
    "ELSE 'default_result' " +
    "END AS new_column_name"
).show()
```




In [None]:
new_json_data1.selectExpr(
    "CASE " +
    "WHEN coffee_region_name = 'Latin America' THEN 'LA' " +
    "WHEN coffee_region_name = 'Africa' THEN 'AF' " +
    "ELSE 'NA' " +
    "END AS coffee_shrt_name"
).show()


+----------------+
|coffee_shrt_name|
+----------------+
|              LA|
|              LA|
|              AF|
|              AF|
+----------------+



In [None]:
# final_new_json_data = new_json_data1.union(new_json_data1.selectExpr(
#     "CASE " +
#     "WHEN coffee_region_name = 'Latin America' THEN 'LA' " +
#     "WHEN coffee_region_name = 'Africa' THEN 'AF' " +
#     "ELSE 'NA' " +
#     "END AS coffee_shrt_name"
# ))

# THIS METHOD CANNOT BE DONE

'''perform a union between two DataFrames with different numbers
 of columns. new_json_data1 has 8 columns,
  while new_json_data1.selectExpr(...) results in a DataFrame
  with only 1 column (coffee_shrt_name).
  The union operation in PySpark requires both DataFrames
   to have the same number of columns.'''

'perform a union between two DataFrames with different numbers\n of columns. new_json_data1 has 8 columns,\n  while new_json_data1.selectExpr(...) results in a DataFrame \n  with only 1 column (coffee_shrt_name). \n  The union operation in PySpark requires both DataFrames\n   to have the same number of columns.'

In [None]:
# new_json_data=new_json_data.withColumn(
#     "coffee_shrt_name",
#     (
#         "CASE " +
#         "WHEN coffee_region_name = 'Latin America' THEN 'LA' " +
#         "WHEN coffee_region_name = 'Africa' THEN 'AF' " +
#         "ELSE 'NA' " +
#         "END "
#     )
# )

# here the second argument should be a column name but here
# CASE WHEN expression is being considered as a string so that's why it not working

In [None]:
from pyspark.sql.functions import expr


new_json_data1=new_json_data1.withColumn(
    "coffee_shrt_name",
    expr(
        "CASE " +
        "WHEN coffee_region_name = 'Latin America' THEN 'LA' " +
        "WHEN coffee_region_name = 'Africa' THEN 'AF' " +
        "ELSE 'NA' " +
        "END "
    )
)

In [None]:
new_json_data1.show()

+----------------+------------------+-----------------+-------------------+-----------------+-------------------+------------------+--------------------+----------------+
|coffee_region_id|coffee_region_name|coffee_country_id|coffee_country_name|brewing_region_id|brewing_region_name|brewing_country_id|brewing_country_name|coffee_shrt_name|
+----------------+------------------+-----------------+-------------------+-----------------+-------------------+------------------+--------------------+----------------+
|               1|     Latin America|              101|         Coffee Co.|                3|             Europe|               102|         Brewing Co.|              LA|
|               1|     Latin America|              101|         Coffee Co.|                4|               Asia|               102|         Brewing Co.|              LA|
|               2|            Africa|              101|         Coffee Co.|                3|             Europe|               102|         Brew