<a href="https://colab.research.google.com/github/JarekMaleszyk/data-science-project-sandbox/blob/main/apache_spark_with_python_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [120]:
try:
  import pyspark
except:
  !pip install pyspark
  import pyspark
finally:
  from pyspark.sql import SparkSession

1. Simple json file

In [121]:
import os.path
FILE_PATH = "/content/jsondata/data.json"

if not os.path.isfile(FILE_PATH):
  !rm -rf /content/jsondata
  !wget -P "/content/jsondata/" "https://raw.githubusercontent.com/JarekMaleszyk/data-science-project-sandbox/refs/heads/main/data.json"

In [122]:
sparkSession = SparkSession.builder.appName('Pratice with pyspark basics').getOrCreate()

In [123]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, DoubleType, BooleanType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", DoubleType(), True)
])

In [124]:
pyspark_df_products = sparkSession.read\
  .schema(schema)\
  .option("multiline", True)\
  .json(FILE_PATH)

In [125]:
pyspark_df_products.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)



In [126]:
pyspark_df_products.show()

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
|  6|            Yoga Mat|         Sports|      30| 29.99|
|  7| Samsung 4K Smart TV|    Electronics|       8|799.99|
|  8|        Levi's Jeans|       Clothing|      15| 49.99|
|  9|Dyson Vacuum Cleaner|Home Appliances|       3|399.99|
| 10| Harry Potter Series|          Books|      20| 15.99|
+---+--------------------+---------------+--------+------+



2. More complex json file

In [127]:
import os.path
USER_FILE_PATH = "/content/jsondata/users.json"

if not os.path.isfile(USER_FILE_PATH):
  !rm -f /content/jsondata/users.json
  !wget -P "/content/jsondata/" "https://jsonplaceholder.typicode.com/users"
  !mv /content/jsondata/users /content/jsondata/users.json

In [128]:
pyspark_df_users = sparkSession.read\
  .option("inferSchema", True)\
  .option("multiline", True)\
  .json(USER_FILE_PATH)

In [129]:
pyspark_df_users.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- geo: struct (nullable = true)
 |    |    |-- lat: string (nullable = true)
 |    |    |-- lng: string (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- suite: string (nullable = true)
 |    |-- zipcode: string (nullable = true)
 |-- company: struct (nullable = true)
 |    |-- bs: string (nullable = true)
 |    |-- catchPhrase: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- username: string (nullable = true)
 |-- website: string (nullable = true)



In [130]:
pyspark_df_users.show()

+--------------------+--------------------+--------------------+---+--------------------+--------------------+----------------+-------------+
|             address|             company|               email| id|                name|               phone|        username|      website|
+--------------------+--------------------+--------------------+---+--------------------+--------------------+----------------+-------------+
|{Gwenborough, {-3...|{harness real-tim...|   Sincere@april.biz|  1|       Leanne Graham|1-770-736-8031 x5...|            Bret|hildegard.org|
|{Wisokyburgh, {-4...|{synergize scalab...|   Shanna@melissa.tv|  2|        Ervin Howell| 010-692-6593 x09125|       Antonette|anastasia.net|
|{McKenziehaven, {...|{e-enable strateg...|  Nathan@yesenia.net|  3|    Clementine Bauch|      1-463-123-4447|        Samantha|  ramiro.info|
|{South Elvis, {29...|{transition cutti...|Julianne.OConner@...|  4|    Patricia Lebsack|   493-170-9623 x156|        Karianne|     kale.biz|
|{Rosc