In [39]:
try:
  import pyspark
except:
  !pip install pyspark
  import pyspark
finally:
  from pyspark.sql import SparkSession

In [40]:
from pathlib import Path

def create_folder(directory_name: str) -> None:
    try:
        os.mkdir(Path(directory_name))
        print(f"Directory '{directory_name}' created successfully.")
    except FileExistsError:
        print(f"Directory '{directory_name}' already exists.")
    except PermissionError:
        print(f"Permission denied: Unable to create '{directory_name}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

def remove_file(directory_name: str, file_name: str) -> None:
    try:
        os.remove(f"{Path(directory_name)}/{file_name}") 
        print(f"File '{Path(directory_name)}/{file_name}' removed successfully.")
    except FileExistsError:
        print(f"File '{Path(directory_name)}/{file_name}' doesn't exists.")
    except PermissionError:
        print(f"Permission denied: Unable to remove file '{Path(directory_name)}/{file_name}'.")
    except Exception as e:
        print(f"An error occurred: {e}")



In [41]:
import os.path

try:
  import wget
except:
  !pip install wget -q
  import wget

try:
  import shutil
except:
  !pip install shutil -q
  import shutil

FILE_PATH = "./tmp/"
FILE_NAME = "electro_product_mock.json"
URL = "https://raw.githubusercontent.com/JarekMaleszyk/data-science-project-sandbox/refs/heads/main/data.json"

if not os.path.isfile(FILE_PATH):
    create_folder(FILE_PATH)
    remove_file(FILE_PATH, FILE_NAME) 
    filename = wget.download(URL)
    shutil.move(filename, f"{Path(FILE_PATH)}/{FILE_NAME}")

filename

Directory './tmp/' already exists.
File 'tmp/electro_product_mock.json' removed successfully.
100% [................................................................................] 1537 / 1537

'data.json'

In [42]:
sparkSession = SparkSession.builder.appName('Spark basics').getOrCreate()
sparkSession.sparkContext.setLogLevel("WARN")

In [43]:
from pyspark.sql.types import (StructType, StructField, StringType, IntegerType,
                               ArrayType, DoubleType, BooleanType)

schema = StructType(fields=[
    StructField("id", IntegerType(), True), # True = nullable
    StructField("name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", DoubleType(), True)
])

In [44]:
pyspark_df_products = sparkSession.read\
  .schema(schema)\
  .option("multiline", True)\
  .json(f"{Path(FILE_PATH)}/{FILE_NAME}")

In [45]:
pyspark_df_products.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)



In [46]:
pyspark_df_products.show()

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
|  6|            Yoga Mat|         Sports|      30| 29.99|
|  7| Samsung 4K Smart TV|    Electronics|       8|799.99|
|  8|        Levi's Jeans|       Clothing|      15| 49.99|
|  9|Dyson Vacuum Cleaner|Home Appliances|       3|399.99|
| 10| Harry Potter Series|          Books|      20| 15.99|
+---+--------------------+---------------+--------+------+

