In [2]:
import os
os.environ['SPARK_HOME'] = "/opt/spark"
os.environ['PYSPARK_DRIVER_PYTHON'] = "jupyter"
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = "lab"
os.environ['PYSPARK_PYTHON'] = "python"

In [3]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Create-DateFrame").getOrCreate()

# **Read CSV file into DataFrame**

In [None]:
%%bash 
head -10 ./data/products.csv

In [5]:
# Read CSV file into DataFrame
csv_file_path = "./data/products.csv"
df = spark.read.csv(csv_file_path, header=True)

In [None]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

# **Read CSV with an explicit schema definition**

In [7]:
# import necessary types
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [8]:
# Define the schema
schema = StructType([
    StructField(name="id", dataType=IntegerType(), nullable=True),
    StructField(name="name", dataType=StringType(), nullable=True),
    StructField(name="category", dataType=StringType(), nullable=True),
    StructField(name="quantity", dataType=IntegerType(), nullable=True),
    StructField(name="price", dataType=DoubleType(), nullable=True)
])

In [9]:
# Read CSV file into DataFrame with schema definition
csv_file_path = "./data/products.csv"
df = spark.read.csv(csv_file_path, header=True, schema=schema)

In [None]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

# **Read CSV with inferSchema**

In [11]:
# Read CSV file into DataFrame with inferSchema
csv_file_path = "./data/products.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [None]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

# **Read JSON file into DataFrame**

## Single Line JSON

In [None]:
%%bash
head -10 data/products_singleline.json

In [14]:
# Read single line JSON
# Each row is a JSON record, records are separated by new line
json_file_path = "./data/products_singleline.json"
df = spark.read.json(json_file_path)

In [None]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

## Multi-lines JSON

In [None]:
%%bash
head -20 data/products_multiline.json

In [17]:
# Read multi-line JSON
# JSON is an array of record, records are separated by a comma.
# each record is defined in multiple lines
json_file_path = "./data/products_multiline.json"
df = spark.read.json(json_file_path, multiLine=True)

In [None]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

# **Writing into parquet file**

In [20]:
# write dataframe into parquet file
parquet_file_path = "./data/products.parquet"
df.write.mode("overwrite").parquet(parquet_file_path)

                                                                                

# **Read parquet file into DataFrame**

In [21]:
df = spark.read.parquet(parquet_file_path)

In [None]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

In [23]:
spark.stop()