### Reading CSV data with an inferred schema

In [0]:
# Read CSV file into a DataFrame
df = (spark.read
      .format("csv")
      .option("header", "true")
      .load("../data/netflix_titles.csv"))

# Alternatively
## If your CSV file does not have a header row

df = (spark.read
      .format("csv")
      .option("header", "false") # When the CSV file does not have any headers
      .load("/Volumes/mycatalog/myschema/myvolume/repofiles/Data-Engineering-with-Databricks-Cookbook-main/data/netflix_titles.csv"))

In [0]:
# Display contents of DataFrame
df.show()

# Alternatively

# df.show(50)  # Display first 50 rows
# df.show(10, truncate=False)  # Display first 10 rows without truncation

In [0]:
# Print schema of DataFrame
df.printSchema()

### Reading CSV data with explicit schema

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

# Define a Schema
schema = StructType([
    StructField("show_id", StringType(), True),
    StructField("type", StringType(), True),
    StructField("title", StringType(), True),
    StructField("director", StringType(), True),
    StructField("cast", StringType(), True),
    StructField("country", StringType(), True),
    StructField("date_added", DateType(), True),
    StructField("release_year", IntegerType(), True),
    StructField("rating", StringType(), True),
    StructField("duration", StringType(), True),
    StructField("listed_in", StringType(), True),
    StructField("description", StringType(), True)])


In [0]:
# Read CSV file into a DataFrame
df = (spark.read.format("csv")
      .option("header", "true")
      .schema(schema)
      .load("/Volumes/mycatalog/myschema/myvolume/repofiles/Data-Engineering-with-Databricks-Cookbook-main/data/netflix_titles.csv"))

In [0]:
# Display contents of DataFrame
df.show()

### Common issues faced while working with CSV data

In [0]:
# Read CSV file into a DataFrame
df = (spark.read.format("csv") 
      .option("header", "true") 
      .option("nullValue", "null") 
      .option("escapeQuotes", "true") 
      .schema(schema) 
      .load("/Volumes/mycatalog/myschema/myvolume/repofiles/Data-Engineering-with-Databricks-Cookbook-main/data/netflix_titles.csv")) 

In [0]:
# Display first 5 rows of DataFrame
df.show(5)

In [0]:
# Read CSV file into a DataFrame
df = (spark.read
      .format("csv")
      .option("header", "true")
      .option("nullValue", "null")
      .option("emptyValues", "")
      .schema(schema)
      .load("/Volumes/mycatalog/myschema/myvolume/repofiles/Data-Engineering-with-Databricks-Cookbook-main/data/netflix_titles.csv"))

In [0]:
# Display first 5 rows of DataFrame
df.show(5)

In [0]:
df = (spark.read.format("csv")
      .option("header", "true")
      .option("nullValue", "null")
      .option("dateFormat", "LLLL d, y")
      .schema(schema)
      .load("/Volumes/mycatalog/myschema/myvolume/repofiles/Data-Engineering-with-Databricks-Cookbook-main/data/netflix_titles.csv"))


In [0]:
# Display contents of DataFrame
df.show()