In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

df = (spark.read.format("csv")
      .option("header", "true")
      .option("nullValue", "null")
      .option("dateFormat", "LLLL d, y")
      .load("../data/netflix_titles.csv"))

In [0]:
(df.write.format("csv")
 .option("header", "true") 
 .mode("overwrite")
 .option("delimiter", ",")
 .save("../data/data_lake/netflix_csv_data"))

In [0]:
(df.write.format("json") 
 .mode("overwrite") 
 .save("../data/data_lake/netflix_json_data"))

In [0]:
(df.write.format("parquet") 
 .mode("overwrite") 
 .save("../data/data_lake/netflix_parquet_data"))

### Write Compressed Data

In [0]:
(df.write
 .format("csv")
 .mode("overwrite") 
 .option("header", "true")
 .option("delimiter", ",")
 .option("codec", "org.apache.hadoop.io.compress.GzipCodec")
 .save("../data/data_lake/netflix_csv_data.gz"))

### Specify the Number of Partitions

In [0]:
(df.repartition(4) 
 .write.format("csv") 
 .mode("overwrite") 
 .option("header", "true") 
 .option("delimiter", ",") 
 .save("../data/data_lake/netflix_csv_data_4_part")) 

### Use `coalesce()` to Reduce number fo Partitions

In [0]:
(df.coalesce(1)
 .write.format("csv")
 .mode("overwrite") 
 .option("header", "true")
 .option("delimiter", ",")
 .save("../data/data_lake/netflix_csv_data_whole"))

### Use `partitionBy()` to write partitions based on a column

In [0]:
# partition the CSV data by the 'release_year' column
(df.write.format('csv')
 .option('header', 'true')
 .option('delimiter', ',')
 .mode('overwrite')
 .partitionBy('release_year')
 .save("../data/data_lake/netflix_csv_data_partitioned"))

In [0]:
spark.stop()