In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()

## Create DataFrame from CSV with no headers

In [3]:
df = spark.read.csv("../Example_Sources/zipcodes.csv")
df.printSchema()
df.toPandas().head()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: string (nullable = true)
 |-- _c19: string (nullable = true)



Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14,_c15,_c16,_c17,_c18,_c19
0,RecordNumber,Zipcode,ZipCodeType,City,State,LocationType,Lat,Long,Xaxis,Yaxis,Zaxis,WorldRegion,Country,LocationText,Location,Decommisioned,TaxReturnsFiled,EstimatedPopulation,TotalWages,Notes
1,1,704,STANDARD,PARC PARQUE,PR,NOT ACCEPTABLE,17.96,-66.22,0.38,-0.87,0.3,,US,"Parc Parque, PR",NA-US-PR-PARC PARQUE,FALSE,,,,
2,2,704,STANDARD,PASEO COSTA DEL SUR,PR,NOT ACCEPTABLE,17.96,-66.22,0.38,-0.87,0.3,,US,"Paseo Costa Del Sur, PR",NA-US-PR-PASEO COSTA DEL SUR,FALSE,,,,
3,10,709,STANDARD,BDA SAN LUIS,PR,NOT ACCEPTABLE,18.14,-66.26,0.38,-0.86,0.31,,US,"Bda San Luis, PR",NA-US-PR-BDA SAN LUIS,FALSE,,,,
4,61391,76166,UNIQUE,CINGULAR WIRELESS,TX,NOT ACCEPTABLE,32.72,-97.31,-0.1,-0.83,0.54,,US,"Cingular Wireless, TX",NA-US-TX-CINGULAR WIRELESS,FALSE,,,,


Other possible syntax
df = spark.read.format("csv").load("../Example_Sources/zipcodes.csv")
df = spark.read.format("org.apache.spark.sql.csv").load("../Example_Sources/zipcodes.csv")

org.apache.spark.sql.csv is the fully qualified name for csv

## Create DataFrame from CSV with 1st row as header

In [4]:
df2 = spark.read.option("header", True).csv("../Example_Sources/zipcodes.csv")
df2.printSchema()
df2.toPandas().head()

root
 |-- RecordNumber: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Lat: string (nullable = true)
 |-- Long: string (nullable = true)
 |-- Xaxis: string (nullable = true)
 |-- Yaxis: string (nullable = true)
 |-- Zaxis: string (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Decommisioned: string (nullable = true)
 |-- TaxReturnsFiled: string (nullable = true)
 |-- EstimatedPopulation: string (nullable = true)
 |-- TotalWages: string (nullable = true)
 |-- Notes: string (nullable = true)



Unnamed: 0,RecordNumber,Zipcode,ZipCodeType,City,State,LocationType,Lat,Long,Xaxis,Yaxis,Zaxis,WorldRegion,Country,LocationText,Location,Decommisioned,TaxReturnsFiled,EstimatedPopulation,TotalWages,Notes
0,1,704,STANDARD,PARC PARQUE,PR,NOT ACCEPTABLE,17.96,-66.22,0.38,-0.87,0.3,,US,"Parc Parque, PR",NA-US-PR-PARC PARQUE,False,,,,
1,2,704,STANDARD,PASEO COSTA DEL SUR,PR,NOT ACCEPTABLE,17.96,-66.22,0.38,-0.87,0.3,,US,"Paseo Costa Del Sur, PR",NA-US-PR-PASEO COSTA DEL SUR,False,,,,
2,10,709,STANDARD,BDA SAN LUIS,PR,NOT ACCEPTABLE,18.14,-66.26,0.38,-0.86,0.31,,US,"Bda San Luis, PR",NA-US-PR-BDA SAN LUIS,False,,,,
3,61391,76166,UNIQUE,CINGULAR WIRELESS,TX,NOT ACCEPTABLE,32.72,-97.31,-0.1,-0.83,0.54,,US,"Cingular Wireless, TX",NA-US-TX-CINGULAR WIRELESS,False,,,,
4,61392,76177,STANDARD,FORT WORTH,TX,PRIMARY,32.75,-97.33,-0.1,-0.83,0.54,,US,"Fort Worth, TX",NA-US-TX-FORT WORTH,False,2126.0,4053.0,122396986.0,


### Read Multiple CSV Files
df = spark.read.csv("path1,path2,path3")

### Read all CSV files in a Directory
df = spark.read.csv("Folder path")

## Provide Options

### How

#### Chaining option(self, key, value) to use multiple options
df4 = spark.read.option("inferSchema",True).option("delimiter",",").csv("../Example_Sources/zipcodes.csv")

#### Use multiple options at once options(self, **options) method
df4 = spark.read.options(inferSchema='True',delimiter=',').csv("../Example_Sources/zipcodes.csv")

### Options

`delimiter` - used to specify the column delimiter of the CSV file. By default, it is comma (,) character,
but can be set to any character like pipe(|), tab (\t), space using this option.

`inferSchema` - The default value set to this option is False when setting to true it automatically infers column 
types based on the data. Note that, it requires reading the data one more time to infer the schema.

`header` - This option is used to read the first line of the CSV file as column names. By default the value of this 
option is False , and all column types are assumed to be a string.

`quotes` - When you have a column with a delimiter that used to split the columns, use quotes option to specify 
the quote character, by default it is ” and delimiters inside quotes are ignored. but using this option you 
can set any character

`nullValues` - Using nullValues option you can specify the string in a CSV to consider as null. 
For example, if you want to consider a date column with a value "1900-01-01" set null on DataFrame.

`dateFormat` - dateFormat option to used to set the format of the input DateType and TimestampType
columns. Supports all java.text.SimpleDateFormat formats.
 
`Full Documentation` - https://docs.databricks.com/data/data-sources/read-csv.html

## Reading CSV files with a user-specified custom schema

schema = StructType().add("RecordNumber",IntegerType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("Zipcode",IntegerType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("ZipCodeType",StringType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("City",StringType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("State",StringType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("LocationType",StringType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("Lat",DoubleType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("Long",DoubleType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("Xaxis",IntegerType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("Yaxis",DoubleType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("Zaxis",DoubleType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("WorldRegion",StringType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("Country",StringType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("LocationText",StringType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("Location",StringType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("Decommisioned",BooleanType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("TaxReturnsFiled",StringType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("EstimatedPopulation",IntegerType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("TotalWages",IntegerType(),True) \
      &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;.add("Notes",StringType(),True)
      
df_with_schema = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/tmp/resources/zipcodes.csv")

## Write PySpark DataFrame to CSV file

df.write.option("header", True).csv('../Example_Sources/zipcodes.csv')

### Saving Modes

`overwrite` – mode is used to overwrite the existing file.

`append` – To add the data to the existing file.

`ignore` – Ignores write operation when the file already exists.

`error` – This is a default option when the file already exists, it returns an error.

### Example

df2.write.mode('overwrite').csv("../Example_Sources/zipcodes.csv")

*OR*

df2.write.format("csv").mode('overwrite').save("../Example_Sources/zipcodes.csv")