# Read/Write File DataFrame Operations

In [None]:
## Fundamentals

### While reading  

## Use below parameters if required; not mandatory to use
# If want to read from local user then use "file:///path", if want to read from hadoop then use either "hdfs:///path" or "/path"
# Use option('inferSchema','true') or inferSchema=True when datatype required according to data in file
# Use option('header','True') or header=True when first row required as columns of dataframe
# Use option('delimiter',';') or sep=';' when file contains delimiter to seperate columns, in this case ';', default is ','

### While writing  

## Use below parameters if required; not mandatory to use
# For CSV uncompressed is default, we can use the following: snappy/uncompressed/gzip/lz4/bzip2
# For JSON uncompressed is default, we can use the following: snappy/uncompressed/gzip/lz4
# For ORC snappy is default, we can use the following: snappy/uncompressed/gzip
# For Parquet snappy is default, we can use the following: snappy/uncompressed/gzip
# For Avro snappy is default, we can use the following: snappy/uncompressed/deflate
# Use coalesce(n) when asked to store file in specific partitions; 3 is default (use repartition(n) in case coalesce doesn't work)
# Use option('compression','snappy') or compression='snappy' when asked to compress file; in this case snappy compression
# Use option('delimiter','\t') or sep='\t' to seperate each column of file; in this case file is tab delimited; default is ','
# Use option('mode','overwrite') or mode='overwrite' when required to overwrite the existing file, 'error' is default; mode = overwrite/append/ignore/error

## CSV File

### Read CSV File

#### Method 1

In [None]:
df = spark.read.option('inferSchema','True').option('header','True').option('delimiter',';').format("csv").load("/path")
df.show()        # Confirm the dataframe read
df.printSchema() # Verify the schema

#### Method 2

In [None]:
df = spark.read.csv("/path", header=True, inferSchema=True, sep=",")
df.show()        # Confirm the dataframe read
df.printSchema() # Verify the schema

### Write CSV File

#### Method 1

In [None]:
df.coalesce(4).write.format("csv").option('delimiter',';').option("compression","snappy").option('mode','overwrite').save("/path")

#### Method 2

In [None]:
df.coalesce(4).write.csv("/path", compression="snappy", mode="overwrite", sep=",")

## JSON File

### Read JSON File

#### Method 1

In [None]:
df = spark.read.option('inferSchema','True').option('header','True').option('delimiter',';').format("json").load("/path")
df.show()        # Confirm the dataframe read
df.printSchema() # Verify the schema

#### Method 2

In [None]:
df = spark.read.option('inferSchema','True').option('header','True').option('delimiter',';').json("/path")
df.show()        # Confirm the dataframe read
df.printSchema() # Verify the schema

### Write JSON File

#### Method 1

In [None]:
df.write.format('json').option('delimiter',';').option("compression","snappy").option('mode','overwrite').save('/path')

#### Method 2

In [None]:
df.coalesce(4).write.option('delimiter','|').json("/path", compression="snappy", mode="overwrite")

## ORC File

### Read ORC File

#### Method 1

In [None]:
df = spark.read.option('inferSchema','True').option('header','True').option('delimiter',';').format("orc").load("/path")
df.show()        # Confirm the dataframe read
df.printSchema() # Verify the schema

#### Method 2

In [None]:
df = spark.read.option('inferSchema','True').option('header','True').option('delimiter',';').orc("/path")
df.show()        # Confirm the dataframe read
df.printSchema() # Verify the schema

### Write ORC File

#### Method 1

In [None]:
df.write.format('orc').option('delimiter',';').option("compression","snappy").option('mode','overwrite').save('/path')

#### Method 2

In [None]:
df.coalesce(4).write.option('delimiter','|').orc("/path", compression="snappy", mode="overwrite")

## Parquet File

### Read Parquet File

#### Method 1

In [None]:
df = spark.read.option('inferSchema','True').option('header','True').option('delimiter',';').format("parquet").load("/path")
df.show()        # Confirm the dataframe read
df.printSchema() # Verify the schema

#### Method 2

In [None]:
df = spark.read.option('inferSchema','True').option('header','True').option('delimiter',';').parquet("/path")
df.show()        # Confirm the dataframe read
df.printSchema() # Verify the schema

### Write Parquet File

#### Method 1

In [None]:
df.write.format('parquet').option('delimiter',';').option("compression","snappy").option('mode','overwrite').save('/path')

#### Method 2

In [None]:
df.coalesce(4).write.option('delimiter','|').parquet("/path", compression="snappy", mode="overwrite")

## Avro File

### Read Avro File

In [None]:
df = spark.read.option('inferSchema','True').option('header','True').option('delimiter',';').format("avro").load("/path")
df.show()        # Confirm the dataframe read
df.printSchema() # Verify the schema

### Write Avro File

In [None]:
df.write.format('avro').option('delimiter',';').option("compression","snappy").option('mode','overwrite').save('/path')

In [None]:
## Hive 

In [None]:
### Read From Hive

In [None]:
#### Method 1

In [None]:
df = spark.read.format('hive').table('database_name.table_name') # If database is default just use table_name

In [None]:
#### Method 2

In [None]:
df = spark.read.table('database_name.table_name') # If database is default just use table_name

In [None]:
### Write To Hive

In [None]:
#### Method 1

In [None]:
df.repartition(5).write.format('parquet').option('compression','snappy').saveAsTable('database_name.table_name', mode='overwrite')

In [None]:
#### Method 2

In [None]:
df.write.insertInto('database_name.table_name', overwrite=True) 
# Table must exist before using insertInto where as saveAsTable (method 1) will create a permanent table in hive and load data
# insertInto will append the data if table already exists on contrary saveAsTable will error if table exists
# If want to insert data into existing table using parameter saveAsTable then use with mode='append' because default is mode='overwrite'