In [0]:
flight_df = spark.read.format("csv")\
            .option("header","true")\
            .option("inferschema","true")\
            .option("mode","FAILFAST")\
            .load("/FileStore/tables/2010_summary.csv")

flight_df.show(10)

''' 
	=> Spark is the session that you create 
		
	=>Format specifies which format like CSV, Json, JDBC connector etc if not mentioned then default it takes as parquet. It is a optional parameter
		
	=>Option has various parameters like (“Header”,”true”), (“InferSchema”,”true”)
		
	=>Schema is when schema you want to specifically specify
		
	=>Load is when you want to load data

MODE : Here in option there is a mode parameter it has various meaning
			a) FAILFAST: whenever any corrupt data is detected fail immediatly the 
                        whole process will be stopped  (like if data type is integer and data has null it will not run and throw error for whole dataset)
  
			b) DROPMALFORMED: Will only dropped corrupted records

			c) PERMISSIVE: If No mode mentioned this is the default one. It subsitutes null whenever corrupted record occurs

'''


+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
|       Costa Rica|      United States|  477|
|          Senegal|      United States|   29|
|    United States|   Marshall Islands|   44|
+-----------------+-------------------+-----+
only showing top 10 rows



In [0]:
flight_df.printSchema() #Prints the schem as shown below

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [0]:
# How to create a manual schema if we use inferschem as false in options
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

my_schema = StructType([
                                StructField("destination_country",StringType(),True),
                                StructField("origin_country",StringType(),True),
                                StructField("count",IntegerType(),True)
                            ])

# StructField("Name", DataType(), nullable(True or False))

In [0]:
df = spark.read.format("csv")\
            .option("header","false")\
            .option("skipRows",1)\
            .option("inferschema","false")\
            .schema(my_schema)\
            .option("mode","PERMISSIVE")\
            .load("/FileStore/tables/2010_summary.csv")
        
df.show(5)

# This is how you can use your own schema 

+-------------------+--------------+-----+
|destination_country|origin_country|count|
+-------------------+--------------+-----+
|      United States|       Romania|    1|
|      United States|       Ireland|  264|
|      United States|         India|   69|
|              Egypt| United States|   24|
|  Equatorial Guinea| United States|    1|
+-------------------+--------------+-----+
only showing top 5 rows

