## Ingest data from circuits.csv

#### Step 1 - Read the csv file using spark dataframereader

In [0]:
%python
## get file source

display(dbutils.fs.mounts())


In [0]:
%fs
ls /mnt/formula19533dl/raw

#### Reading the file by taking it from the above path

#### spark.read.csv()

In [0]:
circuits_df = spark.read.csv('dbfs:/mnt/formula19533dl/raw/circuits.csv')

In [0]:
# print(circuits_df.dtypes)
display(circuits_df)

#### option - `header=True`

In [0]:
#  if seen above, the headers are also being considered as columns, to change that add `header=True`
circuits_df = spark.read.option("header", True).csv('dbfs:/mnt/formula19533dl/raw/circuits.csv')

# This works too
# circuits_df = spark.read.csv('dbfs:/mnt/formula19533dl/raw/circuits.csv', header=True)
display(circuits_df)

#### .printSchema() - will display the datatypes of the columns

In [0]:
# prints the schema of the dataframe, shows each columns' datatype
circuits_df.printSchema()

- here all the data is being treated as strings, we can change that by adding `inferSchema=True`

In [0]:
circuits_df.describe().show()
# this will display a profile for the dataframe's data, such as count, mean, stddev, min, max

### option - inferSchema=True 
- will get the datatypes of the columns by looking at the data

In [0]:
# circuits_df = spark.read.option("header", True).csv('dbfs:/mnt/formula19533dl/raw/circuits.csv')
circuits_df = spark.read.csv('dbfs:/mnt/formula19533dl/raw/circuits.csv', header=True, inferSchema=True)
# or
circuits_df = spark.read \
  .option("header", True) \
  .option("inferSchema", True) \
  .csv('dbfs:/mnt/formula19533dl/raw/circuits.csv')

In [0]:
circuits_df.printSchema()
# now this will display all the schema details

### Using custom Schema for the dataframe

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [0]:
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True),
])

In [0]:
circuits_df = spark.read \
    .option("header", True) \
    .schema(circuits_schema) \
    .csv('dbfs:/mnt/formula19533dl/raw/circuits.csv')

In [0]:
display(circuits_df)

In [0]:
circuits_df.printSchema()