# ORIGINAL: INFERRED SCHEMA

# Data
### create database

In [0]:
%sql 
drop database if exists u23_2_3 cascade;
create database u23_2_3;
show databases;

### load data

In [0]:
# File location and type
file_location_large    = "/FileStore/tables/u23_2_3/flights_larger.csv"
file_location_small    = "/FileStore/tables/u23_2_3/flights_small.csv"
file_location_planes   = "/FileStore/tables/u23_2_3/planes.csv"
file_location_airports = "/FileStore/tables/u23_2_3/airports.csv"

file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

flights_large_data = (spark.read.format(file_type) 
                    .option("inferSchema", infer_schema) 
                    .option("header", first_row_is_header) 
                    .option("sep", delimiter) 
                    .load(file_location_large))

flights_small_data = (spark.read.format(file_type) 
                    .option("inferSchema", infer_schema) 
                    .option("header", first_row_is_header) 
                    .option("sep", delimiter) 
                    .load(file_location_small))

planes_data = (spark.read.format(file_type) 
                    .option("inferSchema", infer_schema) 
                    .option("header", first_row_is_header) 
                    .option("sep", delimiter) 
                    .load(file_location_planes))

airports_data = (spark.read.format(file_type) 
                    .option("inferSchema", infer_schema) 
                    .option("header", first_row_is_header) 
                    .option("sep", delimiter) 
                    .load(file_location_airports))

### check data

In [0]:
print('type: ', type(flights_large_data))
print('number rows =', flights_large_data.count())
flights_large_data.printSchema()
flights_large_data.show(5)

In [0]:
print('type: ', type(flights_small_data))
print('number rows =', flights_small_data.count())
flights_small_data.printSchema()
flights_small_data.show(5)

In [0]:
print('type: ', type(planes_data))
print('number rows =', planes_data.count())
planes_data.printSchema()
planes_data.show(5)

In [0]:
print('type: ', type(airports_data))
print('number rows =', airports_data.count())
airports_data.printSchema()
airports_data.show(5)

### create permanent tables

In [0]:
#permanent_table_name = "u23_2_3.flights_larger"

flights_large_data.write.format("parquet").saveAsTable("u23_2_3.flights_larger")
flights_small_data.write.format("parquet").saveAsTable("u23_2_3.flights_small")
planes_data.write.format("parquet").saveAsTable("u23_2_3.planes")
airports_data.write.format("parquet").saveAsTable("u23_2_3.airports")

# NEW: SPECIFY SCHEMA

In [0]:
from pyspark.sql.types import *

### create database

In [0]:
%sql 
drop database if exists u23_2_3_schema cascade;
create database u23_2_3_schema;
show databases;

namespace
country_club
default
u23_2_2
u23_2_3
u23_2_3_schema


### read flight data

In [0]:
the_schema = StructType().add('year', IntegerType(),True) \
                         .add('month', IntegerType(),True) \
                         .add('day', IntegerType(),True) \
                         .add('dep_time', IntegerType(),True) \
                         .add('dep_delay', IntegerType(),True) \
                         .add('arr_time', IntegerType(),True) \
                         .add('arr_delay', IntegerType(),True) \
                         .add('carrier', StringType(),True) \
                         .add('tailnum', StringType(),True) \
                         .add('flight', IntegerType(),True) \
                         .add('origin', StringType(),True) \
                         .add('dest', StringType(),True) \
                         .add('air_time', IntegerType(),True) \
                         .add('distance', IntegerType(),True) \
                         .add('hour', IntegerType(),True) \
                         .add('minute', IntegerType(),True)

spark_df_flights = spark.read.csv('/FileStore/tables/u23_2_3/flights_small.csv',
                                  header=True, schema=the_schema)
spark_df_flights.show(5)

### read plane data

In [0]:
the_schema = StructType().add('tailnum', StringType(),True) \
                         .add('year', IntegerType(),True) \
                         .add('type', StringType(),True) \
                         .add('manufacturer', StringType(),True) \
                         .add('model', StringType(),True) \
                         .add('engines', IntegerType(),True) \
                         .add('seats', IntegerType(),True) \
                         .add('speed', StringType(),True) \
                         .add('engine', StringType(),True)

spark_df_planes = spark.read.csv('/FileStore/tables/u23_2_3/planes.csv',
                                  header=True, schema=the_schema)
spark_df_planes.show(5)

### read airport data

In [0]:
the_schema = StructType().add('faa', StringType(),True) \
                         .add('name', StringType(),True) \
                         .add('lat', DoubleType(),True) \
                         .add('lon', DoubleType(),True) \
                         .add('alt', IntegerType(),True) \
                         .add('tz', IntegerType(),True) \
                         .add('dst', StringType(),True)

spark_df_airports = spark.read.csv('/FileStore/tables/u23_2_3/airports.csv',
                                  header=True, schema=the_schema)
spark_df_airports.show(5)

### create permanent tables

In [0]:
spark_df_flights.write.format("parquet").saveAsTable("u23_2_3_schema.flights")
spark_df_planes.write.format("parquet").saveAsTable("u23_2_3_schema.planes")
spark_df_airports.write.format("parquet").saveAsTable("u23_2_3_schema.airports")