### Load datasets
- Load CSV file
- Use Headers
- Infer Schema
- Set field types
- Load JSON file
- Load Parquet file
- Construct file manually

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import types as T
spark = SparkSession.builder.appName("df").master("local[4]").getOrCreate()

In [None]:
#Import CSV
df_penguins = spark.read.csv("data/penguins.csv")

In [None]:
#Show data
df_penguins.show(3)

In [None]:
#Use header
df_penguins = spark.read.csv("data/penguins.csv",header=True)
df_penguins.show(3)

In [None]:
#Show schema
df_penguins.printSchema()

In [None]:
#Infer schema
df_penguins = spark.read.csv("data/penguins.csv",header=True,inferSchema=True)
df_penguins.printSchema()


In [None]:
#Manually set the schema
schema = T.StructType(fields=[
    T.StructField("species", T.StringType(), True),
    T.StructField("island", T.StringType(), True),
    T.StructField("bill_length_mm", T.DoubleType(), True),
    T.StructField("bill_depth_mm", T.DoubleType(), True),
    T.StructField("flipper_length_mm", T.DoubleType(), True),
    T.StructField("body_mass_g", T.IntegerType(), True),
    T.StructField("sex", T.StringType(), True),
])
df_penguins = spark.read.csv("data/penguins.csv",header=True,schema=schema)
df_penguins.printSchema()


In [None]:
json_df = spark.read.json("data/penguins.json")
json_df.printSchema()

In [None]:
#Read parquet files
df_parquet = spark.read.parquet("data/penguins.parquet")
df_parquet.printSchema()

In [None]:
# Construct dataframe manually
penguins = [
    ("Adelie","Biscoe",37.8,18.3,174.0,3400,"FEMALE"),
    ("Adelie","Biscoe",37.7,18.7,180.0,3600,"MALE"),
    ("Adelie","Biscoe",35.9,19.2,189.0,3800,"FEMALE")]
df_manual = spark.createDataFrame(penguins).toDF("Species","Island","bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g","sex")
df_manual.printSchema()
