In [0]:
from pyspark.sql.types import StructType, StructField, DateType, StringType, IntegerType
from pyspark.sql.functions import to_date, try_to_date

# file path
file = '/Volumes/workspace/dbo/sales/Sales.csv'

''' # create schema from csv file
schema = StructType([StructField("OrderDate", DateType()),
                     StructField("StockDate", DateType()),
                     StructField("OrderNumber", StringType()),
                     StructField("ProductKey", IntegerType()),
                     StructField("CustomerKey", IntegerType()),
                     StructField("TerritoryKey", IntegerType()),
                     StructField("OrderLineItem", IntegerType()),
                     StructField("OrderQuantity", IntegerType())]) '''

# read csv file into Spark DataFrame df, using schema defined above
df = (spark
      .read
      .option("header", "true")
      .option("delimiter", ",")
      .option("inferSchema", "False")
      .format("csv")
      .load(file))

# OrderDate and StockDate are having issue, so cast datatype to date, to specific format
df = (df
      .withColumn("OrderDate",to_date(df["OrderDate"], "M/d/yyyy").cast(DateType()))
      .withColumn("StockDate", to_date(df["StockDate"], "M/d/yyyy").cast(DateType()))
      .withColumn("OrderNumber", df["OrderNumber"].cast(StringType()))
      .withColumn("ProductKey", df["ProductKey"].cast(IntegerType()))
      .withColumn("CustomerKey", df["CustomerKey"].cast(IntegerType()))
      .withColumn("TerritoryKey", df["TerritoryKey"].cast(IntegerType()))
      .withColumn("OrderLineItem", df["OrderLineItem"].cast(IntegerType()))
      .withColumn("OrderQuantity", df["OrderQuantity"].cast(IntegerType())))


df.printSchema()
# display values only where Orderdate or Stockdate field is null
df.show(n=1000, truncate=False)
