In [None]:
# Import Packages

from pyspark.sql import SparkSession
from pyspark.sql.functions import add_months
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

spark = SparkSession.builder.appName('Invoice').master('local[*]').getOrCreate()

In [None]:
# Import data from 'liquor-sales-data/raw_data/' in ADLS

file_path = "/mnt/liquor-sales-data/raw_data/invoice.csv"

schema = StructType([
    StructField("Invoice_Item Number", StringType(), True),
    StructField("Store Number", StringType(), True),
    StructField("Address Sequence", IntegerType(), True),
    StructField("Item Number", StringType(), True),
    StructField("Vendor Number", StringType(), True),
    StructField("Price Sequence", IntegerType(), True),
    StructField("Items Sold", IntegerType(), True),
    StructField("Sale Dollars", DoubleType(), True),
    StructField("Sale Baht", DoubleType(), True),
    StructField("Volume Sold Liters", DoubleType(), True),
    StructField("Date", DateType(), True)
])
df_invoice = spark.read.load(file_path, format='csv', header=True, schema=schema)


+-------------------+------------+----------------+-----------+-------------+--------------+----------+------------+---------+------------------+----------+
|Invoice_Item Number|Store Number|Address Sequence|Item Number|Vendor Number|Price Sequence|Items Sold|Sale Dollars|Sale Baht|Volume Sold Liters|      Date|
+-------------------+------------+----------------+-----------+-------------+--------------+----------+------------+---------+------------------+----------+
|       S10731600154|    STO_2633|               1|  ITM_87938|      VEN_434|             6|        48|       480.0|  17280.0|              48.0|2013-02-21|
|       S10484600004|    STO_2512|               1| ITM_903202|      VEN_259|             1|        36|       286.2|  10303.2|              27.0|2013-02-07|
|       S10907100019|    STO_4095|               1|  ITM_68038|      VEN_260|            18|         1|       43.07|  1550.52|               2.4|2013-03-05|
|       S10976000109|    STO_4829|               1|  ITM_6

In [None]:
# Transform

# Subtract year column by 543
df_invoice = df_invoice.withColumn('Date', add_months(df_invoice['Date'], -543 * 12))

df_invoice.printSchema()


root
 |-- Invoice_Item Number: string (nullable = true)
 |-- Store Number: string (nullable = true)
 |-- Address Sequence: integer (nullable = true)
 |-- Item Number: string (nullable = true)
 |-- Vendor Number: string (nullable = true)
 |-- Price Sequence: integer (nullable = true)
 |-- Items Sold: integer (nullable = true)
 |-- Sale Dollars: double (nullable = true)
 |-- Sale Baht: double (nullable = true)
 |-- Volume Sold Liters: double (nullable = true)
 |-- Date: date (nullable = true)



In [None]:
# Export to 'liquor-sales-data/transformed_data/' in ADLS

output_path = "/mnt/liquor-sales-data/transformed_data/invoice/"

df_invoice.write.parquet(output_path, mode='overwrite')