In [None]:
# Import Packages

from pyspark.sql import SparkSession
from pyspark.sql.functions import round
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

spark = SparkSession.builder.appName('Item').master('local[*]').getOrCreate()


In [None]:
# Import data from 'liquor-sales-data/raw_data/' in ADLS

file_path = "/mnt/liquor-sales-data/raw_data/item.csv"

schema = StructType([
    StructField("Item Number", StringType(), True),
    StructField("Item Description", StringType(), True),
    StructField("Category Number", StringType(), True),
    StructField("Vendor Number", StringType(), True),
    StructField("Pack", IntegerType(), True),
    StructField("Bottle Volume ml", IntegerType(), True),
    StructField("State Bottle cost", DoubleType(), True),
    StructField("State Bottle Retail", DoubleType(), True)
])
df_item = spark.read.load(file_path, format='csv', header=True, schema=schema)


+-----------+--------------------+---------------+-------------+----+----------------+-----------------+-------------------+
|Item Number|    Item Description|Category Number|Vendor Number|Pack|Bottle Volume ml|State Bottle cost|State Bottle Retail|
+-----------+--------------------+---------------+-------------+----+----------------+-----------------+-------------------+
| ITM_904919|          Cachaca 21|         CAT_68|      VEN_391|   1|            1000|              7.5|              11.25|
| ITM_987159|Sooh Millet Armag...|         CAT_48|      VEN_283|   1|             750|             20.0|               30.0|
| ITM_994875|Sooh Morin Calvad...|         CAT_48|      VEN_283|   1|             750|             20.0|               30.0|
| ITM_904677|Hirsch Straight B...|         CAT_89|      VEN_391|   1|             750|             22.0|               33.0|
| ITM_987162|Charles Baur Fine...|         CAT_48|      VEN_283|   1|             750|             22.0|               33.0|


In [None]:
# Transform

# Round 'State Bottle Cost' and 'State Botttle Retail' up to 2 decimal figures
df_item = df_item.withColumn('State Bottle Cost', round(df_item['State Bottle Cost'], 2))
df_item = df_item.withColumn('State Bottle Retail', round(df_item['State Bottle Retail'], 2))

df_item.printSchema()


root
 |-- Item Number: string (nullable = true)
 |-- Item Description: string (nullable = true)
 |-- Category Number: string (nullable = true)
 |-- Vendor Number: string (nullable = true)
 |-- Pack: integer (nullable = true)
 |-- Bottle Volume ml: integer (nullable = true)
 |-- State Bottle Cost: double (nullable = true)
 |-- State Bottle Retail: double (nullable = true)



In [None]:
# Export to 'liquor-sales-data/transformed_data/' in ADLS

output_path = "/mnt/liquor-sales-data/transformed_data/item/"

df_item.write.parquet(output_path, mode='overwrite')