In [None]:
# Import Packages

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

spark = SparkSession.builder.appName('Store Address').master('local[*]').getOrCreate()


In [None]:
# Import data from 'liquor-sales-data/raw_data/' in ADLS

file_path = "/mnt/liquor-sales-data/raw_data/store_address.csv"

schema = StructType([
    StructField("Store Number", StringType(), True),
    StructField("Store Name", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Zip Code", IntegerType(), True),
    StructField("County Number", StringType(), True),
    StructField("Latitude", DoubleType(), True),
    StructField("Longitude", DoubleType(), True)
])
df_address = spark.read.load(file_path, format='csv', header=True, schema=schema)

df_address.show()


+------------+--------------------+--------------------+----------------+--------+-------------+---------+-----------+
|Store Number|          Store Name|             Address|            City|Zip Code|County Number| Latitude|  Longitude|
+------------+--------------------+--------------------+----------------+--------+-------------+---------+-----------+
|    STO_9936|     Dash Events Llc|1685 WEST UINTAH ...|COLORADO SPRINGS|   80904|         NULL|38.848017|-104.845334|
|    STO_4422|Kum And Go #193 /...|  104 NORTH DIVISION|          STUART|   50250|        CNT_1|41.503858| -94.318376|
|    STO_4654|Kum And Go #510 /...|  629 SOUTH DIVISION|          STUART|   50250|        CNT_1|41.496773| -94.318482|
|    STO_4417|Kum And Go #76 / ...|       109 SOUTH 5TH|           ADAIR|   50002|        CNT_1|42.017403| -93.611475|
|    STO_4753|Caseys General St...|       110 SOUTH 5TH|           ADAIR|   50002|        CNT_1|41.013948| -93.300253|
|    STO_4620|Nodaway Valley Ma...|             

In [None]:
# Transform


root
 |-- Store Number: string (nullable = true)
 |-- Store Name: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip Code: integer (nullable = true)
 |-- County Number: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)



In [None]:
# Export to 'liquor-sales-data/transformed_data/' in ADLS

output_path = "/mnt/liquor-sales-data/transformed_data/store_address/"

df_address.write.parquet(output_path, mode='overwrite')