In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("sales-analytics").getOrCreate()
spark

24/06/09 13:50:12 WARN Utils: Your hostname, ngocthang-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.29.130 instead (on interface ens33)
24/06/09 13:50:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/09 13:50:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# EXTRACT

In [4]:
df_order = spark.read.option("header", "true").option("quote", '"').csv("file:///home/ngocthang/Documents/Code/Sales-Analytics/sales-analytics/data/raw-data/orders.csv")

                                                                                

In [6]:
df_order.show(5)

+-----------+---------------+---------------------+-------------+---------+------------+----------------+---------------------------------+-------------------+
|Customer ID|Customer Status|Date Order was placed|Delivery Date| Order ID|  Product ID|Quantity Ordered|Total Retail Price for This Order|Cost Price Per Unit|
+-----------+---------------+---------------------+-------------+---------+------------+----------------+---------------------------------+-------------------+
|        579|         Silver|            01-Jan-17|    07-Jan-17|123002578|220101400106|               2|                             92.6|               20.7|
|       7574|         SILVER|            01-Jan-17|    05-Jan-17|123004074|210201000009|               1|                             21.7|               9.95|
|      28861|           Gold|            01-Jan-17|    04-Jan-17|123000871|230100500068|               1|                              1.7|                0.8|
|      43796|           Gold|           

In [7]:
df_order.printSchema()

root
 |-- Customer ID: string (nullable = true)
 |-- Customer Status: string (nullable = true)
 |-- Date Order was placed: string (nullable = true)
 |-- Delivery Date: string (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Quantity Ordered: string (nullable = true)
 |-- Total Retail Price for This Order: string (nullable = true)
 |-- Cost Price Per Unit: string (nullable = true)



In [24]:
df_order.count()

                                                                                

185013

In [7]:
df_product = spark.read.\
    option("header", "true")\
    .option("quote", '"')\
    .csv("file:///home/ngocthang/Documents/Code/Sales-Analytics/sales-analytics/data/raw-data/product-supplier.csv")

In [9]:
df_product.count()

5504

In [8]:
df_product.show(5)

+------------+------------+-----------------+--------------------+--------------------+----------------+--------------------+-----------+
|  Product ID|Product Line| Product Category|       Product Group|        Product Name|Supplier Country|       Supplier Name|Supplier ID|
+------------+------------+-----------------+--------------------+--------------------+----------------+--------------------+-----------+
|210100100001|    Children|Children Outdoors|Outdoor things, Kids|Boy's and Girl's ...|              NO|Scandinavian Clot...|         50|
|210100100002|    Children|Children Outdoors|Outdoor things, Kids|   Children's Jacket|              ES| Luna sastreria S.A.|       4742|
|210100100003|    Children|Children Outdoors|Outdoor things, Kids|Children's Jacket...|              NO|Scandinavian Clot...|         50|
|210100100004|    Children|Children Outdoors|Outdoor things, Kids| Children's Rain Set|              NO|Scandinavian Clot...|         50|
|210100100005|    Children|Childre

# TRANSFORM

In [9]:
from pyspark.sql.functions import to_date, date_format, col, when, udf, count, isnan
from pyspark.sql.types import StringType

## STAGE 1 

### basic cleanning  & formating data 

In [10]:
df_order = df_order.dropDuplicates()

In [11]:
# check if there is any wrong cloumn
df_order.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df_order.columns]).show()



+-----------+---------------+---------------------+-------------+--------+----------+----------------+---------------------------------+-------------------+
|Customer ID|Customer Status|Date Order was placed|Delivery Date|Order ID|Product ID|Quantity Ordered|Total Retail Price for This Order|Cost Price Per Unit|
+-----------+---------------+---------------------+-------------+--------+----------+----------------+---------------------------------+-------------------+
|          0|              0|                    0|            0|       0|         0|               0|                                0|                  0|
+-----------+---------------+---------------------+-------------+--------+----------+----------------+---------------------------------+-------------------+



                                                                                

In [55]:
standardize = udf(lambda s : s.capitalize(), StringType()) # Capitalize the first letter and convert the rest to lowercase
# correct data type
df_order_transform = df_order.withColumn("Date Order was placed", to_date("Date Order was placed", "dd-MMM-yy")) \
                            .withColumn("Delivery Date", to_date("Delivery Date", "dd-MMM-yy")) \
                            .withColumn("Customer Status", standardize("Customer Status")) \
                            .withColumn("Quantity Ordered", col("Quantity Ordered").cast("integer"))\
                            .withColumn("Total Retail Price for This Order", col("Total Retail Price for This Order").cast("double")) \
                            .withColumn("Cost Price Per Unit", col("Cost Price Per Unit").cast("double"))

In [56]:
# rename columns
f_order_transform = df_order_transform.withColumnRenamed("Date Order was placed", "Order Date").\
                                        withColumnRenamed("Total Retail Price for This Order", "Order Price").\
                                        withColumnRenamed("Cost Price Per Unit", "Cost Per Product").\
                                        withColumnRenamed("Quantity Ordered", "Order Quantity")

In [16]:
df_order_transform.count()

                                                                                

185013

In [57]:
df_order_transform.show(5)

[Stage 154:>                                                        (0 + 1) / 1]

+-----------+---------------+----------+-------------+---------+------------+--------------+-----------+----------------+
|Customer ID|Customer Status|Order Date|Delivery Date| Order ID|  Product ID|Order Quantity|Order Price|Cost Per Product|
+-----------+---------------+----------+-------------+---------+------------+--------------+-----------+----------------+
|      92956|         Silver|2017-01-02|   2017-01-02|123009195|230100100013|             2|      226.2|            58.9|
|      62039|           Gold|2017-01-03|   2017-01-03|123013358|220100100036|             1|       41.2|            20.7|
|      40267|         Silver|2017-01-05|   2017-01-05|123024632|210200600084|             1|       34.8|            14.9|
|      50664|           Gold|2017-01-07|   2017-01-07|123037269|240800200043|             2|       84.6|           18.45|
|      55786|         Silver|2017-01-07|   2017-01-07|123034300|210200600013|             1|       83.7|            41.8|
+-----------+-----------

                                                                                

In [18]:
df_order_transform.filter(col("Customer ID") == "579").show()

[Stage 22:>                                                         (0 + 3) / 3]

+-----------+---------------+----------+-------------+---------+------------+--------------+-----------+----------------+
|Customer ID|Customer Status|Order Date|Delivery Date| Order ID|  Product ID|Order Quantity|Order Price|Cost Per Product|
+-----------+---------------+----------+-------------+---------+------------+--------------+-----------+----------------+
|        579|         Silver|2018-09-28|   2018-09-28|123306088|240100100235|             1|       13.2|            5.65|
|        579|         Silver|2017-01-01|   2017-01-07|123002578|220101400106|             2|       92.6|            20.7|
|        579|         Silver|2021-09-24|   2021-09-24|124350126|220101400027|             2|      139.0|            31.7|
|        579|           Gold|2021-04-29|   2021-04-29|124209728|240100100458|             1|        6.7|            2.95|
+-----------+---------------+----------+-------------+---------+------------+--------------+-----------+----------------+



                                                                                

In [19]:
df_order_transform.printSchema()

root
 |-- Customer ID: string (nullable = true)
 |-- Customer Status: string (nullable = true)
 |-- Order Date: date (nullable = true)
 |-- Delivery Date: date (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Order Quantity: string (nullable = true)
 |-- Order Price: double (nullable = true)
 |-- Cost Per Product: double (nullable = true)



In [14]:
df_product = df_product.dropDuplicates()

In [15]:
# check if any row contains null or empty value
df_product.select([count(when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c), c 
                           )).alias(c)
                    for c in df_product.columns]).show()

[Stage 13:>                                                         (0 + 1) / 1]

+----------+------------+----------------+-------------+------------+----------------+-------------+-----------+
|Product ID|Product Line|Product Category|Product Group|Product Name|Supplier Country|Supplier Name|Supplier ID|
+----------+------------+----------------+-------------+------------+----------------+-------------+-----------+
|         0|           0|               0|            0|           0|               0|            0|          0|
+----------+------------+----------------+-------------+------------+----------------+-------------+-----------+



                                                                                

In [16]:
import pycountry
# generate a new column contains name of country values base on Supplier Country Name  column
def get_country_name(country_code):
    try:
        country = pycountry.countries.get(alpha_2=country_code)
        if country:
            return country.name
        else:
            return None
    except:
        return None
    
get_country_name_udf = udf(get_country_name, StringType())

# Apply the UDF to the DataFrame column

df_product_transform = df_product.withColumn("Supplier Country Name", get_country_name_udf(col("Supplier Country")))

In [17]:
df_product_transform = df_product_transform.withColumnRenamed("Supplier Country", "Supplier Country Code")

In [24]:
df_product_transform.printSchema() # check data type

root
 |-- Product ID: string (nullable = true)
 |-- Product Line: string (nullable = true)
 |-- Product Category: string (nullable = true)
 |-- Product Group: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Supplier Country Code: string (nullable = true)
 |-- Supplier Name: string (nullable = true)
 |-- Supplier ID: string (nullable = true)
 |-- Supplier Country Name: string (nullable = true)



In [18]:
df_product_transform.show(5)

[Stage 19:>                                                         (0 + 1) / 1]

+------------+---------------+----------------+----------------+--------------------+---------------------+--------------------+-----------+---------------------+
|  Product ID|   Product Line|Product Category|   Product Group|        Product Name|Supplier Country Code|       Supplier Name|Supplier ID|Supplier Country Name|
+------------+---------------+----------------+----------------+--------------------+---------------------+--------------------+-----------+---------------------+
|210200900019|       Children| Children Sports|    Osprey, Kids|Osprey Cellerator...|                   US|Triple Sportswear...|       3664|        United States|
|220100100113|Clothes & Shoes|         Clothes|Eclipse Clothing|Big Guy Men's Col...|                   US|         Eclipse Inc|       1303|        United States|
|220100100252|Clothes & Shoes|         Clothes|Eclipse Clothing|Big Guy Men's Sol...|                   US|         Eclipse Inc|       1303|        United States|
|220100100264|Clothes 

                                                                                

load stage files to HDFS

In [None]:
df_order_parquet = df_order_transform.repartition(1)
df_order_parquet.write.mode('overwrite').parquet("D:\Code\DataEngieering-Projects\Projects\Sales-Analytics\data\data-transformed\order")

In [None]:
df_product_parquet = df_product_transform.repartition(1)
df_product_parquet.write.mode('overwrite').parquet("D:\Code\DataEngieering-Projects\Projects\Sales-Analytics\data\data-transformed\product")

## STAGE 2

### Format data on following schema 

dimCountry

In [19]:
dimCountry = df_product_transform.select(col("Supplier Country Code").alias("countryID"), col("Supplier Country Name").alias("countryName")).distinct()
dimCountry.show()

[Stage 20:>                                                         (0 + 1) / 1]

+---------+--------------+
|countryID|   countryName|
+---------+--------------+
|       NO|        Norway|
|       SE|        Sweden|
|       DE|       Germany|
|       FR|        France|
|       NL|   Netherlands|
|       GB|United Kingdom|
|       AU|     Australia|
|       DK|       Denmark|
|       US| United States|
|       CA|        Canada|
|       PT|      Portugal|
|       ES|         Spain|
|       BE|       Belgium|
+---------+--------------+



                                                                                

In [141]:
#check the format of datatype 
dimCountry.printSchema()

root
 |-- countryID: string (nullable = true)
 |-- countryName: string (nullable = true)



dimSupplier

In [34]:
df_product_transform.select(col("Supplier ID")).distinct().count()

64

In [137]:
dimSupplier = df_product_transform.select(col("Supplier ID").alias("supplierID"),\
                                       col("Supplier Name").alias("supplierName"),\
                                        col("Supplier Country Code").alias("supplierCountryID")).distinct()
dimSupplier.show(5)

+----------+--------------------+-----------------+
|supplierID|        supplierName|supplierCountryID|
+----------+--------------------+-----------------+
|     12869|Truls Sporting Goods|               NO|
|      5922|        Force Sports|               BE|
|      7511|  Mike Schaeffer Inc|               US|
|     13314|         Triffy B.V.|               NL|
|     18009|     SportPharma Inc|               US|
+----------+--------------------+-----------------+
only showing top 5 rows



In [142]:
#check the format of datatype 
dimSupplier.printSchema()

root
 |-- supplierID: string (nullable = true)
 |-- supplierName: string (nullable = true)
 |-- supplierCountryID: string (nullable = true)



dimProduct

In [47]:
# check distinct rows 
df_product_transform.select(col("Product ID").alias("produdctID")).count()

5504

In [144]:
#gennerate dimProdcutable
dimProduct = df_product_transform.select(col("Product ID").alias("productID"),\
                                         col("Product Name").alias("productName"),\
                                         col("Product Group").alias("productGroup"),\
                                         col("Product Line").alias("productLine"),\
                                         col("Product Category").alias("productCategory"),\
                                        col("Supplier ID").alias("supplierID")
).distinct()
                                         

In [145]:
dimProduct.show(5)

[Stage 245:>                                                        (0 + 1) / 1]

+------------+--------------------+--------------------+-----------+---------------+----------+
|   productID|         productName|        productGroup|productLine|productCategory|supplierID|
+------------+--------------------+--------------------+-----------+---------------+----------+
|210200300097|Tony's Sweatshirt...|Eclipse, Kid's Cl...|   Children|Children Sports|      1303|
|210200300109|Tony's Unbrushed ...|Eclipse, Kid's Cl...|   Children|Children Sports|      1303|
|210200600004|College Sweat Hoo...|     N.D. Gear, Kids|   Children|Children Sports|     14682|
|210200600025|Kid Tracking Pant...|     N.D. Gear, Kids|   Children|Children Sports|     14682|
|210200600094|       Junior Shorts|     N.D. Gear, Kids|   Children|Children Sports|      4742|
+------------+--------------------+--------------------+-----------+---------------+----------+
only showing top 5 rows



                                                                                

In [146]:
# check datatype 
dimProduct.printSchema()

root
 |-- productID: string (nullable = true)
 |-- productName: string (nullable = true)
 |-- productGroup: string (nullable = true)
 |-- productLine: string (nullable = true)
 |-- productCategory: string (nullable = true)
 |-- supplierID: string (nullable = true)



In [66]:
dimProduct.count()

                                                                                

5504

dimCustomer

In [23]:
df_order_transform.select(col("Customer ID").alias("customerID")).distinct().count()

                                                                                

56027

In [24]:
dimCustomer = df_order_transform.select(col("Customer ID").alias("customerID")).distinct()

In [65]:
dimCustomer.count()

                                                                                

56027

In [25]:
dimCustomer.show(5)

[Stage 35:>                                                         (0 + 3) / 3]

+----------+
|customerID|
+----------+
|     74605|
|     40740|
|     90143|
|     17427|
|     50254|
+----------+
only showing top 5 rows



                                                                                

dimCustomerStatus

In [96]:
from pyspark.sql.functions import monotonically_increasing_id
dimCustomerStatus = df_order_transform.select(col("Customer ID").alias("customerID"),\
                                        col("Customer Status").alias("customerStatus")
).distinct()

In [97]:
dimCustomerStatus = dimCustomerStatus.withColumn("customerStatusID", monotonically_increasing_id() + 100).\
                                        withColumn("customerStatusID", col("customerStatusID").cast("string"))

In [148]:
dimCustomerStatus = dimCustomerStatus.select("customerStatusID", "customerID", "customerStatus")

In [149]:
dimCustomerStatus.show(5)



+----------------+----------+--------------+
|customerStatusID|customerID|customerStatus|
+----------------+----------+--------------+
|             100|     49016|          Gold|
|             101|     59620|        Silver|
|             102|     91893|        Silver|
|             103|     88779|        Silver|
|             104|     56163|        Silver|
+----------------+----------+--------------+
only showing top 5 rows



                                                                                

In [150]:
# check data type
dimCustomerStatus.printSchema()

root
 |-- customerStatusID: string (nullable = false)
 |-- customerID: string (nullable = true)
 |-- customerStatus: string (nullable = true)



dimDate

In [41]:
# UNION order date and delivery date to generate date table
df_order_date = df_order_transform.select("Order Date").distinct()
df_delivery_date = df_order_transform.select("Delivery Date").distinct()
df_date = df_order_date.union(df_delivery_date).distinct()

In [42]:
df_date.distinct().count()

                                                                                

1840

In [47]:
df_date.printSchema()

root
 |-- Order Date: date (nullable = true)



In [44]:
df_date.show(5)

                                                                                

+----------+
|Order Date|
+----------+
|2017-08-11|
|2017-09-11|
|2018-05-28|
|2018-08-10|
|2019-05-08|
+----------+
only showing top 5 rows



In [51]:
from pyspark.sql.functions import year, month, dayofmonth, quarter
#create dimDate table
dimDate = df_date.withColumn("day", dayofmonth("Order Date")).\
                    withColumn("month", month("Order Date")).\
                    withColumn("quarter", quarter("Order Date")).\
                    withColumn("year", year("Order Date")).\
                    withColumnRenamed("Order Date", "date")

save file as parquet format

In [52]:
dimDate.show(5)





+----------+---+-----+-------+----+
|      date|day|month|quarter|year|
+----------+---+-----+-------+----+
|2017-08-11| 11|    8|      3|2017|
|2017-09-11| 11|    9|      3|2017|
|2018-05-28| 28|    5|      2|2018|
|2018-08-10| 10|    8|      3|2018|
|2019-05-08|  8|    5|      2|2019|
+----------+---+-----+-------+----+
only showing top 5 rows



                                                                                

In [152]:
dimDate.printSchema()

root
 |-- date: date (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- year: integer (nullable = true)



factOrder

In [58]:
df_order_transform.printSchema()

root
 |-- Customer ID: string (nullable = true)
 |-- Customer Status: string (nullable = true)
 |-- Order Date: date (nullable = true)
 |-- Delivery Date: date (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Order Quantity: integer (nullable = true)
 |-- Order Price: double (nullable = true)
 |-- Cost Per Product: double (nullable = true)



In [124]:

df_fact_order = df_order_transform.select(col("Order ID").alias("orderID"),\
                                        col("Customer ID").alias("customerID"),\
                                        col("Customer Status").alias("customerStatus"),\
                                        col("Order Date").alias("orderDate"),\
                                        col("Order Price").alias("orderPrice"),\
                                        col("Delivery Date").alias("deliveryDate"),\
                                        col("Cost Per Product").alias("costPerProduct"),\
                                        col("Order Quantity").alias("orderQuantity"),\
                                        col("Product ID").alias("productID")
)

In [125]:
df_fact_order.show(5)

[Stage 214:>                                                        (0 + 1) / 1]

+---------+----------+--------------+----------+----------+------------+--------------+-------------+------------+
|  orderID|customerID|customerStatus| orderDate|orderPrice|deliveryDate|costPerProduct|orderQuantity|   productID|
+---------+----------+--------------+----------+----------+------------+--------------+-------------+------------+
|123009195|     92956|        Silver|2017-01-02|     226.2|  2017-01-02|          58.9|            2|230100100013|
|123013358|     62039|          Gold|2017-01-03|      41.2|  2017-01-03|          20.7|            1|220100100036|
|123024632|     40267|        Silver|2017-01-05|      34.8|  2017-01-05|          14.9|            1|210200600084|
|123037269|     50664|          Gold|2017-01-07|      84.6|  2017-01-07|         18.45|            2|240800200043|
|123034300|     55786|        Silver|2017-01-07|      83.7|  2017-01-07|          41.8|            1|210200600013|
+---------+----------+--------------+----------+----------+------------+--------

                                                                                

In [153]:
dimCustomerStatus.printSchema()

root
 |-- customerStatusID: string (nullable = false)
 |-- customerID: string (nullable = true)
 |-- customerStatus: string (nullable = true)



In [127]:
df_fact_order.printSchema()

root
 |-- orderID: string (nullable = true)
 |-- customerID: string (nullable = true)
 |-- customerStatus: string (nullable = true)
 |-- orderDate: date (nullable = true)
 |-- orderPrice: double (nullable = true)
 |-- deliveryDate: date (nullable = true)
 |-- costPerProduct: double (nullable = true)
 |-- orderQuantity: integer (nullable = true)
 |-- productID: string (nullable = true)



In [130]:
# create temp view to implement sparkSQL
df_fact_order.createOrReplaceTempView("order_table")
dimCustomerStatus.createOrReplaceTempView("customerStatus_table")

In [131]:
spark.sql("select * from order_table").show(5)



[Stage 217:>                                                        (0 + 1) / 1]

+---------+----------+--------------+----------+----------+------------+--------------+-------------+------------+
|  orderID|customerID|customerStatus| orderDate|orderPrice|deliveryDate|costPerProduct|orderQuantity|   productID|
+---------+----------+--------------+----------+----------+------------+--------------+-------------+------------+
|123009195|     92956|        Silver|2017-01-02|     226.2|  2017-01-02|          58.9|            2|230100100013|
|123013358|     62039|          Gold|2017-01-03|      41.2|  2017-01-03|          20.7|            1|220100100036|
|123024632|     40267|        Silver|2017-01-05|      34.8|  2017-01-05|          14.9|            1|210200600084|
|123037269|     50664|          Gold|2017-01-07|      84.6|  2017-01-07|         18.45|            2|240800200043|
|123034300|     55786|        Silver|2017-01-07|      83.7|  2017-01-07|          41.8|            1|210200600013|
+---------+----------+--------------+----------+----------+------------+--------

                                                                                

In [111]:
spark.sql("select * from customerStatus_table").show(5)



+----------+--------------+----------------+
|customerID|customerStatus|customerStatusID|
+----------+--------------+----------------+
|     49016|          Gold|             100|
|     59620|        Silver|             101|
|     91893|        Silver|             102|
|     88779|        Silver|             103|
|     56163|        Silver|             104|
+----------+--------------+----------------+
only showing top 5 rows



                                                                                

In [155]:
# generate factOrder table by using JOiN in sparkSQL
factOrder = spark.sql("""
        select o.orderID, o.orderPrice, o.costPerProduct, o.orderQuantity, o.orderDate, o.deliveryDate, c.customerStatusID, productID
        from order_table o
        join customerStatus_table c
        on o.customerStatus = c.customerStatus and o.customerID = c.customerID
        
""")

In [156]:
factOrder.show()

[Stage 251:=====>           (1 + 2) / 3][Stage 252:>                (0 + 2) / 3]

[Stage 256:>                                                        (0 + 1) / 1]

+---------+----------+--------------+-------------+----------+------------+----------------+------------+
|  orderID|orderPrice|costPerProduct|orderQuantity| orderDate|deliveryDate|customerStatusID|   productID|
+---------+----------+--------------+-------------+----------+------------+----------------+------------+
|123009195|     226.2|          58.9|            2|2017-01-02|  2017-01-02|           31768|230100100013|
|123013358|      41.2|          20.7|            1|2017-01-03|  2017-01-03|           16661|220100100036|
|123024632|      34.8|          14.9|            1|2017-01-05|  2017-01-05|           13138|210200600084|
|123037269|      84.6|         18.45|            2|2017-01-07|  2017-01-07|           28716|240800200043|
|123034300|      83.7|          41.8|            1|2017-01-07|  2017-01-07|           23032|210200600013|
|123062708|     165.5|         82.85|            1|2017-01-12|  2017-01-12|           34157|220200100229|
|123069200|      18.2|           8.2|         

                                                                                

In [157]:
# check datatype 
factOrder.printSchema()

root
 |-- orderID: string (nullable = true)
 |-- orderPrice: double (nullable = true)
 |-- costPerProduct: double (nullable = true)
 |-- orderQuantity: integer (nullable = true)
 |-- orderDate: date (nullable = true)
 |-- deliveryDate: date (nullable = true)
 |-- customerStatusID: string (nullable = false)
 |-- productID: string (nullable = true)



In [136]:
print(df_order.count(), factOrder.count())



185013 185013


                                                                                

load stage 2 files to HDSF

In [178]:
def write_df_to_hdfs(df, hdfs_path):
    df.repartition(1).write.mode('overwrite').parquet(f"{hdfs_path}")

In [174]:
hdfs_directory = "hdfs://localhost:9000/sales-analytics-data/completely-transformed-data"
file_format = "parquet"

tables = [factOrder, dimDate, dimCustomerStatus, dimCustomer, dimProduct, dimSupplier, dimCountry]

hdfs_paths = [f"{hdfs_directory}/factOrder.{file_format}", 
             f"{hdfs_directory}/dimDate.{file_format}", 
             f"{hdfs_directory}/dimCustomerStatus.{file_format}", 
             f"{hdfs_directory}/dimCustomer.{file_format}", 
             f"{hdfs_directory}/dimProduct.{file_format}", 
             f"{hdfs_directory}/dimSupplier.{file_format}", 
             f"{hdfs_directory}/dimCountry.{file_format}"]


In [175]:
for i in hdfs_path:
    print(i)

hdfs://localhost:9000/sales-analytics-data/completely-transformed-data/factOrder.parquet
hdfs://localhost:9000/sales-analytics-data/completely-transformed-data/dimDate.parquet
hdfs://localhost:9000/sales-analytics-data/completely-transformed-data/dimCustomerStatus.parquet
hdfs://localhost:9000/sales-analytics-data/completely-transformed-data/dimCustomer.parquet
hdfs://localhost:9000/sales-analytics-data/completely-transformed-data/dimProduct.parquet
hdfs://localhost:9000/sales-analytics-data/completely-transformed-data/dimSupplier.parquet
hdfs://localhost:9000/sales-analytics-data/completely-transformed-data/dimCountry.parquet


In [179]:
for i, df in enumerate(tables):
    write_df_to_hdfs(df, hdfs_paths[i])
    print(f"Load succesfully no.{i} files")



                                                                                

Load succesfully no.0 files


                                                                                

Load succesfully no.1 files


                                                                                

Load succesfully no.2 files


                                                                                

Load succesfully no.3 files
Load succesfully no.4 files
Load succesfully no.5 files


[Stage 385:>                                                        (0 + 1) / 1]

Load succesfully no.6 files


                                                                                

# LOAD

load to data warehouse

In [180]:
spark.stop()