# "Product Dimnsion"

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder \
    .appName("product") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "/Drivers/SQL_Sever/jdbc/sqljdbc42.jar")\
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
Tables= ["[Production].[Product]", "[Production].[ProductDescription]", "[Production].[ProductModelProductDescriptionCulture]", "[Production].[ProductSubcategory]", "[Production].[ProductCategory]", "[Production].[ProductCostHistory]"]
print(len(Tables), " tables")

6  tables


In [4]:
dataFrames = {}
for table in Tables:
    query = f"select * from {table}"
    df =spark.read.format("jdbc")\
        .option("url","jdbc:sqlserver://172.18.0.4:1433;databaseName=AdventureWorks2017")\
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")\
        .option("dbtable", f"({query}) AS temp")\
        .option("user", "sa")\
        .option("password", "Mo*012105")\
        .load()
    dataFrames[table]= df
    
print(dataFrames.keys())

dict_keys(['[Production].[Product]', '[Production].[ProductDescription]', '[Production].[ProductModelProductDescriptionCulture]', '[Production].[ProductSubcategory]', '[Production].[ProductCategory]', '[Production].[ProductCostHistory]'])


In [5]:
# Prepare the core table product, select desired columns, repartition and cache it

In [6]:
product = dataFrames["[Production].[Product]"]\
    .select('ProductID',
         expr('Name').alias("productName"),
         'ProductNumber',
         'MakeFlag',
         'FinishedGoodsFlag',
         'Color',
         'SafetyStockLevel',
         'ReorderPoint',
         'StandardCost',
         'ListPrice',
         'Size',
         'SizeUnitMeasureCode',
         'WeightUnitMeasureCode',
         'Weight',
         'DaysToManufacture',
         'ProductLine',
         'Class',
         'Style',
         'ProductSubcategoryID',
         'ProductModelID',
         'SellStartDate',
         'SellEndDate',
         'DiscontinuedDate')\
    .repartition(4,'ProductID')\
    .cache()
product.createOrReplaceTempView("product")
product.count()

504

prepare category and subCategory tables to join it to product

In [7]:
category = dataFrames['[Production].[ProductCategory]']\
    .select('ProductCategoryID', 'Name')\
    .repartition(4,'ProductCategoryID')\
    .cache()
category.createOrReplaceTempView("category")

In [8]:
subCat = dataFrames['[Production].[ProductSubcategory]']\
    .select('ProductSubcategoryID',
             'ProductCategoryID',
             expr('Name').alias("subCategoryName"))\
    .repartition(4,'ProductSubcategoryID', 'ProductCategoryID')\
    .cache()
subCat.createOrReplaceTempView("subCat")
subCat.show(3)

+--------------------+-----------------+---------------+
|ProductSubcategoryID|ProductCategoryID|subCategoryName|
+--------------------+-----------------+---------------+
|                  19|                3|           Caps|
|                  22|                3|         Shorts|
|                  27|                4|    Bike Stands|
+--------------------+-----------------+---------------+
only showing top 3 rows



got the newest price for product history

In [9]:
productHist = dataFrames['[Production].[ProductCostHistory]']\
    .where(col("EndDate").isNull())\
    .select("ProductID", "StandardCost")\
    .repartition(4,'ProductID')\
    .cache()
productHist.createOrReplaceTempView("productHist")
productHist.show(3)

+---------+------------+
|ProductID|StandardCost|
+---------+------------+
|      707|     13.0863|
|      711|     13.0863|
|      716|     38.4923|
+---------+------------+
only showing top 3 rows



In [10]:
culture = dataFrames['[Production].[ProductModelProductDescriptionCulture]']\
    .where(col("CultureID") =='en')\
    .select('ProductModelID', 'ProductDescriptionID', 'CultureID')\
    .repartition(4,'ProductModelID', 'ProductDescriptionID')\
    .cache()
culture.createOrReplaceTempView("culture")
culture.columns

['ProductModelID', 'ProductDescriptionID', 'CultureID']

In [11]:
description = dataFrames['[Production].[ProductDescription]']\
    .select('ProductDescriptionID', 'Description')\
    .repartition(4,'ProductDescriptionID')\
    .cache()
description.createOrReplaceTempView("description")
description.columns

['ProductDescriptionID', 'Description']

In [12]:
spark.catalog.listTables()

[Table(name='dimdate', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='my_new_date_table', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='my_new_date_tablee', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='category', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='culture', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='description', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='product', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='productHist', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='subCat', cat

In [13]:
DimProduct = spark.sql("""
    select *
    from product p left outer join subCat sc
    on p.ProductSubcategoryID = sc.ProductSubcategoryID

    left outer join category c
    on sc.ProductCategoryID = c.ProductCategoryID

    left outer join productHist ph
    on p.productID = ph.productID

    left outer join culture
    on p.ProductModelID = culture.ProductModelID

    left outer join description desc
    on culture.ProductDescriptionID = desc.ProductDescriptionID
    
""")
DimProduct.columns

['ProductID',
 'productName',
 'ProductNumber',
 'MakeFlag',
 'FinishedGoodsFlag',
 'Color',
 'SafetyStockLevel',
 'ReorderPoint',
 'StandardCost',
 'ListPrice',
 'Size',
 'SizeUnitMeasureCode',
 'WeightUnitMeasureCode',
 'Weight',
 'DaysToManufacture',
 'ProductLine',
 'Class',
 'Style',
 'ProductSubcategoryID',
 'ProductModelID',
 'SellStartDate',
 'SellEndDate',
 'DiscontinuedDate',
 'ProductSubcategoryID',
 'ProductCategoryID',
 'subCategoryName',
 'ProductCategoryID',
 'Name',
 'ProductID',
 'StandardCost',
 'ProductModelID',
 'ProductDescriptionID',
 'CultureID',
 'ProductDescriptionID',
 'Description']

In [14]:
DimProduct= DimProduct.select(
        'p.ProductID',
         'productName',
         'ProductNumber',
         'MakeFlag',
         'FinishedGoodsFlag',
         'Color',
         'SafetyStockLevel',
         'ReorderPoint',
         'p.StandardCost',
         'ListPrice',
         'Size',
         'SizeUnitMeasureCode',
         'WeightUnitMeasureCode',
         'Weight',
         'DaysToManufacture',
         'ProductLine',
         'Class',
         'Style',
         'SellStartDate',
         'SellEndDate',
         'DiscontinuedDate',
         'subCategoryName',
         'Description')\
    .repartition(2,"ProductID")
        

In [15]:
DimProduct.write.format("hive").mode("overwrite").saveAsTable("bronze.DimProduct")

In [16]:
spark.sql("show databases").show()

+-----------------+
|        namespace|
+-----------------+
|           bronze|
|bronzesalesschema|
|          default|
|      my_database|
|            sales|
|             test|
+-----------------+

