# "Silver Schema : Product Dimension"

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder \
    .appName("Employee") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "/Drivers/SQL_Sever/jdbc/sqljdbc42.jar")\
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
spark.sql("show schemas;").show()

+---------+
|namespace|
+---------+
|   bronze|
|  default|
|    sales|
|   silver|
+---------+



In [4]:
spark.sql("use bronze")

DataFrame[]

In [5]:
spark.sql("show tables;").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   bronze|dimcustomer|      false|
|   bronze|    dimdate|      false|
|   bronze|dimemployee|      false|
|   bronze| dimproduct|      false|
|   bronze|  factsales|      false|
+---------+-----------+-----------+



In [7]:
bronze_Product= spark.sql("select * from bronze.dimproduct")
bronze_Product.columns

['ProductID',
 'productName',
 'ProductNumber',
 'MakeFlag',
 'FinishedGoodsFlag',
 'Color',
 'SafetyStockLevel',
 'ReorderPoint',
 'StandardCost',
 'ListPrice',
 'Size',
 'SizeUnitMeasureCode',
 'WeightUnitMeasureCode',
 'Weight',
 'DaysToManufacture',
 'ProductLine',
 'Class',
 'Style',
 'SellStartDate',
 'SellEndDate',
 'DiscontinuedDate',
 'subCategoryName',
 'Description']

In [8]:
sEmp = bronze_Product.toPandas()
sEmp.isnull().any()

ProductID                False
productName              False
ProductNumber            False
MakeFlag                 False
FinishedGoodsFlag        False
Color                     True
SafetyStockLevel         False
ReorderPoint             False
StandardCost             False
ListPrice                False
Size                      True
SizeUnitMeasureCode       True
WeightUnitMeasureCode     True
Weight                    True
DaysToManufacture        False
ProductLine               True
Class                     True
Style                     True
SellStartDate            False
SellEndDate               True
DiscontinuedDate          True
subCategoryName           True
Description               True
dtype: bool

In [9]:
sEmp.count()

ProductID                504
productName              504
ProductNumber            504
MakeFlag                 504
FinishedGoodsFlag        504
Color                    256
SafetyStockLevel         504
ReorderPoint             504
StandardCost             504
ListPrice                504
Size                     211
SizeUnitMeasureCode      176
WeightUnitMeasureCode    205
Weight                   205
DaysToManufacture        504
ProductLine              278
Class                    247
Style                    211
SellStartDate            504
SellEndDate               98
DiscontinuedDate           0
subCategoryName          295
Description              294
dtype: int64

In [10]:
bronze_Product =bronze_Product.drop(col("DiscontinuedDate"))

In [12]:
bronze_Product.write.format("hive").mode("overwrite").saveAsTable("silver.DimProduct")

In [13]:
spark.sql("use silver")

DataFrame[]

In [14]:
spark.sql("show tables").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   silver|dimcustomer|      false|
|   silver|    dimdate|      false|
|   silver|dimemployee|      false|
|   silver| dimproduct|      false|
+---------+-----------+-----------+

