# "Silver Schema : Fact Sales"

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder \
    .appName("Employee") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "/Drivers/SQL_Sever/jdbc/sqljdbc42.jar")\
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
spark.sql("show schemas;").show()

+---------+
|namespace|
+---------+
|   bronze|
|  default|
|    sales|
|   silver|
+---------+



In [17]:
spark.sql("use bronze;")

DataFrame[]

In [18]:
spark.sql("show tables;").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   bronze|dimcustomer|      false|
|   bronze|    dimdate|      false|
|   bronze|dimemployee|      false|
|   bronze| dimproduct|      false|
|   bronze|  factsales|      false|
+---------+-----------+-----------+



In [10]:
spark.sql("select count(SalesOrderID) from factsales").show()

+-------------------+
|count(SalesOrderID)|
+-------------------+
|             121317|
+-------------------+



In [19]:
bronze_Fact= spark.sql("select * from bronze.FactSales")
bronze_Fact.columns

['SalesOrderID',
 'CustomerKey',
 'EmployeeKey',
 'ProductKey',
 'RevisionNumber',
 'OrderQty',
 'UnitPrice',
 'UnitPriceDiscount',
 'SubTotal',
 'TaxAmt',
 'Freight',
 'TotalDue',
 'OrderDate',
 'DueDate',
 'ShipDate',
 'Status',
 'OnlineOrderFlag',
 'SalesOrderNumber',
 'PurchaseOrderNumber',
 'AccountNumber',
 'CreditCardApprovalCode',
 'Comment',
 'CarrierTrackingNumber',
 'LineTotal']

In [20]:
bronze_Fact.count()

121317

In [47]:
sFact =bronze_Fact.toPandas()

In [48]:
sFact.isnull().any()

SalesOrderID              False
CustomerKey               False
EmployeeKey                True
ProductKey                False
RevisionNumber            False
OrderQty                  False
UnitPrice                 False
UnitPriceDiscount         False
SubTotal                  False
TaxAmt                    False
Freight                   False
TotalDue                  False
OrderDate                 False
DueDate                   False
ShipDate                  False
Status                    False
OnlineOrderFlag           False
SalesOrderNumber          False
PurchaseOrderNumber        True
AccountNumber             False
CreditCardApprovalCode     True
Comment                    True
CarrierTrackingNumber      True
LineTotal                 False
dtype: bool

In [49]:
sFact.count()

SalesOrderID              121317
CustomerKey               121317
EmployeeKey                60919
ProductKey                121317
RevisionNumber            121317
OrderQty                  121317
UnitPrice                 121317
UnitPriceDiscount         121317
SubTotal                  121317
TaxAmt                    121317
Freight                   121317
TotalDue                  121317
OrderDate                 121317
DueDate                   121317
ShipDate                  121317
Status                    121317
OnlineOrderFlag           121317
SalesOrderNumber          121317
PurchaseOrderNumber        60919
AccountNumber             121317
CreditCardApprovalCode    118744
Comment                        0
CarrierTrackingNumber      60919
LineTotal                 121317
dtype: int64

In [13]:
bronze_Fact =bronze_Fact.drop(col("Comment"),col("CreditCardApprovalCode"))\
    .where("EmployeeKey is not null")\
    .where("PurchaseOrderNumber is not null")\
    .where("CarrierTrackingNumber is not null")

In [14]:
sFact =bronze_Fact.toPandas()
sFact.isnull().any()

SalesOrderID             False
CustomerKey              False
EmployeeKey              False
ProductKey               False
RevisionNumber           False
OrderQty                 False
UnitPrice                False
UnitPriceDiscount        False
SubTotal                 False
TaxAmt                   False
Freight                  False
TotalDue                 False
OrderDate                False
DueDate                  False
ShipDate                 False
Status                   False
OnlineOrderFlag          False
SalesOrderNumber         False
PurchaseOrderNumber      False
AccountNumber            False
CarrierTrackingNumber    False
LineTotal                False
dtype: bool

In [60]:
bronze_Fact.write.format("hive").mode("overwrite").saveAsTable("silver.FactSales")


In [21]:
spark.sql("use silver")

DataFrame[]

In [22]:
spark.sql("drop table factsales;")

DataFrame[]

In [25]:
spark.sql("show tables").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   silver|dimcustomer|      false|
|   silver|    dimdate|      false|
|   silver|dimemployee|      false|
|   silver| dimproduct|      false|
|   silver|  factsales|      false|
+---------+-----------+-----------+



In [24]:
bronze_Fact.write.format("hive").mode("overwrite").saveAsTable("silver.FactSales")

In [26]:
spark.sql("select count(distinct CustomerKey) from factsales").show()

+---------------------------+
|count(DISTINCT CustomerKey)|
+---------------------------+
|                      19119|
+---------------------------+

