# "Silver Schema : Fact Sales"

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder \
    .appName("Employee") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "/Drivers/SQL_Sever/jdbc/sqljdbc42.jar")\
    .enableHiveSupport() \
    .getOrCreate()

In [4]:
spark.sql("show schemas;").show()

+---------+
|namespace|
+---------+
|   bronze|
|  default|
|    sales|
|   silver|
+---------+



In [5]:
spark.sql("use bronze")

DataFrame[]

In [6]:
spark.sql("show tables;").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   bronze|dimcustomer|      false|
|   bronze|    dimdate|      false|
|   bronze|dimemployee|      false|
|   bronze| dimproduct|      false|
|   bronze|  factsales|      false|
+---------+-----------+-----------+



In [9]:
bronze_Fact= spark.sql("select * from bronze.FactSales")
bronze_Fact.columns

['SalesOrderID',
 'CustomerKey',
 'EmployeeKey',
 'ProductKey',
 'RevisionNumber',
 'OrderQty',
 'UnitPrice',
 'UnitPriceDiscount',
 'SubTotal',
 'TaxAmt',
 'Freight',
 'TotalDue',
 'OrderDate',
 'DueDate',
 'ShipDate',
 'Status',
 'OnlineOrderFlag',
 'SalesOrderNumber',
 'PurchaseOrderNumber',
 'AccountNumber',
 'CreditCardApprovalCode',
 'Comment',
 'CarrierTrackingNumber',
 'LineTotal']

In [13]:
sFact =bronze_Fact.toPandas()

In [14]:
sFact.isnull().any()

SalesOrderID              False
CustomerKey               False
EmployeeKey               False
ProductKey                False
RevisionNumber            False
OrderQty                  False
UnitPrice                 False
UnitPriceDiscount         False
SubTotal                  False
TaxAmt                    False
Freight                   False
TotalDue                  False
OrderDate                 False
DueDate                   False
ShipDate                  False
Status                    False
OnlineOrderFlag           False
SalesOrderNumber          False
PurchaseOrderNumber       False
AccountNumber             False
CreditCardApprovalCode     True
Comment                    True
CarrierTrackingNumber     False
LineTotal                 False
dtype: bool

In [15]:
sFact.count()

SalesOrderID              60919
CustomerKey               60919
EmployeeKey               60919
ProductKey                60919
RevisionNumber            60919
OrderQty                  60919
UnitPrice                 60919
UnitPriceDiscount         60919
SubTotal                  60919
TaxAmt                    60919
Freight                   60919
TotalDue                  60919
OrderDate                 60919
DueDate                   60919
ShipDate                  60919
Status                    60919
OnlineOrderFlag           60919
SalesOrderNumber          60919
PurchaseOrderNumber       60919
AccountNumber             60919
CreditCardApprovalCode    60791
Comment                       0
CarrierTrackingNumber     60919
LineTotal                 60919
dtype: int64

In [16]:
bronze_Fact =bronze_Fact.drop(col("Comment"))\
                        .where("CreditCardApprovalCode is not null")

In [17]:
sFact =bronze_Fact.toPandas()
sFact.isnull().any()

SalesOrderID              False
CustomerKey               False
EmployeeKey               False
ProductKey                False
RevisionNumber            False
OrderQty                  False
UnitPrice                 False
UnitPriceDiscount         False
SubTotal                  False
TaxAmt                    False
Freight                   False
TotalDue                  False
OrderDate                 False
DueDate                   False
ShipDate                  False
Status                    False
OnlineOrderFlag           False
SalesOrderNumber          False
PurchaseOrderNumber       False
AccountNumber             False
CreditCardApprovalCode    False
CarrierTrackingNumber     False
LineTotal                 False
dtype: bool

In [18]:
bronze_Fact.write.format("hive").mode("overwrite").saveAsTable("silver.FactSales")

In [19]:
spark.sql("use silver")

DataFrame[]

In [20]:
spark.sql("show tables").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   silver|dimcustomer|      false|
|   silver|    dimdate|      false|
|   silver|dimemployee|      false|
|   silver| dimproduct|      false|
|   silver|  factsales|      false|
+---------+-----------+-----------+

