# "Fact Table"
# Fact table built on the transactional level granularity

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [9]:
f
spark = SparkSession.builder \
    .appName("SampleDataToHive") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "/Drivers/SQL_Sever/jdbc/sqljdbc42.jar")\
    .enableHiveSupport() \
    .getOrCreate()


In [10]:
Tables = ["[Sales].[SalesOrderHeader]", "[Sales].[SalesOrderDetail]", "[Sales].[Customer]", "[Production].[Product]", "[HumanResources].[Employee]"] 
print(len(Tables), " tables")

5  tables


In [11]:
dataFrames= {}
for table in Tables:
    query = f"select * from {table}"
    df =spark.read.format("jdbc")\
        .option("url", "jdbc:sqlserver://172.18.0.7:1433;databaseName=AdventureWorks2017")\
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")\
        .option("dbtable", f"({query}) as temp")\
        .option("user","sa")\
        .option("password", "Mo*012105")\
        .load()
    dataFrames[table] = df
print(dataFrames.keys())

dict_keys(['[Sales].[SalesOrderHeader]', '[Sales].[SalesOrderDetail]', '[Sales].[Customer]', '[Production].[Product]', '[HumanResources].[Employee]'])


# SalesOrderHeader is a transactional table

In [12]:
SalesOrderHeader = dataFrames["[Sales].[SalesOrderHeader]"]\
    .select(
        'SalesOrderID',
         'RevisionNumber',
         'OrderDate',
         'DueDate',
         'ShipDate',
         'Status',
         'OnlineOrderFlag',
         'SalesOrderNumber',
         'PurchaseOrderNumber',
         'AccountNumber',
         'CustomerID',
         'SalesPersonID',
         'TerritoryID',
         'CreditCardApprovalCode',
         'SubTotal',
         'TaxAmt',
         'Freight',
         'TotalDue',
         'Comment')\
    .repartition(4, 'SalesOrderID', 'CustomerID', 'SalesPersonID')\
    .cache()
SalesOrderHeader.createOrReplaceTempView("SalesOrderHeader")
SalesOrderHeader.columns

['SalesOrderID',
 'RevisionNumber',
 'OrderDate',
 'DueDate',
 'ShipDate',
 'Status',
 'OnlineOrderFlag',
 'SalesOrderNumber',
 'PurchaseOrderNumber',
 'AccountNumber',
 'CustomerID',
 'SalesPersonID',
 'TerritoryID',
 'CreditCardApprovalCode',
 'SubTotal',
 'TaxAmt',
 'Freight',
 'TotalDue',
 'Comment']

# details about transactions

In [13]:
SalesOrderDetail = dataFrames["[Sales].[SalesOrderDetail]"]\
    .select(
        'SalesOrderID',
         'SalesOrderDetailID',
         'CarrierTrackingNumber',
         'OrderQty',
         'ProductID',
         'UnitPrice',
         'UnitPriceDiscount',
         'LineTotal')\
    .repartition(4,'SalesOrderID', 'SalesOrderDetailID','ProductID')\
    .cache()
SalesOrderDetail.createOrReplaceTempView("SalesOrderDetail")
SalesOrderDetail.columns

['SalesOrderID',
 'SalesOrderDetailID',
 'CarrierTrackingNumber',
 'OrderQty',
 'ProductID',
 'UnitPrice',
 'UnitPriceDiscount',
 'LineTotal']

In [14]:
customer = dataFrames['[Sales].[Customer]']\
    .select('CustomerID',
             'PersonID',
             'StoreID',
             'TerritoryID',
             'AccountNumber')\
    .repartition(4, "CustomerID")\
    .cache()
customer.createOrReplaceTempView("customer")
customer.count()

19820

In [15]:
emp = dataFrames["[HumanResources].[Employee]"]\
    .select('BusinessEntityID',
             'NationalIDNumber',
             'LoginID',
             'OrganizationNode',
             'OrganizationLevel',
             'JobTitle',
             'BirthDate',
             'MaritalStatus',
             'Gender',
             'HireDate',
             'SalariedFlag',
             'VacationHours',
             'SickLeaveHours',
             'CurrentFlag')
emp.repartition(4, "BusinessEntityID").cache()
emp.createOrReplaceTempView("emp")
emp.count()

290

In [16]:
product = dataFrames["[Production].[Product]"]\
    .select('ProductID',
         expr('Name').alias("productName"),
         'ProductNumber',
         'MakeFlag',
         'FinishedGoodsFlag',
         'Color',
         'SafetyStockLevel',
         'ReorderPoint',
         'StandardCost',
         'ListPrice',
         'Size',
         'SizeUnitMeasureCode',
         'WeightUnitMeasureCode',
         'Weight',
         'DaysToManufacture',
         'ProductLine',
         'Class',
         'Style',
         'ProductSubcategoryID',
         'ProductModelID',
         'SellStartDate',
         'SellEndDate',
         'DiscontinuedDate')\
    .repartition(4,'ProductID')\
    .cache()
product.createOrReplaceTempView("product")
product.count()

504

In [17]:
spark.catalog.listTables()

[Table(name='dimcustomer', catalog='spark_catalog', namespace=['bronze'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='dimdate', catalog='spark_catalog', namespace=['bronze'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='dimemployee', catalog='spark_catalog', namespace=['bronze'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='dimproduct', catalog='spark_catalog', namespace=['bronze'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='customer', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='emp', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='product', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='SalesOrderDetail', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='SalesOrder

In [18]:
FactSales= spark.sql("""
    select soh.*, sod.*
    from SalesOrderHeader soh inner join SalesOrderDetail sod
    on soh.SalesOrderID = sod.SalesOrderID
    
    inner join product p
    on sod.ProductID = p.ProductID
    
    inner join customer c
    on soh.CustomerID = c.CustomerID
    
    left outer join emp 
   on soh.SalesPersonID = emp.BusinessEntityID

""")
FactSales.columns

['SalesOrderID',
 'RevisionNumber',
 'OrderDate',
 'DueDate',
 'ShipDate',
 'Status',
 'OnlineOrderFlag',
 'SalesOrderNumber',
 'PurchaseOrderNumber',
 'AccountNumber',
 'CustomerID',
 'SalesPersonID',
 'TerritoryID',
 'CreditCardApprovalCode',
 'SubTotal',
 'TaxAmt',
 'Freight',
 'TotalDue',
 'Comment',
 'SalesOrderID',
 'SalesOrderDetailID',
 'CarrierTrackingNumber',
 'OrderQty',
 'ProductID',
 'UnitPrice',
 'UnitPriceDiscount',
 'LineTotal']

In [19]:
FactSales = FactSales.select(
             'soh.SalesOrderID',
             expr('CustomerID').alias("CustomerKey"),
             expr('SalesPersonID').alias("EmployeeKey"),
             expr('ProductID').alias("ProductKey"),
             'RevisionNumber',
             'OrderQty',
             'UnitPrice',
             'UnitPriceDiscount',
             'SubTotal',
             'TaxAmt',
             'Freight',
             'TotalDue',
             'OrderDate',
             'DueDate',
             'ShipDate',
             'Status',
             'OnlineOrderFlag',
             'SalesOrderNumber',
             'PurchaseOrderNumber',
             'AccountNumber',
             'CreditCardApprovalCode',
             'Comment',
             'CarrierTrackingNumber',
             'LineTotal')\
            .repartition(4,'SalesOrderID', 'CustomerKey', 'EmployeeKey', 'ProductKey')\
            .cache()
FactSales.createOrReplaceTempView("FactSales")
FactSales.columns

['SalesOrderID',
 'CustomerKey',
 'EmployeeKey',
 'ProductKey',
 'RevisionNumber',
 'OrderQty',
 'UnitPrice',
 'UnitPriceDiscount',
 'SubTotal',
 'TaxAmt',
 'Freight',
 'TotalDue',
 'OrderDate',
 'DueDate',
 'ShipDate',
 'Status',
 'OnlineOrderFlag',
 'SalesOrderNumber',
 'PurchaseOrderNumber',
 'AccountNumber',
 'CreditCardApprovalCode',
 'Comment',
 'CarrierTrackingNumber',
 'LineTotal']

In [20]:
spark.sql("select count(distinct CustomerKey) from factsales").show()

+---------------------------+
|count(DISTINCT CustomerKey)|
+---------------------------+
|                      19119|
+---------------------------+



In [21]:
spark.sql("select count(distinct productkey) from factsales").show()

+--------------------------+
|count(DISTINCT productkey)|
+--------------------------+
|                       266|
+--------------------------+



In [22]:
'''FactSales.write\
        .mode("overwrite")\
        .partitionBy('SalesOrderID',
                     'CustomerKey',
                     'EmployeeKey',
                     'ProductKey')\
        .saveAsTable("SalesSchema.FactSales")'''

'FactSales.write        .mode("overwrite")        .partitionBy(\'SalesOrderID\',\n                     \'CustomerKey\',\n                     \'EmployeeKey\',\n                     \'ProductKey\')        .saveAsTable("SalesSchema.FactSales")'

In [23]:
FactSales.write\
        .format("hive")\
        .mode("overwrite")\
        .saveAsTable("bronze.FactSales")

In [24]:
spark.sql("show schemas").show()

+---------+
|namespace|
+---------+
|   bronze|
|  default|
|    sales|
|   silver|
+---------+



In [25]:
spark.sql("use bronze;")

DataFrame[]

In [26]:
spark.sql("show tables").show()

+---------+----------------+-----------+
|namespace|       tableName|isTemporary|
+---------+----------------+-----------+
|   bronze|     dimcustomer|      false|
|   bronze|         dimdate|      false|
|   bronze|     dimemployee|      false|
|   bronze|      dimproduct|      false|
|   bronze|       factsales|      false|
|         |        customer|       true|
|         |             emp|       true|
|         |       factsales|       true|
|         |         product|       true|
|         |salesorderdetail|       true|
|         |salesorderheader|       true|
+---------+----------------+-----------+



In [27]:
spark.sql("select SalesOrderID from factSales").count()

121317