# "Fact Table"
# Fact table built on the transactional level granularity

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("SampleDataToHive") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "/Drivers/SQL_Sever/jdbc/sqljdbc42.jar")\
    .enableHiveSupport() \
    .getOrCreate()


In [3]:
Tables = ["[Sales].[SalesOrderHeader]", "[Sales].[SalesOrderDetail]", "[Sales].[Customer]", "[Production].[Product]", "[HumanResources].[Employee]"] 
print(len(Tables), " tables")

5  tables


In [4]:
dataFrames= {}
for table in Tables:
    query = f"select * from {table}"
    df =spark.read.format("jdbc")\
        .option("url", "jdbc:sqlserver://172.18.0.4:1433;databaseName=AdventureWorks2017")\
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")\
        .option("dbtable", f"({query}) as temp")\
        .option("user","sa")\
        .option("password", "Mo*012105")\
        .load()
    dataFrames[table] = df
print(dataFrames.keys())

dict_keys(['[Sales].[SalesOrderHeader]', '[Sales].[SalesOrderDetail]', '[Sales].[Customer]', '[Production].[Product]', '[HumanResources].[Employee]'])


# SalesOrderHeader is a transactional table

In [5]:
SalesOrderHeader = dataFrames["[Sales].[SalesOrderHeader]"]\
    .select(
        'SalesOrderID',
         'RevisionNumber',
         'OrderDate',
         'DueDate',
         'ShipDate',
         'Status',
         'OnlineOrderFlag',
         'SalesOrderNumber',
         'PurchaseOrderNumber',
         'AccountNumber',
         'CustomerID',
         'SalesPersonID',
         'TerritoryID',
         'CreditCardApprovalCode',
         'SubTotal',
         'TaxAmt',
         'Freight',
         'TotalDue',
         'Comment')\
    .repartition(4, 'SalesOrderID', 'CustomerID', 'SalesPersonID')\
    .cache()
SalesOrderHeader.createOrReplaceTempView("SalesOrderHeader")
SalesOrderHeader.columns

['SalesOrderID',
 'RevisionNumber',
 'OrderDate',
 'DueDate',
 'ShipDate',
 'Status',
 'OnlineOrderFlag',
 'SalesOrderNumber',
 'PurchaseOrderNumber',
 'AccountNumber',
 'CustomerID',
 'SalesPersonID',
 'TerritoryID',
 'CreditCardApprovalCode',
 'SubTotal',
 'TaxAmt',
 'Freight',
 'TotalDue',
 'Comment']

# details about transactions

In [6]:
SalesOrderDetail = dataFrames["[Sales].[SalesOrderDetail]"]\
    .select(
        'SalesOrderID',
         'SalesOrderDetailID',
         'CarrierTrackingNumber',
         'OrderQty',
         'ProductID',
         'UnitPrice',
         'UnitPriceDiscount',
         'LineTotal')\
    .repartition(4,'SalesOrderID', 'SalesOrderDetailID','ProductID')\
    .cache()
SalesOrderDetail.createOrReplaceTempView("SalesOrderDetail")
SalesOrderDetail.columns

['SalesOrderID',
 'SalesOrderDetailID',
 'CarrierTrackingNumber',
 'OrderQty',
 'ProductID',
 'UnitPrice',
 'UnitPriceDiscount',
 'LineTotal']

In [7]:
customer = dataFrames['[Sales].[Customer]']\
    .select('CustomerID',
             'PersonID',
             'StoreID',
             'TerritoryID',
             'AccountNumber')\
    .repartition(4, "CustomerID")\
    .cache()
customer.createOrReplaceTempView("customer")
customer.count()

19820

In [8]:
emp = dataFrames["[HumanResources].[Employee]"]\
    .select('BusinessEntityID',
             'NationalIDNumber',
             'LoginID',
             'OrganizationNode',
             'OrganizationLevel',
             'JobTitle',
             'BirthDate',
             'MaritalStatus',
             'Gender',
             'HireDate',
             'SalariedFlag',
             'VacationHours',
             'SickLeaveHours',
             'CurrentFlag')
emp.repartition(4, "BusinessEntityID").cache()
emp.createOrReplaceTempView("emp")
emp.count()

290

In [9]:
product = dataFrames["[Production].[Product]"]\
    .select('ProductID',
         expr('Name').alias("productName"),
         'ProductNumber',
         'MakeFlag',
         'FinishedGoodsFlag',
         'Color',
         'SafetyStockLevel',
         'ReorderPoint',
         'StandardCost',
         'ListPrice',
         'Size',
         'SizeUnitMeasureCode',
         'WeightUnitMeasureCode',
         'Weight',
         'DaysToManufacture',
         'ProductLine',
         'Class',
         'Style',
         'ProductSubcategoryID',
         'ProductModelID',
         'SellStartDate',
         'SellEndDate',
         'DiscontinuedDate')\
    .repartition(4,'ProductID')\
    .cache()
product.createOrReplaceTempView("product")
product.count()

504

In [10]:
spark.catalog.listTables()

[Table(name='dimdate', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='my_new_date_table', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='my_new_date_tablee', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='customer', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='emp', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='product', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='SalesOrderDetail', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='SalesOrderHeader', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [11]:
FactSales= spark.sql("""
    select soh.*, sod.*
    from SalesOrderHeader soh inner join SalesOrderDetail sod
    on soh.SalesOrderID = sod.SalesOrderID
    
    inner join product p
    on sod.ProductID = p.ProductID
    
    inner join customer c
    on soh.CustomerID = c.CustomerID
    
    inner join emp 
    on soh.SalesPersonID = emp.BusinessEntityID

""")

In [12]:
FactSales = FactSales.select(
             'soh.SalesOrderID',
             expr('CustomerID').alias("CustomerKey"),
             expr('SalesPersonID').alias("EmployeeKey"),
             expr('ProductID').alias("ProductKey"),
             'RevisionNumber',
             'OrderQty',
             'UnitPrice',
             'UnitPriceDiscount',
             'SubTotal',
             'TaxAmt',
             'Freight',
             'TotalDue',
             'OrderDate',
             'DueDate',
             'ShipDate',
             'Status',
             'OnlineOrderFlag',
             'SalesOrderNumber',
             'PurchaseOrderNumber',
             'AccountNumber',
             'CreditCardApprovalCode',
             'Comment',
             'CarrierTrackingNumber',
             'LineTotal')\
            .repartition(4,'SalesOrderID', 'CustomerKey', 'EmployeeKey', 'ProductKey')\
            .cache()
FactSales.createOrReplaceTempView("FactSales")
FactSales.columns

['SalesOrderID',
 'CustomerKey',
 'EmployeeKey',
 'ProductKey',
 'RevisionNumber',
 'OrderQty',
 'UnitPrice',
 'UnitPriceDiscount',
 'SubTotal',
 'TaxAmt',
 'Freight',
 'TotalDue',
 'OrderDate',
 'DueDate',
 'ShipDate',
 'Status',
 'OnlineOrderFlag',
 'SalesOrderNumber',
 'PurchaseOrderNumber',
 'AccountNumber',
 'CreditCardApprovalCode',
 'Comment',
 'CarrierTrackingNumber',
 'LineTotal']

In [13]:
'''FactSales.write\
        .mode("overwrite")\
        .partitionBy('SalesOrderID',
                     'CustomerKey',
                     'EmployeeKey',
                     'ProductKey')\
        .saveAsTable("SalesSchema.FactSales")'''

'FactSales.write        .mode("overwrite")        .partitionBy(\'SalesOrderID\',\n                     \'CustomerKey\',\n                     \'EmployeeKey\',\n                     \'ProductKey\')        .saveAsTable("SalesSchema.FactSales")'

In [14]:
FactSales.write\
        .format("hive")\
        .mode("overwrite")\
        .saveAsTable("bronze.FactSales")

In [15]:
spark.sql("select SalesOrderID from factSales").count()

60919

In [16]:
spark.sql("show databases;").show()

+-----------------+
|        namespace|
+-----------------+
|           bronze|
|bronzesalesschema|
|          default|
|      my_database|
|            sales|
|             test|
+-----------------+



In [17]:
spark.sql("show tables;").show()

+---------+------------------+-----------+
|namespace|         tableName|isTemporary|
+---------+------------------+-----------+
|  default|           dimdate|      false|
|  default| my_new_date_table|      false|
|  default|my_new_date_tablee|      false|
|         |          customer|       true|
|         |               emp|       true|
|         |         factsales|       true|
|         |           product|       true|
|         |  salesorderdetail|       true|
|         |  salesorderheader|       true|
+---------+------------------+-----------+



In [18]:
spark.sql("use bronzesalesschema;").show()

++
||
++
++



In [19]:
spark.sql("show tables;").show()

+-----------------+----------------+-----------+
|        namespace|       tableName|isTemporary|
+-----------------+----------------+-----------+
|bronzesalesschema|     dimcustomer|      false|
|bronzesalesschema|         dimdate|      false|
|                 |        customer|       true|
|                 |             emp|       true|
|                 |       factsales|       true|
|                 |         product|       true|
|                 |salesorderdetail|       true|
|                 |salesorderheader|       true|
+-----------------+----------------+-----------+



In [20]:
spark.sql('''
select dimemployee.employeeid, sum(factsales.UnitPrice) as totalsales 
from dimemployee 
inner join factsales on dimemployee.employeeid = factsales.employeekey
group by dimemployee.employeeid;
''').show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `dimemployee` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 3 pos 5;
'Aggregate ['dimemployee.employeeid], ['dimemployee.employeeid, 'sum('factsales.UnitPrice) AS totalsales#4157]
+- 'Join Inner, ('dimemployee.employeeid = 'factsales.employeekey)
   :- 'UnresolvedRelation [dimemployee], [], false
   +- SubqueryAlias factsales
      +- View (`FactSales`, [SalesOrderID#0,CustomerKey#1406,EmployeeKey#1407,ProductKey#1408,RevisionNumber#1,OrderQty#56,UnitPrice#59,UnitPriceDiscount#60,SubTotal#19,TaxAmt#20,Freight#21,TotalDue#22,OrderDate#2,DueDate#3,ShipDate#4,Status#5,OnlineOrderFlag#6,SalesOrderNumber#7,PurchaseOrderNumber#8,AccountNumber#9,CreditCardApprovalCode#17,Comment#23,CarrierTrackingNumber#55,LineTotal#61])
         +- RepartitionByExpression [SalesOrderID#0, CustomerKey#1406, EmployeeKey#1407, ProductKey#1408], 4
            +- Project [SalesOrderID#0, CustomerID#10 AS CustomerKey#1406, SalesPersonID#11 AS EmployeeKey#1407, ProductID#57 AS ProductKey#1408, RevisionNumber#1, OrderQty#56, UnitPrice#59, UnitPriceDiscount#60, SubTotal#19, TaxAmt#20, Freight#21, TotalDue#22, OrderDate#2, DueDate#3, ShipDate#4, Status#5, OnlineOrderFlag#6, SalesOrderNumber#7, PurchaseOrderNumber#8, AccountNumber#9, CreditCardApprovalCode#17, Comment#23, CarrierTrackingNumber#55, LineTotal#61]
               +- Project [SalesOrderID#0, RevisionNumber#1, OrderDate#2, DueDate#3, ShipDate#4, Status#5, OnlineOrderFlag#6, SalesOrderNumber#7, PurchaseOrderNumber#8, AccountNumber#9, CustomerID#10, SalesPersonID#11, TerritoryID#12, CreditCardApprovalCode#17, SubTotal#19, TaxAmt#20, Freight#21, TotalDue#22, Comment#23, SalesOrderID#53, SalesOrderDetailID#54, CarrierTrackingNumber#55, OrderQty#56, ProductID#57, ... 3 more fields]
                  +- Join Inner, (SalesPersonID#11 = BusinessEntityID#142)
                     :- Join Inner, (CustomerID#10 = CustomerID#76)
                     :  :- Join Inner, (ProductID#57 = ProductID#91)
                     :  :  :- Join Inner, (SalesOrderID#0 = SalesOrderID#53)
                     :  :  :  :- SubqueryAlias soh
                     :  :  :  :  +- SubqueryAlias salesorderheader
                     :  :  :  :     +- View (`SalesOrderHeader`, [SalesOrderID#0,RevisionNumber#1,OrderDate#2,DueDate#3,ShipDate#4,Status#5,OnlineOrderFlag#6,SalesOrderNumber#7,PurchaseOrderNumber#8,AccountNumber#9,CustomerID#10,SalesPersonID#11,TerritoryID#12,CreditCardApprovalCode#17,SubTotal#19,TaxAmt#20,Freight#21,TotalDue#22,Comment#23])
                     :  :  :  :        +- RepartitionByExpression [SalesOrderID#0, CustomerID#10, SalesPersonID#11], 4
                     :  :  :  :           +- Project [SalesOrderID#0, RevisionNumber#1, OrderDate#2, DueDate#3, ShipDate#4, Status#5, OnlineOrderFlag#6, SalesOrderNumber#7, PurchaseOrderNumber#8, AccountNumber#9, CustomerID#10, SalesPersonID#11, TerritoryID#12, CreditCardApprovalCode#17, SubTotal#19, TaxAmt#20, Freight#21, TotalDue#22, Comment#23]
                     :  :  :  :              +- Project [SalesOrderID#0, RevisionNumber#1, OrderDate#2, DueDate#3, ShipDate#4, Status#5, OnlineOrderFlag#6, SalesOrderNumber#7, PurchaseOrderNumber#8, AccountNumber#9, CustomerID#10, SalesPersonID#11, TerritoryID#12, BillToAddressID#13, ShipToAddressID#14, ShipMethodID#15, CreditCardID#16, CreditCardApprovalCode#17, CurrencyRateID#18, SubTotal#19, TaxAmt#20, Freight#21, TotalDue#22, Comment#23, ... 2 more fields]
                     :  :  :  :                 +- Relation [SalesOrderID#0,RevisionNumber#1,OrderDate#2,DueDate#3,ShipDate#4,Status#5,OnlineOrderFlag#6,SalesOrderNumber#7,PurchaseOrderNumber#8,AccountNumber#9,CustomerID#10,SalesPersonID#11,TerritoryID#12,BillToAddressID#13,ShipToAddressID#14,ShipMethodID#15,CreditCardID#16,CreditCardApprovalCode#17,CurrencyRateID#18,SubTotal#19,TaxAmt#20,Freight#21,TotalDue#22,Comment#23,... 2 more fields] JDBCRelation((select * from [Sales].[SalesOrderHeader]) as temp) [numPartitions=1]
                     :  :  :  +- SubqueryAlias sod
                     :  :  :     +- SubqueryAlias salesorderdetail
                     :  :  :        +- View (`SalesOrderDetail`, [SalesOrderID#53,SalesOrderDetailID#54,CarrierTrackingNumber#55,OrderQty#56,ProductID#57,UnitPrice#59,UnitPriceDiscount#60,LineTotal#61])
                     :  :  :           +- RepartitionByExpression [SalesOrderID#53, SalesOrderDetailID#54, ProductID#57], 4
                     :  :  :              +- Project [SalesOrderID#53, SalesOrderDetailID#54, CarrierTrackingNumber#55, OrderQty#56, ProductID#57, UnitPrice#59, UnitPriceDiscount#60, LineTotal#61]
                     :  :  :                 +- Project [SalesOrderID#53, SalesOrderDetailID#54, CarrierTrackingNumber#55, OrderQty#56, ProductID#57, SpecialOfferID#58, UnitPrice#59, UnitPriceDiscount#60, LineTotal#61, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, rowguid#62, 36, true, false, true) AS rowguid#64, ModifiedDate#63]
                     :  :  :                    +- Relation [SalesOrderID#53,SalesOrderDetailID#54,CarrierTrackingNumber#55,OrderQty#56,ProductID#57,SpecialOfferID#58,UnitPrice#59,UnitPriceDiscount#60,LineTotal#61,rowguid#62,ModifiedDate#63] JDBCRelation((select * from [Sales].[SalesOrderDetail]) as temp) [numPartitions=1]
                     :  :  +- SubqueryAlias p
                     :  :     +- SubqueryAlias product
                     :  :        +- View (`product`, [ProductID#91,productName#605,ProductNumber#93,MakeFlag#94,FinishedGoodsFlag#95,Color#96,SafetyStockLevel#97,ReorderPoint#98,StandardCost#99,ListPrice#100,Size#101,SizeUnitMeasureCode#102,WeightUnitMeasureCode#103,Weight#104,DaysToManufacture#105,ProductLine#106,Class#107,Style#108,ProductSubcategoryID#109,ProductModelID#110,SellStartDate#111,SellEndDate#112,DiscontinuedDate#113])
                     :  :           +- RepartitionByExpression [ProductID#91], 4
                     :  :              +- Project [ProductID#91, Name#92 AS productName#605, ProductNumber#93, MakeFlag#94, FinishedGoodsFlag#95, Color#96, SafetyStockLevel#97, ReorderPoint#98, StandardCost#99, ListPrice#100, Size#101, SizeUnitMeasureCode#102, WeightUnitMeasureCode#103, Weight#104, DaysToManufacture#105, ProductLine#106, Class#107, Style#108, ProductSubcategoryID#109, ProductModelID#110, SellStartDate#111, SellEndDate#112, DiscontinuedDate#113]
                     :  :                 +- Project [ProductID#91, Name#92, ProductNumber#93, MakeFlag#94, FinishedGoodsFlag#95, Color#96, SafetyStockLevel#97, ReorderPoint#98, StandardCost#99, ListPrice#100, Size#101, SizeUnitMeasureCode#102, WeightUnitMeasureCode#103, Weight#104, DaysToManufacture#105, ProductLine#106, Class#107, Style#108, ProductSubcategoryID#109, ProductModelID#110, SellStartDate#111, SellEndDate#112, DiscontinuedDate#113, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, rowguid#114, 36, true, false, true) AS rowguid#116, ModifiedDate#115]
                     :  :                    +- Relation [ProductID#91,Name#92,ProductNumber#93,MakeFlag#94,FinishedGoodsFlag#95,Color#96,SafetyStockLevel#97,ReorderPoint#98,StandardCost#99,ListPrice#100,Size#101,SizeUnitMeasureCode#102,WeightUnitMeasureCode#103,Weight#104,DaysToManufacture#105,ProductLine#106,Class#107,Style#108,ProductSubcategoryID#109,ProductModelID#110,SellStartDate#111,SellEndDate#112,DiscontinuedDate#113,rowguid#114,ModifiedDate#115] JDBCRelation((select * from [Production].[Product]) as temp) [numPartitions=1]
                     :  +- SubqueryAlias c
                     :     +- SubqueryAlias customer
                     :        +- View (`customer`, [CustomerID#76,PersonID#77,StoreID#78,TerritoryID#79,AccountNumber#80])
                     :           +- RepartitionByExpression [CustomerID#76], 4
                     :              +- Project [CustomerID#76, PersonID#77, StoreID#78, TerritoryID#79, AccountNumber#80]
                     :                 +- Project [CustomerID#76, PersonID#77, StoreID#78, TerritoryID#79, AccountNumber#80, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, rowguid#81, 36, true, false, true) AS rowguid#83, ModifiedDate#82]
                     :                    +- Relation [CustomerID#76,PersonID#77,StoreID#78,TerritoryID#79,AccountNumber#80,rowguid#81,ModifiedDate#82] JDBCRelation((select * from [Sales].[Customer]) as temp) [numPartitions=1]
                     +- SubqueryAlias emp
                        +- View (`emp`, [BusinessEntityID#142,NationalIDNumber#143,LoginID#144,OrganizationNode#145,OrganizationLevel#146,JobTitle#147,BirthDate#148,MaritalStatus#149,Gender#150,HireDate#151,SalariedFlag#152,VacationHours#153,SickLeaveHours#154,CurrentFlag#155])
                           +- Project [BusinessEntityID#142, NationalIDNumber#143, LoginID#144, OrganizationNode#145, OrganizationLevel#146, JobTitle#147, BirthDate#148, MaritalStatus#149, Gender#150, HireDate#151, SalariedFlag#152, VacationHours#153, SickLeaveHours#154, CurrentFlag#155]
                              +- Project [BusinessEntityID#142, NationalIDNumber#143, LoginID#144, OrganizationNode#145, OrganizationLevel#146, JobTitle#147, BirthDate#148, MaritalStatus#149, Gender#150, HireDate#151, SalariedFlag#152, VacationHours#153, SickLeaveHours#154, CurrentFlag#155, staticinvoke(class org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils, StringType, readSidePadding, rowguid#156, 36, true, false, true) AS rowguid#158, ModifiedDate#157]
                                 +- Relation [BusinessEntityID#142,NationalIDNumber#143,LoginID#144,OrganizationNode#145,OrganizationLevel#146,JobTitle#147,BirthDate#148,MaritalStatus#149,Gender#150,HireDate#151,SalariedFlag#152,VacationHours#153,SickLeaveHours#154,CurrentFlag#155,rowguid#156,ModifiedDate#157] JDBCRelation((select * from [HumanResources].[Employee]) as temp) [numPartitions=1]
