# "Employee Dimension"

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder \
    .appName("Customer") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "/Drivers/SQL_Sever/jdbc/sqljdbc42.jar")\
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
Tables= ["[Person].[StateProvince]", "[Person].[BusinessEntityAddress]", "[Person].[Address]", "[Person].[Person]", "[HumanResources].[Department]", "[HumanResources].[EmployeeDepartmentHistory]", "[HumanResources].[Employee]"]
print(len(Tables), " tables")

7  tables


In [4]:
dataFrames= {}
for table in Tables:
    query = f"select * from {table}"
    df =spark.read.format("jdbc")\
        .option("url", "jdbc:sqlserver://172.18.0.4:1433;databaseName=AdventureWorks2017")\
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")\
        .option("dbtable", f"({query}) as temp")\
        .option("user","sa")\
        .option("password", "Mo*012105")\
        .load()
    dataFrames[table] = df
print(dataFrames.keys())

dict_keys(['[Person].[StateProvince]', '[Person].[BusinessEntityAddress]', '[Person].[Address]', '[Person].[Person]', '[HumanResources].[Department]', '[HumanResources].[EmployeeDepartmentHistory]', '[HumanResources].[Employee]'])


In [5]:
emp = dataFrames["[HumanResources].[Employee]"]\
    .select('BusinessEntityID',
             'NationalIDNumber',
             'LoginID',
             'OrganizationNode',
             'OrganizationLevel',
             'JobTitle',
             'BirthDate',
             'MaritalStatus',
             'Gender',
             'HireDate',
             'SalariedFlag',
             'VacationHours',
             'SickLeaveHours',
             'CurrentFlag')
emp.repartition(4, "BusinessEntityID").cache()
emp.createOrReplaceTempView("emp")
emp.count()

290

In [6]:
#there is dublicated values ?
spark.sql("""
select * from(
    select BusinessEntityID, count(BusinessEntityID) as count
    from emp
    group by BusinessEntityID) as nt
    order by count asc
""").show(3)

+----------------+-----+
|BusinessEntityID|count|
+----------------+-----+
|             148|    1|
|             243|    1|
|              31|    1|
+----------------+-----+
only showing top 3 rows



In [7]:
depthist = dataFrames["[HumanResources].[EmployeeDepartmentHistory]"]
depthist.createOrReplaceTempView("depthist")

#select the last department for each employee
depthist = spark.sql("""
select * from depthist
where EndDate is null
""")
depthist=depthist.select("BusinessEntityID", "DepartmentID")\
    .repartition("BusinessEntityID", "DepartmentID")\
    .cache()
depthist.createOrReplaceTempView("depthist")
depthist.columns

['BusinessEntityID', 'DepartmentID']

In [8]:
dept = dataFrames["[HumanResources].[Department]"]\
    .select('DepartmentID', 'Name', 'GroupName')\
    .repartition(4,"DepartmentID")\
    .cache()
dept.createOrReplaceTempView("dept")
dept.columns

['DepartmentID', 'Name', 'GroupName']

In [9]:
person= dataFrames['[Person].[Person]'].withColumn("EmployeeName", concat_ws(" ", "FirstName","LastName"))


person= person.select('BusinessEntityID',
     'EmployeeName',        
     'PersonType',
     'NameStyle',
     'Title',
     'Suffix',
     'EmailPromotion',
     'AdditionalContactInfo',
     'Demographics')\
    .repartition(4,"BusinessEntityID")\
    .cache()

person.createOrReplaceTempView("person")
person.columns

['BusinessEntityID',
 'EmployeeName',
 'PersonType',
 'NameStyle',
 'Title',
 'Suffix',
 'EmailPromotion',
 'AdditionalContactInfo',
 'Demographics']

In [10]:
address = dataFrames["[Person].[Address]"]\
    .select('AddressID',
         'AddressLine1',
         'AddressLine2',
         'City',
         'StateProvinceID',
         'PostalCode',
         'SpatialLocation')\
    .repartition(4,"AddressID")\
    .cache()
address.createOrReplaceTempView("address")
address.columns

['AddressID',
 'AddressLine1',
 'AddressLine2',
 'City',
 'StateProvinceID',
 'PostalCode',
 'SpatialLocation']

In [11]:
entityAdd= dataFrames["[Person].[BusinessEntityAddress]"]\
    .select('BusinessEntityID', 'AddressID')\
    .repartition(4,'BusinessEntityID', 'AddressID')\
    .cache()
entityAdd.createOrReplaceTempView("entityAdd")
entityAdd.columns

['BusinessEntityID', 'AddressID']

In [12]:
state = dataFrames["[Person].[StateProvince]"]\
    .select('StateProvinceID',
     'StateProvinceCode',
     'CountryRegionCode',
     'IsOnlyStateProvinceFlag',
     expr('Name').alias("stateName"))\
    .repartition(4,'StateProvinceID')\
    .cache()
state.createOrReplaceTempView("state")
state.columns

['StateProvinceID',
 'StateProvinceCode',
 'CountryRegionCode',
 'IsOnlyStateProvinceFlag',
 'stateName']

In [13]:
spark.catalog.listTables()

[Table(name='dimdate', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='my_new_date_table', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='my_new_date_tablee', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='address', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='dept', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='depthist', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='emp', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='entityAdd', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='person', catalog=None, na

In [14]:
DimEmployee = spark.sql("""
    select *
    from emp e inner join depthist dh
    on e.BusinessEntityID = dh.BusinessEntityID

    inner join dept d
    on dh.DepartmentID = d.DepartmentID

    inner join person p
    on e.BusinessEntityID =p.BusinessEntityID

    inner join entityadd ea
    on e.BusinessEntityID = ea.BusinessEntityID

    inner join address add
    on ea.AddressID = add.AddressID

    inner join state s
    on add.StateProvinceID = s.StateProvinceID
""")
DimEmployee.columns

['BusinessEntityID',
 'NationalIDNumber',
 'LoginID',
 'OrganizationNode',
 'OrganizationLevel',
 'JobTitle',
 'BirthDate',
 'MaritalStatus',
 'Gender',
 'HireDate',
 'SalariedFlag',
 'VacationHours',
 'SickLeaveHours',
 'CurrentFlag',
 'BusinessEntityID',
 'DepartmentID',
 'DepartmentID',
 'Name',
 'GroupName',
 'BusinessEntityID',
 'EmployeeName',
 'PersonType',
 'NameStyle',
 'Title',
 'Suffix',
 'EmailPromotion',
 'AdditionalContactInfo',
 'Demographics',
 'BusinessEntityID',
 'AddressID',
 'AddressID',
 'AddressLine1',
 'AddressLine2',
 'City',
 'StateProvinceID',
 'PostalCode',
 'SpatialLocation',
 'StateProvinceID',
 'StateProvinceCode',
 'CountryRegionCode',
 'IsOnlyStateProvinceFlag',
 'stateName']

In [15]:
DimEmployee= DimEmployee.select(
 expr('e.BusinessEntityID').alias("EmployeeID"),
 'NationalIDNumber',
 'LoginID',
 'OrganizationNode',
 'OrganizationLevel',
 'JobTitle',
 'BirthDate',
 'MaritalStatus',
 'Gender',
 'HireDate',
 'SalariedFlag',
 'VacationHours',
 'SickLeaveHours',
 'CurrentFlag',
 'Name',
 'GroupName',
 'EmployeeName',
 'PersonType',
 'NameStyle',
 'Title',
 'Suffix',
 'EmailPromotion',
 'AdditionalContactInfo',
 'Demographics',
 'AddressLine1',
 'AddressLine2',
 'City',
 'PostalCode',
 'SpatialLocation',
 'StateProvinceCode',
 'CountryRegionCode',
 'IsOnlyStateProvinceFlag',
 'stateName'
)
DimEmployee.count()
DimEmployee= DimEmployee.repartition(2,"EmployeeID")
DimEmployee.createOrReplaceTempView("DimEmployee")

In [16]:
spark.sql("select count(distinct BusinessEntityID) from emp").show()

+--------------------------------+
|count(DISTINCT BusinessEntityID)|
+--------------------------------+
|                             290|
+--------------------------------+



In [17]:
spark.sql("select * from DimEmployee").columns

['EmployeeID',
 'NationalIDNumber',
 'LoginID',
 'OrganizationNode',
 'OrganizationLevel',
 'JobTitle',
 'BirthDate',
 'MaritalStatus',
 'Gender',
 'HireDate',
 'SalariedFlag',
 'VacationHours',
 'SickLeaveHours',
 'CurrentFlag',
 'Name',
 'GroupName',
 'EmployeeName',
 'PersonType',
 'NameStyle',
 'Title',
 'Suffix',
 'EmailPromotion',
 'AdditionalContactInfo',
 'Demographics',
 'AddressLine1',
 'AddressLine2',
 'City',
 'PostalCode',
 'SpatialLocation',
 'StateProvinceCode',
 'CountryRegionCode',
 'IsOnlyStateProvinceFlag',
 'stateName']

In [18]:
DimEmployee.write.mode("overwrite").saveAsTable("bronze.DimEmployee")

In [19]:
spark.sql("show databases;").show()

+-----------------+
|        namespace|
+-----------------+
|           bronze|
|bronzesalesschema|
|          default|
|      my_database|
|            sales|
|             test|
+-----------------+



In [20]:
spark.sql("use bronzesalesschema").show()

++
||
++
++



In [21]:
spark.sql("show tables;").show()

+-----------------+-----------+-----------+
|        namespace|  tableName|isTemporary|
+-----------------+-----------+-----------+
|bronzesalesschema|dimcustomer|      false|
|bronzesalesschema|    dimdate|      false|
|                 |    address|       true|
|                 |       dept|       true|
|                 |   depthist|       true|
|                 |dimemployee|       true|
|                 |        emp|       true|
|                 |  entityadd|       true|
|                 |     person|       true|
|                 |      state|       true|
+-----------------+-----------+-----------+



In [22]:
spark.sql("select * from dimemployee").count()

290