# "Silver Schema : Employee Dimension"

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder \
    .appName("Employee") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "/Drivers/SQL_Sever/jdbc/sqljdbc42.jar")\
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
spark.sql("show schemas;").show()

+---------+
|namespace|
+---------+
|   bronze|
|  default|
|    sales|
|   silver|
+---------+



In [4]:
spark.sql("use bronze")

DataFrame[]

In [5]:
spark.sql("show tables;").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   bronze|dimcustomer|      false|
|   bronze|    dimdate|      false|
|   bronze|dimemployee|      false|
|   bronze| dimproduct|      false|
|   bronze|  factsales|      false|
+---------+-----------+-----------+



In [6]:
bronze_Employee = spark.sql("select * from bronze.dimemployee")
bronze_Employee.columns

['EmployeeID',
 'EmployeeName',
 'NationalIDNumber',
 'LoginID',
 'OrganizationNode',
 'OrganizationLevel',
 'JobTitle',
 'BirthDate',
 'MaritalStatus',
 'Gender',
 'HireDate',
 'SalariedFlag',
 'VacationHours',
 'SickLeaveHours',
 'CurrentFlag',
 'GroupName',
 'PersonType',
 'NameStyle',
 'Title',
 'Suffix',
 'EmailPromotion',
 'AdditionalContactInfo',
 'Demographics',
 'AddressLine1',
 'AddressLine2',
 'City',
 'PostalCode',
 'SpatialLocation',
 'StateProvinceCode',
 'CountryRegionCode',
 'IsOnlyStateProvinceFlag',
 'stateName']

In [7]:
sEmp = bronze_Employee.toPandas()
sEmp.isnull().any()

EmployeeID                 False
EmployeeName               False
NationalIDNumber           False
LoginID                    False
OrganizationNode            True
OrganizationLevel           True
JobTitle                   False
BirthDate                  False
MaritalStatus              False
Gender                     False
HireDate                   False
SalariedFlag               False
VacationHours              False
SickLeaveHours             False
CurrentFlag                False
GroupName                  False
PersonType                 False
NameStyle                  False
Title                       True
Suffix                      True
EmailPromotion             False
AdditionalContactInfo       True
Demographics               False
AddressLine1               False
AddressLine2                True
City                       False
PostalCode                 False
SpatialLocation            False
StateProvinceCode          False
CountryRegionCode          False
IsOnlyStat

In [8]:
sEmp.count()

EmployeeID                 290
EmployeeName               290
NationalIDNumber           290
LoginID                    290
OrganizationNode           289
OrganizationLevel          289
JobTitle                   290
BirthDate                  290
MaritalStatus              290
Gender                     290
HireDate                   290
SalariedFlag               290
VacationHours              290
SickLeaveHours             290
CurrentFlag                290
GroupName                  290
PersonType                 290
NameStyle                  290
Title                        8
Suffix                       2
EmailPromotion             290
AdditionalContactInfo        0
Demographics               290
AddressLine1               290
AddressLine2                 8
City                       290
PostalCode                 290
SpatialLocation            290
StateProvinceCode          290
CountryRegionCode          290
IsOnlyStateProvinceFlag    290
stateName                  290
dtype: i


# Based on the available data in each column they will make a decision about it:
- AdditionalContactInfo, Suffix, and Title will be dropped 
- address1 haven't nulls, but address2 has, so I decided to make address1 and 2 a list in one column called address

In [9]:
bronze_Employee= bronze_Employee\
            .withColumn("Address", array(col("AddressLine1"), col("AddressLine2")))\
            .drop(col("AdditionalContactInfo"), col("Suffix"), col("Title"), col("AddressLine1"), col("AddressLine2"))

bronze_Employee.columns

['EmployeeID',
 'EmployeeName',
 'NationalIDNumber',
 'LoginID',
 'OrganizationNode',
 'OrganizationLevel',
 'JobTitle',
 'BirthDate',
 'MaritalStatus',
 'Gender',
 'HireDate',
 'SalariedFlag',
 'VacationHours',
 'SickLeaveHours',
 'CurrentFlag',
 'GroupName',
 'PersonType',
 'NameStyle',
 'EmailPromotion',
 'Demographics',
 'City',
 'PostalCode',
 'SpatialLocation',
 'StateProvinceCode',
 'CountryRegionCode',
 'IsOnlyStateProvinceFlag',
 'stateName',
 'Address']

In [10]:
Semp = bronze_Employee.toPandas()
Semp.isnull().any()

EmployeeID                 False
EmployeeName               False
NationalIDNumber           False
LoginID                    False
OrganizationNode            True
OrganizationLevel           True
JobTitle                   False
BirthDate                  False
MaritalStatus              False
Gender                     False
HireDate                   False
SalariedFlag               False
VacationHours              False
SickLeaveHours             False
CurrentFlag                False
GroupName                  False
PersonType                 False
NameStyle                  False
EmailPromotion             False
Demographics               False
City                       False
PostalCode                 False
SpatialLocation            False
StateProvinceCode          False
CountryRegionCode          False
IsOnlyStateProvinceFlag    False
stateName                  False
Address                    False
dtype: bool

# OrganizationNode and OrganizationLevel columns have only two missed values, so we decided to ignore these two transactions

In [11]:
bronze_Employee.createOrReplaceTempView("sEmp")
bronze_Employee = spark.sql("""
    select *
    from semp
    where OrganizationNode is not null or OrganizationLevel is not null
""")

In [12]:
sEmp = bronze_Employee.toPandas()
sEmp.isnull().any()

EmployeeID                 False
EmployeeName               False
NationalIDNumber           False
LoginID                    False
OrganizationNode           False
OrganizationLevel          False
JobTitle                   False
BirthDate                  False
MaritalStatus              False
Gender                     False
HireDate                   False
SalariedFlag               False
VacationHours              False
SickLeaveHours             False
CurrentFlag                False
GroupName                  False
PersonType                 False
NameStyle                  False
EmailPromotion             False
Demographics               False
City                       False
PostalCode                 False
SpatialLocation            False
StateProvinceCode          False
CountryRegionCode          False
IsOnlyStateProvinceFlag    False
stateName                  False
Address                    False
dtype: bool

In [13]:
sEmp.count()

EmployeeID                 289
EmployeeName               289
NationalIDNumber           289
LoginID                    289
OrganizationNode           289
OrganizationLevel          289
JobTitle                   289
BirthDate                  289
MaritalStatus              289
Gender                     289
HireDate                   289
SalariedFlag               289
VacationHours              289
SickLeaveHours             289
CurrentFlag                289
GroupName                  289
PersonType                 289
NameStyle                  289
EmailPromotion             289
Demographics               289
City                       289
PostalCode                 289
SpatialLocation            289
StateProvinceCode          289
CountryRegionCode          289
IsOnlyStateProvinceFlag    289
stateName                  289
Address                    289
dtype: int64

In [14]:
bronze_Employee.write.format("hive").mode("overwrite").saveAsTable("silver.DimEmployee")

In [15]:
spark.sql("use silver")

DataFrame[]

In [16]:
spark.sql("show tables").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   silver|dimcustomer|      false|
|   silver|    dimdate|      false|
|   silver|dimemployee|      false|
|         |       semp|       true|
+---------+-----------+-----------+



# Output From Hive DWH:

In [None]:
"""
0: jdbc:hive2://localhost:10000/default> show schemas;
+----------------+
| database_name  |
+----------------+
| bronze         |
| default        |
| sales          |
| silver         |
+----------------+
4 rows selected (1.624 seconds)
0: jdbc:hive2://localhost:10000/default> use silver;
No rows affected (0.111 seconds)
0: jdbc:hive2://localhost:10000/default> show tables;
+--------------+
|   tab_name   |
+--------------+
| dimcustomer  |
| dimdate      |
| dimemployee  |
+--------------+
3 rows selected (0.09 seconds)
0: jdbc:hive2://localhost:10000/default> select EmployeeID, EmployeeName, NationalIDNumber, LoginID, OrganizationNode
. . . . . . . . . . . . . . . . . . . .> from dimemployee
. . . . . . . . . . . . . . . . . . . .> limit 5;
+-------------+-------------------+-------------------+--------------------------+-------------------+
| employeeid  |   employeename    | nationalidnumber  |         loginid          | organizationnode  |
+-------------+-------------------+-------------------+--------------------------+-------------------+
| 6           | Jossef Goldberg   | 998320692         | adventure-works\jossef0  | Z?                |
| 9           | Gigi Matthew      | 658797903         | adventure-works\gigi0    | Z?h               |
| 15          | Sharon Salavaria  | 56920285          | adventure-works\sharon0  | Z?                |
| 16          | David Bradley     | 24756624          | adventure-works\david0   | h                 |
| 17          | Kevin Brown       | 253022876         | adventure-works\kevin0   | j?                |
+-------------+-------------------+-------------------+--------------------------+-------------------+
5 rows selected (2.48 seconds)
"""