# "Silver Schema : Employee Dimension"

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder \
    .appName("Employee") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "/Drivers/SQL_Sever/jdbc/sqljdbc42.jar")\
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
spark.sql("show schemas;").show()

+---------+
|namespace|
+---------+
|   bronze|
|  default|
|    sales|
|   silver|
+---------+



In [4]:
spark.sql("use bronze")

DataFrame[]

In [5]:
spark.sql("show tables;").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   bronze|dimcustomer|      false|
|   bronze|    dimdate|      false|
|   bronze|dimemployee|      false|
|   bronze| dimproduct|      false|
|   bronze|  factsales|      false|
+---------+-----------+-----------+



In [6]:
bronze_Employee = spark.sql("select * from bronze.dimemployee")
bronze_Employee.columns

['EmployeeID',
 'NationalIDNumber',
 'LoginID',
 'OrganizationNode',
 'OrganizationLevel',
 'JobTitle',
 'BirthDate',
 'MaritalStatus',
 'Gender',
 'HireDate',
 'SalariedFlag',
 'VacationHours',
 'SickLeaveHours',
 'CurrentFlag',
 'Name',
 'GroupName',
 'EmployeeName',
 'PersonType',
 'NameStyle',
 'Title',
 'Suffix',
 'EmailPromotion',
 'AdditionalContactInfo',
 'Demographics',
 'AddressLine1',
 'AddressLine2',
 'City',
 'PostalCode',
 'SpatialLocation',
 'StateProvinceCode',
 'CountryRegionCode',
 'IsOnlyStateProvinceFlag',
 'stateName']

In [23]:
sEmp = bronze_Employee.toPandas()
sEmp.isnull().any()

EmployeeID                 False
NationalIDNumber           False
LoginID                    False
OrganizationNode           False
OrganizationLevel          False
JobTitle                   False
BirthDate                  False
MaritalStatus              False
Gender                     False
HireDate                   False
SalariedFlag               False
VacationHours              False
SickLeaveHours             False
CurrentFlag                False
Name                       False
GroupName                  False
EmployeeName               False
PersonType                 False
NameStyle                  False
EmailPromotion             False
Demographics               False
City                       False
PostalCode                 False
SpatialLocation            False
StateProvinceCode          False
CountryRegionCode          False
IsOnlyStateProvinceFlag    False
stateName                  False
Address                    False
dtype: bool

In [24]:
sEmp.count()

EmployeeID                 289
NationalIDNumber           289
LoginID                    289
OrganizationNode           289
OrganizationLevel          289
JobTitle                   289
BirthDate                  289
MaritalStatus              289
Gender                     289
HireDate                   289
SalariedFlag               289
VacationHours              289
SickLeaveHours             289
CurrentFlag                289
Name                       289
GroupName                  289
EmployeeName               289
PersonType                 289
NameStyle                  289
EmailPromotion             289
Demographics               289
City                       289
PostalCode                 289
SpatialLocation            289
StateProvinceCode          289
CountryRegionCode          289
IsOnlyStateProvinceFlag    289
stateName                  289
Address                    289
dtype: int64


# Based on the available data in each column they will make a decision about it:
- AdditionalContactInfo, Suffix, and Title will be dropped 
- address1 haven't nulls, but address2 has, so I decided to make address1 and 2 a list in one column called address

In [14]:
bronze_Employee= bronze_Employee\
            .withColumn("Address", array(col("AddressLine1"), col("AddressLine2")))\
            .drop(col("AdditionalContactInfo"), col("Suffix"), col("Title"), col("AddressLine1"), col("AddressLine2"))

bronze_Employee.columns

['EmployeeID',
 'NationalIDNumber',
 'LoginID',
 'OrganizationNode',
 'OrganizationLevel',
 'JobTitle',
 'BirthDate',
 'MaritalStatus',
 'Gender',
 'HireDate',
 'SalariedFlag',
 'VacationHours',
 'SickLeaveHours',
 'CurrentFlag',
 'Name',
 'GroupName',
 'EmployeeName',
 'PersonType',
 'NameStyle',
 'EmailPromotion',
 'Demographics',
 'City',
 'PostalCode',
 'SpatialLocation',
 'StateProvinceCode',
 'CountryRegionCode',
 'IsOnlyStateProvinceFlag',
 'stateName',
 'Address']

In [16]:
Semp = bronze_Employee.toPandas()
Semp.isnull().any()

EmployeeID                 False
NationalIDNumber           False
LoginID                    False
OrganizationNode            True
OrganizationLevel           True
JobTitle                   False
BirthDate                  False
MaritalStatus              False
Gender                     False
HireDate                   False
SalariedFlag               False
VacationHours              False
SickLeaveHours             False
CurrentFlag                False
Name                       False
GroupName                  False
EmployeeName               False
PersonType                 False
NameStyle                  False
EmailPromotion             False
Demographics               False
City                       False
PostalCode                 False
SpatialLocation            False
StateProvinceCode          False
CountryRegionCode          False
IsOnlyStateProvinceFlag    False
stateName                  False
Address                    False
dtype: bool

# OrganizationNode and OrganizationLevel columns have only two missed values, so we decided to ignore these two transactions

In [21]:
bronze_Employee.createOrReplaceTempView("sEmp")
bronze_Employee = spark.sql("""
    select *
    from semp
    where OrganizationNode is not null or OrganizationLevel is not null
""")

In [22]:
sEmp = bronze_Employee.toPandas()
sEmp.isnull().any()

EmployeeID                 False
NationalIDNumber           False
LoginID                    False
OrganizationNode           False
OrganizationLevel          False
JobTitle                   False
BirthDate                  False
MaritalStatus              False
Gender                     False
HireDate                   False
SalariedFlag               False
VacationHours              False
SickLeaveHours             False
CurrentFlag                False
Name                       False
GroupName                  False
EmployeeName               False
PersonType                 False
NameStyle                  False
EmailPromotion             False
Demographics               False
City                       False
PostalCode                 False
SpatialLocation            False
StateProvinceCode          False
CountryRegionCode          False
IsOnlyStateProvinceFlag    False
stateName                  False
Address                    False
dtype: bool

In [25]:
sEmp.count()

EmployeeID                 289
NationalIDNumber           289
LoginID                    289
OrganizationNode           289
OrganizationLevel          289
JobTitle                   289
BirthDate                  289
MaritalStatus              289
Gender                     289
HireDate                   289
SalariedFlag               289
VacationHours              289
SickLeaveHours             289
CurrentFlag                289
Name                       289
GroupName                  289
EmployeeName               289
PersonType                 289
NameStyle                  289
EmailPromotion             289
Demographics               289
City                       289
PostalCode                 289
SpatialLocation            289
StateProvinceCode          289
CountryRegionCode          289
IsOnlyStateProvinceFlag    289
stateName                  289
Address                    289
dtype: int64

In [26]:
bronze_Employee.write.format("hive").mode("overwrite").saveAsTable("silver.DimEmployee")

In [27]:
spark.sql("use silver")

DataFrame[]

In [30]:
spark.sql("show tables").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   silver|dimcustomer|      false|
|   silver|    dimdate|      false|
|   silver|dimemployee|      false|
|         |       semp|       true|
+---------+-----------+-----------+

