# "Silver Schema : Customer Dimension"

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [25]:
spark = SparkSession.builder \
    .appName("Employee") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "/Drivers/SQL_Sever/jdbc/sqljdbc42.jar")\
    .enableHiveSupport() \
    .getOrCreate()

In [26]:
spark.sql("show schemas;").show()

+---------+
|namespace|
+---------+
|   bronze|
|  default|
|    sales|
|   silver|
+---------+



In [27]:
spark.sql("use bronze")

DataFrame[]

In [28]:
spark.sql("show tables;").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   bronze|dimcustomer|      false|
|   bronze|    dimdate|      false|
|   bronze|dimemployee|      false|
|   bronze| dimproduct|      false|
|   bronze|  factsales|      false|
+---------+-----------+-----------+



In [38]:
bronze_customer = spark.sql("select * from bronze.dimcustomer")
bronze_customer.columns

['CustomerID',
 'AccountNumber',
 'CustomerName',
 'PersonType',
 'NameStyle',
 'Title',
 'Suffix',
 'EmailPromotion',
 'AdditionalContactInfo',
 'Demographics',
 'AddressLine1',
 'AddressLine2',
 'City',
 'PostalCode',
 'SpatialLocation',
 'StateProvinceCode',
 'CountryRegionCode',
 'IsOnlyStateProvinceFlag',
 'stateName']

In [30]:
Scust = bronze_customer.toPandas()
Scust.isnull().any()

CustomerID                 False
AccountNumber              False
CustomerName               False
PersonType                 False
NameStyle                  False
Title                       True
Suffix                      True
EmailPromotion             False
AdditionalContactInfo       True
Demographics               False
AddressLine1               False
AddressLine2                True
City                       False
PostalCode                 False
SpatialLocation            False
StateProvinceCode          False
CountryRegionCode          False
IsOnlyStateProvinceFlag    False
stateName                  False
dtype: bool

In [31]:
Scust.count()

CustomerID                 18508
AccountNumber              18508
CustomerName               18508
PersonType                 18508
NameStyle                  18508
Title                        101
Suffix                         3
EmailPromotion             18508
AdditionalContactInfo          0
Demographics               18508
AddressLine1               18508
AddressLine2                 314
City                       18508
PostalCode                 18508
SpatialLocation            18508
StateProvinceCode          18508
CountryRegionCode          18508
IsOnlyStateProvinceFlag    18508
stateName                  18508
dtype: int64


# Based on the available data in each column they will make a decision about it:
- AdditionalContactInfo, Suffix, and Title will be dropped 
- address1 haven't nulls, but address2 have, so I decided to make address1 and 2 a list in one column called address

In [39]:
bronze_customer= bronze_customer\
            .withColumn("Address", array(col("AddressLine1"), col("AddressLine2")))\
            .drop(col("AdditionalContactInfo"), col("Suffix"), col("Title"), col("AddressLine1"), col("AddressLine2"))

bronze_customer.columns

['CustomerID',
 'AccountNumber',
 'CustomerName',
 'PersonType',
 'NameStyle',
 'EmailPromotion',
 'Demographics',
 'City',
 'PostalCode',
 'SpatialLocation',
 'StateProvinceCode',
 'CountryRegionCode',
 'IsOnlyStateProvinceFlag',
 'stateName',
 'Address']

In [40]:
Scust = bronze_customer.toPandas()
Scust.isnull().any()

CustomerID                 False
AccountNumber              False
CustomerName               False
PersonType                 False
NameStyle                  False
EmailPromotion             False
Demographics               False
City                       False
PostalCode                 False
SpatialLocation            False
StateProvinceCode          False
CountryRegionCode          False
IsOnlyStateProvinceFlag    False
stateName                  False
Address                    False
dtype: bool

In [44]:
bronze_customer.select("address").show(5, truncate=False)

+--------------------------------+
|address                         |
+--------------------------------+
|[Roßstr 6642, NULL]             |
|[8531 Bayter Court, NULL]       |
|[2368 Olivera Rd, NULL]         |
|[5690 Morgan Territory Rd, NULL]|
|[4932 La Jolla, NULL]           |
+--------------------------------+
only showing top 5 rows



In [45]:
bronze_customer.write.format("hive").mode("overwrite").saveAsTable("silver.DimCustomer")

In [46]:
spark.sql("use silver")

DataFrame[]

In [47]:
spark.sql("show tables").show()

+---------+-----------+-----------+
|namespace|  tableName|isTemporary|
+---------+-----------+-----------+
|   silver|dimcustomer|      false|
+---------+-----------+-----------+



# Output from HiveQL

In [None]:
"""
0: jdbc:hive2://localhost:10000/default> show schemas;
+----------------+
| database_name  |
+----------------+
| bronze         |
| default        |
| sales          |
| silver         |
+----------------+
4 rows selected (0.054 seconds)

0: jdbc:hive2://localhost:10000/default> use silver;
No rows affected (0.025 seconds)

0: jdbc:hive2://localhost:10000/default> select count(*) as cnt
. . . . . . . . . . . . . . . . . . . .> from dimcustomer;
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+--------+
|  cnt   |
+--------+
| 18508  |
+--------+
"""