In [7]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import warnings

In [8]:
spark = SparkSession.builder.config("spark.jars","/home/jovyan/drivers/postgresql-42.2.18.jar") \
        .master("local[*]").appName("Engine").getOrCreate().newSession()

In [9]:
df_dwh = (spark
        .read.parquet("/home/jovyan/datalake/silver_layer/data_clean_maintenance/*")
)

In [10]:
df_dwh.printSchema()

root
 |-- type_id: integer (nullable = true)
 |-- package_id: integer (nullable = true)
 |-- status_id: integer (nullable = true)
 |-- maintenance_id: integer (nullable = true)
 |-- stock_id: integer (nullable = true)
 |-- estimate_id: integer (nullable = true)
 |-- price_maintenance: double (nullable = true)
 |-- mtn_creation_date_id: string (nullable = true)
 |-- mtn_start_date_id: string (nullable = true)
 |-- mtn_end_date_id: string (nullable = true)
 |-- mtn_delivered_date_id: string (nullable = true)
 |-- mtn_creation_date: string (nullable = true)
 |-- mtn_start_date: string (nullable = true)
 |-- mtn_end_date: string (nullable = true)
 |-- mtn_delivered_date: string (nullable = true)
 |-- status_name: string (nullable = true)
 |-- package_name: string (nullable = true)
 |-- type_name: string (nullable = true)



# Dim

## Dim Package

In [18]:
df_dwh.select("package_id","package_name")

DataFrame[package_id: int, package_name: string]

In [20]:
df_dim_package = df_dwh.select("package_id","package_name")

In [21]:
df_dim_package.printSchema()

root
 |-- package_id: integer (nullable = true)
 |-- package_name: string (nullable = true)



In [22]:
df_dim_package = df_dim_package.withColumnRenamed("package_id","bk_package")
df_dim_package = df_dim_package.withColumnRenamed("package_name","name_package")

In [24]:
df_dim_package.printSchema()

root
 |-- bk_package: integer (nullable = true)
 |-- name_package: string (nullable = true)



In [46]:
df_dim_package.show(3)

+----------+---------------+
|bk_package|   name_package|
+----------+---------------+
|       200|Paying Customer|
|       200|Paying Customer|
|       200|Paying Customer|
+----------+---------------+
only showing top 3 rows



In [49]:
properties = {"user":"airflow","password":"airflow","driver":"org.postgresql.Driver"}
(
    df_dim_package
    .write
    .jdbc(url="jdbc:postgresql://postgres:5432/postgres",
         table="dim_package",
         properties=properties)
)

# Dim status

In [33]:
df_dwh.select("status_id","status_name")

DataFrame[status_id: int, status_name: string]

In [34]:
df_dim_status = df_dwh.select("status_id","status_name")

In [35]:
df_dim_status = df_dim_status.withColumnRenamed("status_id","bk_status")
df_dim_status = df_dim_status.withColumnRenamed("status_name","name_status")

In [36]:
df_dim_status.printSchema()

root
 |-- bk_status: integer (nullable = true)
 |-- name_status: string (nullable = true)



In [50]:
df_dim_status.show(3)

+---------+--------------+
|bk_status|   name_status|
+---------+--------------+
|        1|In_Maintenance|
|        1|In_Maintenance|
|        1|In_Maintenance|
+---------+--------------+
only showing top 3 rows



In [51]:
properties = {"user":"airflow","password":"airflow","driver":"org.postgresql.Driver"}
(
    df_dim_status
    .write
    .jdbc(url="jdbc:postgresql://postgres:5432/postgres",
         table="dim_status",
         properties=properties)
)

# Fact

In [40]:
df_fact_maintenance = (df_dwh
 .select("status_id"
         ,"package_id"
         ,"maintenance_id",
         "price_maintenance")
)

In [41]:
df_fact_maintenance = df_fact_maintenance.withColumnRenamed("status_id","bk_status")
df_fact_maintenance = df_fact_maintenance.withColumnRenamed("package_id","bk_package")
df_fact_maintenance = df_fact_maintenance.withColumnRenamed("maintenance_id","bk_maintenance")

In [42]:
df_fact_maintenance.printSchema()

root
 |-- bk_status: integer (nullable = true)
 |-- bk_package: integer (nullable = true)
 |-- bk_maintenance: integer (nullable = true)
 |-- price_maintenance: double (nullable = true)



In [52]:
df_fact_maintenance.show(4)

+---------+----------+--------------+-----------------+
|bk_status|bk_package|bk_maintenance|price_maintenance|
+---------+----------+--------------+-----------------+
|        1|       200|         36722|            650.0|
|        1|       200|         38642|            650.0|
|        1|       200|         38470|           1000.0|
|        1|       200|         38856|           1100.0|
+---------+----------+--------------+-----------------+
only showing top 4 rows



In [53]:
properties = {"user":"airflow","password":"airflow","driver":"org.postgresql.Driver"}
(
    df_fact_maintenance
    .write
    .jdbc(url="jdbc:postgresql://postgres:5432/postgres",
         table="fact_maintenance",
         properties=properties)
)