# Transformation Pipeline - Gold [![259302-pipeline-management-aws-deployment-copy-icon.png](https://i.postimg.cc/3w6CMc6p/259302-pipeline-management-aws-deployment-copy-icon.png)](https://postimg.cc/zLCRK0qX)

##### Load Data

In [24]:
# Define file paths for the ingested data
placements_path = "Tables/Silver/tblPlacements" 
interviews_path = "Tables/Silver/tblInterviews"

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 26, Finished, Available, Finished)

In [25]:
# Load the data into DataFrames
placements_df = spark.read.format("delta").option("header", "true").load(placements_path)
interviews_df = spark.read.format("delta").option("header", "true").load(interviews_path)

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 27, Finished, Available, Finished)

##### EDA

In [26]:
# Inspect placement and interview data
print("Placements Schema:")
placements_df.printSchema()
print("Sample Placements Data:")
placements_df.show()

print("Interviews Schema:")
interviews_df.printSchema()
print("Sample Interviews Data:")
interviews_df.show()

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 28, Finished, Available, Finished)

Placements Schema:
root
 |-- PlacementId: integer (nullable = true)
 |-- Candidate_email: string (nullable = true)
 |-- Start_Date: date (nullable = true)
 |-- Status: string (nullable = true)
 |-- Marketing_Opt_Out: string (nullable = true)
 |-- First_Name: string (nullable = true)
 |-- Last_Name: string (nullable = true)
 |-- Full_Name: string (nullable = true)

Sample Placements Data:
+-----------+-----------------+----------+--------+-----------------+----------+---------+---------+
|PlacementId|  Candidate_email|Start_Date|  Status|Marketing_Opt_Out|First_Name|Last_Name|Full_Name|
+-----------+-----------------+----------+--------+-----------------+----------+---------+---------+
|          1|Francis@gmail.com|2024-04-11|  Active|            FALSE|   Francis|         | Francis |
|          2|Jessica@gmail.com|2024-05-10|  Active|             TRUE|   Jessica|         | Jessica |
|          3|Michael@gmail.com|2024-06-12|  Active|            FALSE|   Michael|         | Michael |
|  

##### Gold Layer Transformations

In [27]:
# Import dependencies
from pyspark.sql.functions import col, lit, when, year, row_number
from pyspark.sql.window import Window
from datetime import datetime

# Calculate "last year" dynamically
current_year = datetime.now().year
last_year = current_year - 1

# Filter placements and interviews for the year 2024
filtered_placements = placements_df.filter(year(col("Start_Date")) == last_year)
filtered_interviews = interviews_df.filter(year(col("Interview_Date")) == last_year)

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 29, Finished, Available, Finished)

In [28]:
# Align schemas by adding placeholder columns for consistency
filtered_placements = filtered_placements.select(
    col("Candidate_email"),
    col("PlacementId"),
    lit(None).cast("date").alias("Interviewed_Date")  # Add Interviewed_Date as NULL for placements
)

filtered_interviews = filtered_interviews.select(
    col("Candidate_email"),
    lit(None).cast("int").alias("PlacementId"),  # Add PlacementId as NULL for interviews
    col("Interview_Date").alias("Interviewed_Date")
)

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 30, Finished, Available, Finished)

In [29]:
# Combine placements and interviews
combined_df = filtered_placements.union(filtered_interviews)

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 31, Finished, Available, Finished)

In [30]:
# Prioritize placements over interviews
# Add a Rank column to prioritize placements (rank 1) over interviews (rank 2)
combined_df = combined_df.withColumn(
    "Rank", when(col("PlacementId").isNotNull(), 1).otherwise(2)
)

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 32, Finished, Available, Finished)

In [31]:
# Select the highest priority record for each Candidate_Email
window_spec = Window.partitionBy("Candidate_email").orderBy("Rank")

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 33, Finished, Available, Finished)

In [32]:
# Add row_number column to identify the top-ranked row per Candidate_Email
deduplicated_df = combined_df.withColumn("row_number", row_number().over(window_spec))

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 34, Finished, Available, Finished)

In [33]:
# Filter to retain only the top-ranked row per Candidate_Email
final_df = deduplicated_df.filter(col("row_number") == 1).drop("Rank", "row_number")

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 35, Finished, Available, Finished)

In [34]:
# Select and reorder columns for final output
final_df = final_df.select("Candidate_email", "PlacementId", "Interviewed_Date")

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 36, Finished, Available, Finished)

##### Save Data to Gold Layer

In [35]:
# Save transformed data into gold layer
final_df.write.format("delta").mode("overwrite").save("Tables/Gold/placementsPrevYear")

print("Tables loaded into Gold Layer Successfully!")

StatementMeta(, 49b9c6cb-8e8e-4d47-94a2-97411ef91cc7, 37, Finished, Available, Finished)

Tables loaded into Gold Layer Successfully!
