# Transformation Pipeline - Silver [![259302-pipeline-management-aws-deployment-copy-icon.png](https://i.postimg.cc/3w6CMc6p/259302-pipeline-management-aws-deployment-copy-icon.png)](https://postimg.cc/zLCRK0qX)

##### Load Data

In [50]:
# Define file paths for the ingested data
placements_path = "Tables/Bronze/tblPlacements" 
interviews_path = "Tables/Bronze/tblInterviews"

StatementMeta(, 1a9cf707-a417-4e14-baa9-5d5b852a65b9, 52, Finished, Available, Finished)

In [51]:
# Load the data into DataFrames
placements_df = spark.read.format("delta").option("header", "true").load(placements_path)
interviews_df = spark.read.format("delta").option("header", "true").load(interviews_path)

StatementMeta(, 1a9cf707-a417-4e14-baa9-5d5b852a65b9, 53, Finished, Available, Finished)

##### EDA

In [52]:
# Inspect the placement data
print("Placements Schema:")
placements_df.printSchema()
print("Sample Placements Data:")
placements_df.show()

StatementMeta(, 1a9cf707-a417-4e14-baa9-5d5b852a65b9, 54, Finished, Available, Finished)

Placements Schema:
root
 |-- PlacementId: string (nullable = true)
 |-- Candidate_email: string (nullable = true)
 |-- Start_Date: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Marketing_Opt_Out: string (nullable = true)

Sample Placements Data:
+-----------+-----------------+----------+--------+-----------------+
|PlacementId|  Candidate_email|Start_Date|  Status|Marketing_Opt_Out|
+-----------+-----------------+----------+--------+-----------------+
|          1|Francis@gmail.com| 4/11/2024|  Active|            FALSE|
|          2|Jessica@gmail.com| 5/10/2024|  Active|             TRUE|
|          3|Michael@gmail.com| 6/12/2024|  Active|            FALSE|
|          4|  Sarah@gmail.com|  7/1/2024|Inactive|             TRUE|
|          5| Thomas@gmail.com| 8/15/2024|  Active|            FALSE|
|          6|  Emily@gmail.com| 9/20/2024|  Active|             TRUE|
|          7|Francis@gmail.com|10/11/2024|  Active|            FALSE|
|          8|  Alice@gmail.com|1

In [53]:
# Inspect interview data
print("Interviews Schema:")
interviews_df.printSchema()
print("Sample Interviews Data:")
interviews_df.show()

StatementMeta(, 1a9cf707-a417-4e14-baa9-5d5b852a65b9, 55, Finished, Available, Finished)

Interviews Schema:
root
 |-- InterviewId: string (nullable = true)
 |-- Candidate_email: string (nullable = true)
 |-- Interview_Date: string (nullable = true)

Sample Interviews Data:
+-----------+--------------------+--------------+
|InterviewId|     Candidate_email|Interview_Date|
+-----------+--------------------+--------------+
|          1|     Emily@gmail.com|      4/1/2024|
|          2|lisa.white@gmail.com|      4/5/2024|
|          3|eric.smith@gmail.com|     4/12/2024|
|          4|megan.jones@gmail...|     4/15/2024|
|          5|kevin.hill@gmail.com|     4/18/2024|
|          6|sophie.king@gmail...|     4/20/2024|
|          7|ryan.moore@gmail.com|     4/25/2024|
|          8|chloe.green@gmail...|     4/27/2024|
|          9| jacob.lee@gmail.com|     4/30/2024|
|         10|emma.scott@gmail.com|      5/2/2024|
|         11|daniel.wright@gma...|      5/5/2024|
|         12|olivia.adams@gmai...|      5/7/2024|
+-----------+--------------------+--------------+



##### Silver Layer Transformations

In [54]:
# Import dependencies
from pyspark.sql.functions import to_date, col, split, expr, when, to_timestamp, concat_ws, initcap

# Set the legacy time parser policy
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Change the date columns to data type date
placements_df = placements_df.withColumn("Start_Date", to_date(col("Start_Date"), "MM/dd/yyyy"))
interviews_df = interviews_df.withColumn("Interview_Date", to_date(col("Interview_Date"), "MM/dd/yyyy"))

StatementMeta(, 1a9cf707-a417-4e14-baa9-5d5b852a65b9, 56, Finished, Available, Finished)

In [55]:
# Set PlacementId and InterviewId data type as Int
placements_df = placements_df.withColumn("PlacementId", col("PlacementId").cast("int"))
interviews_df = interviews_df.withColumn("InterviewId", col("InterviewId").cast("int"))

StatementMeta(, 1a9cf707-a417-4e14-baa9-5d5b852a65b9, 57, Finished, Available, Finished)

In [57]:
# Extract first name, last name, and full name from the email field
def extract_names(df, email_col="Candidate_email"):
    # Extract the local part of the email (before @)
    df = df.withColumn("local_part", split(col(email_col), "@")[0])
    
    # Split the local part into first and last names based on "." or "_"
    df = df.withColumn("First_Name", initcap(expr("split(local_part, '[._]')[0]")))
    df = df.withColumn("Last_Name", 
                       when(expr("size(split(local_part, '[._]')) > 1"), 
                            initcap(expr("split(local_part, '[._]')[1]"))).otherwise(""))
    
    # Create Full Name by concatenating First and Last Name
    df = df.withColumn("Full_Name", concat_ws(" ", col("First_Name"), col("Last_Name")))
    
    # Drop the temporary local_part column
    df = df.drop("local_part")
    return df

StatementMeta(, 1a9cf707-a417-4e14-baa9-5d5b852a65b9, 59, Finished, Available, Finished)

In [58]:
# Apply name extraction on both placements and interviews DataFrames
placements_df = extract_names(placements_df, "Candidate_Email")
interviews_df = extract_names(interviews_df, "Candidate_Email")

StatementMeta(, 1a9cf707-a417-4e14-baa9-5d5b852a65b9, 60, Finished, Available, Finished)

In [59]:
# Inspect the placement and interview data
print("Placements Schema:")
placements_df.printSchema()
print("Sample Placements Data:")
placements_df.show()

print("Interviews Schema:")
interviews_df.printSchema()
print("Sample Interviews Data:")
interviews_df.show()

StatementMeta(, 1a9cf707-a417-4e14-baa9-5d5b852a65b9, 61, Finished, Available, Finished)

Placements Schema:
root
 |-- PlacementId: integer (nullable = true)
 |-- Candidate_email: string (nullable = true)
 |-- Start_Date: date (nullable = true)
 |-- Status: string (nullable = true)
 |-- Marketing_Opt_Out: string (nullable = true)
 |-- First_Name: string (nullable = true)
 |-- Last_Name: string (nullable = true)
 |-- Full_Name: string (nullable = false)

Sample Placements Data:
+-----------+-----------------+----------+--------+-----------------+----------+---------+---------+
|PlacementId|  Candidate_email|Start_Date|  Status|Marketing_Opt_Out|First_Name|Last_Name|Full_Name|
+-----------+-----------------+----------+--------+-----------------+----------+---------+---------+
|          1|Francis@gmail.com|2024-04-11|  Active|            FALSE|   Francis|         | Francis |
|          2|Jessica@gmail.com|2024-05-10|  Active|             TRUE|   Jessica|         | Jessica |
|          3|Michael@gmail.com|2024-06-12|  Active|            FALSE|   Michael|         | Michael |
| 

##### Save Data to Silver Layer

In [61]:
# Save transformed data into silver layer
placements_df.write.format("delta").mode("overwrite").save("Tables/Silver/tblPlacements")
interviews_df.write.format("delta").mode("overwrite").save("Tables/Silver/tblInterviews")

print("Tables loaded into Silver Layer Successfully!")

StatementMeta(, 1a9cf707-a417-4e14-baa9-5d5b852a65b9, 63, Finished, Available, Finished)

Tables loaded into Silver Layer Successfully!
