### Creating the Catalog & Schena

In [0]:
%sql
CREATE CATALOG if not exists logistics_catalog_assign;
CREATE SCHEMA IF NOT EXISTS logistics_catalog_assign.landing_zone;
CREATE VOLUME IF NOT EXISTS logistics_catalog_assign.landing_zone.landing_vol;

### Creating the Directories

In [0]:
dbutils.fs.mkdirs("/Volumes/logistics_catalog_assign/landing_zone/landing_vol/logistics_source1/")
dbutils.fs.mkdirs("/Volumes/logistics_catalog_assign/landing_zone/landing_vol/logistics_source2/")
dbutils.fs.mkdirs("/Volumes/logistics_catalog_assign/landing_zone/landing_vol/logistics_shipment_detail/")

### Programatically try to find couple of data patterns applying below EDA (File: logistics_source1)

- Apply inferSchema and toDF to create a DF and analyse the actual data.

In [0]:
log_src1_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/Volumes/logistics_catalog_assign/landing_zone/landing_vol/logistics_source1/logistics_source1")

log_src1_df.show(100,truncate=False)

- Analyse the schema, datatypes, columns etc.,

In [0]:
log_src1_df.printSchema()
print(log_src1_df.schema)
print(log_src1_df.columns)

- Analyse the duplicate records count and summary of the dataframe.

In [0]:
print(log_src1_df.count())
print(log_src1_df.distinct().count())
display(log_src1_df.summary())

- ###  a. Passive Data Munging - (File: logistics_source1 and logistics_source2)

In [0]:
log_src2_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/Volumes/logistics_catalog_assign/landing_zone/landing_vol/logistics_source2/logistics_source2")
display(log_src2_df)
from pyspark.sql.functions import col

#shipment_id is non-numeric
log_src1_df.filter(~col("shipment_id").rlike("^[0-9]+$")).select("shipment_id").show(truncate=False)
log_src2_df.filter(~col("shipment_id").rlike("^[0-9]+$")).select("shipment_id").show(truncate=False)

#age is not an integer
log_src1_df.filter(~col("age").rlike("^[0-9]+$")).select("age").show(truncate=False)
log_src2_df.filter(~col("age").rlike("^[0-9]+$")).select("age").show(truncate=False)

### b. Active Data Munging File: logistics_source1 and logistics_source2

- Read both files without enforcing schema
- Align them into a single canonical schema: shipment_id, first_name, last_name, age, role, hub_location, vehicle_type, data_source
- Add data_source column with values as: system1, system2 in the respective dataframes

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
from pyspark.sql.functions import lit
schema =StructType([StructField('shipment_id', IntegerType(), True),
                     StructField('first_name', StringType(), True), 
                     StructField('last_name', StringType(), True), 
                     StructField('age', IntegerType(), True),
                      StructField('role', StringType(), True), 
                      StructField('hub_location', StringType(), True), 
                      StructField('vehicle_type', StringType(), True),
                      StructField('data_source', StringType(), True),
                      StructField('corruptedrows', StringType(), True)
                      ])

source1_df=spark.read.schema(schema).csv(path="/Volumes/logistics_catalog_assign/landing_zone/landing_vol/logistics_source1/logistics_source1",mode='permissive',columnNameOfCorruptRecord="corruptedrows",header=True)
source1_df=source1_df.withColumn("data_source",lit("system1"))
source2_df=spark.read.schema(schema).csv(path="/Volumes/logistics_catalog_assign/landing_zone/landing_vol/logistics_source2/logistics_source2",columnNameOfCorruptRecord="corruptedrows",header=True)
source2_df=source2_df.withColumn("data_source",lit("system2"))

single_canonical_df = source1_df.union(source2_df)
single_canonical_df= single_canonical_df.select(col('shipment_id'),col('first_name'),col('last_name'),col('age'),col('role'),col('hub_location'),col('vehicle_type'),col('data_source'))


#####Cleansing, Scrubbing:
#####Cleansing (removal of unwanted datasets)

- Mandatory Column Check - Drop any record where any of the following columns is NULL:shipment_id, role
- Name Completeness Rule - Drop records where both of the following columns are NULL: first_name, last_name
- Join Readiness Rule - Drop records where the join key is null: shipment_id

In [0]:
print(single_canonical_df.count())
cleanseddf=single_canonical_df.na.drop(how="any",subset=["shipment_id","role"])
print(cleanseddf.count())
display(cleanseddf)

In [0]:
cleanseddf2=cleanseddf.na.drop(how="all",subset=["first_name","last_name"])
print(cleanseddf2.count())

In [0]:
cleanseddf3=cleanseddf2.na.drop(how='all',subset=["shipment_id"])
print(cleanseddf2.count())

Scrubbing (convert raw to tidy)
- Age Defaulting Rule - Fill NULL values in the age column with: -1
- Vehicle Type Default Rule - Fill NULL values in the vehicle_type column with: UNKNOWN
- Invalid Age Replacement - Replace the following values in age: "ten" to -1 "" to -1
- Vehicle Type Normalization - Replace inconsistent vehicle types: truck to LMV bike to TwoWheeler

In [0]:
cleanseddf4=cleanseddf3.na.fill(-1,['age'])
cleanseddf4.where('age=-1').show()

In [0]:
cleanseddf5=cleanseddf4.na.fill('UNKNOWN',['vehicle_type'])
cleanseddf5.where('vehicle_type=="UNKNOWN"').show()

In [0]:
find_and_replace = {'Truck':'LMV','Bike':'TwoWheeler'}
cleanseddf6=cleanseddf5.na.replace(find_and_replace,subset=['vehicle_type'])
cleanseddf6.where('vehicle_type=="LMV"').show()

##### 3. Standardization, De-Duplication and Replacement / Deletion of Data to make it in a usable format

Creating shipments Details data Dataframe creation

Create a DF by Reading Data from logistics_shipment_detail.json
As this data is a clean json data, it doesn't require any cleansing or scrubbing.

In [0]:
logistics_shipment_df = (
    spark.read
         .option("multiline", "true")   
         .option("mode", "PERMISSIVE") 
         .json("/Volumes/logistics_catalog_assign/landing_zone/landing_vol/logistics_shipment_detail/logistics_shipment_detail_3000.json")
)

logistics_shipment_df.show(100,truncate=False)

Add a column
Source File: DF of logistics_shipment_detail_3000.json
- domain as 'Logistics', current timestamp 'ingestion_timestamp' and 'False' as 'is_expedited'

In [0]:
from pyspark.sql.functions import current_timestamp,lit
logistics_shipment_df2 = logistics_shipment_df.withColumn("domain",lit("Logistics")).withColumn("ingestion_timestamp",current_timestamp()).withColumn("is_expedited",lit("False"))
logistics_shipment_df2.show(100,truncate=False)

Column Uniformity: role - Convert to lowercase
- Source File: DF of merged(logistics_source1 & logistics_source2)
  - vehicle_type - Convert values to UPPERCASE
- Source Files: DF of logistics_shipment_detail_3000.json             
  - hub_location - Convert values to initcap case
- Source Files: DF of merged(logistics_source1 & logistics_source2)

In [0]:
from pyspark.sql.functions import upper,initcap
cleanseddf7=cleanseddf6.withColumn("vehicle_type",upper(col("vehicle_type"))).withColumn("hub_location",initcap(col("hub_location")))
cleanseddf7.where('data_source == "system2"').show()

Format Standardization:
- Source Files: DF of logistics_shipment_detail_3000.json
  - Convert shipment_date to yyyy-MM-dd
  - Ensure shipment_cost has 2 decimal precision

In [0]:
from pyspark.sql.functions import to_date,date_format
logistics_shipment_df3 = logistics_shipment_df2.withColumn("shipment_date", date_format(to_date(col("shipment_date"), "yy-MM-dd"),"MM-dd-yyyy")).withColumn("shipment_cost", col("shipment_cost").cast("decimal(18,2)"))
logistics_shipment_df3.show()

Data Type Standardization
Standardizing column data types to fix schema drift and enable mathematical operations.
 - Source File: DF of merged(logistics_source1 & logistics_source2)
   - age: Cast String to Integer
 - Source File: DF of logistics_shipment_detail_3000.json
   - shipment_weight_kg: Cast to Double
- Source File: DF of logistics_shipment_detail_3000.json
   - is_expedited: Cast to Boolean

In [0]:
cleanseddf8 = cleanseddf7.withColumn("age",col('age').cast("int"))
logistics_shipment_df4 = logistics_shipment_df3.withColumn("shipment_weight",col('shipment_weight_kg').cast("double")).withColumn("is_expedited",col('is_expedited').cast("boolean"))
logistics_shipment_df4.show(10000,truncate=False)

Naming Standardization
Source File: DF of merged(logistics_source1 & logistics_source2)
 - Rename: first_name to staff_first_name
 - Rename: last_name to staff_last_name
 - Rename: hub_location to origin_hub_city

In [0]:
cleanseddf9 = cleanseddf8.withColumnRenamed("first_name","staff_first_name").withColumnRenamed("last_name","staff_last_name").withColumnRenamed("hub_location","origin_hub_city")

Reordering columns logically in a better standard format:
Source File: DF of Data from all 3 files
shipment_id (Identifier), staff_first_name (Dimension)staff_last_name (Dimension), role (Dimension), origin_hub_city (Location), shipment_cost (Metric), ingestion_timestamp (Audit)

In [0]:
cleanseddf10 = cleanseddf9.selectExpr('shipment_id','staff_first_name','staff_last_name','age','role','origin_hub_city','vehicle_type','data_source')
cleanseddf10.show(100,truncate=False)

In [0]:
logistics_shipment_df5 = logistics_shipment_df4.selectExpr('shipment_id','domain','cargo_type','source_city','destination_city','order_id','shipment_date','shipment_cost','shipment_weight','is_expedited','ingestion_timestamp')
logistics_shipment_df5.show(100,truncate=False)


Deduplication:

 - Apply Record Level De-Duplication
 - Apply Column Level De-Duplication (Primary Key Enforcement)

In [0]:
cleanseddf11 = cleanseddf10.dropDuplicates()
cleanseddf12 = cleanseddf11.dropDuplicates(['shipment_id'])
display(cleanseddf12)

In [0]:
logistics_shipment_df6 = logistics_shipment_df5.dropDuplicates()
logistics_shipment_df7 = logistics_shipment_df6.dropDuplicates(['shipment_id'])
display(logistics_shipment_df7)

### 2. Data Enrichment - Detailing of data

1. Add Audit Timestamp (load_dt) Source File: DF of logistics_source1 and logistics_source2

 - Scenario: We need to track exactly when this record was ingested into our Data Lakehouse for auditing purposes.
 - Action: Add a column load_dt using the function current_timestamp().

In [0]:
cleanseddf13 = cleanseddf12.withColumn("load_dt",current_timestamp())
cleanseddf13.printSchema()

2. Create Full Name (full_name) Source File: DF of logistics_source1 and logistics_source2

 - Scenario: The reporting dashboard requires a single field for the driver's name instead of separate columns.
 - Action: Create full_name by concatenating first_name and last_name with a space separator.
Result: "Rajesh" + " " + "Kumar" -> "Rajesh Kumar"

In [0]:
from pyspark.sql.functions import *
cleanseddf14 = cleanseddf13.withColumn("full_name",concat(col("staff_first_name"),lit(" "),col("staff_last_name")))
display(cleanseddf14)

3. Define Route Segment (route_segment) Source File: DF of logistics_shipment_detail_3000.json

 - Scenario: The logistics team wants to analyze performance based on specific transport lanes (Source to Destination).
 - Action: Combine source_city and destination_city with a hyphen.
 - Result: "Chennai" + "-" + "Pune" -> "Chennai-Pune"

In [0]:
logistics_shipment_df8 = logistics_shipment_df7.withColumn("route_segment", concat(col("source_city"),lit("->"),col("destination_city")))
display(logistics_shipment_df8)

4. Generate Vehicle Identifier (vehicle_identifier) Source File: DF of logistics_shipment_detail_3000.json

 - Scenario: We need a unique tracking code that immediately tells us the vehicle type and the shipment ID.
 - Action: Combine vehicle_type and shipment_id to create a composite key.
 - Result: "Truck" + "_" + "500001" -> "Truck_500001"

In [0]:
cleanseddf15 = cleanseddf14.withColumn("vehicle_identifier",concat(col('vehicle_type'),lit("_"),col('shipment_id')))
display(cleanseddf15)

1. Derive Shipment Year (shipment_year)

 - Scenario: Management needs an annual performance report to compare growth year-over-year.
 - Action: Extract the year component from shipment_date.-
 - Result: "2024-04-23" -> 2024


In [0]:
logistics_shipment_df9 = logistics_shipment_df8.withColumn("shipment_year",year(to_date(col('shipment_date'),'MM-dd-yyyy')))
logistics_shipment_df9.show()

2. Derive Shipment Month (shipment_month)

 - Scenario: Analysts want to identify seasonal peaks (e.g., increased volume in December).
 - Action: Extract the month component from shipment_date.
 - Result: "2024-04-23" -> 4 (April)

In [0]:
logistics_shipment_df10 = logistics_shipment_df9.withColumn("shipment_month",month(to_date(col('shipment_date'),'MM-dd-yyyy')))
logistics_shipment_df10.show()