In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.sql.functions import expr,to_date,col



df_src1_raw = spark.read.options(header=True,inferSchema=True).format("csv").load("/Volumes/workspace/logistics_data/logistics_volume/logistics_source1")

df_src2_raw = spark.read.options(header=True,inferSchema=True).format("csv").load("/Volumes/workspace/logistics_data/logistics_volume/logistics_source2")

df_logistics_ship_raw = spark.read.options(header=True,inferSchema=True,multiline=True).format("json").load("/Volumes/workspace/logistics_data/logistics_volume/logistics_shipment_detail_3000.json")


print('count of df_src1_raw:-',df_src1_raw.count())
print('count of df_src2_raw:-',df_src2_raw.count())
print('count of df_json_raw:-',df_logistics_ship_raw.count())


In [0]:
# Task 1 and 2

from pyspark.sql.functions import lit,to_date,col,expr,when,lower,upper,cast,current_timestamp, initcap,count
from pyspark.sql.types import DecimalType

df_src1_clned = df_src1_raw.withColumn("source",lit("source1"))
df_src2_clned = df_src2_raw.withColumn("source",lit("source2"))
# df_json_clned = df_json_raw.withColumn("source",lit("source3"))

df_src_combined = df_src1_clned.unionByName(df_src2_clned,allowMissingColumns = True).select("shipment_id","first_name","last_name","age","role","hub_location","vehicle_type","source")

df_user = df_src_combined.dropna(subset=["shipment_id","role"]) \
    .filter(col("first_name").isNotNull() | col("last_name").isNotNull()) \
    .filter(col("shipment_id").rlike("^[0-9]+$")) \
    .dropDuplicates(["shipment_id"]) \
    .withColumn("age",when(col("age").rlike("^[0-9]+$"), col("age").cast("int")).otherwise(-1)) \
    .withColumn("vehicle_type",when (col("vehicle_type").isNull(),"UNKNOWN")
    .when (lower(col("vehicle_type")) == "truck", "LMV")
    .when (lower(col("vehicle_type")) == "bike", "TwoWheeler") \
    .otherwise(col("vehicle_type"))) \
    .withColumn("role",lower(col("role"))) \
    .withColumn("vehicle_type",upper(col("role"))) \
    .withColumn("hub_location",initcap(col("hub_location"))) \
    .withColumnsRenamed({"hub_location":"origin_hub_city","first_name":"staff_first_name","last_name":"staff_last_name"})



df_user=df_user.dropDuplicates()


In [0]:
 
df_log_data_stand =df_logistics_ship_raw.withColumn("domain", lit("logistics")) \
    .withColumn("ingestion_timestamp", current_timestamp()) \
    .withColumn("is_expedited", lit("False")) \
    .withColumn("shipment_date", to_date(col("shipment_date"), 'yyyy-MM-dd')) \
    .withColumn("shipment_cost", col("shipment_cost").cast(DecimalType(15, 2))) \
    .withColumn("shipment_weight_kg", col("shipment_weight_kg").cast("double"))


df_log_standard=df_log_data_stand.withColumn("is_expedited",col("is_expedited") \
    .cast("boolean"))
# de duplication
df_log_standard=df_log_standard.dropDuplicates()
  