In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.sql.functions import expr,to_date,col



df_src1_raw = spark.read.options(header=True,inferSchema=True).format("csv").load("/Volumes/workspace/logistics_data/logistics_volume/logistics_source1")

df_src2_raw = spark.read.options(header=True,inferSchema=True).format("csv").load("/Volumes/workspace/logistics_data/logistics_volume/logistics_source2")

df_json_raw = spark.read.options(header=True,inferSchema=True,multiline=True).format("json").load("/Volumes/workspace/logistics_data/logistics_volume/logistics_shipment_detail_3000.json")


print('count of df_src1_raw:-',df_src1_raw.count())
print('count of df_src2_raw:-',df_src2_raw.count())
print('count of df_json_raw:-',df_json_raw.count())


In [0]:
# Task 1 and 2

from pyspark.sql.functions import lit,to_date,col,expr,when,lower

df_src1_clned = df_src1_raw.withColumn("source",lit("source1"))
df_src2_clned = df_src2_raw.withColumn("source",lit("source2"))
# df_json_clned = df_json_raw.withColumn("source",lit("source3"))

df_src_combined = df_src1_clned.unionByName(df_src2_clned,allowMissingColumns = True).select("shipment_id","first_name","last_name","age","role","hub_location","vehicle_type","source")

df_cleansed = df_src_combined.dropna(subset=["shipment_id","role"]) \
    .filter(col("first_name").isNotNull() | col("last_name").isNotNull()) \
    .filter(col("shipment_id").rlike("^[0-9]+$")) \
    .dropDuplicates(["shipment_id"]) \
    .withColumn("age",when(col("age").rlike("^[0-9]+$"), col("age").cast("int")).otherwise(-1)) \
    .withColumn("vehicle_type",when (col("vehicle_type").isNull(),"UNKNOWN")
                .when (lower(col("vehicle_type")) == "truck", "LMV")
                .when (lower(col("vehicle_type")) == "bike", "TwoWheeler")
                .otherwise(col("vehicle_type"))) 

# df_cleansed.groupBy("shipment_id").count().filter(col("count") > 1).display()

df_cleansed.display()



In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType,FloatType

src1_schema = StructType([
    StructField("shipment_id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("role", StringType(), True)
])

src2_schema = StructType([
    StructField("shipment_id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("role", StringType(), True),
    StructField("hub_location", StringType(), True),
    StructField("vehicle_type", StringType(), True)
])

srcjson_schema = StructType([
    StructField("cargo_type", StringType(), True),
    StructField("destination_city", StringType(), True),
    StructField("order_id", StringType(), True),
    StructField("payment_mode", StringType(), True),
    StructField("shipment_cost", FloatType(), True),
    StructField("shipment_date", StringType(), True),
    StructField("shipment_id", IntegerType(), True),
    StructField("shipment_status", StringType(), True),
    StructField("shipment_weight_kg", FloatType(), True),
    StructField("source_city", StringType(), True),
    StructField("vehicle_type", StringType(), True)
])


df_src1 = spark.read.schema(src1_schema).options(header=True).format("csv").load("/Volumes/workspace/logistics_data/logistics_volume/logistics_source1")
df_src2 = spark.read.schema(src2_schema).options(header=True,inferSchema=True).format("csv").load("/Volumes/workspace/logistics_data/logistics_volume/logistics_source2")
df_json = spark.read.schema(srcjson_schema).option("multiline", True).format("json").load("/Volumes/workspace/logistics_data/logistics_volume/logistics_shipment_detail_3000.json")


print("df_src1 count",df_src1.count())
print("df_src1 distinct count",df_src1.distinct().count())
print("df_src2 count",df_src2.count())
print("df_src2 distinct count",df_src1.distinct().count())
print("df_json count",df_json.count())
print("df_json distinct count",df_src1.distinct().count())

# df_src1.printSchema()
# df_src2.printSchema()
# df_json.printSchema()

# display(df_json)

In [0]:
display(df_src1.select("*").filter(df_src1.shipment_id.isNull()))
display(df_src2.select("*").filter(df_src2.shipment_id.isNull()))