In [0]:
%sql
USE CATALOG agriculture

In [0]:
# Master Dictionary: { "Mandi Data String" : "Standard Soil Label" }
crop_mapping = {
    "Rice": "rice",
    "Maize": "maize",
    "Chana": "chickpea",
    "Chickpeas": "chickpea",
    "Lentil": "lentil",
    "Masur": "lentil",
    "Pomegranate": "pomegranate",
    "Banana": "banana",
    "Mango": "mango",
    "Grapes": "grapes",
    "Apple": "apple",
    "Orange": "orange",
    "Papaya": "papaya",
    "Coconut": "coconut",
    "Cotton": "cotton",
    "Jute": "jute",
    "Coffee": "coffee"
}

print("Mapping Dictionary Defined")

Mapping Dictionary Defined


In [0]:
from pyspark.sql.functions import col, when, regexp_extract, to_date, lit

# 2. Convert Dictionary to Spark SQL Chain
# We build a chain of "WHEN filename contains 'Rice' THEN 'rice'" logic
crop_normalization_expr = None
for key, value in crop_mapping.items():
    condition = col("extracted_filename").contains(key)
    if crop_normalization_expr is None:
        crop_normalization_expr = when(condition, value)
    else:
        crop_normalization_expr = crop_normalization_expr.when(condition, value)

# Add a default 'unknown' catch-all
crop_normalization_expr = crop_normalization_expr.otherwise("unknown")

In [0]:
# 3. Read Bronze Data
df_bronze = spark.table("agriculture.bronze.market_prices")
display(df_bronze)

state_name,district_name,market_name,variety,group,arrivals_tonnes,min_price_rs_quintal,max_price_rs_quintal,modal_price_rs_quintal,reported_date,_metadata,ingestion_ts,source_file
Arunachal Pradesh,Papum Pore,Naharlagun,Green,Fruits,2.0,15500.0,19000.0,17250.0,30 Jul 2004,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Grapes.csv, Grapes.csv, 25385025, 0, 25385025, 2026-01-26T15:06:43.000Z)",2026-01-27T15:20:07.306Z,Grapes.csv
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Black,Fruits,0.5,6000.0,7000.0,6500.0,08 May 2019,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Grapes.csv, Grapes.csv, 25385025, 0, 25385025, 2026-01-26T15:06:43.000Z)",2026-01-27T15:20:07.306Z,Grapes.csv
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Green,Fruits,0.5,5700.0,5900.0,5800.0,08 May 2019,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Grapes.csv, Grapes.csv, 25385025, 0, 25385025, 2026-01-26T15:06:43.000Z)",2026-01-27T15:20:07.306Z,Grapes.csv
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Black,Fruits,0.5,7000.0,8000.0,7500.0,04 May 2019,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Grapes.csv, Grapes.csv, 25385025, 0, 25385025, 2026-01-26T15:06:43.000Z)",2026-01-27T15:20:07.306Z,Grapes.csv
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Green,Fruits,0.5,6000.0,7000.0,6500.0,04 May 2019,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Grapes.csv, Grapes.csv, 25385025, 0, 25385025, 2026-01-26T15:06:43.000Z)",2026-01-27T15:20:07.306Z,Grapes.csv
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Black,Fruits,0.6,6000.0,7000.0,6500.0,26 Apr 2019,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Grapes.csv, Grapes.csv, 25385025, 0, 25385025, 2026-01-26T15:06:43.000Z)",2026-01-27T15:20:07.306Z,Grapes.csv
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Green,Fruits,0.6,5000.0,6000.0,5500.0,26 Apr 2019,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Grapes.csv, Grapes.csv, 25385025, 0, 25385025, 2026-01-26T15:06:43.000Z)",2026-01-27T15:20:07.306Z,Grapes.csv
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Black,Fruits,0.9,6000.0,8000.0,7000.0,25 Apr 2019,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Grapes.csv, Grapes.csv, 25385025, 0, 25385025, 2026-01-26T15:06:43.000Z)",2026-01-27T15:20:07.306Z,Grapes.csv
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Green,Fruits,0.9,5500.0,5700.0,5600.0,25 Apr 2019,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Grapes.csv, Grapes.csv, 25385025, 0, 25385025, 2026-01-26T15:06:43.000Z)",2026-01-27T15:20:07.306Z,Grapes.csv
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Black,Fruits,0.75,5000.0,6000.0,5500.0,22 Apr 2019,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Grapes.csv, Grapes.csv, 25385025, 0, 25385025, 2026-01-26T15:06:43.000Z)",2026-01-27T15:20:07.306Z,Grapes.csv


In [0]:
from pyspark.sql.functions import col, try_to_date

# Define the formats we already know and handle
format_1 = "dd MMM yyyy"  
format_2 = "yyyy-MM-dd" 

# Filter for rows that match NEITHER format 1 NOR format 2
unknown_formats = df_bronze \
    .filter(try_to_date(col("reported_date"), format_1).isNull()) \
    .filter(try_to_date(col("reported_date"), format_2).isNull()) \
    .select("reported_date") \
    .distinct()

print("Any dates remaining below are unknown formats:")
display(unknown_formats)

Any dates remaining below are unknown formats:


reported_date


In [0]:
from pyspark.sql.functions import col, try_to_date, coalesce, regexp_extract

# Define your date parsing logic separately for cleanliness
# It tries 'dd MMM yyyy' first, and falls back to 'yyyy-MM-dd'
date_parsing_logic = coalesce(
    try_to_date(col("reported_date"), "dd MMM yyyy"), 
    try_to_date(col("reported_date"), "yyyy-MM-dd")
)

df_silver = df_bronze \
    .withColumn("extracted_filename", regexp_extract(col("source_file"), r"([^/]+)\.csv$", 1)) \
    .withColumn("crop_name", crop_normalization_expr) \
    .withColumn("market_date", date_parsing_logic) \
    .select(
        col("market_date"),
        col("crop_name"),
        col("state_name"),
        col("district_name"),
        col("market_name"),
        col("variety"),
        col("group"),
        col("arrivals_tonnes").cast("double"),
        col("min_price_rs_quintal").cast("double"),
        col("max_price_rs_quintal").cast("double"),
        col("modal_price_rs_quintal").cast("double")
    ) \
    .filter(col("crop_name") != "unknown") \
    .dropDuplicates()

display(df_silver)

market_date,crop_name,state_name,district_name,market_name,variety,group,arrivals_tonnes,min_price_rs_quintal,max_price_rs_quintal,modal_price_rs_quintal
2018-12-18,maize,Andhra Pradesh,Kurnool,Atmakur,Hybrid/Local,Cereals,0.01,1700.0,1700.0,1700.0
2019-05-19,maize,Andhra Pradesh,Kurnool,Alur,Local,Cereals,0.01,1720.0,1800.0,1760.0
2010-09-09,maize,Andhra Pradesh,Kurnool,Nandikotkur,Hybrid,Cereals,1.0,700.0,1000.0,800.0
2011-12-02,maize,Andhra Pradesh,Kurnool,Nandikotkur,Hybrid,Cereals,0.1,980.0,1080.0,1060.0
2012-10-13,maize,Andhra Pradesh,Kurnool,Nandikotkur,Hybrid,Cereals,0.1,1200.0,1300.0,1250.0
2012-10-06,maize,Andhra Pradesh,Kurnool,Nandikotkur,Hybrid,Cereals,0.1,1200.0,1350.0,1250.0
2015-08-18,maize,Andhra Pradesh,Kurnool,Nandikotkur,Hybrid,Cereals,0.1,1310.0,1350.0,1330.0
2015-01-21,maize,Andhra Pradesh,Kurnool,Nandikotkur,Hybrid,Cereals,0.1,1320.0,1360.0,1340.0
2018-03-01,maize,Andhra Pradesh,Kurnool,Kurnool,Local,Cereals,8.5,1128.0,1148.0,1148.0
2011-07-28,maize,Andhra Pradesh,Vijayanagaram,Pusapatirega,Local,Cereals,0.1,900.0,950.0,900.0


In [0]:
# Verify unique crops (Should match your list of 15 crops)
display(df_silver.select("crop_name").distinct())

crop_name
maize
cotton
apple
mango
orange
papaya
grapes
coffee
chickpea
jute


In [0]:
# 6. Write to Silver Table
# We use Delta format with 'overwrite' to ensure idempotency (running it twice doesn't duplicate data)
df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("agriculture.silver.market_prices")

print("Silver Market Prices Table Created Successfully")

Silver Market Prices Table Created Successfully
