In [0]:
%sql
USE CATALOG agriculture

In [0]:
# --- SILVER LAYER TRANSFORMATION ---
from pyspark.sql.functions import col, when, lower, trim, lit

# 1. Read from Bronze (Now with lowercase columns)
df_bronze = spark.read.table("agriculture.bronze.crop_production")

# 2. Define the Mapping Logic
# Note: Input columns are now lowercase (e.g., col("crop"))
df_cleaned_names = df_bronze.withColumn("crop_normalized", 
    when(lower(trim(col("crop"))) == "gram", "chickpea") 
    .when(lower(trim(col("crop"))) == "pome granet", "pomegranate") 
    .when(lower(trim(col("crop"))) == "kapas", "cotton") 
    .when(lower(trim(col("crop"))) == "cotton(lint)", "cotton") 
    .when(lower(trim(col("crop"))) == "masoor", "lentil") 
    .when(lower(trim(col("crop"))) == "paddy", "rice") 
    .otherwise(lower(trim(col("crop"))))
)

# 3. Filter ONLY for your 15 Target Crops
target_crops = [
    'rice', 'maize', 'chickpea', 'lentil', 'pomegranate', 
    'banana', 'mango', 'grapes', 'apple', 'orange', 
    'papaya', 'coconut', 'cotton', 'jute', 'coffee'
]

df_filtered = df_cleaned_names.filter(col("crop_normalized").isin(target_crops))

# 4. Clean Data Types & Calculate Yield
# Note: referencing 'production' and 'area' in lowercase
df_calculated = df_filtered \
    .withColumn("production", col("production").cast("double")) \
    .withColumn("area", col("area").cast("double")) \
    .withColumn("calculated_yield", 
                when(col("area") > 0, col("production") / col("area"))
                .otherwise(lit(0)) 
    ) \
    .select(
        col("state_name").alias("state"),      
        col("district_name").alias("district"), 
        col("crop_normalized").alias("crop"), 
        col("crop_year").alias("year"),
        col("area"),
        col("production"),
        col("calculated_yield")
    )

display(df_calculated.limit(10))

state,district,crop,year,area,production,calculated_yield
Andaman and Nicobar Islands,NICOBARS,rice,2000,102.0,321.0,3.147058823529412
Andaman and Nicobar Islands,NICOBARS,banana,2000,176.0,641.0,3.642045454545455
Andaman and Nicobar Islands,NICOBARS,coconut,2000,18168.0,65100000.0,3583.223249669749
Andaman and Nicobar Islands,NICOBARS,rice,2001,83.0,300.0,3.614457831325301
Andaman and Nicobar Islands,NICOBARS,coconut,2001,18190.0,64430000.0,3542.056074766356
Andaman and Nicobar Islands,NICOBARS,rice,2002,189.2,510.84,2.7
Andaman and Nicobar Islands,NICOBARS,banana,2002,213.0,1278.0,6.0
Andaman and Nicobar Islands,NICOBARS,coconut,2002,18240.0,67490000.0,3700.1096491228072
Andaman and Nicobar Islands,NICOBARS,rice,2003,52.0,90.17,1.7340384615384616
Andaman and Nicobar Islands,NICOBARS,banana,2003,266.0,1763.0,6.62781954887218


In [0]:
# 5. Write to Silver Delta Table
df_calculated.write.format("delta").mode("overwrite").saveAsTable("agriculture.silver.crop_production")

print(f"Silver Table Created. Row Count: {df_calculated.count()}")
display(df_calculated.limit(10))

Silver Table Created. Row Count: 53731


state,district,crop,year,area,production,calculated_yield
Andaman and Nicobar Islands,NICOBARS,rice,2000,102.0,321.0,3.147058823529412
Andaman and Nicobar Islands,NICOBARS,banana,2000,176.0,641.0,3.642045454545455
Andaman and Nicobar Islands,NICOBARS,coconut,2000,18168.0,65100000.0,3583.223249669749
Andaman and Nicobar Islands,NICOBARS,rice,2001,83.0,300.0,3.614457831325301
Andaman and Nicobar Islands,NICOBARS,coconut,2001,18190.0,64430000.0,3542.056074766356
Andaman and Nicobar Islands,NICOBARS,rice,2002,189.2,510.84,2.7
Andaman and Nicobar Islands,NICOBARS,banana,2002,213.0,1278.0,6.0
Andaman and Nicobar Islands,NICOBARS,coconut,2002,18240.0,67490000.0,3700.1096491228072
Andaman and Nicobar Islands,NICOBARS,rice,2003,52.0,90.17,1.7340384615384616
Andaman and Nicobar Islands,NICOBARS,banana,2003,266.0,1763.0,6.62781954887218
