In [0]:
%sql
USE CATALOG agriculture

In [0]:
from pyspark.sql.functions import current_timestamp, input_file_name, lit

# Define Base Path
base_path = "/Volumes/agriculture/bronze/raw_uploads"

# --- 1. Crop Recommendation (The "Agronomist" Model Data) ---
df_rec = spark.read.format("csv").option("header", "true").option("inferSchema", "true") \
    .load(f"{base_path}/Crop_recommendation.csv")

df_rec.withColumn("ingestion_ts", current_timestamp()) \
      .write.format("delta").mode("overwrite").saveAsTable("agriculture.bronze.crop_recommendation")
print(f"Crop Recommendation Data: {df_rec.count()} rows")

Crop Recommendation Data: 2200 rows


In [0]:
# --- 2. State Weather History (Environmental Data) ---
df_weather = spark.read.format("csv").option("header", "true").option("inferSchema", "true") \
    .load(f"{base_path}/state_weather_data_1997_2020.csv")

df_weather.withColumn("ingestion_ts", current_timestamp()) \
          .write.format("delta").mode("overwrite").saveAsTable("agriculture.bronze.weather_history")
print(f" Weather History: {df_weather.count()} rows")

 Weather History: 720 rows


In [0]:
# --- 3. State Soil Defaults (Regional Fallback Data) ---
df_state_soil = spark.read.format("csv").option("header", "true").option("inferSchema", "true") \
    .load(f"{base_path}/state_soil_data.csv")

df_state_soil.withColumn("ingestion_ts", current_timestamp()) \
             .write.format("delta").mode("overwrite").saveAsTable("agriculture.bronze.state_soil_profiles")
print(f" State Soil Profiles: {df_state_soil.count()} rows")

 State Soil Profiles: 30 rows


In [0]:
#spark.sql("DROP TABLE IF EXISTS agriculture.bronze.market_prices")

DataFrame[]

In [0]:
from pyspark.sql.functions import current_timestamp, input_file_name, col

# --- Mandi Market Prices (Economic Data) ---
mandi_path = "/Volumes/agriculture/bronze/raw_uploads/mandi_data/*.csv"

# 1. Read the CSVs
df_market = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(mandi_path).select("*", "_metadata")

# 2. Define a cleanup mapping (Old Name -> New Explicit Name)
# We keep '_rs_quintal' and '_tonnes' to preserve unit info
column_mapping = {
    "State Name": "state_name",
    "District Name": "district_name",
    "Market Name": "market_name",
    "Variety": "variety",
    "Group": "group",
    "Arrivals (Tonnes)": "arrivals_tonnes",
    "Min Price (Rs./Quintal)": "min_price_rs_quintal",
    "Max Price (Rs./Quintal)": "max_price_rs_quintal",
    "Modal Price (Rs./Quintal)": "modal_price_rs_quintal",
    "Reported Date": "reported_date"
}

# 3. Apply the renaming loop
df_market_clean = df_market
for old_col, new_col in column_mapping.items():
    df_market_clean = df_market_clean.withColumnRenamed(old_col, new_col)



In [0]:
df_market_clean.display()

state_name,district_name,market_name,variety,group,Arrivals,Min Price,Max Price,Modal Price,reported_date,_metadata
Arunachal Pradesh,Tawang,Tawang,Other,Fruits,0.12,4000.0,4000.0,4000.0,2005-08-24,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Arunachal Pradesh,Tawang,Tawang,Other,Fruits,0.15,4000.0,4000.0,4000.0,2005-08-25,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Arunachal Pradesh,Tawang,Tawang,Other,Fruits,0.15,4000.0,4000.0,4000.0,2005-08-27,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Arunachal Pradesh,East Siang,Pasighat,Delicious,Fruits,0.02,9000.0,10000.0,9500.0,2003-08-05,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Arunachal Pradesh,East Siang,Pasighat,Other,Fruits,0.02,8000.0,8500.0,8300.0,2003-08-05,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Arunachal Pradesh,East Siang,Pasighat,Apple,Fruits,1.0,5000.0,6000.0,5500.0,2010-07-06,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Hajratbali,Fruits,2.0,1300.0,1400.0,1350.0,2004-06-28,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Hajratbali,Fruits,7.0,1200.0,1300.0,1250.0,2004-07-03,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Hajratbali,Fruits,12.0,1200.0,1300.0,1250.0,2004-07-12,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Hajratbali,Fruits,15.0,1100.0,1200.0,1150.0,2004-07-20,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"


In [0]:
# 1. Define your mapping
rename_mapping = {
    "Min Price": "min_price_rs_quintal",
    "Max Price": "max_price_rs_quintal",
    "Modal Price": "modal_price_rs_quintal",
    "Arrivals": "arrivals_tonnes"
}

# 2. Loop through and apply the changes
for old_name, new_name in rename_mapping.items():
    df_market_clean = df_market_clean.withColumnRenamed(old_name, new_name)

# 3. View the result
df_market_clean.display()

state_name,district_name,market_name,variety,group,arrivals_tonnes,min_price_rs_quintal,max_price_rs_quintal,modal_price_rs_quintal,reported_date,_metadata
Arunachal Pradesh,Tawang,Tawang,Other,Fruits,0.12,4000.0,4000.0,4000.0,2005-08-24,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Arunachal Pradesh,Tawang,Tawang,Other,Fruits,0.15,4000.0,4000.0,4000.0,2005-08-25,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Arunachal Pradesh,Tawang,Tawang,Other,Fruits,0.15,4000.0,4000.0,4000.0,2005-08-27,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Arunachal Pradesh,East Siang,Pasighat,Delicious,Fruits,0.02,9000.0,10000.0,9500.0,2003-08-05,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Arunachal Pradesh,East Siang,Pasighat,Other,Fruits,0.02,8000.0,8500.0,8300.0,2003-08-05,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Arunachal Pradesh,East Siang,Pasighat,Apple,Fruits,1.0,5000.0,6000.0,5500.0,2010-07-06,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Hajratbali,Fruits,2.0,1300.0,1400.0,1350.0,2004-06-28,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Hajratbali,Fruits,7.0,1200.0,1300.0,1250.0,2004-07-03,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Hajratbali,Fruits,12.0,1200.0,1300.0,1250.0,2004-07-12,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"
Jammu and Kashmir,Badgam,Zaloosa-Chararishrief (F&V),Hajratbali,Fruits,15.0,1100.0,1200.0,1150.0,2004-07-20,"List(dbfs:/Volumes/agriculture/bronze/raw_uploads/mandi_data/Apple.csv, Apple.csv, 83863970, 0, 59976470, 2026-01-26T20:10:45.000Z)"


In [0]:
from pyspark.sql.functions import col, current_timestamp

# 4. Add Metadata
# We access the hidden "_metadata" column provided by the file source
df_market_clean = df_market_clean \
    .withColumn("ingestion_ts", current_timestamp()) \
    .withColumn("source_file", col("_metadata.file_name")) 



In [0]:
# 5. Write to Bronze Table (Unity Catalog)
df_market_clean.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("agriculture.bronze.market_prices")

print(f" Market Prices Ingested. Total Rows: {df_market_clean.count()}")
print("Schema with units preserved:")
df_market_clean.printSchema()

 Market Prices Ingested. Total Rows: 5577805
Schema with units preserved:
root
 |-- state_name: string (nullable = true)
 |-- district_name: string (nullable = true)
 |-- market_name: string (nullable = true)
 |-- variety: string (nullable = true)
 |-- group: string (nullable = true)
 |-- arrivals_tonnes: string (nullable = true)
 |-- min_price_rs_quintal: string (nullable = true)
 |-- max_price_rs_quintal: string (nullable = true)
 |-- modal_price_rs_quintal: string (nullable = true)
 |-- reported_date: string (nullable = true)
 |-- _metadata: struct (nullable = false)
 |    |-- file_path: string (nullable = false)
 |    |-- file_name: string (nullable = false)
 |    |-- file_size: long (nullable = false)
 |    |-- file_block_start: long (nullable = false)
 |    |-- file_block_length: long (nullable = false)
 |    |-- file_modification_time: timestamp (nullable = false)
 |-- ingestion_ts: timestamp (nullable = false)
 |-- source_file: string (nullable = false)



In [0]:
# 1. Define source path (Adjust filename if needed)
source_path = "/Volumes/agriculture/bronze/raw_uploads/crop_production.csv"

In [0]:
# 2. Read the Raw CSV
df_raw = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(source_path)

# 3. Write to Bronze Delta Table
df_raw.write.format("delta").mode("overwrite").saveAsTable("agriculture.bronze.crop_production")

print(f"Bronze Table Created. Row Count: {df_raw.count()}")
display(df_raw.limit(5))

Bronze Table Created. Row Count: 246091


State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0
