In [5]:
import pandas as pd

# ================================
# 📂 Load datasets
# ================================
df_agri = pd.read_csv("../data/raw/agriculture_dataset.csv")
df_crop = pd.read_csv("../data/raw/crop_yield_prediction.csv")

print("✅ Agriculture dataset shape:", df_agri.shape)
print("✅ Crop yield dataset shape:", df_crop.shape)

print("\nAgriculture dataset columns:", df_agri.columns.tolist())
print("Crop yield dataset columns:", df_crop.columns.tolist())

# ================================
# 🔎 Select relevant columns
# ================================
df_agri = df_agri[[
    "Crop_Type", "Fertilizer_Used(tons)", "Yield(tons)",
    "Water_Usage(cubic meters)", "Irrigation_Type"
]].rename(columns={
    "Crop_Type": "Crop",
    "Fertilizer_Used(tons)": "Fertilizer_Used",
    "Yield(tons)": "Yield_tons",
    "Water_Usage(cubic meters)": "Water_Usage",
    "Irrigation_Type": "Irrigation_Type"
})

df_crop = df_crop[[
    "Soil_Type", "Crop", "Region"
]]

print("\n✅ Agriculture dataset (selected):", df_agri.head())
print("\n✅ Crop yield dataset (selected):", df_crop.head())

# ================================
# 🔗 Merge on "Crop"
# ================================
df_merged = pd.merge(df_crop, df_agri, on="Crop", how="inner")

print("\n✅ Merged dataset shape:", df_merged.shape)
print(df_merged.head())

# ================================
# 🧹 Clean column names
# ================================
df_merged.columns = [col.strip().lower().replace(" ", "_") for col in df_merged.columns]

print("\n✅ Final Columns:", df_merged.columns.tolist())

# ================================
# 💾 Save processed dataset
# ================================
df_merged.to_csv("../data/processed/final_training_dataset.csv", index=False)
print("\n💾 Saved to '../data/processed/final_training_dataset.csv'")


✅ Agriculture dataset shape: (50, 10)
✅ Crop yield dataset shape: (1000000, 10)

Agriculture dataset columns: ['  ', 'Crop_Type', 'Farm_Area(acres)', 'Irrigation_Type', 'Fertilizer_Used(tons)', 'Pesticide_Used(kg)', 'Yield(tons)', 'Soil_Type', 'Season', 'Water_Usage(cubic meters)']
Crop yield dataset columns: ['Region', 'Soil_Type', 'Crop', 'Rainfall_mm', 'Temperature_Celsius', 'Fertilizer_Used', 'Irrigation_Used', 'Weather_Condition', 'Days_to_Harvest', 'Yield_tons_per_hectare']

✅ Agriculture dataset (selected):         Crop  Fertilizer_Used  Yield_tons  Water_Usage Irrigation_Type
0     Cotton             8.14       14.44     76648.20       Sprinkler
1     Carrot             4.77       42.91     68725.54          Manual
2  Sugarcane             2.91       33.44     75538.56           Flood
3     Tomato             3.32       34.08     45401.23        Rain-fed
4     Tomato             8.33       43.28     93718.69       Sprinkler

✅ Crop yield dataset (selected):   Soil_Type     Crop