
# 🌾 Notebook 1: Data Preprocessing
# ==========================================

In [2]:
import pandas as pd
import numpy as np

### 1. Load dataset

In [3]:
raw_df = pd.read_csv("/content/yield_df.csv")

### 2. Inspect data

In [4]:
print("🔍 Shape:", raw_df.shape)
print("📑 Columns:", raw_df.columns)

🔍 Shape: (28242, 8)
📑 Columns: Index(['Unnamed: 0', 'Area', 'Item', 'Year', 'hg/ha_yield',
       'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp'],
      dtype='object')


### 3. Rename columns

In [5]:
rename_map = {
    "Item": "Crop",
    "hg/ha_yield": "Yield",
    "Area": "Area",
    "Year": "Year"
}
raw_df.rename(columns=rename_map, inplace=True)


### 4. Handle missing values

In [6]:
raw_df = raw_df.dropna()

### 5. Normalize satellite data if present

In [7]:
for col in ["NDVI", "Rainfall", "Temperature"]:
    if col in raw_df.columns:
        raw_df[col] = (raw_df[col] - raw_df[col].min()) / (raw_df[col].max() - raw_df[col].min())


### 6. Save cleaned data

In [8]:
raw_df.to_csv("/content/processed_data.csv", index=False)
print("✅ Cleaned dataset saved to data/processed/processed_data.csv")

✅ Cleaned dataset saved to data/processed/processed_data.csv
