In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor


In [2]:
DATA_PATH = "../data/raw/crop_data.csv"

df = pd.read_csv(DATA_PATH)
df.columns = df.columns.str.strip()

print("Shape:", df.shape)
df.head()


Shape: (50000, 24)


Unnamed: 0,State,District,Region,Soil_Type,Temperature (°C),Humidity (%),Rainfall (mm),pH,Nitrogen (N),Phosphorus (P),...,Crop_Yield (kg/ha),Irrigation_Type,Fertilizer_Used,Pesticide_Used,Year,Soil_Moisture (%),Altitude (m),Sunshine_Hours,Wind_Speed (km/h),Evapotranspiration (mm/day)
0,Karnataka,District_0,South,Alluvial,22.7,87.7,229.9,8.26,212,20,...,2970.1,Drip,NPK,Yes,2020,16.4,636,4.5,6.6,5.07
1,Uttar Pradesh,District_1,South,Red,24.8,87.7,97.9,4.72,371,60,...,2091.7,,,No,2018,40.3,1449,9.4,11.4,4.2
2,Gujarat,District_2,North,Peaty,30.0,51.4,269.1,6.72,187,89,...,2258.3,Canal,DAP,Yes,2017,23.9,847,4.3,6.3,2.6
3,Karnataka,District_3,East,Laterite,26.5,32.2,33.6,5.29,193,11,...,2319.6,Sprinkler,NPK,No,2020,32.1,871,9.0,25.4,2.13
4,Tamil Nadu,District_4,West,Peaty,25.5,47.1,7.0,7.76,160,51,...,2757.5,,NPK,No,2014,39.2,1180,8.0,23.4,4.98


In [3]:
TARGET = "Crop_Yield (kg/ha)"

# Drop time-based leakage for now
drop_cols = ["Year"]

df = df.drop(columns=drop_cols, errors="ignore")


In [4]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (50000, 22)
y shape: (50000,)


In [5]:
# TEMPORARY FEATURE SELECTION (for speed)

categorical_cols = [
    "Soil_Type",
    "Irrigation_Type",
    "Fertilizer_Used",
    "Pesticide_Used",
    "Crop"
]

numerical_cols = [
    "Temperature (°C)",
    "Humidity (%)",
    "Rainfall (mm)",
    "pH",
    "Nitrogen (N)",
    "Phosphorus (P)",
    "Potassium (K)",
    "Organic_Carbon (%)",
    "Electrical_Conductivity (dS/m)",
    "Soil_Moisture (%)",
    "Altitude (m)",
    "Sunshine_Hours",
    "Wind_Speed (km/h)",
    "Evapotranspiration (mm/day)"
]

print("Categorical:", categorical_cols)
print("Numerical:", numerical_cols)


Categorical: ['Soil_Type', 'Irrigation_Type', 'Fertilizer_Used', 'Pesticide_Used', 'Crop']
Numerical: ['Temperature (°C)', 'Humidity (%)', 'Rainfall (mm)', 'pH', 'Nitrogen (N)', 'Phosphorus (P)', 'Potassium (K)', 'Organic_Carbon (%)', 'Electrical_Conductivity (dS/m)', 'Soil_Moisture (%)', 'Altitude (m)', 'Sunshine_Hours', 'Wind_Speed (km/h)', 'Evapotranspiration (mm/day)']


In [6]:
X[categorical_cols] = X[categorical_cols].fillna("Unknown")


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numerical_cols)
    ]
)


In [9]:
model = RandomForestRegressor(
    n_estimators=50,          # very fast
    max_depth=15,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])


In [10]:
pipeline.fit(X_train, y_train)
print("✅ Regression model trained")


✅ Regression model trained


In [11]:
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R² Score:", r2)


MAE: 394.1006861170914
RMSE: 492.49802922530915
R² Score: -0.004548457446498677


In [12]:
def recommend_crops(input_features, model, top_n=3):
    """
    input_features: dict without 'Crop' and 'Crop_Yield'
    """
    crops = df["Crop"].unique()
    results = []

    for crop in crops:
        row = input_features.copy()
        row["Crop"] = crop

        row_df = pd.DataFrame([row])
        row_df[categorical_cols] = row_df[categorical_cols].fillna("Unknown")

        predicted_yield = model.predict(row_df)[0]
        results.append((crop, predicted_yield))

    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_n]


In [13]:
sample_input = {
    "State": "Karnataka",
    "District": "Mysuru",
    "Region": "South",
    "Soil_Type": "Loamy",
    "Temperature (°C)": 28,
    "Humidity (%)": 65,
    "Rainfall (mm)": 180,
    "pH": 6.5,
    "Nitrogen (N)": 90,
    "Phosphorus (P)": 40,
    "Potassium (K)": 45,
    "Organic_Carbon (%)": 0.8,
    "Electrical_Conductivity (dS/m)": 1.2,
    "Irrigation_Type": "Canal",
    "Fertilizer_Used": "Urea",
    "Pesticide_Used": "Yes",
    "Soil_Moisture (%)": 22,
    "Altitude (m)": 750,
    "Sunshine_Hours": 7.5,
    "Wind_Speed (km/h)": 12,
    "Evapotranspiration (mm/day)": 4.5
}

recommend_crops(sample_input, pipeline, top_n=3)


[('Maize', 2454.058265394956),
 ('Cotton', 2441.9524790200653),
 ('Pulses', 2440.385277531307)]

In [14]:
import joblib

joblib.dump(pipeline, "model.joblib")
print("✅ Model saved correctly")

✅ Model saved correctly
