In [23]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import joblib

In [3]:
# Load datasets
crop_df = pd.read_csv("../datasets/crop_production.csv")
rainfall_df = pd.read_csv("../datasets/district wise rainfall normal.csv")
price_df = pd.read_csv("../datasets/CMO_MSP_Mandi.csv")

In [4]:
# Filter only Maharashtra
crop_df = crop_df[crop_df['State_Name'].str.lower().str.contains("maharashtra")]
rainfall_df = rainfall_df[rainfall_df['STATE_UT_NAME'].str.lower().str.contains("maharashtra")]

In [5]:
# Rename columns for merging
rainfall_df.rename(columns={"DISTRICT": "District_Name"}, inplace=True)

In [6]:
# Calculate annual rainfall
rainfall_df["Annual_Rainfall"] = rainfall_df["ANNUAL"]

In [7]:
# Merge crop & rainfall
merged = pd.merge(crop_df, rainfall_df[["District_Name", "Annual_Rainfall"]], on="District_Name", how="left")

In [8]:
# Merge price (MSP)
price_df.rename(columns={"commodity": "Crop", "year": "Crop_Year", "msprice": "Price"}, inplace=True)
price_df = price_df[["Crop", "Crop_Year", "Price"]]
price_df["Crop"] = price_df["Crop"].str.upper()

merged["Crop"] = merged["Crop"].str.upper()
merged = pd.merge(merged, price_df, on=["Crop", "Crop_Year"], how="inner")

In [9]:
# Drop missing values
merged.dropna(subset=["Production", "Area", "Annual_Rainfall", "Price"], inplace=True)

In [41]:
# Calculate yield (tons/ha)
merged["Yield"] = merged["Production"] / merged["Area"]

In [42]:
# Select features
features = merged[["District_Name", "Crop", "Crop_Year", "Annual_Rainfall", "Yield", "Production"]]
target = merged["Price"]

In [43]:
# Encode categorical features
le_district = LabelEncoder()
le_crop = LabelEncoder()
features.loc[:, "District_Name"] = le_district.fit_transform(features["District_Name"])
features.loc[:, "Crop"] = le_crop.fit_transform(features["Crop"])

In [44]:
# Save encoders for later
joblib.dump(le_district, "district_encoder.pkl")
joblib.dump(le_crop, "crop_encoder.pkl")

['crop_encoder.pkl']

In [45]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [46]:
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [47]:
# Predict on test set
preds = model.predict(X_test)

# Metrics
preds = model.predict(X_test)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, preds)

print(f"Model Evaluation:")
print(f"-----------------")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.4f} ({r2 * 100:.2f}%)")

Model Evaluation:
-----------------
Mean Squared Error: 772.35
Root Mean Squared Error: 27.79
R² Score: 0.9995 (99.95%)


In [48]:
# Save model
joblib.dump(model, "crop_model.pkl")

['crop_model.pkl']

In [49]:
# Predict prices for all entries
features_all = features.copy()
predicted_prices = model.predict(features_all)

# Add predicted prices to DataFrame
merged["Predicted_Price"] = predicted_prices

# Calculate Predicted Agroeconomy = Production * Predicted_Price
merged["Predicted_Agroeconomy"] = merged["Production"] * merged["Predicted_Price"]

# Optional: Save the prediction results
merged[[
    "District_Name", "Crop", "Crop_Year",
    "Area", "Production", "Annual_Rainfall",
    "Yield", "Predicted_Price", "Predicted_Agroeconomy"
]].to_csv("predicted_agroeconomy.csv", index=False)

print("✅ Agroeconomy predictions saved to 'predicted_agroeconomy.csv'")

✅ Agroeconomy predictions saved to 'predicted_agroeconomy.csv'
