In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

df = pd.read_csv("merged_crop_yield_dataset.csv")
print("Dataset loaded successfully with shape:", df.shape)
df.head()
df = df[df['Yield'] < 10]
print("Outliers removed — new shape:", df.shape)

Dataset loaded successfully with shape: (28373, 15)
Outliers removed — new shape: (26001, 15)


In [2]:
le_crop = LabelEncoder()
le_state = LabelEncoder()
le_season = LabelEncoder()

df['Crop'] = le_crop.fit_transform(df['Crop'])
df['State'] = le_state.fit_transform(df['State'])
df['Season'] = le_season.fit_transform(df['Season'])

print("Encoding done. Sample data:")
df.head()

Encoding done. Sample data:


Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield,N,P,K,Temperature,Humidity
0,0,1997,4,2,73814.0,56708,2051.4,7024878.38,22882.34,0.796087,101,53,34,28.87,73.57
1,1,1997,1,2,6637.0,4685,2051.4,631643.29,2057.47,0.710435,142,57,49,20.62,69.1
2,8,1997,1,2,796.0,22,2051.4,75755.32,246.76,0.238333,64,42,67,26.91,78.26
4,11,1997,1,2,1739.0,794,2051.4,165500.63,539.09,0.420909,110,20,63,28.2,57.82
5,13,1997,4,2,13587.0,9073,2051.4,1293074.79,4211.97,0.643636,70,22,65,31.07,66.28


In [3]:
X = df.drop(columns=['Yield'])
y = df['Yield']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split complete — Training:", X_train.shape, "| Testing:", X_test.shape)


Data split complete — Training: (20800, 14) | Testing: (5201, 14)


In [4]:
#Scale Feature
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling complete.")

Feature scaling complete.


In [5]:
rf = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_scaled, y_train)
print("Model training complete.")


Model training complete.


In [6]:
#Evaluate Model
y_pred = rf.predict(X_test_scaled)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Model Performance:\nRMSE: {rmse:.3f}\nR²: {r2:.3f}\nMAE: {mae:.3f}")


Model Performance:
RMSE: 0.259
R²: 0.976
MAE: 0.114


In [7]:
joblib.dump(rf, "best_model.joblib")
joblib.dump(scaler, "scaler.joblib")
joblib.dump(le_crop, "le_crop.joblib")
joblib.dump(le_state, "le_state.joblib")
joblib.dump(le_season, "le_season.joblib")

print("All files saved successfully")


All files saved successfully


In [8]:
import joblib
import lzma

# Load the original model
model = joblib.load("best_model.joblib")

# Save with high compression using LZMA
joblib.dump(model, "best_model_compressed.joblib", compress=('xz', 9))

print("✅ Compressed successfully using LZMA (.xz).")


✅ Compressed successfully using LZMA (.xz).
