In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
#read cvs file
df = pd.read_csv('../dataset/processed/crop_yield_cleansing_data.csv')

print(f"✅ Data loaded successfully! Shape: {df.shape}")

# Preview the dataset
display(df.head())

✅ Data loaded successfully! Shape: (36211, 19)


Unnamed: 0,Date,Crop_Type,Soil_Type,Soil_pH,Temperature,Humidity,Wind_Speed,N,P,K,Crop_Yield,Soil_Quality,Year,Month,Day,Temperature_orig,Humidity_orig,Wind_Speed_orig,Soil_Quality_orig
0,2014-01-01,Wheat,Peaty,5.5,9,80,11,60.5,45.0,31.5,0.0,23,2014,1,1,9.440599,80.0,10.956707,22.833333
1,2014-01-01,Corn,Loamy,6.5,20,80,9,84.0,66.0,50.0,104.87131,67,2014,1,1,20.052576,79.947424,8.591577,66.666667
2,2014-01-01,Rice,Peaty,5.5,12,80,7,71.5,54.0,38.5,0.0,27,2014,1,1,12.143099,80.0,7.227751,27.333333
3,2014-01-01,Barley,Sandy,6.75,20,80,3,50.0,40.0,30.0,58.939796,35,2014,1,1,19.751848,80.0,2.682683,35.0
4,2014-01-01,Soybean,Peaty,5.5,16,80,8,49.5,45.0,38.5,32.970413,22,2014,1,1,16.110395,80.0,7.69607,22.166667


In [5]:
print("🧹 Cleaning and encoding data...")

# 1. Drop rows with missing values
df = df.dropna()

# 2. Drop columns that are not useful for prediction (like Date)
#    Adjust this list based on your dataset columns
drop_cols = ["Date"]  # add others like ['ID', 'Sample_Name'] if needed
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

# 3. Convert categorical (object) columns to numeric via one-hot encoding
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print(f"✅ Preprocessing complete! Shape: {df.shape}")
display(df.head())

🧹 Cleaning and encoding data...
✅ Preprocessing complete! Shape: (36211, 29)


Unnamed: 0,Soil_pH,Temperature,Humidity,Wind_Speed,N,P,K,Crop_Yield,Soil_Quality,Year,...,Crop_Type_Rice,Crop_Type_Soybean,Crop_Type_Sugarcane,Crop_Type_Sunflower,Crop_Type_Tomato,Crop_Type_Wheat,Soil_Type_Loamy,Soil_Type_Peaty,Soil_Type_Saline,Soil_Type_Sandy
0,5.5,9,80,11,60.5,45.0,31.5,0.0,23,2014,...,False,False,False,False,False,True,False,True,False,False
1,6.5,20,80,9,84.0,66.0,50.0,104.87131,67,2014,...,False,False,False,False,False,False,True,False,False,False
2,5.5,12,80,7,71.5,54.0,38.5,0.0,27,2014,...,True,False,False,False,False,False,False,True,False,False
3,6.75,20,80,3,50.0,40.0,30.0,58.939796,35,2014,...,False,False,False,False,False,False,False,False,False,True
4,5.5,16,80,8,49.5,45.0,38.5,32.970413,22,2014,...,False,True,False,False,False,False,False,True,False,False


In [6]:
# Split Data into Training and Testing Sets

target_column = "Crop_Yield"

X = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"✅ Data split complete! Train: {X_train.shape}, Test: {X_test.shape}")

✅ Data split complete! Train: (28968, 28), Test: (7243, 28)


In [7]:
# Train the Random Forest Model

print("🧠 Training RandomForestRegressor model...")

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

print("✅ Model training complete!")

🧠 Training RandomForestRegressor model...
✅ Model training complete!


In [9]:
# Evaluate Model Performance

print("📊 Evaluating model performance...")

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"R²:   {r2:.4f}")

📊 Evaluating model performance...


TypeError: got an unexpected keyword argument 'squared'

In [None]:

# =====================================================
# 💾 7. Save the Trained Model
# =====================================================
model_output_path = "models/random_forest_model.pkl"

joblib.dump(model, model_output_path)
print(f"✅ Model saved successfully at: {model_output_path}")


# =====================================================
# 📈 8. Plot Feature Importance
# =====================================================
importance_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=importance_df)
plt.title("Feature Importance (Random Forest Regressor)")
plt.tight_layout()
plt.show()