# **Import**

In [12]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

# **Load DataSet**

In [13]:
df = pd.read_csv("/kaggle/input/crop-production-prediction/Model_dataset.csv")

# **Feature Engineering**

In [14]:
df['areaa_yeild_interaction'] = (df['area_harvested_ha'] * df['yield_kg_per_ha'])
df = df.drop(columns=['area_harvested_confidence_score','production_confidence_score', 'yield_confidence_score'])

## One hot encoding
df = pd.get_dummies(df, columns=['area', 'item'], drop_first=True)

x = df.drop("production_tons", axis=1)
y = df['production_tons']

# **Train test split**

In [15]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size = 0.2,
    random_state = 42
)

# **Model Training**

In [16]:
xgb_model = XGBRegressor(
    n_estimators = 300,
    learning_rate = 0.05,
    max_depth = 6,
    subsample = 0.8,
    colsample_bytree = 0.8,
    random_state = 42,
    n_jobs = -1
)

xgb_model.fit(x_train, y_train)

# **Prediction**

In [17]:
y_pred = xgb_model.predict(x_test)
y_pred

array([ 83339.516 ,  10199.694 , -23334.059 , ...,   6024.707 ,
         2656.2502,  95796.914 ], dtype=float32)

# **Evaluation**

In [18]:
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print(f"R2 Score : {r2:.4f}")
print(f"RMSE     : {rmse:.2f}")
print(f"MAE      : {mae:.2f}")

ðŸ“Š XGBoost Model Evaluation
----------------------------
R2 Score : 0.9843
RMSE     : 1517869.69
MAE      : 206699.56


In [19]:
import joblib

joblib.dump(xgb_model, "crop_production_xgb.pkl")

['crop_production_xgb.pkl']