In [8]:
import pandas as pd

df = pd.read_csv('yield_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,0,Albania,Maize,1990,36613,1485.0,121.0,16.37
1,1,Albania,Potatoes,1990,66667,1485.0,121.0,16.37
2,2,Albania,"Rice, paddy",1990,23333,1485.0,121.0,16.37
3,3,Albania,Sorghum,1990,12500,1485.0,121.0,16.37
4,4,Albania,Soybeans,1990,7000,1485.0,121.0,16.37


In [13]:
df.drop(columns='Unnamed: 0', inplace=True)
df

KeyError: "['Unnamed: 0'] not found in axis"

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28242 entries, 0 to 28241
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Area                           28242 non-null  object 
 1   Item                           28242 non-null  object 
 2   Year                           28242 non-null  int64  
 3   hg/ha_yield                    28242 non-null  int64  
 4   average_rain_fall_mm_per_year  28242 non-null  float64
 5   pesticides_tonnes              28242 non-null  float64
 6   avg_temp                       28242 non-null  float64
dtypes: float64(3), int64(2), object(2)
memory usage: 1.5+ MB


In [14]:
df['Item'].unique()

array(['Maize', 'Potatoes', 'Rice, paddy', 'Sorghum', 'Soybeans', 'Wheat',
       'Cassava', 'Sweet potatoes', 'Plantains and others', 'Yams'],
      dtype=object)

In [15]:
# One-Hot Encoding
df = pd.get_dummies(df, columns=["Item"])  # crop names one-hot encoded
# X = df.drop(columns=["hg/ha_yield", "Area"])  # Drop target and unnecessary columns

df

Unnamed: 0,Area,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp,Crop,Item_Cassava,Item_Maize,Item_Plantains and others,Item_Potatoes,"Item_Rice, paddy",Item_Sorghum,Item_Soybeans,Item_Sweet potatoes,Item_Wheat,Item_Yams
0,Albania,1990,36613,1485.0,121.00,16.37,1,False,True,False,False,False,False,False,False,False,False
1,Albania,1990,66667,1485.0,121.00,16.37,3,False,False,False,True,False,False,False,False,False,False
2,Albania,1990,23333,1485.0,121.00,16.37,4,False,False,False,False,True,False,False,False,False,False
3,Albania,1990,12500,1485.0,121.00,16.37,5,False,False,False,False,False,True,False,False,False,False
4,Albania,1990,7000,1485.0,121.00,16.37,6,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28237,Zimbabwe,2013,22581,657.0,2550.07,19.76,4,False,False,False,False,True,False,False,False,False,False
28238,Zimbabwe,2013,3066,657.0,2550.07,19.76,5,False,False,False,False,False,True,False,False,False,False
28239,Zimbabwe,2013,13142,657.0,2550.07,19.76,6,False,False,False,False,False,False,True,False,False,False
28240,Zimbabwe,2013,22222,657.0,2550.07,19.76,7,False,False,False,False,False,False,False,True,False,False


In [16]:
# Step 4: Define features and target
X = df.drop(columns=["hg/ha_yield", "Area"])
y = df["hg/ha_yield"]


In [17]:
from sklearn.model_selection import train_test_split

# Step 5: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [18]:
from sklearn.ensemble import RandomForestRegressor

# Step 6: Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
# Define parameter grid
from sklearn.model_selection import GridSearchCV


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2', None]
}

# Create base model
rf = RandomForestRegressor(random_state=42)

# GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring='r2'
)

# Fit the model
grid_search.fit(X_train, y_train)

# Best estimator
model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


In [27]:
from xgboost import XGBRegressor

# Define the model
xgb = XGBRegressor(objective="reg:squarederror", random_state=42)

# Define parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7, 9],
    "learning_rate": [0.05, 0.1, 0.2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring="r2",
    verbose=2
)

# Fit
grid_search.fit(X_train, y_train)

# Best model
model = grid_search.best_estimator_
print("✅ Best Params:", grid_search.best_params_)


Fitting 3 folds for each of 144 candidates, totalling 432 fits
✅ Best Params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'subsample': 0.8}


In [28]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# Step 7: Evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# acc = accuracy_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.6f}")
# print(f"Accuracy Score: {acc:.2f}")


Mean Squared Error: 102592984.00
R^2 Score: 0.985856
