In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [53]:
data = pd.read_csv("data/crop_yield.csv")

In [54]:
num_rows, num_columns = data.shape
print(f"Rows: {num_rows}, Columns: {num_columns}, Columns: {data.columns}")

Rows: 1000000, Columns: 10, Columns: Index(['Region', 'Soil_Type', 'Crop', 'Rainfall_mm', 'Temperature_Celsius',
       'Fertilizer_Used', 'Irrigation_Used', 'Weather_Condition',
       'Days_to_Harvest', 'Yield_tons_per_hectare'],
      dtype='object')


In [55]:
print(data.isnull().sum())


Region                    0
Soil_Type                 0
Crop                      0
Rainfall_mm               0
Temperature_Celsius       0
Fertilizer_Used           0
Irrigation_Used           0
Weather_Condition         0
Days_to_Harvest           0
Yield_tons_per_hectare    0
dtype: int64


In [56]:
for col in ['Region', 'Soil_Type', 'Crop', 'Weather_Condition']:
    print(f"{col}: {data[col].unique()}\n")



Region: ['West' 'South' 'North' 'East']

Soil_Type: ['Sandy' 'Clay' 'Loam' 'Silt' 'Peaty' 'Chalky']

Crop: ['Cotton' 'Rice' 'Barley' 'Soybean' 'Wheat' 'Maize']

Weather_Condition: ['Cloudy' 'Rainy' 'Sunny']



In [57]:
# Check for boolean columns
bool_columns = data.select_dtypes(include=['bool']).columns
print("Boolean Columns:", bool_columns)


Boolean Columns: Index(['Fertilizer_Used', 'Irrigation_Used'], dtype='object')


In [58]:
# Convert boolean columns to integers
data["Fertilizer_Used"] = data["Fertilizer_Used"].astype(int)
data["Irrigation_Used"] = data["Irrigation_Used"].astype(int)

In [None]:
# Apply label encoding to categorical columns
categorical_cols = ["Region", "Soil_Type", "Crop", "Weather_Condition"]
label_encoders = {}



for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Store encoders for future reference
    
    
    
joblib.dump(label_encoders, "label_encoders.pkl")
print("✅ Label encoders saved as 'label_encoders.pkl'")

✅ Label encoders saved as 'label_encoders.pkl'


In [60]:
print(data.head())

   Region  Soil_Type  Crop  Rainfall_mm  Temperature_Celsius  Fertilizer_Used  \
0       3          4     1   897.077239            27.676966                0   
1       2          1     3   992.673282            18.026142                1   
2       1          2     0   147.998025            29.794042                0   
3       1          4     4   986.866331            16.644190                0   
4       2          5     5   730.379174            31.620687                1   

   Irrigation_Used  Weather_Condition  Days_to_Harvest  Yield_tons_per_hectare  
0                1                  0              122                6.555816  
1                1                  1              140                8.527341  
2                0                  2              106                1.127443  
3                1                  1              146                6.517573  
4                1                  0              110                7.248251  


In [61]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop(columns=["Yield_tons_per_hectare"]))  # Excluding target column


In [None]:
from sklearn.model_selection import train_test_split

# Defining features (X) and target variable (y)
X = data.drop(columns=["Yield_tons_per_hectare"])
y = data["Yield_tons_per_hectare"]



# Splitting the dataset (80% Training, 20% Testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [68]:
print("Features used in training:", X.columns.tolist())
print("Number of features:", len(X.columns))


Features used in training: ['Region', 'Soil_Type', 'Crop', 'Rainfall_mm', 'Temperature_Celsius', 'Fertilizer_Used', 'Irrigation_Used', 'Weather_Condition', 'Days_to_Harvest']
Number of features: 9


In [63]:
from xgboost import XGBRegressor

# Define XGBoost model
xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate model
print("📌 XGBoost Performance")
print(f"MAE: {mean_absolute_error(y_test, y_pred_xgb)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_xgb)}")
print(f"R² Score: {r2_score(y_test, y_pred_xgb)}")


📌 XGBoost Performance
MAE: 0.4002273553954539
MSE: 0.25158681884202466
R² Score: 0.9127327562670771


In [None]:
import lightgbm as lgb

# Define LightGBM model
lgb_model = lgb.LGBMRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)

# Train the model
lgb_model.fit(X_train, y_train)

# Make predictions
y_pred_lgb = lgb_model.predict(X_test)

# Evaluate model
print("📌 LightGBM Performance")
print(f"MAE: {mean_absolute_error(y_test, y_pred_lgb)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_lgb)}")
print(f"R² Score: {r2_score(y_test, y_pred_lgb)}")




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025331 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 624
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 9
[LightGBM] [Info] Start training from score 4.649019
📌 LightGBM Performance
MAE: 0.4000343164391992
MSE: 0.25133715381298605
R² Score: 0.9128193569047455


In [66]:
import joblib
joblib.dump(lgb_model, "lightgbm_crop_yield_model.pkl")

print("✅ Model saved successfully as lightgbm_crop_yield_model.pkl")

✅ Model saved successfully as lightgbm_crop_yield_model.pkl


In [73]:
import joblib

# Load the saved label encoders
label_encoders = joblib.load("label_encoders.pkl")

# Print the mapping of categories to encoded values
for col, encoder in label_encoders.items():
    print(f"🔹 {col} Encoding:")
    for idx, class_name in enumerate(encoder.classes_):
        print(f"  {class_name} → {idx}")
    print("-" * 30)


🔹 Region Encoding:
  0 → 0
  1 → 1
  2 → 2
  3 → 3
------------------------------
🔹 Soil_Type Encoding:
  0 → 0
  1 → 1
  2 → 2
  3 → 3
  4 → 4
  5 → 5
------------------------------
🔹 Crop Encoding:
  0 → 0
  1 → 1
  2 → 2
  3 → 3
  4 → 4
  5 → 5
------------------------------
🔹 Weather_Condition Encoding:
  0 → 0
  1 → 1
  2 → 2
------------------------------
