## Setup and Imports

## Initialize Data Populator

## Load Original Data

## Generate Populated Data

## Basic Statistics and Generation Summary

## Enhanced Comparison Plotting Function

## Analyze Key Agricultural Variables

## Comprehensive Crop Analysis

## Correlation and Relationship Analysis

## Environmental Factor Analysis

## Nutrient Analysis (N, P, K)

## save Populated Data

## Comprehensive Summary

In [1]:
# ==========================================================
# Import Required Libraries
# ==========================================================

import sys
import os
import pandas as pd

# Add Scripts folder to path
sys.path.append(os.path.abspath("../Scripts"))

from data_populator import RangeDataPopulator



In [2]:
# Load original dataset
df = pd.read_csv("../data/processed/crop_clean.csv", dtype=str)
print("Original rows: ", len(df))



In [3]:
# ==========================================================
# Initialize Populator
# ==========================================================

n_samples_per_row = 100
decimal_precision = 3

populator = RangeDataPopulator(
    n_samples=n_samples_per_row,
    decimal_places=decimal_precision
)

populated_df = populator.populate(df)

print("Original rows:", len(df))
print("Populated rows:", len(populated_df))


In [4]:


print("Original rows:", len(df))
print("Populated rows:", len(populated_df))

populated_df.info()
populated_df.isna().sum()


In [5]:
populated_df.to_csv("../data/processed/cereal_populated_data.csv",index =False)

In [28]:
# Core libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Models
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

# Reproducibility
RANDOM_STATE = 42


In [30]:
# Define data path
DATA_PATH = "../data/processed/cereal_populated_data.csv"

# Load data
df = pd.read_csv(DATA_PATH)

# Preview
df.head()
df.info()


In [32]:
# Feature columns (environmental factors)
FEATURES = [
    "N (kg/ha)", "P (kg/ha)", "K (kg/ha)",
    "T (°C)", "PH",
    "RF (mm)", "LGP",
    "Altitude (m)", "Crop Type"
]

# Target
TARGET = "Crop Species"

X = df[FEATURES]
y = df[TARGET]

# Encode Crop Type inside X
X = pd.get_dummies(X, columns=["Crop Type"])

X.shape, y.shape


In [33]:
# Encode crop type labels
label_encoder_crop = LabelEncoder()
y_encoded = label_encoder_crop.fit_transform(y)

# Mapping for interpretation
crop_type_mapping = dict(
    zip(label_encoder_crop.classes_, label_encoder_crop.transform(label_encoder_crop.classes_))
)

crop_type_mapping


In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    
    random_state=RANDOM_STATE
)

X_train.shape, X_test.shape


In [35]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [36]:
crop_type_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

crop_type_model.fit(X_train_scaled, y_train)


In [37]:
# Predictions
y_pred = crop_type_model.predict(X_test_scaled)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:\n")
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder_crop.classes_
))


In [40]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=label_encoder_crop.classes_,
    yticklabels=label_encoder_crop.classes_
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix – Crop Type Prediction")
plt.show()


In [6]:
# ============================================
# 1. IMPORT LIBRARIES
# ============================================

import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Metrics
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Reproducibility
RANDOM_STATE = 42


In [7]:
# ============================================
# 2. LOAD DATA
# ============================================

data_path = "../data/processed/cereal_populated_data.csv"
df = pd.read_csv(data_path)

df.head()


In [13]:
# ============================================
# 3. DEFINE FEATURES & TARGET
# ============================================

features = [
    "N (kg/ha)",
    "P (kg/ha)",
    "K (kg/ha)",
    "T (°C)",
    "PH",
    "RF (mm)",
    "LGP",
    "Altitude (m)",
    "Crop Type"
]

target = "Crop Species"

X = df[features]
y = df[target]


In [14]:
# ============================================
# 4. ENCODE TARGET
# ============================================

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

class_names = label_encoder.classes_


In [15]:
# ============================================
# 5. STRATIFIED TRAIN-TEST SPLIT
# ============================================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=RANDOM_STATE
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


In [16]:
# ============================================
# 6. PREPROCESSING PIPELINE
# ============================================

numeric_features = [
    "N (kg/ha)",
    "P (kg/ha)",
    "K (kg/ha)",
    "T (°C)",
    "PH",
    "RF (mm)",
    "LGP",
    "Altitude (m)"
]

categorical_features = ["Crop Type"]

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [17]:
# ============================================
# 7. EVALUATION FUNCTION
# ============================================

def evaluate_model(model_name, pipeline, X_train, X_test, y_train, y_test, results_dict):
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average="macro")
    f1_weighted = f1_score(y_test, y_pred, average="weighted")
    
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"\n===== {model_name} =====")
    print(f"Accuracy      : {acc:.4f}")
    print(f"Macro F1      : {f1_macro:.4f}")
    print(f"Weighted F1   : {f1_weighted:.4f}")
    
    # plt.figure(figsize=(8,6))
    # sns.heatmap(cm, annot=False, cmap="Blues")
    # plt.title(f"{model_name} - Confusion Matrix")
    # plt.xlabel("Predicted")
    # plt.ylabel("Actual")
    # plt.show()
    
    results_dict[model_name] = {
        "Accuracy": acc,
        "Macro F1": f1_macro,
        "Weighted F1": f1_weighted
    }


In [18]:
# ============================================
# 8. LOGISTIC REGRESSION
# ============================================

results = {}

log_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
])

evaluate_model("Logistic Regression", log_reg_pipeline,
               X_train, X_test, y_train, y_test, results)


In [24]:
# ============================================
# 9. SUPPORT VECTOR MACHINE
# ============================================

svm_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", SVC(kernel="rbf", random_state=RANDOM_STATE))
])

evaluate_model("SVM", svm_pipeline,
               X_train, X_test, y_train, y_test, results)


In [19]:
# ============================================
# 10. RANDOM FOREST
# ============================================

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=200,
        random_state=RANDOM_STATE
    ))
])

evaluate_model("Random Forest", rf_pipeline,
               X_train, X_test, y_train, y_test, results)


In [20]:
# ============================================
# 11. XGBOOST
# ============================================

xgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(
        use_label_encoder=False,
        eval_metric="mlogloss",
        random_state=RANDOM_STATE
    ))
])

evaluate_model("XGBoost", xgb_pipeline,
               X_train, X_test, y_train, y_test, results)


In [21]:
# ============================================
# 12. LIGHTGBM
# ============================================

lgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LGBMClassifier(
        random_state=RANDOM_STATE
    ))
])

evaluate_model("LightGBM", lgb_pipeline,
               X_train, X_test, y_train, y_test, results)


In [25]:
# ============================================
# 13. MODEL COMPARISON TABLE
# ============================================

comparison_df = pd.DataFrame(results).T.sort_values(by="Accuracy", ascending=False)

comparison_df


In [23]:
print('results_dict:', results)

## Yield prediction

In [26]:
# ============================================
# 1. DEFINE FEATURES & TARGET (REGRESSION)
# ============================================

features = [
    "N (kg/ha)",
    "P (kg/ha)",
    "K (kg/ha)",
    "T (°C)",
    "PH",
    "RF (mm)",
    "LGP",
    "Altitude (m)",
    "Crop Type",
    "Crop Species"
]

target = "Yield (q/ha)"

X = df[features]
y = df[target]


In [27]:
# ============================================
# 2. TRAIN-TEST SPLIT
# ============================================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


In [28]:
# ============================================
# 3. PREPROCESSING PIPELINE
# ============================================

numeric_features = [
    "N (kg/ha)",
    "P (kg/ha)",
    "K (kg/ha)",
    "T (°C)",
    "PH",
    "RF (mm)",
    "LGP",
    "Altitude (m)"
]

categorical_features = ["Crop Type", "Crop Species"]

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [29]:
# ============================================
# 4. REGRESSION EVALUATION FUNCTION
# ============================================

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_regression_model(model_name, pipeline, X_train, X_test, y_train, y_test, results_dict):
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n===== {model_name} =====")
    print(f"MAE  : {mae:.4f}")
    print(f"RMSE : {rmse:.4f}")
    print(f"R²   : {r2:.4f}")
    
    results_dict[model_name] = {
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }


In [30]:
# ============================================
# 5. LINEAR REGRESSION
# ============================================

from sklearn.linear_model import LinearRegression

results_reg = {}

linreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

evaluate_regression_model("Linear Regression",
                          linreg_pipeline,
                          X_train, X_test,
                          y_train, y_test,
                          results_reg)


In [31]:
# ============================================
# 6. SUPPORT VECTOR REGRESSION
# ============================================

from sklearn.svm import SVR

svr_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", SVR(kernel="rbf"))
])

evaluate_regression_model("SVR",
                          svr_pipeline,
                          X_train, X_test,
                          y_train, y_test,
                          results_reg)


In [32]:
# ============================================
# 7. RANDOM FOREST REGRESSOR
# ============================================

from sklearn.ensemble import RandomForestRegressor

rf_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(
        n_estimators=200,
        random_state=RANDOM_STATE
    ))
])

evaluate_regression_model("Random Forest Regressor",
                          rf_reg_pipeline,
                          X_train, X_test,
                          y_train, y_test,
                          results_reg)


In [33]:
# ============================================
# 8. XGBOOST REGRESSOR
# ============================================

from xgboost import XGBRegressor

xgb_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(
        random_state=RANDOM_STATE,
        objective="reg:squarederror"
    ))
])

evaluate_regression_model("XGBoost Regressor",
                          xgb_reg_pipeline,
                          X_train, X_test,
                          y_train, y_test,
                          results_reg)


In [34]:
# ============================================
# 9. LIGHTGBM REGRESSOR
# ============================================

from lightgbm import LGBMRegressor

lgb_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LGBMRegressor(
        random_state=RANDOM_STATE
    ))
])

evaluate_regression_model("LightGBM Regressor",
                          lgb_reg_pipeline,
                          X_train, X_test,
                          y_train, y_test,
                          results_reg)


In [36]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline

# Create KNN Regressor pipeline
knn_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", KNeighborsRegressor(
        n_neighbors=5  # you can adjust this
    ))
])

# Evaluate using your regression function
evaluate_regression_model("KNN Regressor",
                          knn_reg_pipeline,
                          X_train, X_test,
                          y_train, y_test,
                          results_reg)


In [None]:
# ============================================
# 10. REGRESSION MODEL COMPARISON
# ============================================

comparison_reg_df = pd.DataFrame(results_reg).T.sort_values(by="R2", ascending=False)

comparison_reg_df
