In [1]:

import os, json, numpy as np, pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score, accuracy_score

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import joblib

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


In [2]:
## Load data & normalize column names

CSV_PATH = "Crop_recommendation.csv"   # adjust if needed
df = pd.read_csv(CSV_PATH)
df.columns = df.columns.str.strip()

# 1) Drop any Unnamed or fully-empty columns
df = df.loc[:, ~df.columns.str.match(r"^Unnamed", case=False)]
df = df.dropna(axis=1, how="all")

# Ensure standard names
COL_MAP = {'Nitrogen':'N','phosphorus':'P','potassium':'K'}
df = df.rename(columns={k:v for k,v in COL_MAP.items() if k in df.columns})

required = {'N','P','K','temperature','humidity','ph','rainfall','label'}
missing = required - set(df.columns)
assert not missing, f"Missing columns: {missing}"

df.head()


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [3]:
## Preprocessing: target encoding, split, scaling
# Target encode
y_cats = df['label'].astype('category')
y = y_cats.cat.codes.values
classes = list(y_cats.cat.categories)

# Features
X = df[['N','P','K','temperature','humidity','ph','rainfall']].astype('float32').values

# Stratified 70/15/15
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=RANDOM_SEED
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1765, stratify=y_train, random_state=RANDOM_SEED
)  # 0.1765 ≈ 15% of (train+val)

# Scale (trees don’t need it, but keeps swap-to-MLP easy)
scaler = StandardScaler().fit(X_train)
Xs_train = scaler.transform(X_train)
Xs_val   = scaler.transform(X_val)
Xs_test  = scaler.transform(X_test)

len(classes), classes[:5]


(22, ['apple', 'banana', 'blackgram', 'chickpea', 'coconut'])

In [4]:
## Helper: train/eval wrapper

def evaluate_model(model, X_tr, y_tr, X_va, y_va, X_te, y_te, name="model", **fit_kwargs):
    """
    Trains model with optional fit kwargs (e.g., eval_set for XGBoost >=3.x),
    prints metrics, and returns results dict.
    """
    model.fit(X_tr, y_tr, **fit_kwargs)

    # Validation metrics
    va_pred = model.predict(X_va)
    va_f1   = f1_score(y_va, va_pred, average="macro")
    va_acc  = accuracy_score(y_va, va_pred)

    # Test metrics
    te_pred = model.predict(X_te)
    te_f1   = f1_score(y_te, te_pred, average="macro")
    te_acc  = accuracy_score(y_te, te_pred)

    print(f"\n=== {name} ===")
    print("Validation -> macro-F1:", round(va_f1,4), "  acc:", round(va_acc,4))
    print("Test       -> macro-F1:", round(te_f1,4), "  acc:", round(te_acc,4))
    print("\nClassification report (test):")
    print(classification_report(y_te, te_pred, target_names=classes, digits=3))

    return {"name":name, "model":model,
            "val_f1":va_f1, "val_acc":va_acc,
            "test_f1":te_f1, "test_acc":te_acc}



In [5]:
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    class_weight="balanced_subsample",
    random_state=RANDOM_SEED,
    n_jobs=-1
)

res_rf = evaluate_model(
    rf, Xs_train, y_train, Xs_val, y_val, Xs_test, y_test, name="RandomForest"
)



=== RandomForest ===
Validation -> macro-F1: 1.0   acc: 1.0
Test       -> macro-F1: 0.9939   acc: 0.9939

Classification report (test):
              precision    recall  f1-score   support

       apple      1.000     1.000     1.000        15
      banana      1.000     1.000     1.000        15
   blackgram      1.000     0.933     0.966        15
    chickpea      1.000     1.000     1.000        15
     coconut      1.000     1.000     1.000        15
      coffee      1.000     1.000     1.000        15
      cotton      1.000     1.000     1.000        15
      grapes      1.000     1.000     1.000        15
        jute      0.938     1.000     0.968        15
 kidneybeans      1.000     1.000     1.000        15
      lentil      1.000     1.000     1.000        15
       maize      0.938     1.000     0.968        15
       mango      1.000     1.000     1.000        15
   mothbeans      1.000     1.000     1.000        15
    mungbean      1.000     1.000     1.000        1

In [6]:
xgb_clf = xgb.XGBClassifier(
    objective="multi:softprob",
    eval_metric="mlogloss",
    early_stopping_rounds=30,   # set on the estimator (>=3.x)
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method="hist",
    random_state=RANDOM_SEED,
    n_jobs=-1,
    num_class=len(classes)
)

res_xgb = evaluate_model(
    xgb_clf, Xs_train, y_train, Xs_val, y_val, Xs_test, y_test,
    name="XGBoost (>=3.x)",
    eval_set=[(Xs_val, y_val)],
    verbose=False
)



=== XGBoost (>=3.x) ===
Validation -> macro-F1: 0.994   acc: 0.994
Test       -> macro-F1: 0.9909   acc: 0.9909

Classification report (test):
              precision    recall  f1-score   support

       apple      1.000     1.000     1.000        15
      banana      1.000     1.000     1.000        15
   blackgram      0.933     0.933     0.933        15
    chickpea      1.000     1.000     1.000        15
     coconut      1.000     1.000     1.000        15
      coffee      1.000     1.000     1.000        15
      cotton      1.000     1.000     1.000        15
      grapes      1.000     1.000     1.000        15
        jute      1.000     1.000     1.000        15
 kidneybeans      1.000     1.000     1.000        15
      lentil      1.000     0.933     0.966        15
       maize      1.000     1.000     1.000        15
       mango      1.000     1.000     1.000        15
   mothbeans      0.938     1.000     0.968        15
    mungbean      0.938     1.000     0.968  

In [7]:
results = [res_rf, res_xgb]
best = max(results, key=lambda r: r["val_f1"])
print("\nBest model by validation F1:", best["name"], "->", round(best["val_f1"],4))

Path("artifacts").mkdir(exist_ok=True)
joblib.dump(best["model"], "artifacts/model_best.joblib")
joblib.dump(scaler,       "artifacts/scaler.joblib")
joblib.dump(classes,      "artifacts/classes.joblib")

best



Best model by validation F1: RandomForest -> 1.0


{'name': 'RandomForest',
 'model': RandomForestClassifier(class_weight='balanced_subsample', n_estimators=400,
                        n_jobs=-1, random_state=42),
 'val_f1': 1.0,
 'val_acc': 1.0,
 'test_f1': 0.9939326524421074,
 'test_acc': 0.9939393939393939}

In [8]:
def load_artifacts():
    model   = joblib.load("artifacts/model_best.joblib")
    scaler  = joblib.load("artifacts/scaler.joblib")
    classes = joblib.load("artifacts/classes.joblib")
    return model, scaler, classes

def predict_one(sample_dict):
    # keys: N,P,K,temperature,humidity,ph,rainfall
    order = ['N','P','K','temperature','humidity','ph','rainfall']
    x = np.array([[sample_dict[k] for k in order]], dtype=np.float32)
    model, sc, cls = load_artifacts()
    x = sc.transform(x)
    proba = model.predict_proba(x)[0]
    top = int(np.argmax(proba))
    return {"crop": cls[top], "confidence": float(proba[top])}

# Example
predict_one({"N":90,"P":42,"K":43,"temperature":21.5,"humidity":80,"ph":6.5,"rainfall":200})


{'crop': 'rice', 'confidence': 0.7875}

In [10]:
pip install streamlit streamlit-webrtc faster-whisper edge-tts av joblib scikit-learn 


Collecting streamlitNote: you may need to restart the kernel to use updated packages.

  Using cached streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting streamlit-webrtc
  Using cached streamlit_webrtc-0.63.11-py3-none-any.whl.metadata (18 kB)
Collecting faster-whisper
  Using cached faster_whisper-1.2.0-py3-none-any.whl.metadata (16 kB)
Collecting edge-tts
  Using cached edge_tts-7.2.3-py3-none-any.whl.metadata (5.5 kB)
Collecting av
  Using cached av-16.0.1-cp312-cp312-win_amd64.whl.metadata (4.7 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting protobuf<7,>=3.20 (from streamlit)
  Using cached protobuf-6.33.0-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting aioice>=0.10.1 (from streamlit-webrtc)
  Using cached aioice-0.10.1-py3-none-any.whl.metadata (4.1 kB)
Collecting aiortc>=1.11.0 (from streamlit-webrtc)
  Using cached aiortc-1.14.0-py3-none-any.whl.metadata (4.9 kB)
Collecting ctransla

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
googleapis-common-protos 1.62.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0,>=3.19.5, but you have protobuf 6.33.0 which is incompatible.
google-ai-generativelanguage 0.4.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 6.33.0 which is incompatible.
google-api-core 2.17.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0,>=3.19.5, but you have protobuf 6.33.0 which is incompatible.
proto-plus 1.23.0 requires protobuf<5.0.0dev,>=3.19.0, but you have protobuf 6.33.0 which is incompatible.
