In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [2]:
df = pd.read_csv("multi_crop_disease_dataset_20000_rows.csv")
df.head()

Unnamed: 0,Crop,Disease,Temperature(C),Humidity(%),Rainfall(mm),Soil_pH,Soil_Moisture(%),Leaf_Spots,Wilting
0,Apple,Apple Scab,27.5,47,239,7.5,17,1,1
1,Groundnut,Tikka Disease,38.4,48,275,8.1,66,1,1
2,Maize,Downy Mildew,40.7,47,164,6.6,41,1,0
3,Mango,Anthracnose,16.0,80,295,7.1,87,1,1
4,Potato,Early Blight,24.7,56,146,7.0,45,1,1


In [3]:
df["Temp_Humidity_Index"] = df["Temperature(C)"] * df["Humidity(%)"]
df["Rain_Moisture_Index"] = df["Rainfall(mm)"] * df["Soil_Moisture(%)"]
df["Stress_Index"] = df["Temperature(C)"] / (df["Humidity(%)"] + 1)


In [4]:
X = df.drop("Disease", axis=1)
y = df["Disease"]

le = LabelEncoder()
y = le.fit_transform(y)


In [5]:
X = pd.get_dummies(X, columns=["Crop"], drop_first=True)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [7]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
scaled_models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ),
    "SVM": SVC(
        class_weight="balanced"
    ),
    "KNN": KNeighborsClassifier(
        n_neighbors=9,
        weights="distance"
    )
}


non_scaled_models = {
    "Naive Bayes": GaussianNB(),

    "Decision Tree": DecisionTreeClassifier(
        max_depth=20,
        min_samples_leaf=3,
        class_weight="balanced",
        random_state=42
    ),

    "Random Forest": RandomForestClassifier(
        n_estimators=400,
        max_depth=25,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ),

    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=300,
        learning_rate=0.05,
        random_state=42
    )
}


In [9]:
results = []

# Scaled models
for name, model in scaled_models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    results.append({
        "Model": name,
        "Accuracy": round(accuracy_score(y_test, y_pred), 4),
        "F1 Score": round(f1_score(y_test, y_pred, average="weighted"), 4)
    })

# Non-scaled models
for name, model in non_scaled_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": round(accuracy_score(y_test, y_pred), 4),
        "F1 Score": round(f1_score(y_test, y_pred, average="weighted"), 4)
    })

results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False)
results_df


Unnamed: 0,Model,Accuracy,F1 Score
5,Random Forest,0.5188,0.5192
6,Gradient Boosting,0.5258,0.5188
2,KNN,0.519,0.5121
4,Decision Tree,0.4015,0.4027
0,Logistic Regression,0.355,0.3065
1,SVM,0.326,0.2507
3,Naive Bayes,0.3135,0.2188


In [10]:
best_model_name = results_df.iloc[0]["Model"]

if best_model_name in scaled_models:
    best_model = scaled_models[best_model_name]
    best_model.fit(X_train_scaled, y_train)
    best_pred = best_model.predict(X_test_scaled)
else:
    best_model = non_scaled_models[best_model_name]
    best_model.fit(X_train, y_train)
    best_pred = best_model.predict(X_test)

print("Best Model:", best_model_name)
print(classification_report(y_test, best_pred, target_names=le.classes_))


Best Model: Random Forest
                       precision    recall  f1-score   support

          Anthracnose       0.44      0.54      0.49       190
           Apple Scab       0.43      0.41      0.42        88
     Bacterial Blight       0.45      0.69      0.55        94
Bacterial Leaf Blight       0.28      0.37      0.32        62
                Blast       0.25      0.31      0.28        67
             Boll Rot       0.47      0.53      0.50        88
           Brown Spot       0.20      0.21      0.21        58
         Downy Mildew       0.47      0.37      0.42       270
         Early Blight       0.40      0.48      0.44       161
          Fire Blight       0.43      0.48      0.45        94
              Healthy       0.82      0.71      0.76      1271
          Late Blight       0.34      0.21      0.26       151
          Leaf Blight       0.49      0.76      0.59        90
            Leaf Curl       0.41      0.49      0.45        84
      Leaf Curl Virus       

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_f1 = cross_val_score(
    RandomForestClassifier(
        n_estimators=400,
        max_depth=25,
        min_samples_leaf=2,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ),
    X, y,
    cv=skf,
    scoring="f1_weighted"
)

print("Random Forest CV F1:", cv_f1.mean())
