In [1]:
import pandas as pd
import numpy as np


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv("Data/Wheat/Wheat_Crop_Disease_Environment_Cures_Maharashtra.csv")

print("Shape:", data.shape)
print("\nColumns:\n", data.columns)
print("\nSample Data:\n", data.head())
print("\nMissing Values:\n", data.isnull().sum())


Shape: (140, 11)

Columns:
 Index(['Crop Stage', 'Region', 'Crop Disease', 'Temp (°C)', 'Humidity (%)',
       'Soil Moisture (%)', 'Rainfall (mm)', 'Soil pH', 'Cause',
       'Organic Cure', 'Inorganic Cure'],
      dtype='object')

Sample Data:
    Crop Stage  Region Crop Disease  Temp (°C)  Humidity (%)  \
0  Pre-Sowing    Pune   Loose Smut       33.7            74   
1  Pre-Sowing    Pune  Common Bunt       26.3            85   
2  Pre-Sowing    Pune  Karnal Bunt       30.5            88   
3  Pre-Sowing  Nashik   Loose Smut       29.1            73   
4  Pre-Sowing  Nashik  Common Bunt       32.6            80   

   Soil Moisture (%)  Rainfall (mm)  Soil pH  \
0                 54              6      6.9   
1                 52             18      7.2   
2                 68             23      7.3   
3                 41             12      6.7   
4                 40             10      7.0   

                                               Cause  \
0  Seed-borne fungus Ustilag

In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in ['Crop Stage', 'Region']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


In [4]:
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
from sklearn.ensemble import RandomForestClassifier

stage_disease_map = data.groupby('Crop Stage')['Crop Disease'].apply(list).to_dict()

unique_stages = data[['Crop Stage', 'Region', 'Temp (°C)', 'Humidity (%)',
                      'Soil Moisture (%)', 'Rainfall (mm)', 'Soil pH']].drop_duplicates()

unique_stages['Diseases'] = unique_stages['Crop Stage'].map(stage_disease_map)
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(unique_stages['Diseases'])
label_encoders = {}
X_multi = unique_stages[['Crop Stage', 'Region', 'Temp (°C)', 'Humidity (%)',
                         'Soil Moisture (%)', 'Rainfall (mm)', 'Soil pH']].copy()

for col in ['Crop Stage', 'Region']:
    le = LabelEncoder()
    X_multi[col] = le.fit_transform(X_multi[col])
    label_encoders[col] = le

X_train_m, X_test_m, Y_train_m, Y_test_m = train_test_split(
    X_multi, Y, test_size=0.2, random_state=42
)
multi_rf = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced_subsample', n_jobs=-1)
)
multi_rf.fit(X_train_m, Y_train_m)

Y_pred_m = multi_rf.predict(X_test_m)
subset_acc = accuracy_score(Y_test_m, Y_pred_m)
hamming = hamming_loss(Y_test_m, Y_pred_m)
macro_f1 = f1_score(Y_test_m, Y_pred_m, average='macro')
micro_f1 = f1_score(Y_test_m, Y_pred_m, average='micro')
print(f"Subset accuracy: {subset_acc:.4f}")
print(f"Hamming loss: {hamming:.4f}")
print(f"Macro F1: {macro_f1:.4f}")
print(f"Micro F1: {micro_f1:.4f}")
decoded_results = []
for i in range(min(5, len(Y_test_m))):
    true_diseases = mlb.inverse_transform(Y_test_m[i].reshape(1, -1))[0]
    pred_diseases = mlb.inverse_transform(Y_pred_m[i].reshape(1, -1))[0]
    decoded_results.append({
        "Crop Stage": label_encoders['Crop Stage'].inverse_transform([X_test_m.iloc[i, 0]])[0],
        "Region": label_encoders['Region'].inverse_transform([X_test_m.iloc[i, 1]])[0],
        "True Diseases": list(true_diseases),
        "Predicted Diseases": list(pred_diseases)
    })

pd.DataFrame(decoded_results)


Subset accuracy: 0.3571
Hamming loss: 0.0545
Macro F1: 0.5846
Micro F1: 0.7238


Unnamed: 0,Crop Stage,Region,True Diseases,Predicted Diseases
0,6,1,"[Fusarium Head Blight, Leaf Rust]",[Leaf Rust]
1,12,3,"[Barley Yellow Dwarf Virus (BYDV), Root Rot (F...","[Barley Yellow Dwarf Virus (BYDV), Root Rot (F..."
2,10,0,"[Black Point, Loose Smut, Seed Rot]",[]
3,4,1,"[Leaf Rust, Septoria Leaf Blotch]",[Leaf Rust]
4,3,3,"[Damping-off (Pythium), Root Rot (Fusarium), S...","[Damping-off (Pythium), Seedling Blight (Fusar..."


In [5]:
import numpy as np

Y_proba = np.array([estimator.predict_proba(X_test_m)[:, 1] for estimator in multi_rf.estimators_]).T

threshold = 0.25
Y_pred_thresholded = (Y_proba >= threshold).astype(int)

subset_acc_t = accuracy_score(Y_test_m, Y_pred_thresholded)
hamming_t = hamming_loss(Y_test_m, Y_pred_thresholded)
macro_f1_t = f1_score(Y_test_m, Y_pred_thresholded, average='macro')
micro_f1_t = f1_score(Y_test_m, Y_pred_thresholded, average='micro')

print(f"After threshold tuning (0.25):")
print(f"Subset accuracy: {subset_acc_t:.4f}")
print(f"Hamming loss: {hamming_t:.4f}")
print(f"Macro F1: {macro_f1_t:.4f}")
print(f"Micro F1: {micro_f1_t:.4f}\n")

decoded_thresholded = []
for i in range(min(5, len(Y_test_m))):
    true_diseases = mlb.inverse_transform(Y_test_m[i].reshape(1, -1))[0]
    pred_diseases = mlb.inverse_transform(Y_pred_thresholded[i].reshape(1, -1))[0]
    decoded_thresholded.append({
        "Crop Stage": label_encoders['Crop Stage'].inverse_transform([X_test_m.iloc[i, 0]])[0],
        "Region": label_encoders['Region'].inverse_transform([X_test_m.iloc[i, 1]])[0],
        "True Diseases": list(true_diseases),
        "Predicted Diseases": list(pred_diseases)
    })

pd.DataFrame(decoded_thresholded)

After threshold tuning (0.25):
Subset accuracy: 0.6429
Hamming loss: 0.0282
Macro F1: 0.8961
Micro F1: 0.8921



Unnamed: 0,Crop Stage,Region,True Diseases,Predicted Diseases
0,6,1,"[Fusarium Head Blight, Leaf Rust]","[Fusarium Head Blight, Leaf Rust]"
1,12,3,"[Barley Yellow Dwarf Virus (BYDV), Root Rot (F...","[Barley Yellow Dwarf Virus (BYDV), Root Rot (F..."
2,10,0,"[Black Point, Loose Smut, Seed Rot]","[Black Point, Loose Smut, Root Rot (Fusarium),..."
3,4,1,"[Leaf Rust, Septoria Leaf Blotch]",[Leaf Rust]
4,3,3,"[Damping-off (Pythium), Root Rot (Fusarium), S...","[Damping-off (Pythium), Root Rot (Fusarium), S..."


In [6]:
#  for now we Trained multi-label Random Forest on all stages no region, no split

import numpy as np
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

stage_disease_map_no_region = data.groupby('Crop Stage')['Crop Disease'].apply(list).to_dict()

stage_env_all = data.groupby('Crop Stage')[['Temp (°C)', 'Humidity (%)', 'Soil Moisture (%)',
                                            'Rainfall (mm)', 'Soil pH']].mean().reset_index()
stage_env_all['Diseases'] = stage_env_all['Crop Stage'].map(stage_disease_map_no_region)
X_all = stage_env_all[['Crop Stage', 'Temp (°C)', 'Humidity (%)',
                       'Soil Moisture (%)', 'Rainfall (mm)', 'Soil pH']].copy()
X_all['Crop Stage'] = label_encoders['Crop Stage'].transform(X_all['Crop Stage'])


Y_all = mlb.fit_transform(stage_env_all['Diseases'])
multi_rf_all = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced_subsample', n_jobs=-1)
)
multi_rf_all.fit(X_all, Y_all)
Y_pred_all = multi_rf_all.predict(X_all)

subset_acc = accuracy_score(Y_all, Y_pred_all)
hamming = hamming_loss(Y_all, Y_pred_all)
macro_f1 = f1_score(Y_all, Y_pred_all, average='macro')
micro_f1 = f1_score(Y_all, Y_pred_all, average='micro')

print(f"Subset accuracy: {subset_acc:.4f}")
print(f"Hamming loss: {hamming:.4f}")
print(f"Macro F1: {macro_f1:.4f}")
print(f"Micro F1: {micro_f1:.4f}\n")

decoded_all = []
for i in range(len(Y_all)):
    true_diseases = mlb.inverse_transform(np.array(Y_all[i]).reshape(1, -1))[0]
    pred_diseases = mlb.inverse_transform(np.array(Y_pred_all[i]).reshape(1, -1))[0]
    decoded_all.append({
        "Crop Stage": label_encoders['Crop Stage'].inverse_transform([X_all.iloc[i, 0]])[0],
        "True Diseases": list(true_diseases),
        "Predicted Diseases": list(pred_diseases)
    })

pd.DataFrame(decoded_all)



Subset accuracy: 1.0000
Hamming loss: 0.0000
Macro F1: 1.0000
Micro F1: 1.0000



Unnamed: 0,Crop Stage,True Diseases,Predicted Diseases
0,0,"[Leaf Blight, Stripe Rust]","[Leaf Blight, Stripe Rust]"
1,1,"[Damping-off (Pythium), Seedling Blight (Fusar...","[Damping-off (Pythium), Seedling Blight (Fusar..."
2,2,"[Root Rot (Fusarium), Soilborne Pathogens]","[Root Rot (Fusarium), Soilborne Pathogens]"
3,3,"[Damping-off (Pythium), Root Rot (Fusarium), S...","[Damping-off (Pythium), Root Rot (Fusarium), S..."
4,4,"[Leaf Rust, Septoria Leaf Blotch]","[Leaf Rust, Septoria Leaf Blotch]"
5,5,[Storage Fungi (Aspergillus)],[Storage Fungi (Aspergillus)]
6,6,"[Fusarium Head Blight, Leaf Rust]","[Fusarium Head Blight, Leaf Rust]"
7,7,[Leaf Rust],[Leaf Rust]
8,8,"[Storage Fungi (Aspergillus), Weevil Infestation]","[Storage Fungi (Aspergillus), Weevil Infestation]"
9,9,"[Common Bunt, Karnal Bunt, Loose Smut]","[Common Bunt, Karnal Bunt, Loose Smut]"


In [7]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

stage_disease_map_final = data.groupby('Crop Stage')['Crop Disease'].apply(list).to_dict()


stage_df = pd.DataFrame({
    'Crop Stage': list(stage_disease_map_final.keys()),
    'Diseases': list(stage_disease_map_final.values())
})


le_stage = LabelEncoder()
X_stage = le_stage.fit_transform(stage_df['Crop Stage']).reshape(-1, 1)


mlb_final = MultiLabelBinarizer()
Y_stage = mlb_final.fit_transform(stage_df['Diseases'])


rf_stage = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=300, random_state=42)
)
rf_stage.fit(X_stage, Y_stage)


preds = rf_stage.predict(X_stage)
subset_acc = (preds == Y_stage).all(axis=1).mean()
print(f"Subset Accuracy (Stage→Diseases): {subset_acc:.4f}\n")


def predict_diseases(crop_stage):
    encoded_stage = le_stage.transform([crop_stage])
    pred = rf_stage.predict(np.array(encoded_stage).reshape(-1, 1))
    predicted_labels = mlb_final.inverse_transform(pred)
    return list(predicted_labels[0])

for stage in le_stage.classes_:
    print(f"{stage}: {predict_diseases(stage)}")


Subset Accuracy (Stage→Diseases): 1.0000

0: ['Leaf Blight', 'Stripe Rust']
1: ['Damping-off (Pythium)', 'Seedling Blight (Fusarium)']
2: ['Root Rot (Fusarium)', 'Soilborne Pathogens']
3: ['Damping-off (Pythium)', 'Root Rot (Fusarium)', 'Seedling Blight (Fusarium)']
4: ['Leaf Rust', 'Septoria Leaf Blotch']
5: ['Storage Fungi (Aspergillus)']
6: ['Fusarium Head Blight', 'Leaf Rust']
7: ['Leaf Rust']
8: ['Storage Fungi (Aspergillus)', 'Weevil Infestation']
9: ['Common Bunt', 'Karnal Bunt', 'Loose Smut']
10: ['Black Point', 'Loose Smut', 'Seed Rot']
11: ['Crown Rot', 'Root Rot (Fusarium)']
12: ['Barley Yellow Dwarf Virus (BYDV)', 'Root Rot (Fusarium)', 'Yellow Rust']


In [8]:
print("Sanity Re-Test: Stage → Predicted Diseases\n" + "-"*60)


for stage in le_stage.classes_:
    predicted = predict_diseases(stage)
    print(f"{stage}: {predicted}")

print("\n Sanity test complete — verify predictions match earlier outputs.")


Sanity Re-Test: Stage → Predicted Diseases
------------------------------------------------------------
0: ['Leaf Blight', 'Stripe Rust']
1: ['Damping-off (Pythium)', 'Seedling Blight (Fusarium)']
2: ['Root Rot (Fusarium)', 'Soilborne Pathogens']
3: ['Damping-off (Pythium)', 'Root Rot (Fusarium)', 'Seedling Blight (Fusarium)']
4: ['Leaf Rust', 'Septoria Leaf Blotch']
5: ['Storage Fungi (Aspergillus)']
6: ['Fusarium Head Blight', 'Leaf Rust']
7: ['Leaf Rust']
8: ['Storage Fungi (Aspergillus)', 'Weevil Infestation']
9: ['Common Bunt', 'Karnal Bunt', 'Loose Smut']
10: ['Black Point', 'Loose Smut', 'Seed Rot']
11: ['Crown Rot', 'Root Rot (Fusarium)']
12: ['Barley Yellow Dwarf Virus (BYDV)', 'Root Rot (Fusarium)', 'Yellow Rust']

 Sanity test complete — verify predictions match earlier outputs.


In [10]:

import joblib
import pickle

model_artifacts = {
    'model': rf_stage,
    'mlb': mlb_final,
    'label_encoder': le_stage,
    'feature_names': ['Crop Stage']
}


joblib.dump(model_artifacts, 'wheat_disease_model.pkl')

print("Model and encoders saved successfully!")


multi_model_artifacts = {
    'multi_model': multi_rf,
    'multi_mlb': mlb,
    'label_encoders': label_encoders,
    'feature_names': ['Crop Stage', 'Region', 'Temp (°C)', 'Humidity (%)', 
                     'Soil Moisture (%)', 'Rainfall (mm)', 'Soil pH']
}

joblib.dump(multi_model_artifacts, 'wheat_disease_multi_model.pkl')
print("Multi-output model saved successfully!")

Model and encoders saved successfully!
Multi-output model saved successfully!
