In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data=pd.read_csv("Data/Maize/Maize_Data.csv")

In [3]:
data.head()

Unnamed: 0,Crop Stage,Region,Crop Disease,Temp (°C),Humidity (%),Soil Moisture (%),Rainfall (mm),Soil pH,Cause,Organic Cure,Inorganic Cure
0,Silking,Gondia,Leaf Blight,26.0,66.7,30.8,141.6,5.43,Fungal infection due to high humidity,Use neem oil spray weekly,Apply Mancozeb fungicide
1,Grain Filling,Nagpur,Downy Mildew,30.6,41.4,36.1,98.4,7.48,Moist soil and cool nights,Ensure proper field drainage,Spray Metalaxyl-based fungicide
2,Vegetative Growth,Buldhana,Downy Mildew,30.9,75.0,30.2,98.9,7.23,Moist soil and cool nights,Ensure proper field drainage,Spray Metalaxyl-based fungicide
3,Vegetative Growth,Latur,Smut,35.1,50.2,21.8,72.1,5.93,Soil-borne fungal pathogen,Remove infected ears early,Apply Carbendazim treatment
4,Vegetative Growth,Jalna,Rust,31.4,60.8,37.0,114.9,5.56,Fungal spores spread by wind,Use resistant hybrid varieties,Apply Propiconazole fungicide


In [4]:
data.dtypes

Crop Stage            object
Region                object
Crop Disease          object
Temp (°C)            float64
Humidity (%)         float64
Soil Moisture (%)    float64
Rainfall (mm)        float64
Soil pH              float64
Cause                 object
Organic Cure          object
Inorganic Cure        object
dtype: object

In [5]:
data.shape

(200, 11)

In [6]:
data['Crop Stage'].unique()

array(['Silking', 'Grain Filling', 'Vegetative Growth',
       'Maturation & Harvest', 'Germination & Emergence',
       'Seedling Stage', 'Tasseling'], dtype=object)

In [7]:
print("\nMissing Values:\n", data.isnull().sum())


Missing Values:
 Crop Stage           0
Region               0
Crop Disease         0
Temp (°C)            0
Humidity (%)         0
Soil Moisture (%)    0
Rainfall (mm)        0
Soil pH              0
Cause                0
Organic Cure         0
Inorganic Cure       0
dtype: int64


In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in ['Crop Stage', 'Region']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


In [9]:
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
from sklearn.ensemble import RandomForestClassifier

stage_disease_map = data.groupby('Crop Stage')['Crop Disease'].apply(list).to_dict()

unique_stages = data[['Crop Stage', 'Region', 'Temp (°C)', 'Humidity (%)',
                      'Soil Moisture (%)', 'Rainfall (mm)', 'Soil pH']].drop_duplicates()

unique_stages['Diseases'] = unique_stages['Crop Stage'].map(stage_disease_map)
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(unique_stages['Diseases'])
label_encoders = {}
X_multi = unique_stages[['Crop Stage', 'Region', 'Temp (°C)', 'Humidity (%)',
                         'Soil Moisture (%)', 'Rainfall (mm)', 'Soil pH']].copy()

for col in ['Crop Stage', 'Region']:
    le = LabelEncoder()
    X_multi[col] = le.fit_transform(X_multi[col])
    label_encoders[col] = le

X_train_m, X_test_m, Y_train_m, Y_test_m = train_test_split(
    X_multi, Y, test_size=0.2, random_state=42
)
multi_rf = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced_subsample', n_jobs=-1)
)
multi_rf.fit(X_train_m, Y_train_m)

Y_pred_m = multi_rf.predict(X_test_m)
subset_acc = accuracy_score(Y_test_m, Y_pred_m)
hamming = hamming_loss(Y_test_m, Y_pred_m)
macro_f1 = f1_score(Y_test_m, Y_pred_m, average='macro')
micro_f1 = f1_score(Y_test_m, Y_pred_m, average='micro')
print(f"Subset accuracy: {subset_acc:.4f}")
print(f"Hamming loss: {hamming:.4f}")
print(f"Macro F1: {macro_f1:.4f}")
print(f"Micro F1: {micro_f1:.4f}")
decoded_results = []
for i in range(min(5, len(Y_test_m))):
    true_diseases = mlb.inverse_transform(Y_test_m[i].reshape(1, -1))[0]
    pred_diseases = mlb.inverse_transform(Y_pred_m[i].reshape(1, -1))[0]
    decoded_results.append({
        "Crop Stage": label_encoders['Crop Stage'].inverse_transform([X_test_m.iloc[i, 0]])[0],
        "Region": label_encoders['Region'].inverse_transform([X_test_m.iloc[i, 1]])[0],
        "True Diseases": list(true_diseases),
        "Predicted Diseases": list(pred_diseases)
    })

pd.DataFrame(decoded_results)


Subset accuracy: 0.9750
Hamming loss: 0.0036
Macro F1: 0.9979
Micro F1: 0.9982


Unnamed: 0,Crop Stage,Region,True Diseases,Predicted Diseases
0,4,6,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."
1,0,1,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."
2,6,0,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."
3,3,29,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."
4,2,0,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."


In [10]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

stage_disease_map_no_region = data.groupby('Crop Stage')['Crop Disease'].apply(list).to_dict()

stage_env_all = data.groupby('Crop Stage')[['Temp (°C)', 'Humidity (%)', 'Soil Moisture (%)',
                                            'Rainfall (mm)', 'Soil pH']].mean().reset_index()
stage_env_all['Diseases'] = stage_env_all['Crop Stage'].map(stage_disease_map_no_region)
X_all = stage_env_all[['Crop Stage', 'Temp (°C)', 'Humidity (%)',
                       'Soil Moisture (%)', 'Rainfall (mm)', 'Soil pH']].copy()
X_all['Crop Stage'] = label_encoders['Crop Stage'].transform(X_all['Crop Stage'])


Y_all = mlb.fit_transform(stage_env_all['Diseases'])
multi_rf_all = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced_subsample', n_jobs=-1)
)
multi_rf_all.fit(X_all, Y_all)
Y_pred_all = multi_rf_all.predict(X_all)

subset_acc = accuracy_score(Y_all, Y_pred_all)
hamming = hamming_loss(Y_all, Y_pred_all)
macro_f1 = f1_score(Y_all, Y_pred_all, average='macro')
micro_f1 = f1_score(Y_all, Y_pred_all, average='micro')

print(f"Subset accuracy: {subset_acc:.4f}")
print(f"Hamming loss: {hamming:.4f}")
print(f"Macro F1: {macro_f1:.4f}")
print(f"Micro F1: {micro_f1:.4f}\n")

decoded_all = []
for i in range(len(Y_all)):
    true_diseases = mlb.inverse_transform(np.array(Y_all[i]).reshape(1, -1))[0]
    pred_diseases = mlb.inverse_transform(np.array(Y_pred_all[i]).reshape(1, -1))[0]
    decoded_all.append({
        "Crop Stage": label_encoders['Crop Stage'].inverse_transform([X_all.iloc[i, 0]])[0],
        "True Diseases": list(true_diseases),
        "Predicted Diseases": list(pred_diseases)
    })

pd.DataFrame(decoded_all)


Subset accuracy: 1.0000
Hamming loss: 0.0000
Macro F1: 1.0000
Micro F1: 1.0000



Unnamed: 0,Crop Stage,True Diseases,Predicted Diseases
0,0,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."
1,1,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."
2,2,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."
3,3,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."
4,4,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."
5,5,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."
6,6,"[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B...","[Downy Mildew, Gray Leaf Spot, Healthy, Leaf B..."


In [11]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

stage_disease_map_final = data.groupby('Crop Stage')['Crop Disease'].apply(list).to_dict()


stage_df = pd.DataFrame({
    'Crop Stage': list(stage_disease_map_final.keys()),
    'Diseases': list(stage_disease_map_final.values())
})


le_stage = LabelEncoder()
X_stage = le_stage.fit_transform(stage_df['Crop Stage']).reshape(-1, 1)


mlb_final = MultiLabelBinarizer()
Y_stage = mlb_final.fit_transform(stage_df['Diseases'])


rf_stage = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=300, random_state=42)
)
rf_stage.fit(X_stage, Y_stage)


preds = rf_stage.predict(X_stage)
subset_acc = (preds == Y_stage).all(axis=1).mean()
print(f"Subset Accuracy (Stage→Diseases): {subset_acc:.4f}\n")


def predict_diseases(crop_stage):
    encoded_stage = le_stage.transform([crop_stage])
    pred = rf_stage.predict(np.array(encoded_stage).reshape(-1, 1))
    predicted_labels = mlb_final.inverse_transform(pred)
    return list(predicted_labels[0])

for stage in le_stage.classes_:
    print(f"{stage}: {predict_diseases(stage)}")


Subset Accuracy (Stage→Diseases): 1.0000

0: ['Downy Mildew', 'Gray Leaf Spot', 'Healthy', 'Leaf Blight', 'Leaf Spot', 'Rust', 'Smut']
1: ['Downy Mildew', 'Gray Leaf Spot', 'Healthy', 'Leaf Blight', 'Leaf Spot', 'Rust', 'Smut']
2: ['Downy Mildew', 'Gray Leaf Spot', 'Healthy', 'Leaf Blight', 'Leaf Spot', 'Rust', 'Smut']
3: ['Downy Mildew', 'Gray Leaf Spot', 'Healthy', 'Leaf Blight', 'Leaf Spot', 'Rust', 'Smut']
4: ['Downy Mildew', 'Gray Leaf Spot', 'Healthy', 'Leaf Blight', 'Rust', 'Smut']
5: ['Downy Mildew', 'Gray Leaf Spot', 'Healthy', 'Leaf Blight', 'Leaf Spot', 'Rust', 'Smut']
6: ['Downy Mildew', 'Gray Leaf Spot', 'Healthy', 'Leaf Blight', 'Leaf Spot', 'Rust', 'Smut']


In [12]:

import joblib
import pickle

model_artifacts = {
    'model': rf_stage,
    'mlb': mlb_final,
    'label_encoder': le_stage,
    'feature_names': ['Crop Stage']
}


joblib.dump(model_artifacts, 'maize_disease_model.pkl')

print("Model and encoders saved successfully!")


multi_model_artifacts = {
    'multi_model': multi_rf,
    'multi_mlb': mlb,
    'label_encoders': label_encoders,
    'feature_names': ['Crop Stage', 'Region', 'Temp (°C)', 'Humidity (%)', 
                     'Soil Moisture (%)', 'Rainfall (mm)', 'Soil pH']
}

joblib.dump(multi_model_artifacts, 'maize_disease_multi_model.pkl')
print("Multi-output model saved successfully!")

Model and encoders saved successfully!
Multi-output model saved successfully!
