In [1]:
import pandas as pd
import numpy as np

# === 1. Define All 30 Insect Symptom Questions (in the correct order) ===
# This list now correctly includes the first question.
questions = [
    "Is the pest in the image an armyworm?",
    "Is the armyworm green in color?",
    "Is the armyworm brown in color?",
    "Is the armyworm found on the leaf top?",
    "Is the armyworm found on the underside of the leaf?",
    "Is the armyworm present on the stem?",
    "Is the armyworm feeding on the crop?",
    "Are visible bite marks present on the leaf?",
    "Are there multiple armyworms in the image?",
    "Is any frass (armyworm waste) visible near the pest?",
    "Are eggs visible near the armyworm?",
    "Are larvae of the armyworm visible?",
    "Has the crop been attacked by armyworm in previous seasons?",
    "Was pesticide recently applied to this crop area?",
    "Is the armyworm population increasing?",
    "Is the armyworm active during daylight hours?",
    "Is the armyworm mostly active during night?",
    "Is the leaf portion of the plant affected?",
    "Is the stem portion of the plant affected?",
    "Is the damage restricted to a small part of the crop?",
    "Are nearby plants also showing signs of armyworm infestation?",
    "Is the armyworm moving actively?",
    "Are there signs of curled leaves due to feeding?",
    "Has the armyworm damaged more than one section of the same plant?",
    "Is there visible discoloration of the crop due to pest feeding?",
    "Does the armyworm show striping or lines on its body?",
    "Is the length of the armyworm greater than 20 mm?",
    "Are any dead armyworms seen in the area (possibly due to pesticide)?",
    "Is any chewing sound audible during the inspection?",
    "Has any farmer nearby reported armyworm infestation in the last week?",
]

# === 2. Generate the Synthetic Data ===
num_samples = 150  # Increased sample size for better training
presence_labels = ['Present', 'Not Present']
answer_options = ['Yes', 'No']
# When an insect is not present, answers are heavily biased towards 'No'
no_symptoms_bias = [0.05, 0.95]  # 5% 'Yes', 95% 'No'

insect_presence_data = []
for _ in range(num_samples):
    presence = np.random.choice(presence_labels)
    row = {}

    if presence == 'Present':
        # If an insect is present, answers are random
        for q in questions:
            row[q] = np.random.choice(answer_options)
    else:
        # If no insect is present, bias answers to 'No'
        for q in questions:
            row[q] = np.random.choice(answer_options, p=no_symptoms_bias)

    row['Insect_Present'] = presence
    insect_presence_data.append(row)

# Create the DataFrame
synthetic_insect_presence_df = pd.DataFrame(insect_presence_data)

# Reorder columns to ensure 'Insect_Present' is last
synthetic_insect_presence_df = synthetic_insect_presence_df[questions + ['Insect_Present']]


# === 3. Save the Corrected Dataset ===
file_path = 'synthetic_insect_presence_30_questions.csv'
synthetic_insect_presence_df.to_csv(file_path, index=False)

print(f"✅ New dataset with {len(questions)} questions created successfully!")
print(f"File saved as: {file_path}")
print(f"Dataset shape: {synthetic_insect_presence_df.shape}")
print("\nFirst 5 rows of the new dataset:")
print(synthetic_insect_presence_df.head())

✅ New dataset with 30 questions created successfully!
File saved as: synthetic_insect_presence_30_questions.csv
Dataset shape: (150, 31)

First 5 rows of the new dataset:
  Is the pest in the image an armyworm? Is the armyworm green in color?  \
0                                   Yes                              No   
1                                    No                              No   
2                                    No                              No   
3                                    No                              No   
4                                   Yes                             Yes   

  Is the armyworm brown in color? Is the armyworm found on the leaf top?  \
0                             Yes                                     No   
1                              No                                    Yes   
2                              No                                     No   
3                             Yes                                    Yes  

In [2]:
import pandas as pd
import numpy as np

# === 1. All 30 Disease Symptom Questions ===
questions = [
    "Is there a yellow halo around the spots?",
    "Are the leaf spots circular with concentric rings?",
    "Does the disease begin on the lower leaves?",
    "Are the lesions expanding over time?",
    "Is the center of the spot dry and brown?",
    "Are multiple spots merging to form large blotches?",
    "Does the leaf show signs of early yellowing?",
    "Are stems or fruits also affected?",
    "Are the affected leaves wilting?",
    "Is the infection spreading upward on the plant?",
    "Are concentric rings visible clearly on the leaves?",
    "Is there any rotting seen on fruit?",
    "Are the leaf margins turning brown?",
    "Is the plant under moisture stress?",
    "Is the disease more active during rainy days?",
    "Are nearby tomato plants also showing similar symptoms?",
    "Is there any black moldy growth on the lesion?",
    "Does the disease affect the whole plant?",
    "Is the spot size more than 5mm in diameter?",
    "Are the lesions visible on both sides of the leaf?",
    "Is the infection found only on mature leaves?",
    "Are the leaf veins visible through the lesion?",
    "Is the damage uniform across the field?",
    "Was there previous history of Early Blight in this field?",
    "Is the farmer using resistant tomato varieties?",
    "Was any fungicide recently applied?",
    "Was there poor air circulation in the field?",
    "Was the field irrigated from overhead sprinklers?",
    "Are pruning and sanitation practices followed?",
    "Is there any other crop in the field showing similar spots?"
]

# === 2. Generate the Synthetic Data ===
num_samples = 150  # Number of data rows to generate
presence_labels = ['Present', 'Not Present']
answer_options = ['Yes', 'No']
# When a disease is not present, answers are heavily biased towards 'No'
no_symptoms_bias = [0.05, 0.95]  # 5% 'Yes', 95% 'No'

disease_presence_data = []
for _ in range(num_samples):
    presence = np.random.choice(presence_labels)
    row = {}

    if presence == 'Present':
        # If a disease is present, answers are random
        for q in questions:
            row[q] = np.random.choice(answer_options)
    else:
        # If no disease is present, bias answers to 'No'
        for q in questions:
            row[q] = np.random.choice(answer_options, p=no_symptoms_bias)

    row['Disease_Present'] = presence
    disease_presence_data.append(row)

# Create the DataFrame
synthetic_disease_presence_df = pd.DataFrame(disease_presence_data)

# Reorder columns to ensure 'Disease_Present' is the last column
synthetic_disease_presence_df = synthetic_disease_presence_df[questions + ['Disease_Present']]


# === 3. Save the Corrected Dataset ===
file_path = 'synthetic_disease_presence_30_questions.csv'
synthetic_disease_presence_df.to_csv(file_path, index=False)

print(f"✅ New dataset with {len(questions)} questions created successfully!")
print(f"File saved as: {file_path}")
print(f"Dataset shape: {synthetic_disease_presence_df.shape}")
print("\nFirst 5 rows of the new dataset:")
print(synthetic_disease_presence_df.head())

✅ New dataset with 30 questions created successfully!
File saved as: synthetic_disease_presence_30_questions.csv
Dataset shape: (150, 31)

First 5 rows of the new dataset:
  Is there a yellow halo around the spots?  \
0                                       No   
1                                       No   
2                                       No   
3                                       No   
4                                       No   

  Are the leaf spots circular with concentric rings?  \
0                                                 No   
1                                                 No   
2                                                 No   
3                                                 No   
4                                                Yes   

  Does the disease begin on the lower leaves?  \
0                                          No   
1                                          No   
2                                         Yes   
3                 