In [None]:
import json

# Load the CDDM JSON dataset with UTF-8 encoding
with open("Crop_Disease_train_qwenvl.json", "r", encoding="utf-8") as f:
    cddm_data = json.load(f)

# Filter for wheat-related entries
wheat_data = [
    entry for entry in cddm_data
    if "Wheat" in entry["conversations"][0]["value"]
]

print(f"Found {len(wheat_data)} wheat-related entries.")

# Save the filtered dataset
with open("wheat_cddm_data.json", "w", encoding="utf-8") as f:
    json.dump(wheat_data, f, indent=2)

In [None]:
# Load the filtered wheat data
with open("wheat_cddm_data.json", "r", encoding="utf-8") as f:
    wheat_data = json.load(f)

# Function to fix the image path
def fix_image_path(entry):
    # Extract the current image path from the first user message
    first_message = entry["conversations"][0]["value"]
    image_path = first_message.split("<img>")[1].split("</img>")[0]

    # Extract the class name (e.g., "Wheat,Root Rot") and remove spaces
    class_name = image_path.split("/")[-2]  # e.g., "Wheat,Root Rot"
    class_name_no_space = class_name.replace(" ", "")  # e.g., "Wheat,RootRot"
    
    # Extract the image filename
    image_filename = image_path.split("/")[-1]  # e.g., "plant_130966.jpg"
    
    # Construct the new path to match your directory structure
    new_image_path = f"../data/CDDM-images/images/{class_name_no_space}/{image_filename}"
    
    # Update the image path in the first user message
    entry["conversations"][0]["value"] = first_message.replace(image_path, new_image_path)
    
    return entry

# Apply the fix to all entries
updated_wheat_data = [fix_image_path(entry) for entry in wheat_data]

# Save the updated dataset
with open("wheat_cddm_data_fixed.json", "w", encoding="utf-8") as f:
    json.dump(updated_wheat_data, f, indent=2)

print("Fixed image paths and saved to wheat_cddm_data_fixed.json")

In [None]:
import json
from pathlib import Path
from sklearn.model_selection import train_test_split

def convert_cddm_to_qwen_format(cddm_entry):
    # Extract image path and filename
    first_message = cddm_entry["conversations"][0]["value"]
    try:
        image_path = first_message.split("<img>")[1].split("</img>")[0]
        image_filename = Path(image_path).name  # e.g., "plant_134859.jpg"
    except IndexError:
        print(f"Warning: Invalid image path in entry: {cddm_entry}")
        image_filename = ""

    # Convert conversations to Qwen-VL format
    conversations = []
    for turn in cddm_entry["conversations"]:
        from_field = "human" if turn["from"] == "user" else "gpt"
        value = turn["value"]
        if "<img>" in value:
            # Replace <img>...</img> with <image>
            value = "<image>\n" + value.split("</img>\n")[1] if "</img>\n" in value else "<image>"
        conversations.append({
            "from": from_field,
            "value": value
        })

    return {
        "image": f"images/{image_filename}",  # Match Qwen-VL format
        "conversations": conversations
    }

# Load the fixed wheat data
with open("wheat_cddm_data_fixed.json", "r", encoding="utf-8") as f:
    wheat_data_fixed = json.load(f)

# Convert to Qwen-VL format
qwen_format_data = [convert_cddm_to_qwen_format(entry) for entry in wheat_data_fixed]

# Split into train, validation, and test sets
train_data, temp_data = train_test_split(qwen_format_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Save as JSON files
for data, name in [(train_data, "train"), (val_data, "val"), (test_data, "test")]:
    with open(f"wheat_dataset_{name}.json", "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

print("Converted datasets saved: wheat_dataset_train.json, wheat_dataset_val.json, wheat_dataset_test.json")

In [None]:
import json
import os

# Current base directory where images are located
current_base_dir = "../data/CDDM-images/images"

# Process train, val, and test sets
for split in ["train", "val", "test"]:
    json_path = f"wheat_dataset_{split}.json"
    with open(json_path, "r") as f:
        data = json.load(f)

    missing = 0
    found = 0
    for item in data:
        image_filename = item["image"].split("/")[-1]  # e.g., "plant_136080.jpg"
        # Find the image in the current directory
        image_path = None
        for root, _, files in os.walk(current_base_dir):
            if image_filename in files:
                image_path = os.path.join(root, image_filename)
                break
        if image_path:
            found += 1
            print(f"Found image: {image_filename} at {image_path}")
        else:
            missing += 1
            print(f"Missing image: {image_filename}")

    print(f"{split} set: Found {found} images, Missing {missing} images")

In [3]:
import os
import json
from pathlib import Path
from sklearn.model_selection import train_test_split

def convert_cddm_to_qwen_format(cddm_entry, base_image_dir, removed_counter):
    # Extract image path and filename
    first_message = cddm_entry["conversations"][0]["value"]
    try:
        image_path = first_message.split("<img>")[1].split("</img>")[0]
        image_filename = Path(image_path).name  # e.g., "plant_134859.jpg"
        
        # Determine the class (category) of the image
        class_name = None
        for class_dir in os.listdir(base_image_dir):
            full_image_path = os.path.join(base_image_dir, class_dir, image_filename)
            if os.path.exists(full_image_path):
                class_name = class_dir
                break
        
        if not class_name:
            print(f"Skipping entry: Image not found for {image_filename}")
            removed_counter["count"] += 1  # Increment counter
            return None

        full_image_path = os.path.join(base_image_dir, class_name, image_filename)
        if not os.path.exists(full_image_path):
            print(f"Skipping entry: Image not found at {full_image_path}")
            removed_counter["count"] += 1  # Increment counter
            return None
    except IndexError:
        print(f"Warning: Invalid image path in entry: {cddm_entry}")
        removed_counter["count"] += 1  # Increment counter
        return None

    # Convert conversations to Qwen-VL format
    conversations = []
    for turn in cddm_entry["conversations"]:
        from_field = "human" if turn["from"] == "user" else "gpt"
        value = turn["value"]
        if "<img>" in value:
            # Replace <img>...</img> with <image>
            value = "<image>\n" + value.split("</img>\n")[1] if "</img>\n" in value else "<image>"
        conversations.append({
            "from": from_field,
            "value": value
        })

    return {
        "image": f"images/{class_name}/{image_filename}",  # Match Qwen-VL format with class directory
        "conversations": conversations
    }

# Specify the base directory where class folders (e.g., WHEAT_Healthy) are stored
base_image_dir = "../data/CDDM-images/images"  # Path to the 'images' folder containing class directories

# Load the fixed wheat data
with open("wheat_cddm_data_fixed.json", "r", encoding="utf-8") as f:
    wheat_data_fixed = json.load(f)

# Initialize a counter for removed entries
removed_counter = {"count": 0}

# Convert to Qwen-VL format and filter out invalid entries
qwen_format_data = [entry for entry in (convert_cddm_to_qwen_format(entry, base_image_dir, removed_counter) for entry in wheat_data_fixed) if entry is not None]

# Split into train, validation, and test sets
train_data, temp_data = train_test_split(qwen_format_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Save as JSON files
for data, name in [(train_data, "train"), (val_data, "val"), (test_data, "test")]:
    with open(f"wheat_dataset_{name}.json", "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

# Print the total number of removed entries
print(f"Total entries removed: {removed_counter['count']}")
print("Converted datasets saved: wheat_dataset_train.json, wheat_dataset_val.json, wheat_dataset_test.json")

Skipping entry: Image not found for plant_130475.jpg
Skipping entry: Image not found for plant_131326.jpg
Skipping entry: Image not found for plant_136018.jpg
Skipping entry: Image not found for plant_134673.jpg
Skipping entry: Image not found for plant_135927.jpg
Skipping entry: Image not found for plant_136008.jpg
Skipping entry: Image not found for plant_132031.jpg
Skipping entry: Image not found for plant_136211.jpg
Skipping entry: Image not found for plant_135364.jpg
Skipping entry: Image not found for plant_135050.jpg
Skipping entry: Image not found for plant_135169.jpg
Skipping entry: Image not found for plant_134435.jpg
Skipping entry: Image not found for plant_130285.jpg
Skipping entry: Image not found for plant_134527.jpg
Skipping entry: Image not found for plant_130575.jpg
Skipping entry: Image not found for plant_132455.jpg
Skipping entry: Image not found for plant_135856.jpg
Skipping entry: Image not found for plant_135550.jpg
Skipping entry: Image not found for plant_1355

In [None]:
# # List of JSON files to verify
# json_files = ["wheat_dataset_train.json", "wheat_dataset_val.json", "wheat_dataset_test.json"]

# def verify_json_and_images(json_file, base_image_dir):
#     print(f"\nVerifying {json_file}...")
    
#     # Load the JSON file
#     try:
#         with open(json_file, "r", encoding="utf-8") as f:
#             data = json.load(f)
#     except Exception as e:
#         print(f"Error loading {json_file}: {e}")
#         return False

#     print(f"Total entries in {json_file}: {len(data)}")
    
#     # Counter for invalid entries
#     invalid_entries = 0
#     sample_entries = []

#     for idx, entry in enumerate(data):
#         # Check if the entry has the required fields
#         if "image" not in entry or "conversations" not in entry:
#             print(f"Entry {idx} is missing 'image' or 'conversations' fields.")
#             invalid_entries += 1
#             continue

#         # Verify the image path
#         image_path = entry["image"]  # e.g., "images/WHEAT_Healthy/plant_134859.jpg"
#         try:
#             relative_image_path = image_path.split("images/")[1]  # e.g., "WHEAT_Healthy/plant_134859.jpg"
#             full_image_path = base_image_dir / relative_image_path  # Construct full path using Path
#         except IndexError:
#             print(f"Entry {idx}: Invalid image path format: {image_path}")
#             invalid_entries += 1
#             continue

#         if not full_image_path.exists():
#             print(f"Entry {idx}: Image not found at {full_image_path}")
#             invalid_entries += 1
#             continue

#         # Verify the conversation format
#         conversations = entry["conversations"]
#         if not conversations or not isinstance(conversations, list):
#             print(f"Entry {idx}: Invalid 'conversations' format.")
#             invalid_entries += 1
#             continue

#         for conv in conversations:
#             if "from" not in conv or "value" not in conv:
#                 print(f"Entry {idx}: Conversation missing 'from' or 'value'.")
#                 invalid_entries += 1
#                 break

#         # Collect a few sample entries for inspection
#         if idx < 3:  # Collect first 3 entries as samples
#             sample_entries.append(entry)

#     print(f"Total invalid entries in {json_file}: {invalid_entries}")
#     print(f"Sample entries from {json_file}:")
#     for idx, sample in enumerate(sample_entries):
#         print(f"Sample {idx + 1}: {sample}\n")

#     return invalid_entries == 0

# # Verify all JSON files
# all_valid = True
# for json_file in json_files:
#     if not verify_json_and_images(json_file, base_image_dir):
#         all_valid = False

# if all_valid:
#     print("All JSON files and images are valid. Ready to proceed with fine-tuning!")
# else:
#     print("There are issues with the JSON files or images. Please fix them before proceeding.")


Verifying wheat_dataset_train.json...
Total entries in wheat_dataset_train.json: 4835


TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [12]:
from pathlib import Path
import json

# List of JSON files to verify
json_files = ["wheat_dataset_train.json", "wheat_dataset_val.json", "wheat_dataset_test.json"]

def verify_json_and_images(json_file, base_image_dir):
    print(f"\nVerifying {json_file}...")
    
    # Ensure base_image_dir is a Path object
    base_image_dir = Path(base_image_dir)
    
    # Load the JSON file
    try:
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error loading {json_file}: {e}")
        return False

    print(f"Total entries in {json_file}: {len(data)}")
    
    # Counter for invalid entries
    invalid_entries = 0
    sample_entries = []

    for idx, entry in enumerate(data):
        # Check if the entry has the required fields
        if "image" not in entry or "conversations" not in entry:
            print(f"Entry {idx} is missing 'image' or 'conversations' fields.")
            invalid_entries += 1
            continue

        # Verify the image path
        image_path = entry["image"]  # e.g., "images/WHEAT_Healthy/plant_134859.jpg"
        try:
            relative_image_path = image_path.split("images/")[1]  # e.g., "WHEAT_Healthy/plant_134859.jpg"
            full_image_path = base_image_dir / relative_image_path  # Construct full path using Path
        except IndexError:
            print(f"Entry {idx}: Invalid image path format: {image_path}")
            invalid_entries += 1
            continue

        if not full_image_path.exists():
            print(f"Entry {idx}: Image not found at {full_image_path}")
            invalid_entries += 1
            continue

        # Verify the conversation format
        conversations = entry["conversations"]
        if not conversations or not isinstance(conversations, list):
            print(f"Entry {idx}: Invalid 'conversations' format.")
            invalid_entries += 1
            continue

        for conv in conversations:
            if "from" not in conv or "value" not in conv:
                print(f"Entry {idx}: Conversation missing 'from' or 'value'.")
                invalid_entries += 1
                break

        # Collect a few sample entries for inspection
        if idx < 3:  # Collect first 3 entries as samples
            sample_entries.append(entry)

    print(f"Total invalid entries in {json_file}: {invalid_entries}")
    print(f"Sample entries from {json_file}:")
    for idx, sample in enumerate(sample_entries):
        print(f"Sample {idx + 1}: {sample}\n")

    return invalid_entries == 0

# Verify all JSON files
all_valid = True

for json_file in json_files:
    if not verify_json_and_images(json_file, base_image_dir):
        all_valid = False

if all_valid:
    print("All JSON files and images are valid. Ready to proceed with fine-tuning!")
else:
    print("There are issues with the JSON files or images. Please fix them before proceeding.")


Verifying wheat_dataset_train.json...
Total entries in wheat_dataset_train.json: 4835
Total invalid entries in wheat_dataset_train.json: 0
Sample entries from wheat_dataset_train.json:
Sample 1: {'image': 'images/Wheat,LeafRust/plant_133831.jpg', 'conversations': [{'from': 'human', 'value': '<image>\nDescribe the content of this picture.'}, {'from': 'gpt', 'value': 'This image shows a wheat leaf exhibiting symptoms of Leaf Rust, characterized by small, round, reddish-orange pustules on the leaf surface.'}, {'from': 'human', 'value': 'Is this tomato leaf healthy? '}, {'from': 'gpt', 'value': "This is not a tomato leaf; it's a wheat leaf afflicted with Leaf Rust."}, {'from': 'human', 'value': 'Is this a wheat leaf or a corn leaf? '}, {'from': 'gpt', 'value': 'This is a wheat leaf.'}, {'from': 'human', 'value': 'Is this crop diseased? '}, {'from': 'gpt', 'value': 'Yes, this wheat leaf is afflicted with Leaf Rust.'}, {'from': 'human', 'value': "What plant's leaf is this? "}, {'from': 'gpt

### Inspect the Reformatted JSON

In [2]:
import json

# Inspect the reformatted JSON
with open("wheat_dataset_train.json", "r", encoding="utf-8") as f:
    reformatted_data = json.load(f)

# Print the first entry
print("First entry in wheat_dataset_train_qwen_format.json:")
print(json.dumps(reformatted_data[0], indent=2))

First entry in wheat_dataset_train_qwen_format.json:
{
  "image": "images\\Wheat,RootRot\\plant_130334.jpg",
  "conversations": [
    {
      "from": "user",
      "value": "<image> Classify this wheat image."
    },
    {
      "from": "assistant",
      "value": "The wheat image belongs to class: Wheat,RootRot"
    }
  ]
}
