In [1]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [2]:
# ✅ Step 1: Install dependencies
!pip install -q transformers accelerate torch torchvision sentencepiece bitsandbytes

# ✅ Step 2: Import modules
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import torch
import re
from google.colab import files
from io import BytesIO
from transformers import BitsAndBytesConfig

# ✅ Step 3: Upload image
uploaded = files.upload()
image_path = list(uploaded.keys())[0]
image = Image.open(image_path).convert("RGB")

# ✅ Step 4: Load LLaVA model
# Using a smaller model and quantization to avoid OutOfMemoryError
model_id = "llava-hf/llava-1.5-7b-hf"

# Configure quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
)

# ✅ Step 5: Define prompt and inference
prompt = "How many pests are visible in this image? Please provide the number as a single digit or number."

# Format the prompt with the image token
chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ message['content'] | join }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|endoftext|>' }}{% endif %}{% endfor %}"

messages = [
    {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}
]

text_input = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)

inputs = processor(text=text_input, images=image, return_tensors="pt").to(model.device)
generate_ids = model.generate(**inputs, max_new_tokens=256)
output_text = processor.batch_decode(generate_ids, skip_special_tokens=True)[0]

# ✅ Step 6: Extract count
match = re.search(r'\b(\d{1,3})\b', output_text)
count = int(match.group(1)) if match else None
leaf_area_cm2 = 20
density = count / leaf_area_cm2 if count else "N/A"

# ✅ Step 7: Final Output
print("\n📷 Pest Image Analysis (LLaVA 1.5)")
print(f"🧾 Model Output: {output_text}")
if count:
    print(f"✅ Pests Detected: Yes ({count})")
    print(f"🌿 Estimated Density: {density:.2f} pests/cm²")
else:
    print("⚠️ Pest count not clearly extracted.")

Saving P (1).jpg to P (1).jpg


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]


📷 Pest Image Analysis (LLaVA 1.5)
🧾 Model Output: USER:  
How many pests are visible in this image? Please provide the number as a single digit or number. ASSISTANT: 10
✅ Pests Detected: Yes (10)
🌿 Estimated Density: 0.50 pests/cm²


In [1]:
# ✅ Step 1: Install dependencies
!pip install -q transformers accelerate torch torchvision sentencepiece bitsandbytes

# ✅ Step 2: Import modules
from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig
from PIL import Image, ImageDraw, ImageFont
import torch
import re
from google.colab import files
import pandas as pd
import os

# ✅ Step 3: Upload multiple images
uploaded = files.upload()
image_paths = list(uploaded.keys())

# ✅ Step 4: Load quantized LLaVA model
model_id = "llava-hf/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
)

# ✅ Step 5: Prompt Template
prompt = (
    "Please examine the image and provide pest details in the format:\n"
    "Aphids: [number], Thrips: [number], Spider mites: [number], Others: [number].\n"
    "Only give numeric answers."
)

# ✅ Step 6: Analyze images
results = []
for image_path in image_paths:
    image = Image.open(image_path).convert("RGB")
    width, height = image.size
    dpi = 300
    pixels_per_cm = dpi / 2.54
    area_cm2 = (width / pixels_per_cm) * (height / pixels_per_cm)

    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
    text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=text_input, images=image, return_tensors="pt").to(model.device)
    generate_ids = model.generate(**inputs, max_new_tokens=256)
    output_text = processor.batch_decode(generate_ids, skip_special_tokens=True)[0]

    # ✅ Extract counts for each pest type
    def extract_count(keyword):
        match = re.search(rf"{keyword}:\s*(\d+)", output_text, re.IGNORECASE)
        return int(match.group(1)) if match else 0

    aphids = extract_count("Aphids")
    thrips = extract_count("Thrips")
    mites = extract_count("Spider mites")
    others = extract_count("Others")
    total = aphids + thrips + mites + others
    density = total / area_cm2 if total else None

    # ✅ Annotate image
    annotated = image.copy()
    draw = ImageDraw.Draw(annotated)
    font = ImageFont.load_default()
    annotation_msg = f"Aphids: {aphids}, Thrips: {thrips}, Mites: {mites}, Others: {others}"
    draw.text((10, 10), annotation_msg, fill="red", font=font)
    annotated.save(f"annotated_{os.path.basename(image_path)}")

    results.append({
        "Image": image_path,
        "Leaf Area (cm²)": round(area_cm2, 2),
        "Aphids": aphids,
        "Thrips": thrips,
        "Spider Mites": mites,
        "Others": others,
        "Total Pests": total,
        "Density (pests/cm²)": f"{density:.2f}" if density else "N/A",
        "Model Output": output_text
    })

    print(f"\n📷 Analyzed: {image_path}")
    print(f"🧾 Model Output:\n{output_text}")
    print(f"✅ Total: {total}, Density: {f'{density:.2f}' if density else 'N/A'}")

# ✅ Step 7: Save results to CSV
df = pd.DataFrame(results)
df.to_csv("structured_pest_results.csv", index=False)
print("\n📊 Structured results saved to structured_pest_results.csv")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Saving 1 Oak Aphid Myzocalliss - Burr Oak  2023 1B.jpeg to 1 Oak Aphid Myzocalliss - Burr Oak  2023 1B.jpeg


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]


📷 Analyzed: 1 Oak Aphid Myzocalliss - Burr Oak  2023 1B.jpeg
🧾 Model Output:
USER:  
Please examine the image and provide pest details in the format:
Aphids: [number], Thrips: [number], Spider mites: [number], Others: [number].
Only give numeric answers. ASSISTANT: Aphids: 100, Thrips: 0, Spider mites: 0, Others: 0.
✅ Total: 100, Density: 4.04

📊 Structured results saved to structured_pest_results.csv
