<a href="https://colab.research.google.com/github/Leorasaharia/agriverse/blob/main/AgriVerse_AllInOne_Trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# AgriVerse — One-Notebook Trainer (Vision + NER + Q&A)

This notebook trains small baselines for:
- Tomato leaf diseases (ViT; optional MobileNetV3 via `timm`)
- Paddy diseases (ViT)
- Hindi NER on **Naamapadam** (and optional WikiANN)
- Agro Q&A on **AgroQA** (mt5-small)

**Toggles** are at the top—set `True/False` and run **top → bottom**.

_Built 2025-08-09._


In [None]:
# =======================
# RUN TOGGLES (edit here)
# =======================
RUN_TOMATO_VIT = True
RUN_PADDY_VIT  = True
RUN_NAAMAPADAM_NER = True
RUN_WIKIANN_NER    = True  # Added RUN_WIKIANN_NER toggle
RUN_AGROQA_QA      = True
RUN_TOMATO_MBV3    = False  # Optional MobileNetV3

In [None]:

# Minimal installs
!pip -q install -U datasets transformers accelerate evaluate timm torchvision sentencepiece seqeval

import os, random, numpy as np, torch, inspect
import transformers
print("Transformers:", transformers.__version__)
print("Torch:", torch.__version__, "| CUDA available?", torch.cuda.is_available())

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

# Version-safe keyword for eval strategy
from transformers import TrainingArguments
EVAL_KWARG = "eval_strategy" if "eval_strategy" in inspect.signature(TrainingArguments.__init__).parameters else "evaluation_strategy"
print("Using TrainingArguments kwarg:", EVAL_KWARG)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/494.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m491.5/494.8 kB[0m [31m20.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hTransformers: 4.55.0
Torch: 2.8.0+cu128 | CUDA available? True
Using TrainingArguments kwarg: eval_strategy


## A) Tomato — ViT (`wellCh4n/tomato-leaf-disease-image`)

In [None]:

if RUN_TOMATO_VIT:
    from datasets import load_dataset, Image as HFImage
    from transformers import AutoImageProcessor, AutoModelForImageClassification, TrainingArguments, Trainer
    import numpy as np, evaluate, torch

    ds = load_dataset("wellCh4n/tomato-leaf-disease-image")
    IMG_COL, LAB_COL = "image", "label"
    if not isinstance(ds["train"].features[IMG_COL], HFImage):
        ds = ds.cast_column(IMG_COL, HFImage())

    labels = ds["train"].features[LAB_COL].names
    id2label = {i:l for i,l in enumerate(labels)}
    label2id = {l:i for i,l in enumerate(labels)}

    ckpt = "google/vit-base-patch16-224-in21k"
    processor = AutoImageProcessor.from_pretrained(ckpt, use_fast=True)

    def transform(batch):
        out = processor(images=batch[IMG_COL], return_tensors="pt")
        out["labels"] = batch[LAB_COL]
        return out

    train_ds = ds["train"].shuffle(seed=SEED).select(range(min(3000, len(ds["train"])))).with_transform(transform)
    val_name = "validation" if "validation" in ds else ("test" if "test" in ds else "train")
    val_ds   = ds[val_name].with_transform(transform)

    model = AutoModelForImageClassification.from_pretrained(
        ckpt,
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True,
    )

    args = TrainingArguments(
        output_dir="/content/tomato_vit",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        learning_rate=5e-5,
        num_train_epochs=2,
        save_strategy="no",
        fp16=torch.cuda.is_available(),
        report_to="none",
        logging_steps=50,
        remove_unused_columns=False,
        dataloader_pin_memory=torch.cuda.is_available(),
        **{EVAL_KWARG: "epoch"}
    )

    acc = evaluate.load("accuracy")
    def compute_metrics(eval_pred):
        logits, labels_np = eval_pred
        preds = np.argmax(logits, axis=-1)
        return acc.compute(predictions=preds, references=labels_np)

    trainer = Trainer(model=model, args=args,
                      train_dataset=train_ds, eval_dataset=val_ds,
                      compute_metrics=compute_metrics)
    trainer.train(); trainer.evaluate()
    model.save_pretrained("/content/tomato_vit/model")
    processor.save_pretrained("/content/tomato_vit/processor")
    print("Saved ViT tomato model to /content/tomato_vit")
else:
    print("Skipping Tomato ViT")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/56.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14218 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3569 [00:00<?, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

{'loss': 1.6243, 'grad_norm': 1.7924984693527222, 'learning_rate': 4.348404255319149e-05, 'epoch': 0.26595744680851063}
{'loss': 0.9265, 'grad_norm': 1.5625306367874146, 'learning_rate': 3.683510638297872e-05, 'epoch': 0.5319148936170213}
{'loss': 0.5916, 'grad_norm': 1.9606589078903198, 'learning_rate': 3.0186170212765956e-05, 'epoch': 0.7978723404255319}
{'eval_loss': 0.38592058420181274, 'eval_accuracy': 0.9784253292238723, 'eval_runtime': 22.9439, 'eval_samples_per_second': 155.553, 'eval_steps_per_second': 4.881, 'epoch': 1.0}
{'loss': 0.4168, 'grad_norm': 0.8876131772994995, 'learning_rate': 2.3537234042553192e-05, 'epoch': 1.0638297872340425}
{'loss': 0.2899, 'grad_norm': 0.8225058913230896, 'learning_rate': 1.6888297872340426e-05, 'epoch': 1.3297872340425532}
{'loss': 0.2524, 'grad_norm': 2.6041345596313477, 'learning_rate': 1.023936170212766e-05, 'epoch': 1.5957446808510638}
{'loss': 0.2226, 'grad_norm': 0.6838308572769165, 'learning_rate': 3.590425531914894e-06, 'epoch': 1.86

### (Optional) Tomato — MobileNetV3 (timm)

In [None]:

if RUN_TOMATO_MBV3:
    import torch, torchvision
    from torchvision import transforms
    from torch.utils.data import DataLoader
    from datasets import load_dataset

    ds = load_dataset("wellCh4n/tomato-leaf-disease-image")
    num_classes = ds["train"].features["label"].num_classes

    tfms_train = transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor()])
    tfms_val = transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()])

    def to_torch(ex, train=True):
        x = ex["image"].convert("RGB")
        x = tfms_train(x) if train else tfms_val(x)
        return {"pixel_values": x, "labels": ex["label"]}

    train_torch = ds["train"].shuffle(seed=SEED).select(range(min(3000, len(ds["train"]))))        .with_transform(lambda b: {"pixel_values": torch.stack([to_torch(x, True)["pixel_values"] for x in b]), "labels": torch.tensor(b["label"])})
    val_split = "validation" if "validation" in ds else "test"
    val_torch = ds[val_split]        .with_transform(lambda b: {"pixel_values": torch.stack([to_torch(x, False)["pixel_values"] for x in b]), "labels": torch.tensor(b["label"])} )

    from torch.utils.data import DataLoader
    train_loader = DataLoader(train_torch, batch_size=32, shuffle=True)
    val_loader   = DataLoader(val_torch, batch_size=64, shuffle=False)

    import timm
    model = timm.create_model("mobilenetv3_small_100", pretrained=True, num_classes=num_classes)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    opt = torch.optim.AdamW(model.parameters(), lr=5e-4)
    loss_fn = torch.nn.CrossEntropyLoss()

    def epoch(dl, train=True):
        model.train() if train else model.eval()
        total=0; correct=0; loss_sum=0.0
        for batch in dl:
            x = batch["pixel_values"].to(device); y = batch["labels"].to(device)
            with torch.set_grad_enabled(train):
                logits = model(x); loss = loss_fn(logits, y)
                if train: opt.zero_grad(); loss.backward(); opt.step()
            preds = logits.argmax(1)
            total += y.size(0); correct += (preds==y).sum().item(); loss_sum += loss.item()*y.size(0)
        return loss_sum/total, correct/total

    for ep in range(2):
        tr_loss, tr_acc = epoch(train_loader, True)
        va_loss, va_acc = epoch(val_loader, False)
        print(f"[EP{ep+1}] train acc={tr_acc:.3f} val acc={va_acc:.3f}")

    os.makedirs("/content/tomato_mbv3", exist_ok=True)
    torch.save(model.state_dict(), "/content/tomato_mbv3/mobilenetv3_small_100.pth")
    print("Saved MobileNetV3 weights to /content/tomato_mbv3")
else:
    print("Skipping Tomato MobileNetV3")


Skipping Tomato MobileNetV3


## B) Paddy — ViT (`anthony2261/paddy-disease-classification`)

In [None]:

if RUN_PADDY_VIT:
    from datasets import load_dataset, Image as HFImage
    from transformers import AutoImageProcessor, AutoModelForImageClassification, TrainingArguments, Trainer
    import numpy as np, evaluate, torch

    ds = load_dataset("anthony2261/paddy-disease-classification")
    IMG_COL, LAB_COL = "image", "label"
    if not isinstance(ds["train"].features[IMG_COL], HFImage):
        ds = ds.cast_column(IMG_COL, HFImage())

    labels = ds["train"].features[LAB_COL].names
    id2label = {i:l for i,l in enumerate(labels)}
    label2id = {l:i for i,l in enumerate(labels)}

    ckpt = "google/vit-base-patch16-224-in21k"
    processor = AutoImageProcessor.from_pretrained(ckpt, use_fast=True)

    def transform(batch):
        out = processor(images=batch[IMG_COL], return_tensors="pt")
        out["labels"] = batch[LAB_COL]
        return out

    train_ds = ds["train"].shuffle(seed=SEED).select(range(min(4000, len(ds["train"])))).with_transform(transform)
    val_name = "validation" if "validation" in ds else ("test" if "test" in ds else "train")
    val_ds   = ds[val_name].with_transform(transform)

    model = AutoModelForImageClassification.from_pretrained(
        ckpt, num_labels=len(labels), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
    )

    from transformers import TrainingArguments
    args = TrainingArguments(
        output_dir="/content/paddy_vit",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        learning_rate=5e-5,
        num_train_epochs=2,
        save_strategy="no",
        fp16=torch.cuda.is_available(),
        report_to="none",
        logging_steps=50,
        remove_unused_columns=False,
        dataloader_pin_memory=torch.cuda.is_available(),
        **{EVAL_KWARG: "epoch"}
    )

    acc = evaluate.load("accuracy")
    def compute_metrics(eval_pred):
        logits, labels_np = eval_pred
        preds = np.argmax(logits, axis=-1)
        return acc.compute(predictions=preds, references=labels_np)

    trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds, compute_metrics=compute_metrics)
    trainer.train(); trainer.evaluate()

    model.save_pretrained("/content/paddy_vit/model")
    processor.save_pretrained("/content/paddy_vit/processor")
    print("Saved ViT paddy model to /content/paddy_vit")
else:
    print("Skipping Paddy ViT")


NameError: name 'RUN_PADDY_VIT' is not defined

## C) NER — Naamapadam (Hindi)

In [None]:
# Roll back to datasets 2.x (supports loading scripts)
!pip -q install "datasets==2.15" "evaluate<0.5"

import datasets, evaluate, os, sys
print("datasets:", datasets.__version__, "| evaluate:", evaluate.__version__)

# hard-restart runtime so the older version is actually used
os.kill(os.getpid(), 9)

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    DataCollatorForTokenClassification, TrainingArguments, Trainer
)
import numpy as np, evaluate, torch, inspect

# Load dataset (v2.x supports dataset scripts)
ds = load_dataset("ai4bharat/naamapadam", "hi")
TOK_COL, LAB_COL = "tokens", "ner_tags"

label_list = ds["train"].features[LAB_COL].feature.names
id2label = {i:l for i,l in enumerate(label_list)}
label2id = {l:i for i,l in enumerate(label_list)}

base = "xlm-roberta-base"
tok = AutoTokenizer.from_pretrained(base)

def tokenize_and_align_labels(batch):
    tokenized = tok(batch[TOK_COL], truncation=True, is_split_into_words=True)
    new_labels = []
    for i, labels in enumerate(batch[LAB_COL]):
        word_ids = tokenized.word_ids(batch_index=i)
        prev = None; ids = []
        for wid in word_ids:
            if wid is None: ids.append(-100)
            elif wid != prev: ids.append(labels[wid])
            else: ids.append(labels[wid])  # label_all_tokens=True
            prev = wid
        new_labels.append(ids)
    tokenized["labels"] = new_labels
    return tokenized

SEED = 42
train_raw = ds["train"].shuffle(seed=SEED).select(range(min(6000, len(ds["train"]))))
val_raw   = ds["validation"] if "validation" in ds else ds["test"]

cols = ds["train"].column_names
train_ds = train_raw.map(tokenize_and_align_labels, batched=True, remove_columns=cols)
val_ds   = val_raw.map(tokenize_and_align_labels, batched=True, remove_columns=cols)

model = AutoModelForTokenClassification.from_pretrained(
    base, num_labels=len(label_list), id2label=id2label, label2id=label2id
)
collator = DataCollatorForTokenClassification(tok)

# transformers v4/v5 eval kwarg
from transformers import TrainingArguments
import inspect
eval_kw = "eval_strategy" if "eval_strategy" in inspect.signature(TrainingArguments.__init__).parameters else "evaluation_strategy"

args = TrainingArguments(
    output_dir="/content/naamapadam_hi_ner",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=3e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="no",
    fp16=torch.cuda.is_available(),
    report_to="none",
    logging_steps=50,
    **{eval_kw: "epoch"}
)

seqeval = evaluate.load("seqeval")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    true_preds, true_labels = [], []
    for p, l in zip(preds, labels):
        p_tags, l_tags = [], []
        for pi, li in zip(p, l):
            if li != -100:
                p_tags.append(label_list[pi]); l_tags.append(label_list[li])
        true_preds.append(p_tags); true_labels.append(l_tags)
    res = seqeval.compute(predictions=true_preds, references=true_labels)
    return {"precision": res.get("overall_precision", 0.0),
            "recall":    res.get("overall_recall", 0.0),
            "f1":        res.get("overall_f1", 0.0),
            "accuracy":  res.get("overall_accuracy", 0.0)}

trainer = Trainer(model=model, args=args,
                  train_dataset=train_ds, eval_dataset=val_ds,
                  data_collator=collator, tokenizer=tok,
                  compute_metrics=compute_metrics)
trainer.train()
trainer.evaluate()

model.save_pretrained("/content/naamapadam_hi_ner/model")
tok.save_pretrained("/content/naamapadam_hi_ner/tokenizer")
print("Saved to /content/naamapadam_hi_ner")


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args,


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3536,0.290639,0.720449,0.797826,0.757166,0.912574
2,0.2598,0.276674,0.75217,0.795093,0.773036,0.917734


Saved to /content/naamapadam_hi_ner


### (Optional) NER — WikiANN (Hindi)

In [None]:

if RUN_WIKIANN_NER:
    from datasets import load_dataset
    from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
    import numpy as np, evaluate, torch

    ds = load_dataset("unimelb-nlp/wikiann", "hi")
    TOK_COL, LAB_COL = "tokens", "ner_tags"
    label_list = ds["train"].features[LAB_COL].feature.names
    id2label = {i:l for i,l in enumerate(label_list)}
    label2id = {l:i for i,l in enumerate(label_list)}

    base = "xlm-roberta-base"
    tok = AutoTokenizer.from_pretrained(base)

    def tokenize_and_align_labels(batch):
        tokenized = tok(batch[TOK_COL], truncation=True, is_split_into_words=True)
        new_labels = []
        for i, labels in enumerate(batch[LAB_COL]):
            word_ids = tokenized.word_ids(batch_index=i)
            prev = None; ids = []
            for wid in word_ids:
                if wid is None: ids.append(-100)
                elif wid != prev: ids.append(labels[wid])
                else: ids.append(labels[wid])
                prev = wid
            new_labels.append(ids)
        tokenized["labels"] = new_labels
        return tokenized

    train_raw = ds["train"].shuffle(seed=SEED).select(range(min(6000, len(ds["train"]))))
    val_raw   = ds["validation"] if "validation" in ds else ds["test"]

    cols = ds["train"].column_names
    train_ds = train_raw.map(tokenize_and_align_labels, batched=True, remove_columns=cols)
    val_ds   = val_raw.map(tokenize_and_align_labels, batched=True, remove_columns=cols)

    model = AutoModelForTokenClassification.from_pretrained(base, num_labels=len(label_list), id2label=id2label, label2id=label2id)
    collator = DataCollatorForTokenClassification(tok)

    args = TrainingArguments(
        output_dir="/content/wikiann_hi_ner",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        learning_rate=3e-5,
        num_train_epochs=2,
        weight_decay=0.01,
        save_strategy="no",
        fp16=torch.cuda.is_available(),
        report_to="none",
        logging_steps=50,
        **{EVAL_KWARG: "epoch"}
    )

    seqeval = evaluate.load("seqeval")
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        true_preds, true_labels = [], []
        for p, l in zip(preds, labels):
            p_tags, l_tags = [], []
            for pi, li in zip(p, l):
                if li != -100:
                    p_tags.append(label_list[pi]); l_tags.append(label_list[li])
            true_preds.append(p_tags); true_labels.append(l_tags)
        res = seqeval.compute(predictions=true_preds, references=true_labels)
        return {"precision": res.get("overall_precision", 0.0),
                "recall": res.get("overall_recall", 0.0),
                "f1": res.get("overall_f1", 0.0),
                "accuracy": res.get("overall_accuracy", 0.0)}

    trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds,
                      data_collator=collator, tokenizer=tok, compute_metrics=compute_metrics)
    trainer.train(); trainer.evaluate()

    model.save_pretrained("/content/wikiann_hi_ner/model")
    tok.save_pretrained("/content/wikiann_hi_ner/tokenizer")
    print("Saved WikiANN NER to /content/wikiann_hi_ner")
else:
    print("Skipping WikiANN NER")


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds,


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3787,0.339326,0.811703,0.824695,0.818147,0.895762
2,0.223,0.285798,0.848702,0.859375,0.854005,0.915903


Saved WikiANN NER to /content/wikiann_hi_ner


## D) AgroQA — mt5-small (Q&A)

In [None]:
if RUN_AGROQA_QA:
    from datasets import load_dataset
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
    import numpy as np, torch

    ds = load_dataset("Rahulrayudu/AgroQA")
    ds = ds["train"].train_test_split(test_size=0.1, seed=SEED)

    model_ckpt = "google/mt5-small"
    tok = AutoTokenizer.from_pretrained(model_ckpt)

    src_field = "Question"; tgt_field = "Answer"
    assert src_field in ds["train"].column_names and tgt_field in ds["train"].column_names, ds["train"].column_names

    def to_text(batch):
        src = ["question: " + q for q in batch[src_field]]
        # Ensure tgt is a list of strings
        tgt = [str(a) for a in batch[tgt_field]]
        model_in = tok(src, truncation=True)
        with tok.as_target_tokenizer():
            labels = tok(tgt, truncation=True)
        model_in["labels"] = labels["input_ids"]
        return model_in

    tok_train = ds["train"].select(range(min(2000, len(ds["train"]))))        .map(to_text, batched=True, remove_columns=ds["train"].column_names)
    tok_val   = ds["test"].map(to_text, batched=True, remove_columns=ds["test"].column_names)

    qa_model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)
    collator = DataCollatorForSeq2Seq(tok, model=qa_model)

    args = TrainingArguments(
        output_dir="/content/agroqa_mt5",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        learning_rate=3e-4,
        num_train_epochs=2,
        save_strategy="no",
        # predict_with_generate=True, # Removed unsupported argument
        fp16=torch.cuda.is_available(),
        report_to="none",
        logging_steps=50,
        **{EVAL_KWARG: "epoch"}
    )

    trainer = Trainer(model=qa_model, args=args, tokenizer=tok, data_collator=collator,
                      train_dataset=tok_train, eval_dataset=tok_val)
    trainer.train(); trainer.evaluate()

    qa_model.save_pretrained("/content/agroqa_mt5/model")
    tok.save_pretrained("/content/agroqa_mt5/tokenizer")
    print("Saved AgroQA model to /content/agroqa_mt5")
else:
    print("Skipping AgroQA")

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/305 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(model=qa_model, args=args, tokenizer=tok, data_collator=collator,


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,


Saved AgroQA model to /content/agroqa_mt5


In [None]:
# plotting + metrics
!pip -q install matplotlib scikit-learn

import os, json, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from datasets import load_dataset, Image as HFImage
from sklearn.metrics import confusion_matrix, classification_report
import evaluate


In [None]:
import os, glob
paths = ["/content/tomato_vit","/content/paddy_vit","/content/naamapadam_hi_ner",
         "/content/wikiann_hi_ner","/content/agroqa_mt5"]
for p in paths:
    print("\n", p, "exists?", os.path.exists(p))
    if os.path.exists(p):
        print(" files:", [os.path.basename(x) for x in glob.glob(p+"/*")[:10]])



 /content/tomato_vit exists? True
 files: ['processor', 'model']

 /content/paddy_vit exists? True
 files: ['processor', 'model']

 /content/naamapadam_hi_ner exists? True
 files: ['tokenizer', 'model']

 /content/wikiann_hi_ner exists? True
 files: ['tokenizer', 'model']

 /content/agroqa_mt5 exists? True
 files: ['tokenizer', 'model']


In [None]:
from datasets import load_dataset, Image as HFImage
from transformers import AutoImageProcessor, AutoModelForImageClassification
import torch

proc = AutoImageProcessor.from_pretrained("/content/tomato_vit/processor")
model = AutoModelForImageClassification.from_pretrained("/content/tomato_vit/model").eval()

ds = load_dataset("wellCh4n/tomato-leaf-disease-image")
im = ds["validation"][0]["image"]                       # sample image
inputs = proc(images=im, return_tensors="pt")
pred = model(**inputs).logits.argmax(-1).item()
print("Tomato prediction:", model.config.id2label[pred])


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/224M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/56.5M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/14218 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3569 [00:00<?, ? examples/s]

Tomato prediction: A tomato leaf with Late Blight


In [None]:
from datasets import load_dataset
import torch, numpy as np
from transformers import AutoImageProcessor, AutoModelForImageClassification

# make a tiny val split
ds = load_dataset("anthony2261/paddy-disease-classification")
ds = ds["train"].train_test_split(test_size=0.1, seed=42)
val = ds["test"]

proc = AutoImageProcessor.from_pretrained("/content/paddy_vit/processor")
model = AutoModelForImageClassification.from_pretrained("/content/paddy_vit/model").eval()

# one prediction + confidence
im = val[0]["image"]
inputs = proc(images=im, return_tensors="pt")
logits = model(**inputs).logits
pred_id = int(logits.argmax(-1))
probs = torch.softmax(logits, dim=-1).squeeze().tolist()
print("Pred:", model.config.id2label[pred_id], "| conf:", round(probs[pred_id], 3))


Pred: hispa | conf: 0.734


In [None]:
from sklearn.metrics import accuracy_score
subset = val.select(range(min(200, len(val))))
preds = []
for im in subset["image"]:
    p = model(**proc(images=im, return_tensors="pt")).logits.argmax(-1).item()
    preds.append(p)
print("acc:", accuracy_score(subset["label"], preds))


acc: 0.97


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

tok   = AutoTokenizer.from_pretrained("/content/naamapadam_hi_ner/tokenizer", use_fast=True)
model = AutoModelForTokenClassification.from_pretrained("/content/naamapadam_hi_ner/model").eval()

text = "राहुल ने पटना में किसान मेले का उद्घाटन किया।"

# get offsets so we can map tokens back to the original text
enc = tok(text, return_offsets_mapping=True, return_tensors="pt", truncation=True)
offsets = enc.pop("offset_mapping")[0].tolist()

with torch.no_grad():
    pred_ids = model(**enc).logits.argmax(-1)[0].tolist()

labels = [model.config.id2label[i] for i in pred_ids]

# merge B-/I- tags into spans
spans = []
current = None
for (start, end), lab in zip(offsets, labels):
    if start == end:  # special tokens like <s>, </s>
        continue
    if lab == "O":
        if current: spans.append(current); current = None
        continue
    tag = lab.split("-", 1)[-1]  # PER/LOC/ORG...
    if current and current["tag"] == tag and lab.startswith("I") and start == current["end"]:
        current["end"] = end
    else:
        if current: spans.append(current)
        current = {"start": start, "end": end, "tag": tag}
if current: spans.append(current)

print([(text[s["start"]:s["end"]], s["tag"]) for s in spans])


[('राहुल', 'PER'), ('पटना', 'LOC')]


In [None]:
import os, glob, pathlib
p = "/content/agroqa_mt5/model"
print("exists?", os.path.exists(p))
print("files:", [os.path.basename(x) for x in glob.glob(p+"/*")])
# one of these should be big:
for fn in ["pytorch_model.bin","model.safetensors"]:
    f = pathlib.Path(p, fn)
    if f.exists(): print(fn, round(f.stat().st_size/1e6,1), "MB")


exists? True
files: ['config.json', 'generation_config.json', 'model.safetensors']
model.safetensors 1200.7 MB


In [None]:
from datasets import load_dataset
ds = load_dataset("Rahulrayudu/AgroQA")
print("GT:", ds["train"][0]["Answer"])


GT: Machinery weeders are available


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
import torch # Import torch

ds = load_dataset("Rahulrayudu/AgroQA"); ds = ds["train"].train_test_split(test_size=0.1, seed=42)

ckpt = "google/flan-t5-small"
tok = AutoTokenizer.from_pretrained(ckpt)
def prep(b):
    X = tok(["question: "+q for q in b["Question"]], truncation=True)
    with tok.as_target_tokenizer():
        # Ensure Y is a list of strings
        Y = tok([str(a) for a in b["Answer"]], truncation=True)
    X["labels"] = Y["input_ids"]; return X

train = ds["train"].select(range(min(5000, len(ds["train"])))).map(prep, batched=True, remove_columns=ds["train"].column_names)
val   = ds["test"].map(prep, batched=True, remove_columns=ds["test"].column_names)

model = AutoModelForSeq2SeqLM.from_pretrained(ckpt)
coll  = DataCollatorForSeq2Seq(tok, model=model)
args  = TrainingArguments("/content/agroqa_flan",
    per_device_train_batch_size=8, per_device_eval_batch_size=8,
    gradient_accumulation_steps=2, learning_rate=3e-4, num_train_epochs=2,
    save_strategy="no", # predict_with_generate=True, # Removed unsupported argument
    report_to="none")
Trainer(model=model, args=args, tokenizer=tok, data_collator=coll,
        train_dataset=train, eval_dataset=val).train()
model.save_pretrained("/content/agroqa_flan/model"); tok.save_pretrained("/content/agroqa_flan/tokenizer")

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/2739 [00:00<?, ? examples/s]



Map:   0%|          | 0/305 [00:00<?, ? examples/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  Trainer(model=model, args=args, tokenizer=tok, data_collator=coll,


Step,Training Loss


('/content/agroqa_flan/tokenizer/tokenizer_config.json',
 '/content/agroqa_flan/tokenizer/special_tokens_map.json',
 '/content/agroqa_flan/tokenizer/spiece.model',
 '/content/agroqa_flan/tokenizer/added_tokens.json',
 '/content/agroqa_flan/tokenizer/tokenizer.json')

In [None]:
tok = AutoTokenizer.from_pretrained("/content/agroqa_flan/tokenizer")
model = AutoModelForSeq2SeqLM.from_pretrained("/content/agroqa_flan/model").eval()
def ask_flan(q):
    out = model.generate(**tok("question: "+q, return_tensors="pt"),
                         max_new_tokens=80, num_beams=4)
    print(tok.decode(out[0], skip_special_tokens=True))
ask_flan("When should I irrigate wheat during winter?")


During the winter, it is best to irrigate wheat during the winter.


In [None]:
# ==== ONE-CELL GRADIO DEMO WITH PATH FIX ====
!pip -q install gradio pillow

import gradio as gr, torch, os, glob
from pathlib import Path
from PIL import Image
from transformers import (
    AutoImageProcessor, AutoModelForImageClassification,
    AutoTokenizer, AutoModelForTokenClassification,
    AutoModelForSeq2SeqLM, logging
)
logging.set_verbosity_error()

# ---- Model root paths (edit if you saved elsewhere) ----
TOM = "/content/tomato_vit"
PAD = "/content/paddy_vit"
NER = "/content/naamapadam_hi_ner"
QA_FLAN = "/content/agroqa_flan"      # if you trained the FLAN fallback
QA_MT5  = "/content/agroqa_mt5"       # otherwise mT5

# (Optional) If you unzipped a bundle like /content/ava_models.zip, ensure folders exist:
# !unzip -q /content/ava_models.zip -d /content

# ---- Helper: resolve model vs processor subdirs or flat dir ----
def _resolve_img_dirs(base_dir: str):
    """
    Return (model_dir, processor_dir) whether base_dir has split subfolders
    or a flat structure containing the files directly.
    """
    base = Path(base_dir)
    # Prefer split dirs if present
    model_dir = base / "model" if (base / "model").exists() else base
    proc_dir  = base / "processor" if (base / "processor").exists() else base

    # Sanity checks / friendly errors
    if not model_dir.exists():
        raise FileNotFoundError(f"Model folder not found at {model_dir}. Contents of {base_dir}: {os.listdir(base_dir) if base.exists() else 'MISSING'}")
    # processor needs preprocessor_config.json
    if not (proc_dir / "preprocessor_config.json").exists():
        # If missing, try base directly
        if (base / "preprocessor_config.json").exists():
            proc_dir = base
        else:
            raise FileNotFoundError(
                f"preprocessor_config.json not found in {proc_dir}. "
                f"Available files: {os.listdir(proc_dir) if proc_dir.exists() else 'MISSING'}"
            )
    return str(model_dir), str(proc_dir)

# =======================
# Tomato (Image classifier)
# =======================
tom_model_dir, tom_proc_dir = _resolve_img_dirs(TOM)
tom_proc = AutoImageProcessor.from_pretrained(tom_proc_dir)
tom_mod  = AutoModelForImageClassification.from_pretrained(tom_model_dir).eval()

def pred_tomato(img):
    im = img.convert("RGB") if isinstance(img, Image.Image) else Image.fromarray(img).convert("RGB")
    inputs = tom_proc(images=im, return_tensors="pt")
    with torch.no_grad():
        logits = tom_mod(**inputs).logits
        pred = int(logits.argmax(-1))
        conf = torch.softmax(logits, -1)[0, pred].item()
    return f"{tom_mod.config.id2label[pred]}  (confidence {conf:.2f})"

# =======================
# Paddy (Image classifier)
# =======================
pad_model_dir, pad_proc_dir = _resolve_img_dirs(PAD)
pad_proc = AutoImageProcessor.from_pretrained(pad_proc_dir)
pad_mod  = AutoModelForImageClassification.from_pretrained(pad_model_dir).eval()

def pred_paddy(img):
    im = img.convert("RGB") if isinstance(img, Image.Image) else Image.fromarray(img).convert("RGB")
    inputs = pad_proc(images=im, return_tensors="pt")
    with torch.no_grad():
        logits = pad_mod(**inputs).logits
        pred = int(logits.argmax(-1))
        conf = torch.softmax(logits, -1)[0, pred].item()
    return f"{pad_mod.config.id2label[pred]}  (confidence {conf:.2f})"

# =======================
# NER (Hindi)
# =======================
# Prefer split dirs; fall back to flat if needed
ner_tok_dir = Path(NER, "tokenizer") if Path(NER, "tokenizer").exists() else Path(NER)
ner_model_dir = Path(NER, "model") if Path(NER, "model").exists() else Path(NER)

ner_tok = AutoTokenizer.from_pretrained(str(ner_tok_dir), use_fast=True)
ner_mod = AutoModelForTokenClassification.from_pretrained(str(ner_model_dir)).eval()
id2label = ner_mod.config.id2label

def ner_hi(text):
    enc = ner_tok(text, return_offsets_mapping=True, return_tensors="pt", truncation=True)
    offsets = enc.pop("offset_mapping")[0].tolist()
    with torch.no_grad():
        pred_ids = ner_mod(**enc).logits.argmax(-1)[0].tolist()
    labels = [id2label[i] for i in pred_ids]

    # Merge to spans
    spans, cur = [], None
    for (s, e), lab in zip(offsets, labels):
        if s == e:  # specials
            continue
        if lab == "O":
            if cur: spans.append(cur); cur = None
            continue
        tag = lab.split("-", 1)[-1]
        if cur and cur["tag"] == tag and s == cur["end"]:
            cur["end"] = e
        else:
            if cur: spans.append(cur)
            cur = {"start": s, "end": e, "tag": tag}
    if cur: spans.append(cur)
    ents = [(text[x["start"]:x["end"]], x["tag"]) for x in spans]
    return ents if ents else "No entities"

# =======================
# Q&A (AgroQA) — prefer FLAN if available, else mT5
# =======================
qa_dir = QA_FLAN if Path(QA_FLAN, "model").exists() else QA_MT5
qa_tok_dir = Path(qa_dir, "tokenizer") if Path(qa_dir, "tokenizer").exists() else Path(qa_dir)
qa_model_dir = Path(qa_dir, "model") if Path(qa_dir, "model").exists() else Path(qa_dir)

qa_tok = AutoTokenizer.from_pretrained(str(qa_tok_dir))
qa_mod = AutoModelForSeq2SeqLM.from_pretrained(str(qa_model_dir)).eval()

# mT5 needs safer generation settings
qa_mod.config.pad_token_id = qa_tok.pad_token_id
qa_mod.config.eos_token_id = qa_tok.eos_token_id
bad = None
if "mt5" in qa_tok.name_or_path.lower() or "agroqa_mt5" in str(qa_dir).lower():
    qa_mod.config.decoder_start_token_id = qa_tok.pad_token_id
    bad = [[qa_tok.convert_tokens_to_ids(f"<extra_id_{i}>")] for i in range(100)]

def ask(q):
    q = "question: " + q
    x = qa_tok(q, return_tensors="pt", truncation=True, max_length=256)
    with torch.no_grad():
        y = qa_mod.generate(
            **x,
            max_new_tokens=80,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3,
            bad_words_ids=bad,
            eos_token_id=qa_tok.eos_token_id,
            pad_token_id=qa_tok.pad_token_id
        )
    return qa_tok.decode(y[0], skip_special_tokens=True)

# =======================
# Gradio UI
# =======================
with gr.Blocks() as demo:
    gr.Markdown("# AgriVerse (AVA) — Demo")
    with gr.Tab("Tomato disease"):
        img_t = gr.Image(label="Upload tomato leaf")
        out_t = gr.Textbox(label="Prediction")
        img_t.change(pred_tomato, img_t, out_t)
        gr.Button("Predict").click(pred_tomato, img_t, out_t)

    with gr.Tab("Paddy disease"):
        img_p = gr.Image(label="Upload paddy leaf")
        out_p = gr.Textbox(label="Prediction")
        img_p.change(pred_paddy, img_p, out_p)
        gr.Button("Predict").click(pred_paddy, img_p, out_p)

    with gr.Tab("NER (Hindi)"):
        txt_n = gr.Textbox(label="Type Hindi sentence")
        out_n = gr.HighlightedText(label="Entities")
        gr.Button("Extract").click(ner_hi, txt_n, out_n)

    with gr.Tab("Agri Q&A"):
        txt_q = gr.Textbox(label="Ask a question")
        out_q = gr.Textbox(label="Answer")
        gr.Button("Answer").click(ask, txt_q, out_q)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0e963f7adc708db56f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
!mkdir -p /content/ava_models
for p in ["/content/tomato_vit","/content/paddy_vit","/content/naamapadam_hi_ner","/content/agroqa_mt5","/content/agroqa_flan"]:
    import os, shutil
    if os.path.exists(p): shutil.copytree(p, f"/content/ava_models/{p.split('/')[-1]}", dirs_exist_ok=True)
!zip -qr /content/ava_models.zip /content/ava_models
from google.colab import files; files.download("/content/ava_models.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>