In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import torch
from transformers import pipeline, AutoTokenizer
from datasets import Dataset
import os
import random
from dotenv import load_dotenv


In [41]:
dataset_path = "/content/drive/MyDrive/agriculture_dataset.json"

In [None]:
load_dotenv()

In [None]:
HF_TOKEN = os.getenv("HF_TOKEN")  

In [43]:
with open(dataset_path, "r") as f:
    data = json.load(f)

In [44]:
prompts = [entry["prompt"] for entry in data]

In [45]:
tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws", legacy=False, token= os.getenv("HF_TOKEN"))

In [46]:
paraphraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws", tokenizer=tokenizer, device=0, token= os.getenv("HF_TOKEN"))

Device set to use cuda:0


In [47]:
dataset = Dataset.from_dict({"prompt": prompts})

In [48]:
synonym_map = {
    "crop": ["plant", "vegetation", "harvest"],
    "fertilizer": ["manure", "compost", "nutrients"],
    "yield": ["production", "harvest outcome", "crop output"],
    "disease": ["infection", "pathogen", "plant illness"],
    "recommendation": ["suggestion", "best option", "advice"]
}

In [49]:
def enhance_prompt(prompt):
    words = prompt.split()
    new_words = [random.choice(synonym_map.get(word, [word])) for word in words]
    return " ".join(new_words)

In [50]:
def batch_paraphrase(batch):
    outputs = paraphraser(batch["prompt"], num_return_sequences=2, num_beams=5, batch_size=8)
    unique_paraphrases = []

    for i, output_set in enumerate(outputs):
        paraphrases = list(set(o["generated_text"] for o in output_set))
        enhanced_prompt = enhance_prompt(batch["prompt"][i])
        augmented_variants = list(set(paraphrases + [enhanced_prompt]))[:2]
        unique_paraphrases.append(augmented_variants)

    batch["augmented_prompts"] = unique_paraphrases
    return batch

In [51]:
augmented_dataset = dataset.map(batch_paraphrase, batched=True, batch_size=8)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [52]:
final_data = []
for i, entry in enumerate(data):
    final_data.append({
        "prompt": entry["prompt"],
        "response": entry["response"],
        "augmented_prompts": augmented_dataset[i]["augmented_prompts"]
    })

In [53]:
with open("/content/drive/MyDrive/rag_augmented_dataset.json", "w") as f:
    json.dump(final_data, f, indent=2)