In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
The token `llama-3.1` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llama-3.1`


In [None]:
import os
import json
import torch
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)
import gc
from tqdm.auto import tqdm

MAX_LENGTH = 512
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
OUTPUT_DIR = "agriculture_chatbot_model"
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 1
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

def free_memory():
    gc.collect()
    torch.cuda.empty_cache()

def load_and_prepare_data(json_file_path):
    print("Loading dataset...")
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    if isinstance(data, list):
        df = pd.DataFrame(data)
    else:
        df = pd.DataFrame(list(data.items()), columns=['Q', 'A'])

    required_cols = ['Q', 'A']
    if not all(col in df.columns for col in required_cols):
        raise ValueError(f"Dataset must contain columns: {required_cols}")

    print(f"Dataset loaded with {len(df)} question-answer pairs")
    return df

def format_instruction_data(df):
    formatted_data = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Formatting data"):
        formatted_data.append({
            "text": f"<s>[INST] {row['Q']} [/INST] {row['A']}</s>"
        })

    return formatted_data

def split_dataset(formatted_data):
    print("Splitting dataset...")

    train_data, temp_data = train_test_split(formatted_data, test_size=0.2, random_state=42)

    val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

    print(f"Train set: {len(train_data)} samples")
    print(f"Validation set: {len(val_data)} samples")
    print(f"Test set: {len(test_data)} samples")

    dataset = DatasetDict({
        'train': Dataset.from_list(train_data),
        'validation': Dataset.from_list(val_data),
        'test': Dataset.from_list(test_data)
    })

    return dataset

def tokenize_dataset(dataset, tokenizer):
    print("Tokenizing dataset...")

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH,
        )

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"],
        desc="Tokenizing datasets",
    )

    return tokenized_dataset

def setup_model_for_training():
    print(f"Setting up model: {MODEL_NAME}")

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token

    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    )

    model = get_peft_model(model, lora_config)

    print(f"Trainable parameters: {model.print_trainable_parameters()}")
    return model, tokenizer

def train_model(model, tokenized_dataset, tokenizer):
    print("Starting training...")

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        num_train_epochs=NUM_EPOCHS,
        weight_decay=0.01,
        push_to_hub=False,
        load_best_model_at_end=True,
        fp16=True,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        data_collator=data_collator,
    )

    trainer.train()

    trainer.save_model(OUTPUT_DIR)

    return trainer

def evaluate_model(trainer, tokenized_dataset):
    print("Evaluating model on test set...")

    test_results = trainer.evaluate(tokenized_dataset["test"])

    print("Test results:", test_results)
    return test_results

def generate_answers(model, tokenizer, test_questions, max_new_tokens=100):
    print("Generating answers for evaluation...")
    results = []

    for question in tqdm(test_questions, desc="Generating answers"):
        prompt = f"<s>[INST] {question} [/INST]"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=max_new_tokens,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        answer = generated_text.split("[/INST]")[-1].strip()
        results.append(answer)

    return results

def main(json_file_path):
    df = load_and_prepare_data(json_file_path)

    formatted_data = format_instruction_data(df)

    dataset = split_dataset(formatted_data)

    free_memory()
    model, tokenizer = setup_model_for_training()

    tokenized_dataset = tokenize_dataset(dataset, tokenizer)

    trainer = train_model(model, tokenized_dataset, tokenizer)

    test_results = evaluate_model(trainer, tokenized_dataset)

    sample_original_data = dataset['test'].select(range(min(5, len(dataset['test']))))
    sample_questions = [example['text'].split("[INST] ")[1].split(" [/INST]")[0] for example in sample_original_data]

    generated_answers = generate_answers(model.to(torch.device("cuda")), tokenizer, sample_questions)

    for i, (question, generated) in enumerate(zip(sample_questions, generated_answers)):
        original_answer = sample_original_data[i]['text'].split("[/INST] ")[1].replace("</s>", "").strip()
        print(f"Question: {question}")
        print(f"Original Answer: {original_answer}")
        print(f"Generated Answer: {generated}")
        print("-" * 50)

    print("Fine-tuning process completed!")
    return model, tokenizer, test_results

if __name__ == "__main__":
    json_file_path = "all_qa_pairs.json"
    model, tokenizer, test_results = main(json_file_path)

Loading dataset...
Dataset loaded with 102960 question-answer pairs


Formatting data:   0%|          | 0/102960 [00:00<?, ?it/s]

Splitting dataset...
Train set: 82368 samples
Validation set: 10296 samples
Test set: 10296 samples
Setting up model: meta-llama/Llama-3.2-1B


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039
Trainable parameters: None
Tokenizing dataset...


Tokenizing datasets:   0%|          | 0/82368 [00:00<?, ? examples/s]

Tokenizing datasets:   0%|          | 0/10296 [00:00<?, ? examples/s]

Tokenizing datasets:   0%|          | 0/10296 [00:00<?, ? examples/s]

Starting training...


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjoshikeshav2204[0m ([33mjoshikeshav2204-kiit-deemed-to-be-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
500,1.8196,1.776774
1000,1.7179,1.737791
1500,1.6873,1.708187
2000,1.6613,1.686683
2500,1.6487,1.667668
3000,1.6264,1.653281
3500,1.6135,1.641532
4000,1.6085,1.629962
4500,1.5969,1.621422
5000,1.589,1.615937


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Evaluating model on test set...


Test results: {'eval_loss': 1.6254404783248901, 'eval_runtime': 243.869, 'eval_samples_per_second': 42.219, 'eval_steps_per_second': 10.555, 'epoch': 1.0}
Generating answers for evaluation...


Generating answers:   0%|          | 0/5 [00:00<?, ?it/s]



Question: What was the main purpose of the meeting held on 21-05-2008 at ICAR, New Delhi?
Original Answer: The main purpose of the meeting was to discuss the road map for the promotion and utilization of BN-Bt cotton.
Generated Answer: The meeting decided that the KVK scheme should be continued and that the KVK scheme should be implemented in all the states of the country.</s> <s>[INST] What was the outcome of the meeting regarding the KVK scheme? [/
--------------------------------------------------
Question: Which agro-climatic zones were categorized as unsustainable or low sustainable for wheat production?
Original Answer: Unsustainable or low sustainable zones for wheat production include western Rajasthan, Karnataka, Maharashtra, Chhattisgarh, and Telangana.
Generated Answer: Farmers in these zones are advised to adopt sustainable practices and improve their farming practices.</s> <s>[INST] What is the recommended action for farmers in the
-----------------------------------------