In [None]:
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth
!pip install evaluate

In [None]:
!pip install sacrebleu

In [None]:
!pip install rouge_score

In [None]:
from unsloth import FastLanguageModel
import torch
import argparse
from datasets import load_dataset
from sacrebleu.metrics import BLEU
import wandb
from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback
from transformers.utils import logging
import json
from tqdm import tqdm

In [None]:
import xformers
print(xformers.__version__)

In [None]:
configuration = "ZS" # ZS or FT
mode = "predict" # "train" or "predict"

In [None]:
max_seq_length = 1600
dtype = None
load_in_4bit = True

In [None]:
mapping = {}
if configuration == "ZS":
    mapping["model_name"] = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"
    mapping["test_data"] = "test_11_intents.jsonl"
    # mapping["test_data"] = f"/content/drive/MyDrive/mathdial/test_100_annotated_11_intents.jsonl"

elif configuration == "FT":
    mapping["test_data"] = "test_11_intentsjsonl"
    # mapping["model_name"] = "data/KG/outputs/checkpoint-1000"
    mapping["train_data"] = "train_11_intents.jsonl"
    mapping["valid_data"] = "val_11_intents.jsonl"


In [None]:
if mode == "predict":
    mapping["model_name"] = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=mapping["model_name"],
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
EOS_TOKEN = tokenizer.eos_token


In [None]:
bleu = BLEU()

In [None]:
import numpy as np

def compute_bleu(eval_preds):
    """Compute BLEU score for the model."""
    predictions, labels = eval_preds

    # If we received logits instead of text, we need to generate text
    if isinstance(predictions, (np.ndarray, torch.Tensor)):
        if predictions.shape[-1] == tokenizer.vocab_size:  # If these are logits
            predictions = np.argmax(predictions, axis=-1)

        # Decode predictions
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    else:
        # If predictions are already text
        decoded_preds = predictions

    # Decode labels
    if isinstance(labels, (np.ndarray, torch.Tensor)):
        # Filter out padding tokens (-100)
        labels = [[l for l in label if l != -100] for label in labels]
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    else:
        decoded_labels = labels

    # Clean up whitespace
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Calculate BLEU score
    bleu_score = bleu.corpus_score(decoded_preds, [[ref] for ref in decoded_labels])

    return {"bleu": bleu_score.score}

In [None]:
prompt_template = """{} {}"""

In [None]:
def formatting_prompts_func(examples):
    """Format the prompts for the model."""
    texts = []
    for i in range(len(examples["text"])):
        text = prompt_template.format(examples["text"][i], examples["output"][i]) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [None]:
def read_jsonl(filename):
    """Read a JSONL file."""
    data = []
    with open(filename, "r") as f:
        for line in f:
            data.append(json.loads(line))
    return data


def write_jsonl_file(file, data):
    """Write a JSONL file."""
    with open(file, "w") as f:
        for d in data:
            f.write(json.dumps(d) + "\n")

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
def extract_tutor_response(text):
    """
    Extract the first Tutor response that appears after [/INST]\n ###Tutor:

    Args:
        text (str): The full generated text

    Returns:
        str: The extracted Tutor response, or empty string if no response found
    """
    try:
        # Find the start of the tutor's response
        start_marker = "[/INST]\n ###Tutor:"
        start_idx = text.find(start_marker)

        if start_idx == -1:
            return ""

        # Move index to start of actual response
        response_start = start_idx + len(start_marker)

        # Find the end of the tutor's response (next ###Student: or ###Tutor:)
        end_markers = ["###Student:", "###Tutor:", "\n"]
        end_indices = [text.find(marker, response_start) for marker in end_markers]
        # Filter out -1 values (markers not found)
        end_indices = [i for i in end_indices if i != -1]

        if end_indices:
            # Take the earliest end marker
            response_end = min(end_indices)
            response = text[response_start:response_end]
        else:
            # If no end marker found, take the rest of the text
            response = text[response_start:]

        return response.strip()

    except Exception as e:
        print(f"Error extracting response: {e}")
        response = text
        return response

In [None]:
from transformers import Seq2SeqTrainingArguments

def main():
    """Main function for training and inference."""
    global model
    if mode == "train":
        if configuration == "FT":
            # Load datasets with explicit data_files mapping
            train_dataset = load_dataset("json", data_files=mapping["train_data"])["train"]
            eval_dataset = load_dataset("json", data_files=mapping["valid_data"])["train"]

            print(f"Raw train dataset size: {len(train_dataset)}")
            print(f"Raw validation dataset size: {len(eval_dataset)}")

            # Apply formatting
            train_dataset = train_dataset.map(formatting_prompts_func, batched=True, remove_columns=train_dataset.column_names)
            eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True, remove_columns=eval_dataset.column_names)

            print(f"Processed train dataset size: {len(train_dataset)}")
            print(f"Processed validation dataset size: {len(eval_dataset)}")
            wandb.login()
            logging.set_verbosity_info()
            trainer = SFTTrainer(
                model=model,
                tokenizer=tokenizer,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
                dataset_text_field="text",
                max_seq_length=max_seq_length,
                dataset_num_proc=2,
                packing=False,
                compute_metrics=compute_bleu,
                callbacks=[
                    EarlyStoppingCallback(
                        early_stopping_patience=10,
                        early_stopping_threshold=0.0,
                    )
                ],
                args=Seq2SeqTrainingArguments(
                    per_device_train_batch_size=8,
                    per_device_eval_batch_size=1,
                    gradient_accumulation_steps=4,
                    warmup_ratio=0.1,
                    num_train_epochs=1,
                    learning_rate=2e-5,
                    fp16=not torch.cuda.is_bf16_supported(),
                    bf16=torch.cuda.is_bf16_supported(),
                    logging_steps=1,
                    optim="adamw_8bit",
                    weight_decay=0.1,
                    lr_scheduler_type="linear",
                    seed=3407,
                    output_dir="/content/drive/MyDrive/mathdial/train_extended/" + "/".join(mapping["model_name"].split("/")[:-1]),
                    evaluation_strategy="steps",
                    eval_steps=50,
                    predict_with_generate=True,
                    generation_max_length=100,
                    report_to="wandb",
                    load_best_model_at_end=True,
                    save_total_limit=3,
                    run_name="4_intents_extended_train",
                    metric_for_best_model="bleu",
                ),
            )
            wandb.init(project="mathdial", name="4_intents_extended_train")
            trainer.train()
            wandb.finish()
        else:
            raise ValueError("Invalid configuration for training.")

    else:
        tokenizer.padding_side = "left"
        FastLanguageModel.for_inference(model)
        test = read_jsonl(mapping["test_data"])

        # Reduce batch size
        batch_size = 4  # Reduced from 100
        max_input_length = 1500

        # Initialize list to store all results
        all_results = []

        for i in tqdm(range(0, len(test), batch_size)):
            # Clear CUDA cache
            torch.cuda.empty_cache()

            batch = test[i:i+batch_size]

            # Format inputs
            formatted_inputs = [
                prompt_template.format(
                    ex["text"],
                    "",
                )
                for ex in batch
            ]

            try:
                # Tokenize with explicit truncation
                inputs = tokenizer(
                    formatted_inputs,
                    return_tensors="pt",
                    truncation=True,
                    max_length=max_input_length,
                    padding=True,
                ).to("cuda")

                # Generate with remaining context length
                with torch.no_grad():  # Ensure we're not storing gradients
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=100,
                        use_cache=True,
                        pad_token_id=tokenizer.pad_token_id,
                        eos_token_id=tokenizer.eos_token_id,
                    )

                res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

                # Update results
                for j, t in enumerate(batch):
                    result = t.copy()  # Create a copy of the original test item
                    cleaned_output = extract_tutor_response(res[j])
                    result["output_generated"] = cleaned_output
                    all_results.append(result)

                # Clear memory
                del inputs, outputs, res
                torch.cuda.empty_cache()

            except RuntimeError as e:
                print(f"Error processing batch {i}-{i+batch_size}: {e}")
                # If we encounter an error, try processing one by one
                for single_item in batch:
                    try:
                        single_input = tokenizer(
                            prompt_template.format(single_item["text"], ""),
                            return_tensors="pt",
                            truncation=True,
                            max_length=max_input_length,
                            padding=True,
                        ).to("cuda")

                        with torch.no_grad():
                            single_output = model.generate(
                                **single_input,
                                max_new_tokens=100,
                                use_cache=True,
                                pad_token_id=tokenizer.pad_token_id,
                                eos_token_id=tokenizer.eos_token_id,
                            )

                        result = single_item.copy()
                        result["output_generated"] = extract_tutor_response(tokenizer.decode(single_output[0], skip_special_tokens=True))
                        all_results.append(result)

                        del single_input, single_output
                        torch.cuda.empty_cache()

                    except Exception as e2:
                        print(f"Error processing single item: {e2}")
                        result = single_item.copy()
                        result["output_generated"] = "ERROR: Failed to generate"
                        all_results.append(result)

        # Write all results at once
        write_jsonl_file(f"test_{configuration}_generated_11_intents.jsonl", all_results)

In [None]:
main()

In [None]:
mode = "predict"

In [None]:
main()

In [None]:
chrf = evaluate.load("chrf")
sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(pairs):
    """
    Compute evaluation metrics for a list of pairs.

    Args:
        pairs (list): A list of pairs, where each pair contains a reference and a hypothesis.

    Returns:
        dict: A dictionary containing the computed metrics.

    """
    metrics = {}
    references_lst = [[pair[0]] for pair in pairs]
    references = [pair[0] for pair in pairs]
    hypotheses = [pair[1] for pair in pairs]

    chrf_score = chrf.compute(predictions=hypotheses, references=references_lst)
    metrics["chrf"] = chrf_score["score"]
    sacrebleu_score = sacrebleu.compute(predictions=hypotheses, references=references_lst)
    metrics["sacrebleu"] = sacrebleu_score["score"]
    rouge_score = rouge.compute(predictions=hypotheses, references=references)
    metrics["rouge1"] = rouge_score["rouge1"] * 100
    metrics["rouge2"] = rouge_score["rouge2"] * 100
    metrics["rougeL"] = rouge_score["rougeL"] * 100
    return metrics

In [None]:
def write_json_file(filename: str, data: dict) -> None:
    """
    Write data to a JSON file.

    Args:
        filename (str): The name of the file to write to.
        data (dict): The data to be written to the file.

    Returns:
        None
    """
    with open(f"{filename}", "w", encoding="utf-8") as f:
        json.dump(data, f)

In [None]:
def evaluate_outputs(file_name: str) -> None:
    """
    Evaluate the outputs generated by a model and compute metrics.

    Args:
        file_name (str): The name of the JSONL file containing the outputs.

    Returns:
        None
    """
    data = read_jsonl(file_name)
    tag = "ZS_11_intents"
    pairs = []

    for d in data:
        # Get the original output and generated output
        original = d["output"]
        generated = d["output_generated"]
        pairs.append((original, generated))

    # Compute metrics once for all pairs
    generation_metrics = compute_metrics(pairs)

    # Write metrics directly without additional processing
    write_json_file(f"{tag}_metrics.json", generation_metrics)

In [None]:
evaluate_outputs("test_ZS_generated_11_intents.jsonl")