In [1]:
# %%capture
# !pip install unsloth
# # Also get the latest nightly Unsloth!
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
# %pip uninstall unsloth unsloth_zoo -y
# %pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# %pip install --upgrade --no-cache-dir "git+https://github.com/unslothai/unsloth-zoo.git"

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

In [2]:
import json
import time
import transformers
import torch

# Load Hugging Face model and tokenizer
tokenizer = transformers.LlamaTokenizer.from_pretrained('chaoyi-wu/PMC_LLAMA_7B')
model = transformers.LlamaForCausalLM.from_pretrained('chaoyi-wu/PMC_LLAMA_7B').to("cuda")

def load_json(file_paths):
    data = []
    for file_path in file_paths:
        with open(file_path, "r", encoding="utf-8") as file:
            data.extend(json.load(file))
    return data

def save_to_jsonl(file_path, data):
    with open(file_path, "a", encoding="utf-8") as f: 
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

def generate_response(instruction_text, input_text, max_tokens):
    inputs = tokenizer(
        [
            instruction_text + "\n\n" + input_text
        ], return_tensors="pt", add_special_tokens=True
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_new_tokens=max_tokens,
            do_sample=True,
            top_k=50
        )
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    response_text = decoded_outputs[0]

    cleaned_output = response_text.replace(instruction_text, "").replace(input_text, "").strip()
    
    return cleaned_output


# 실행 
input_json_paths = [
    "/data/jaesung/llm_for_diabetes/src/model/mLoRA/demo/data_classification_test.json",
    "/data/jaesung/llm_for_diabetes/src/model/mLoRA/demo/data_generation_test.json"
]
output_json_path = "/data/jaesung/llm_for_diabetes/src/model/unsloth/model_output/pmc_model_output.jsonl"

data = load_json(input_json_paths)

start_time = time.time()
total_samples = len(data)
for idx, item in enumerate(data):
    sample_start_time = time.time()

    input_text = item.get("input", "")
    instruction = item.get("instruction", "")
    task = item.get("task", "")

    max_tokens = 32 if task in ["qa1", "qa2", "qa3", "nli", "ie", "re"] else 512 

    model_output = generate_response(instruction, input_text, max_tokens)

    output_data = item.copy()
    output_data.update({
        "model_output": model_output,
    })
    save_to_jsonl(output_json_path, output_data)

    elapsed_time = time.time() - start_time
    avg_time_per_sample = elapsed_time / (idx + 1) 
    remaining_samples = total_samples - (idx + 1)
    estimated_remaining_time = remaining_samples * avg_time_per_sample

    print(f"[{idx+1}/{total_samples}] Sample processed in {time.time() - sample_start_time:.2f}s, ETA: {estimated_remaining_time/60:.2f} min")

print(f"\nAll samples processed. Total time: {(time.time() - start_time)/60:.2f} min")


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
2025-02-12 11:05:55.423089: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-12 11:05:55.438013: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory 

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

[1/1181] Sample processed in 1.71s, ETA: 33.62 min
[2/1181] Sample processed in 1.16s, ETA: 28.18 min
[3/1181] Sample processed in 1.08s, ETA: 25.87 min
[4/1181] Sample processed in 1.08s, ETA: 24.71 min
[5/1181] Sample processed in 1.15s, ETA: 24.24 min
[6/1181] Sample processed in 1.20s, ETA: 24.11 min
[7/1181] Sample processed in 1.09s, ETA: 23.70 min
[8/1181] Sample processed in 1.08s, ETA: 23.35 min
[9/1181] Sample processed in 1.07s, ETA: 23.07 min
[10/1181] Sample processed in 1.09s, ETA: 22.87 min
[11/1181] Sample processed in 1.16s, ETA: 22.82 min
[12/1181] Sample processed in 1.20s, ETA: 22.86 min
[13/1181] Sample processed in 1.20s, ETA: 22.88 min
[14/1181] Sample processed in 1.47s, ETA: 23.27 min
[15/1181] Sample processed in 1.08s, ETA: 23.10 min
[16/1181] Sample processed in 1.15s, ETA: 23.03 min
[17/1181] Sample processed in 1.15s, ETA: 22.98 min
[18/1181] Sample processed in 1.20s, ETA: 22.97 min
[19/1181] Sample processed in 1.31s, ETA: 23.08 min
[20/1181] Sample proc

KeyboardInterrupt: 