### Text generation

In [1]:
input = "LoRA" # Change the Input to generate

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

# === Config ===
base_model_name = "mistralai/Mistral-7B-v0.1"
lora_repo = "Joshua-Sun-CompSci/Mistral_academic_style_tune"

# === Tokenizer === 
tokenizer = AutoTokenizer.from_pretrained(lora_repo)
tokenizer.pad_token = tokenizer.eos_token

# === QLoRA quantization config ===
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# === Load base model with quantization ===
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True,
)

# === Load LoRA adapter from Hugging Face ===
model = PeftModel.from_pretrained(base_model, lora_repo)
model.eval()

# === Prompt ===
prompt = """### Instruction:
Write an academic paragraph given the title.

### Input: """ + input + """

### Response:
"""

# === Tokenize and generate ===
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.7,
        top_p=0.95,
        do_sample=True
    )

# === Decode ===
print(tokenizer.decode(output[0], skip_special_tokens=True))


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.30s/it]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


### Instruction:
Write an academic paragraph given the title.

### Input: LoRA

### Response:
We present LoRA (Loosely Regularized Adapters), a novel approach to fine-tune large language models (LLMs) for in-context learning tasks. Unlike conventional fine-tuning methods that modify the model's weights, LoRA only adds a few hundred additional trainable parameters to the model, reducing the risk of overfitting and maintaining the model's core functionality. Our experiments show that LoRA achieves comparable performance to conventional fine-tuning, while requiring significantly fewer trainable parameters. LoRA also offers additional benefits such as faster training times, reduced memory requirements, and better generalization to unseen data. We demonstrate the effectiveness of LoRA on a variety of in-context learning tasks, including
