In [3]:
%%capture
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

!pip install pip3-autoremove
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu124
!pip install unsloth
!pip install --upgrade transformers==4.52.4


### Unsloth

In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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"
)


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


ImportError: Unsloth: Your transformers version of 4.52.4 does not support FalconH1.
The minimum required version is 4.53.0.
Try `pip install --upgrade "transformers>=4.53.0"`
to obtain the latest transformers build, then restart this session.

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
import re
from datasets import load_dataset
dataset = load_dataset("/kaggle/input/final-formatted-text", split="train")  # Replace with your dataset

# Preprocess to split by sections (e.g., "Section 3. Formation of company.")
def chunk_by_section(examples):
    new_texts = []
    for text in examples["text"]:
        # Split by section markers (e.g., "Section 3. Formation of company.")
        sections = re.split(r'(Section \d+\..*?)(?=(Section \d+\.|$))', text, flags=re.DOTALL)
        # Filter out empty or separator-only sections
        for section in sections:
            if section.strip() and section.startswith("Section "):
                # Truncate if section exceeds max_seq_length
                tokens = tokenizer.encode(section.strip(), add_special_tokens=False)
                if len(tokens) > max_seq_length:
                    tokens = tokens[:max_seq_length]
                    section = tokenizer.decode(tokens)
                new_texts.append(section.strip())
    return {"text": new_texts}

dataset = dataset.map(chunk_by_section, batched=True)
dataset = dataset.train_test_split(test_size=0.1)  # 10% for validation
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

def formatting_prompts_func(examples):
    return {"text": examples["text"]}

train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)

In [None]:
print("Sample Chunks:")
for i in range(min(5, len(train_dataset))):
    print(f"Sample {i}: {train_dataset[i]['text'][:200]}...")
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(eval_dataset)}")

Let's see how the `ChatML` format works by printing the 5th element

In [None]:
!pip install trl

In [None]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,
    packing = True,
    args = SFTConfig(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,  # Train for 1 epoch
        learning_rate = 1e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
model.save_pretrained("lora_model_v1")  # Save with a unique name
tokenizer.save_pretrained("lora_model_v1")

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model! Since we're using `ChatML`, use `apply_chat_template` with `add_generation_prompt` set to `True` for inference.

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Test prompts related to Companies Act 2013
test_prompts = [
    {"from": "human", "value": "What are the requirements for registering a private company under the Companies Act 2013?"},
    {"from": "human", "value": "Explain the duties of a director as outlined in the Companies Act 2013."},
    {"from": "human", "value": "What is the procedure for winding up a company under the Companies Act 2013?"},
]

# Run inference on each test prompt and print results
for prompt in test_prompts:
    inputs = tokenizer.apply_chat_template(
        [prompt],
        tokenize=True,
        add_generation_prompt=True, # Must add for generation
        return_tensors="pt",
    ).to("cuda")
    
    print(f"\nPrompt: {prompt['value']}")
    print("Model Response:")
    outputs = model.generate(input_ids=inputs, max_new_tokens=2048, use_cache=True)
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # Extract only the assistant's response (remove prompt from output)
    response = decoded_output.split("Assistant:")[-1].strip() if "Assistant:" in decoded_output else decoded_output.strip()
    print(response)
    print("-" * 50)

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="lora_model_v1",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

In [None]:
import re
import spacy
import json
from typing import List, Tuple
import gc
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load spaCy model (small model to save memory)
try:
    nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
except OSError:
    logger.error("Please install the spaCy model: python -m spacy download en_core_web_sm")
    exit(1)

def parse_document(file_path: str) -> List[Tuple[str, str]]:
    """Parse the document into sections and their content."""
    sections = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split by section headings
    pattern = r'(Section \d+\.\s+[^\n]+)\n([\s\S]*?)(?=(Section \d+\.|$))'
    matches = re.finditer(pattern, content)
    
    for match in matches:
        section_title = match.group(1).strip()
        section_content = match.group(2).strip()
        sections.append((section_title, section_content))
    
    logger.info(f"Parsed {len(sections)} sections from the document.")
    return sections

def generate_questions(sentence: str, section_title: str) -> List[Tuple[str, str]]:
    """Generate QA pairs for a sentence using rule-based methods."""
    qa_pairs = []
    doc = nlp(sentence)
    
    # Rule 1: Generate "What" questions for main actions
    for token in doc:
        if token.pos_ == "VERB" and token.dep_ in ("ROOT", "xcomp"):
            subject = None
            for child in token.children:
                if child.dep_ in ("nsubj", "nsubjpass"):
                    subject = child.text
                    break
            if subject:
                question = f"What does {subject} do under {section_title}?"
                answer = f"Under {section_title}, {sentence.lower()}"
                qa_pairs.append((question, answer))
    
    # Rule 2: Generate "Who" questions for entities
    for chunk in doc.noun_chunks:
        if "company" in chunk.text.lower() or "person" in chunk.text.lower():
            question = f"Who is referred to by '{chunk.text}' in {section_title}?"
            answer = f"In {section_title}, '{chunk.text}' refers to {sentence.lower()}"
            qa_pairs.append((question, answer))
    
    return qa_pairs

def process_section(section_title: str, section_content: str, max_questions: int = 5) -> List[Tuple[str, str]]:
    """Process a single section to generate QA pairs."""
    qa_pairs = []
    doc = nlp(section_content)
    
    # Process sentences in chunks
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10]
    for sentence in sentences[:max_questions]:  # Limit questions per section
        qa_pairs.extend(generate_questions(sentence, section_title))
    
    return qa_pairs[:max_questions]

def save_qa_pairs(qa_pairs: List[Tuple[str, str]], output_file: str):
    """Save QA pairs to a JSONL file."""
    with open(output_file, 'w', encoding='utf-8') as f:
        for question, answer in qa_pairs:
            json.dump({"question": question, "answer": answer}, f)
            f.write('\n')
    logger.info(f"Saved {len(qa_pairs)} QA pairs to {output_file}")

def main(input_file: str, output_file: str, chunk_size: int = 10):
    """Main function to generate QA pairs."""
    sections = parse_document(input_file)
    all_qa_pairs = []
    
    # Process sections in chunks to manage memory
    for i in range(0, len(sections), chunk_size):
        chunk = sections[i:i + chunk_size]
        logger.info(f"Processing chunk {i//chunk_size + 1}/{len(sections)//chunk_size + 1}")
        
        for section_title, section_content in chunk:
            qa_pairs = process_section(section_title, section_content)
            all_qa_pairs.extend(qa_pairs)
        
        # Clear memory
        gc.collect()
    
    save_qa_pairs(all_qa_pairs, output_file)

if __name__ == "__main__":
    input_file = "/kaggle/input/final-formatted-text/consolidated_raw_text.txt"  # Replace with your input file
    output_file = "qa_pairs.jsonl"
    main(input_file, output_file)

In [None]:
import json
from datasets import Dataset
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def convert_qa_to_conversations(input_file: str, output_file: str):
    """Convert QA pairs from JSONL to a conversations dataset."""
    conversations = []
    
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            qa = json.loads(line.strip())
            convo = [
                {"from": "human", "value": qa["question"]},
                {"from": "gpt", "value": qa["answer"]}
            ]
            conversations.append({"conversations": convo})
    
    # Create a Hugging Face Dataset
    dataset = Dataset.from_list(conversations)
    
    # Save to JSON
    dataset.to_json(output_file, orient="records", lines=True)
    logger.info(f"Converted {len(conversations)} QA pairs to conversations dataset and saved to {output_file}")

if __name__ == "__main__":
    input_file = "/kaggle/input/preliminary-qa-jsonl/qa_pairs.jsonl"  # Input from previous script
    output_file = "conversations_dataset.jsonl"  # Output for tokenization
    convert_qa_to_conversations(input_file, output_file)

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("/kaggle/input/final-jsonl", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
dataset[5]["conversations"]

In [None]:
print(dataset[5]["text"])

In [None]:
unsloth_template = \
    "{{ bos_token }}"\
    "{{ 'You are a helpful assistant to the user\n' }}"\
    "{% for message in messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ '>>> User: ' + message['content'] + '\n' }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}"\
        "{{ '>>> Assistant: ' }}"\
    "{% endif %}"
unsloth_eos_token = "eos_token"

if False:
    tokenizer = get_chat_template(
        tokenizer,
        chat_template = (unsloth_template, unsloth_eos_token,), # You must provide a template and EOS token
        mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
        map_eos_token = True, # Maps <|im_end|> to </s> instead
    )

In [None]:
from trl import SFTConfig, SFTTrainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,  # New dataset: /kaggle/input/updated-final
    dataset_text_field="text",
    max_seq_length=2048,  # Verify this is 2048 or adjust
    dataset_num_proc=2,
    packing=True,
    args=SFTConfig(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=10,  # Increased from 5
        num_train_epochs=1,  # Changed to 2
        learning_rate=3e-4,  # Changed to 2e-4
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.1,  # Changed to 0.1
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

In [None]:
trainer_stats = trainer.train()

In [None]:
model.save_pretrained("qa_pairs_lora")
tokenizer.save_pretrained("qa_pairs_lora")
logger.info("Saved QA pairs fine-tuned LoRA adapters to 'qa_pairs_lora'")

In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Test prompts related to Companies Act 2013
test_prompts = [
    {"from": "human", "value": "What are the requirements for registering a private company under the Companies Act 2013?"},
    {"from": "human", "value": "Explain the duties of a director as outlined in the Companies Act 2013."},
    {"from": "human", "value": "What is the procedure for winding up a company under the Companies Act 2013?"},
]

# Run inference on each test prompt and print results
for prompt in test_prompts:
    inputs = tokenizer.apply_chat_template(
        [prompt],
        tokenize=True,
        add_generation_prompt=True, # Must add for generation
        return_tensors="pt",
    ).to("cuda")
    
    print(f"\nPrompt: {prompt['value']}")
    print("Model Response:")
    outputs = model.generate(input_ids=inputs, max_new_tokens=2048, use_cache=True)
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # Extract only the assistant's response (remove prompt from output)
    response = decoded_output.split("Assistant:")[-1].strip() if "Assistant:" in decoded_output else decoded_output.strip()
    print(response)
    print("-" * 50)

In [None]:
pip install pymupdf langchain sentence-transformers faiss-gpu transformers torch

In [None]:
import fitz  # PyMuPDF
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import logging
from typing import List, Dict

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load fine-tuned Mistral 7B model and tokenizer
model_name = "path/to/your/fine-tuned-mistral-7b"  # Replace with your model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use FP16 for memory efficiency
    device_map="auto",  # Automatically map to GPU
    low_cpu_mem_usage=True
)

# Load stronger embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",  # Stronger embedding model
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"}
)

def extract_text_from_pdf(pdf_path: str) -> List[Dict]:
    """Extract text from PDF file page by page with page metadata."""
    try:
        doc = fitz.open(pdf_path)
        pages = []
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text("text").strip()
            if text:  # Only include non-empty pages
                pages.append({"text": text, "metadata": {"page": page_num + 1}})
        doc.close()
        logger.info(f"Extracted text from {pdf_path} ({len(pages)} pages).")
        return pages
    except Exception as e:
        logger.error(f"Error extracting text from PDF: {e}")
        raise

def create_document_chunks(pages: List[Dict], chunk_size: int = 500, chunk_overlap: int = 50) -> List[Document]:
    """Split pages into smaller chunks with metadata."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    chunks = []
    for page in pages:
        docs = text_splitter.create_documents(
            [page["text"]],
            metadatas=[page["metadata"]]
        )
        chunks.extend(docs)
    logger.info(f"Split into {len(chunks)} chunks.")
    return chunks

def create_vector_store(chunks: List[Document], batch_size: int = 16) -> FAISS:
    """Create FAISS vector store from document chunks."""
    vector_store = None
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        if vector_store is None:
            vector_store = FAISS.from_documents(batch, embedding_model)
        else:
            vector_store.add_documents(batch)
        torch.cuda.empty_cache()  # Clear GPU memory
    logger.info("Created FAISS vector store.")
    return vector_store

def retrieve_context(query: str, vector_store: FAISS, k: int = 5) -> List[Dict]:
    """Retrieve relevant document chunks with metadata."""
    docs = vector_store.similarity_search(query, k=k)
    return [{"content": doc.page_content, "metadata": doc.metadata} for doc in docs]

def generate_response(query: str, context: List[Dict], max_length: int = 512) -> str:
    """Generate response using fine-tuned model and retrieved context."""
    context_str = "\n".join([f"Page {c['metadata']['page']}:\n{c['content']}" for c in context])
    prompt = f"""Based on the following context from the Companies Act 2013, answer the query. Include specific section or schedule numbers in your response where applicable:

Context:
{context_str}

Query:
{query}

Answer:
"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to("cuda")
    
    # Generate response
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = response.split("Answer:")[-1].strip()
    return answer

def main():
    # File paths
    pdf_file = "/kaggle/input/mcapdf/MCA.pdf"  # Replace with your PDF file
    faiss_index = "faiss_index"  # Directory to save FAISS index
    
    # Process PDF
    pages = extract_text_from_pdf(pdf_file)
    chunks = create_document_chunks(pages)
    
    # Create and save vector store
    vector_store = create_vector_store(chunks)
    vector_store.save_local(faiss_index)
    logger.info(f"Saved vector store to {faiss_index}")
    
    # Example queries
    queries = [
        "What are the requirements for forming a company under Section 3?",
        "What does Schedule I cover?"
    ]
    for query in queries:
        context = retrieve_context(query, vector_store)
        response = generate_response(query, context)
        logger.info(f"Query: {query}\nResponse: {response}")

if __name__ == "__main__":
    main()

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "What is a famous tall tower in Paris?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128, use_cache = True)

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoModelForPeftCausalLM
    from transformers import AutoTokenizer

    model = AutoModelForPeftCausalLM.from_pretrained(
        "lora_model",  # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit=load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False:
    model.save_pretrained("model")
    tokenizer.save_pretrained("model")
if False:
    model.push_to_hub("hf/model", token = "")
    tokenizer.push_to_hub("hf/model", token = "")


### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in llama.cpp or a UI based system like Jan or Open WebUI. You can install Jan [here](https://github.com/janhq/jan) and Open WebUI [here](https://github.com/open-webui/open-webui)

And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)
2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)
6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!

<div class="align-center">
  <a href="https://unsloth.ai"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a>

  Join Discord if you need help + ⭐️ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐️
</div>
