<a href="https://colab.research.google.com/github/Heather-Herbert/colab/blob/main/PDF_Training_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Google Colab Notebook Template for Fine-tuning an LLM with bitsandbytes

# Install necessary libraries
!pip install transformers datasets peft pypdf sentencepiece bitsandbytes accelerate huggingface_hub

# Import libraries
from pypdf import PdfReader
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from huggingface_hub import notebook_login, HfApi

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Log in to Hugging Face
notebook_login()

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

pdf_text = extract_text_from_pdf('/content/drive/My Drive/subdirectory/your_pdf.pdf')

# Prepare dataset
data = {"text": pdf_text.split('\n\n')}  # Split into smaller chunks
dataset = Dataset.from_dict(data)

# Load Qwen 2.5-7B model and tokenizer with bitsandbytes for quantization
model_name = "Qwen/Qwen2.5-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, device_map="auto", quantization_config={"load_in_4bit": True})

# PEFT configuration for LoRA
lora_config = LoraConfig(r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none")
model = get_peft_model(model, lora_config)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# Train model
trainer.train()

# Save and push the fine-tuned model to Hugging Face Hub
model.push_to_hub("your-huggingface-username/your-model-repo")
tokenizer.push_to_hub("your-huggingface-username/your-model-repo")
