## Data Augmentation from Raw PDF Files

In [2]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.25.4-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-win_amd64.whl (16.6 MB)
   ---------------------------------------- 0.0/16.6 MB ? eta -:--:--
   ----------------------------- ---------- 12.3/16.6 MB 71.6 MB/s eta 0:00:01
   ---------------------------------------- 16.6/16.6 MB 60.7 MB/s eta 0:00:00
Installing collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.4


In [5]:
import fitz  # PyMuPDF
import json
import re
import unicodedata

def extract_text_from_pdf(pdf_path, output_json):
    """
    Extracts text from each page of a PDF and saves it in a JSON file.
    Each key is formatted as "page_X" where X is the page number.
    """
    doc = fitz.open(pdf_path)
    pdf_content = {}
    for page_number in range(doc.page_count):
        page = doc[page_number]
        text = page.get_text()
        pdf_content[f"page_{page_number + 1}"] = text
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(pdf_content, f, indent=2, ensure_ascii=False)
    print(f"Extracted text saved to {output_json}")

def remove_headers_footers(text):
    """
    Remove lines that are likely headers or footers, such as page numbers
    or lines that match "Page X" patterns.
    """
    lines = text.splitlines()
    cleaned_lines = []
    for line in lines:
        # Skip lines that contain only numbers
        if re.match(r'^\s*\d+\s*$', line):
            continue
        # Skip lines matching patterns like "Page 1" (case-insensitive)
        if re.match(r'^\s*Page\s+\d+\s*$', line, re.IGNORECASE):
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines).strip()

def clean_special_characters(text):
    """
    Normalize unicode characters and remove non-printable characters.
    Also collapses multiple spaces/newlines.
    """
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'[^\x20-\x7E]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def chunk_text(text, max_words=100):
    """
    Split text into smaller chunks if it exceeds max_words.
    Adjust max_words based on your model's input size limitations.
    """
    words = text.split()
    if len(words) <= max_words:
        return [text]
    
    chunks = []
    for i in range(0, len(words), max_words):
        chunk = " ".join(words[i:i + max_words])
        chunks.append(chunk)
    return chunks

def process_pdf_json(input_file, output_file, max_words_per_chunk=100):
    """
    Loads the extracted PDF JSON, cleans the text, and splits long pages into chunks.
    Then, it formats each chunk as a dictionary with a "text" key and saves the data.
    """
    with open(input_file, 'r', encoding='utf-8') as f:
        pdf_data = json.load(f)
    
    training_examples = []
    
    for key, raw_text in pdf_data.items():
        # Remove headers/footers
        cleaned_text = remove_headers_footers(raw_text)
        # Clean special characters and extra whitespace
        cleaned_text = clean_special_characters(cleaned_text)
        # Split the cleaned text into chunks if necessary
        chunks = chunk_text(cleaned_text, max_words=max_words_per_chunk)
        # Add each chunk as a separate training example
        for chunk in chunks:
            training_examples.append({"text": chunk})
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(training_examples, f, indent=2, ensure_ascii=False)
    
    print(f"Processed fine-tuning dataset saved to {output_file}")

if __name__ == '__main__':
    # File paths (keep the file names as requested)
    pdf_path = './data/augmentation.pdf'
    extracted_json = './data/extracted_pdf.json'
    fine_tuning_json = './data/fine_tuning_dataset.json'
    
    # Step 1: Extract text from the PDF and save it as JSON
    extract_text_from_pdf(pdf_path, extracted_json)
    
    # Step 2: Process the extracted JSON and prepare fine-tuning data
    process_pdf_json(extracted_json, fine_tuning_json, max_words_per_chunk=100)


Extracted text saved to ./data/extracted_pdf.json
Processed fine-tuning dataset saved to ./data/fine_tuning_dataset.json


In [8]:
!pip install transformers@git+https://github.com/huggingface/transformers@46350f5eae87ac1d168ddfdc57a0b39b64b9a029
!pip install huggingface-hub==0.29.3

Collecting transformers@ git+https://github.com/huggingface/transformers@46350f5eae87ac1d168ddfdc57a0b39b64b9a029
  Cloning https://github.com/huggingface/transformers (to revision 46350f5eae87ac1d168ddfdc57a0b39b64b9a029) to c:\users\lucasmartins\appdata\local\temp\pip-install-u1498rba\transformers_33d1cc310d5b4d8682ac396f2b1d4d77
  Resolved https://github.com/huggingface/transformers to commit 46350f5eae87ac1d168ddfdc57a0b39b64b9a029
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting numpy>=1.17 (from transformers@ git+https://github.com/huggingface/transformers@46350f5eae87ac1d168ddfdc57a0b39b64b9a029)
  Downloading numpy-2.2.4-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting regex!=2019.12.17

  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers 'C:\Users\LucasMartins\AppData\Local\Temp\pip-install-u1498rba\transformers_33d1cc310d5b4d8682ac396f2b1d4d77'
  Running command git rev-parse -q --verify 'sha^46350f5eae87ac1d168ddfdc57a0b39b64b9a029'
  Running command git fetch -q https://github.com/huggingface/transformers 46350f5eae87ac1d168ddfdc57a0b39b64b9a029
  Running command git checkout -q 46350f5eae87ac1d168ddfdc57a0b39b64b9a029




In [12]:
!pip install trl
!pip install peft

Collecting trl
  Using cached trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting rich (from trl)
  Using cached rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=15.0.0 (from datasets>=2.21.0->trl)
  Using cached pyarrow-19.0.1-cp313-cp313-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.21.0->trl)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets>=2.21.0->trl)
  Using cached pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting xxhash (from datasets>=2.21.0->trl)
  Using cached xxhash-3.5.0-cp313-cp313-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.21.0->trl)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets>=2.21.0->trl)
  Using cac

In [20]:
!pip3 install torch



In [2]:
#test Gemma3

from transformers import AutoTokenizer, TrainingArguments, Gemma3ForCausalLM
from trl import SFTTrainer
from peft import LoraConfig
from datasets import load_dataset

# Load a sample dataset (for example purposes, we use wikitext-2)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

# Define the model name
model_name = "google/gemma-3-4b-pt"

# Load the tokenizer and adjust padding settings
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Use eos token as pad token
tokenizer.padding_side = "right"

# Load the Gemma 3 model
model = Gemma3ForCausalLM.from_pretrained(model_name, device_map="cpu")
model.config.use_cache = False  # Disable caching for training

# Set up LoRA configuration for causal language modeling
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM"
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    learning_rate=2e-4,
    logging_steps=1,
    save_steps=25,
    report_to="tensorboard",
    group_by_length=True,
)

# Create the SFTTrainer with LoRA parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=training_args,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./gemma3_finetuned")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards: 100%|██████████| 2/2 [25:42<00:00, 771.13s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.75it/s]


NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.