<a href="https://colab.research.google.com/github/Heather-Herbert/colab/blob/main/PDF_Training_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Install necessary libraries
!pip install transformers datasets pypdf sentencepiece bitsandbytes accelerate huggingface_hub scipy --upgrade
!pip install nltk --upgrade

# Import libraries
import torch
import nltk
from pypdf import PdfReader
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model  # Import from peft directly
from huggingface_hub import login

# Download punkt tokenizer data
nltk.download('punkt')

# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Log in to Hugging Face
login(token="<ADD YOUR HUGGING FACE KEY HERE>")

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

pdf_text = extract_text_from_pdf('/content/drive/My Drive/GEO-LGBT-Action-Plan.pdf')

# Prepare dataset
sentences = [s for s in pdf_text.split('\n') if s.strip()]  # Remove empty strings
data = {"text": sentences}
dataset = Dataset.from_dict(data)
print(f"Number of samples in dataset: {len(dataset)}")

if len(dataset) > 1:
    dataset = dataset.train_test_split(test_size=0.2, seed=42)
    train_dataset, eval_dataset = dataset["train"], dataset["test"]
else:
    train_dataset, eval_dataset = dataset, None

# Load Qwen 2.5-1.5B model and tokenizer with bitsandbytes
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model_name = "Qwen/Qwen2.5-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16
)

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True) if eval_dataset else None

# Adjust dataset for causal language modeling
tokenized_train_dataset = tokenized_train_dataset.map(lambda examples: {"input_ids": examples["input_ids"], "labels": examples["input_ids"]}, batched=True)
tokenized_eval_dataset = tokenized_eval_dataset.map(lambda examples: {"input_ids": examples["input_ids"], "labels": examples["input_ids"]}, batched=True) if tokenized_eval_dataset else None

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    fp16=True,
    push_to_hub=True,
    hub_model_id="Heather1911/Selkies"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset
)

trainer.train()





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Using device: cuda
Mounted at /content/drive
Number of samples in dataset: 943


Map:   0%|          | 0/754 [00:00<?, ? examples/s]

Map:   0%|          | 0/189 [00:00<?, ? examples/s]

Map:   0%|          | 0/754 [00:00<?, ? examples/s]

Map:   0%|          | 0/189 [00:00<?, ? examples/s]

trainable params: 2,179,072 || all params: 1,545,893,376 || trainable%: 0.1410


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,No log,6.452041
2,No log,3.629404


TrainOutput(global_step=141, training_loss=5.6504356438386525, metrics={'train_runtime': 603.5453, 'train_samples_per_second': 3.748, 'train_steps_per_second': 0.234, 'total_flos': 8983430698106880.0, 'train_loss': 5.6504356438386525, 'epoch': 2.9523809523809526})